From 160676acc17d3217fa46daf273b9caebe5046dd1 Mon Sep 17 00:00:00 2001 From: simon Date: Thu, 27 Jan 2022 21:56:48 +0700 Subject: [PATCH 01/11] add mapping for channel_tvart_url --- tubearchivist/home/src/es/index_mapping.json | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tubearchivist/home/src/es/index_mapping.json b/tubearchivist/home/src/es/index_mapping.json index db413fb..3272ed3 100644 --- a/tubearchivist/home/src/es/index_mapping.json +++ b/tubearchivist/home/src/es/index_mapping.json @@ -25,6 +25,10 @@ "type": "keyword", "index": false }, + "channel_tvart_url": { + "type": "keyword", + "index": false + }, "channel_thumb_url": { "type": "keyword", "index": false @@ -84,6 +88,10 @@ "type": "keyword", "index": false }, + "channel_tvart_url": { + "type": "keyword", + "index": false + }, "channel_thumb_url": { "type": "keyword", "index": false From 0749202d5df6ad3cd6699b170949449d2c264409 Mon Sep 17 00:00:00 2001 From: simon Date: Thu, 27 Jan 2022 22:50:02 +0700 Subject: [PATCH 02/11] add fallback for none ascii channel names, #127 #146 --- tubearchivist/home/src/download/yt_dlp_handler.py | 3 +++ tubearchivist/home/src/index/video.py | 4 ++++ 2 files changed, 7 insertions(+) diff --git a/tubearchivist/home/src/download/yt_dlp_handler.py b/tubearchivist/home/src/download/yt_dlp_handler.py index 6d17a33..be71049 100644 --- a/tubearchivist/home/src/download/yt_dlp_handler.py +++ b/tubearchivist/home/src/download/yt_dlp_handler.py @@ -212,6 +212,9 @@ class VideoDownloader: host_uid = self.config["application"]["HOST_UID"] host_gid = self.config["application"]["HOST_GID"] channel_name = clean_string(vid_dict["channel"]["channel_name"]) + if len(channel_name) <= 3: + # fall back to channel id + channel_name = vid_dict["channel"]["channel_id"] # make archive folder with correct permissions new_folder = os.path.join(videos, channel_name) if not os.path.exists(new_folder): diff --git a/tubearchivist/home/src/index/video.py b/tubearchivist/home/src/index/video.py index 240a1b3..00db3f6 100644 --- a/tubearchivist/home/src/index/video.py +++ b/tubearchivist/home/src/index/video.py @@ -125,6 +125,10 @@ class YoutubeVideo(YouTubeItem): """build media_url for where file will be located""" channel_name = self.json_data["channel"]["channel_name"] clean_channel_name = clean_string(channel_name) + if len(clean_channel_name) <= 3: + # fall back to channel id + clean_channel_name = self.json_data["channel"]["channel_id"] + timestamp = self.json_data["published"].replace("-", "") youtube_id = self.json_data["youtube_id"] title = self.json_data["title"] From 8e860d4f016b2cc05ce49c4f81bbe59a7bceda7a Mon Sep 17 00:00:00 2001 From: simon Date: Thu, 27 Jan 2022 23:39:07 +0700 Subject: [PATCH 03/11] fix last page error for more than 10k results, #156 --- tubearchivist/home/src/index/generic.py | 6 ++++++ tubearchivist/home/templates/home/base.html | 16 ++++++++++++++-- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/tubearchivist/home/src/index/generic.py b/tubearchivist/home/src/index/generic.py index af96abf..054a882 100644 --- a/tubearchivist/home/src/index/generic.py +++ b/tubearchivist/home/src/index/generic.py @@ -122,6 +122,7 @@ class Pagination: "page_from": page_from, "prev_pages": prev_pages, "current_page": page_get, + "max_hits": False, } if self.search_get: pagination.update({"search_get": self.search_get}) @@ -131,6 +132,11 @@ class Pagination: """validate pagination with total_hits after making api call""" page_get = self.page_get max_pages = math.ceil(total_hits / self.page_size) + if total_hits > 10000: + # es returns maximal 10000 results + self.pagination["max_hits"] = True + max_pages = max_pages - 1 + if page_get < max_pages and max_pages > 1: self.pagination["last_page"] = max_pages else: diff --git a/tubearchivist/home/templates/home/base.html b/tubearchivist/home/templates/home/base.html index 3bd4899..da47e17 100644 --- a/tubearchivist/home/templates/home/base.html +++ b/tubearchivist/home/templates/home/base.html @@ -109,9 +109,21 @@ {% endif %} {% if pagination.last_page > 0 %} {% if pagination.search_get %} - Last ({{ pagination.last_page }}) + + {% if pagination.max_hits %} + Max ({{ pagination.last_page }}) + {% else %} + Last ({{ pagination.last_page }}) + {% endif %} + {% else %} - Last ({{ pagination.last_page }}) + + {% if pagination.max_hits %} + Max ({{ pagination.last_page }}) + {% else %} + Last ({{ pagination.last_page }}) + {% endif %} + {% endif %} {% endif %} {% endif %} From e5e83287ab52dc6dc41ed681bfe89ab5c30f59c3 Mon Sep 17 00:00:00 2001 From: simon Date: Sun, 30 Jan 2022 08:33:10 +0700 Subject: [PATCH 04/11] fix channel delete for channel_id fallback folders --- tubearchivist/home/src/index/channel.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tubearchivist/home/src/index/channel.py b/tubearchivist/home/src/index/channel.py index 50a0696..57a1c4e 100644 --- a/tubearchivist/home/src/index/channel.py +++ b/tubearchivist/home/src/index/channel.py @@ -198,6 +198,9 @@ class YoutubeChannel(YouTubeItem): """get folder where media files get stored""" channel_name = self.json_data["channel_name"] folder_name = clean_string(channel_name) + if len(folder_name) <= 3: + # fall back to channel id + folder_name = self.json_data["channel_id"] folder_path = os.path.join(self.app_conf["videos"], folder_name) return folder_path From 365a2bf59fba337363b40ea5ccddbc7721225d93 Mon Sep 17 00:00:00 2001 From: simon Date: Sun, 30 Jan 2022 20:10:29 +0700 Subject: [PATCH 05/11] add frontend subtitle dl settings --- tubearchivist/home/config.json | 2 ++ tubearchivist/home/src/frontend/forms.py | 10 ++++++++++ tubearchivist/home/templates/home/settings.html | 11 +++++++++++ 3 files changed, 23 insertions(+) diff --git a/tubearchivist/home/config.json b/tubearchivist/home/config.json index 82c76f4..8c4249a 100644 --- a/tubearchivist/home/config.json +++ b/tubearchivist/home/config.json @@ -23,6 +23,8 @@ "format": false, "add_metadata": false, "add_thumbnail": false, + "subtitle": false, + "subtitle_source": false, "throttledratelimit": false, "integrate_ryd": false }, diff --git a/tubearchivist/home/src/frontend/forms.py b/tubearchivist/home/src/frontend/forms.py index 86b1884..77648cf 100644 --- a/tubearchivist/home/src/frontend/forms.py +++ b/tubearchivist/home/src/frontend/forms.py @@ -68,6 +68,12 @@ class ApplicationSettingsForm(forms.Form): ("1", "enable Cast"), ] + SUBTITLE_SOURCE_CHOICES = [ + ("", "-- change subtitle source settings"), + ("auto", "also download auto generated"), + ("user", "only download uploader"), + ] + subscriptions_channel_size = forms.IntegerField(required=False) downloads_limit_count = forms.IntegerField(required=False) downloads_limit_speed = forms.IntegerField(required=False) @@ -81,6 +87,10 @@ class ApplicationSettingsForm(forms.Form): downloads_add_thumbnail = forms.ChoiceField( widget=forms.Select, choices=THUMBNAIL_CHOICES, required=False ) + downloads_subtitle = forms.CharField(required=False) + downloads_subtitle_source = forms.ChoiceField( + widget=forms.Select, choices=SUBTITLE_SOURCE_CHOICES, required=False + ) downloads_integrate_ryd = forms.ChoiceField( widget=forms.Select, choices=RYD_CHOICES, required=False ) diff --git a/tubearchivist/home/templates/home/settings.html b/tubearchivist/home/templates/home/settings.html index 0716d3e..3b9ffe9 100644 --- a/tubearchivist/home/templates/home/settings.html +++ b/tubearchivist/home/templates/home/settings.html @@ -94,6 +94,17 @@ Embed thumbnail into the mediafile.
{{ app_form.downloads_add_thumbnail }} +
+

Subtitles download setting: {{ config.downloads.subtitle }}
+ Choose which subtitles to download, add comma separated two letter language ISO code,
+ e.g. en, de

+ {{ app_form.downloads_subtitle }}

+
+
+

Subtitle source settings: {{ config.downloads.subtitle_source }}

+ Download only user generated, or also less accurate auto generated subtitles.
+ {{ app_form.downloads_subtitle_source }} +

Integrations

From a82e78f8bffc0d07706307913b7c9c36df847ef1 Mon Sep 17 00:00:00 2001 From: simon Date: Sun, 30 Jan 2022 23:57:58 +0700 Subject: [PATCH 06/11] index selected subtitles if available --- tubearchivist/home/src/index/video.py | 77 +++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/tubearchivist/home/src/index/video.py b/tubearchivist/home/src/index/video.py index 00db3f6..25c6b98 100644 --- a/tubearchivist/home/src/index/video.py +++ b/tubearchivist/home/src/index/video.py @@ -14,6 +14,75 @@ from home.src.ta.helper import DurationConverter, clean_string from ryd_client import ryd_client +class YoutubeSubtitle(YouTubeItem): + """handle video subtitle functionality""" + + def __init__(self, youtube_meta, config): + self.youtube_meta = youtube_meta + self.youtube_id = youtube_meta["id"] + self.config = config + self.languages = self.get_lang_list() + self.source = self.config["downloads"]["subtitle_source"] + + def get_lang_list(self): + """return desired languages list""" + languages_raw = self.config["downloads"]["subtitle"] + languages = [i.strip() for i in languages_raw.split(",")] + return languages + + def get_subtitles(self): + """check what to do""" + if not self.languages: + # no subtitles + return False + + relevant_subtitles = self.get_user_subtitles() + if relevant_subtitles: + return relevant_subtitles + + if self.source == "auto": + relevant_auto = self.get_auto_caption() + return relevant_auto + + def get_auto_caption(self): + """get auto_caption subtitles""" + print(f"{self.youtube_id}: get auto generated subtitles") + all_subtitles = self.youtube_meta.get("automatic_captions") + + if not all_subtitles: + return False + + relevant_subtitles = [] + + for language in self.languages: + all_formats = all_subtitles.get(language) + subtitle = [i for i in all_formats if i["ext"] == "vtt"][0] + subtitle.update({"lang": language, "source": "auto"}) + relevant_subtitles.append(subtitle) + break + + return relevant_subtitles + + def get_user_subtitles(self): + """get subtitles uploaded from channel owner""" + print(f"{self.youtube_id}: get user uploaded subtitles") + all_subtitles = self.youtube_meta.get("subtitles") + + if not all_subtitles: + return False + + relevant_subtitles = [] + + for language in self.languages: + all_formats = all_subtitles.get(language) + subtitle = [i for i in all_formats if i["ext"] == "vtt"][0] + subtitle.update({"lang": language, "source": "user"}) + relevant_subtitles.append(subtitle) + break + + return relevant_subtitles + + class YoutubeVideo(YouTubeItem): """represents a single youtube video""" @@ -37,6 +106,7 @@ class YoutubeVideo(YouTubeItem): self._add_stats() self.add_file_path() self.add_player() + self._check_subtitles() if self.config["downloads"]["integrate_ryd"]: self._get_ryd_stats() @@ -167,6 +237,13 @@ class YoutubeVideo(YouTubeItem): return True + def _check_subtitles(self): + """optionally add subtitles""" + handler = YoutubeSubtitle(self.youtube_meta, self.config) + subtitles = handler.get_subtitles() + if subtitles: + self.json_data["subtitles"] = subtitles + def index_new_video(youtube_id): """combined classes to create new video in index""" From ad2647c4bacdec5a85d55f6b21fe5de525bba8d0 Mon Sep 17 00:00:00 2001 From: simon Date: Fri, 4 Feb 2022 17:13:30 +0700 Subject: [PATCH 07/11] upgrade libs --- tubearchivist/requirements.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tubearchivist/requirements.txt b/tubearchivist/requirements.txt index d08678e..815d2c0 100644 --- a/tubearchivist/requirements.txt +++ b/tubearchivist/requirements.txt @@ -1,12 +1,12 @@ beautifulsoup4==4.10.0 celery==5.2.3 -Django==4.0.1 +Django==4.0.2 django-cors-headers==3.11.0 djangorestframework==3.13.1 -Pillow==9.0.0 -redis==4.1.1 +Pillow==9.0.1 +redis==4.1.2 requests==2.27.1 ryd-client==0.0.3 uWSGI==2.0.20 whitenoise==5.3.0 -yt_dlp==2022.1.21 +yt_dlp==2022.2.4 From 7aaf140ccb78f81097f339b8271defb0e29dd216 Mon Sep 17 00:00:00 2001 From: simon Date: Fri, 4 Feb 2022 17:14:00 +0700 Subject: [PATCH 08/11] index subtitle url to video --- tubearchivist/home/src/index/video.py | 51 +++++++++++++++++---------- 1 file changed, 32 insertions(+), 19 deletions(-) diff --git a/tubearchivist/home/src/index/video.py b/tubearchivist/home/src/index/video.py index 25c6b98..b88196a 100644 --- a/tubearchivist/home/src/index/video.py +++ b/tubearchivist/home/src/index/video.py @@ -14,24 +14,24 @@ from home.src.ta.helper import DurationConverter, clean_string from ryd_client import ryd_client -class YoutubeSubtitle(YouTubeItem): +class YoutubeSubtitle: """handle video subtitle functionality""" - def __init__(self, youtube_meta, config): - self.youtube_meta = youtube_meta - self.youtube_id = youtube_meta["id"] + def __init__(self, config, youtube_meta, media_url, youtube_id): self.config = config - self.languages = self.get_lang_list() - self.source = self.config["downloads"]["subtitle_source"] + self.youtube_meta = youtube_meta + self.media_url = media_url + self.youtube_id = youtube_id + self.languages = False - def get_lang_list(self): - """return desired languages list""" + def sub_conf_parse(self): + """add additional conf values to self""" languages_raw = self.config["downloads"]["subtitle"] - languages = [i.strip() for i in languages_raw.split(",")] - return languages + self.languages = [i.strip() for i in languages_raw.split(",")] def get_subtitles(self): """check what to do""" + self.sub_conf_parse() if not self.languages: # no subtitles return False @@ -40,10 +40,12 @@ class YoutubeSubtitle(YouTubeItem): if relevant_subtitles: return relevant_subtitles - if self.source == "auto": + if self.config["downloads"]["subtitle_source"] == "auto": relevant_auto = self.get_auto_caption() return relevant_auto + return False + def get_auto_caption(self): """get auto_caption subtitles""" print(f"{self.youtube_id}: get auto generated subtitles") @@ -54,10 +56,13 @@ class YoutubeSubtitle(YouTubeItem): relevant_subtitles = [] - for language in self.languages: - all_formats = all_subtitles.get(language) + for lang in self.languages: + media_url = self.media_url.replace(".mp4", f"-{lang}.vtt") + all_formats = all_subtitles.get(lang) subtitle = [i for i in all_formats if i["ext"] == "vtt"][0] - subtitle.update({"lang": language, "source": "auto"}) + subtitle.update( + {"lang": lang, "source": "auto", "media_url": media_url} + ) relevant_subtitles.append(subtitle) break @@ -73,17 +78,20 @@ class YoutubeSubtitle(YouTubeItem): relevant_subtitles = [] - for language in self.languages: - all_formats = all_subtitles.get(language) + for lang in self.languages: + media_url = self.media_url.replace(".mp4", f"-{lang}.vtt") + all_formats = all_subtitles.get(lang) subtitle = [i for i in all_formats if i["ext"] == "vtt"][0] - subtitle.update({"lang": language, "source": "user"}) + subtitle.update( + {"lang": lang, "source": "user", "media_url": media_url} + ) relevant_subtitles.append(subtitle) break return relevant_subtitles -class YoutubeVideo(YouTubeItem): +class YoutubeVideo(YouTubeItem, YoutubeSubtitle): """represents a single youtube video""" es_path = False @@ -239,7 +247,12 @@ class YoutubeVideo(YouTubeItem): def _check_subtitles(self): """optionally add subtitles""" - handler = YoutubeSubtitle(self.youtube_meta, self.config) + handler = YoutubeSubtitle( + self.config, + self.youtube_meta, + media_url=self.json_data["media_url"], + youtube_id=self.youtube_id, + ) subtitles = handler.get_subtitles() if subtitles: self.json_data["subtitles"] = subtitles From 3ef35a9d531106b2d7cf11120e28e9c6c8bf2d3e Mon Sep 17 00:00:00 2001 From: simon Date: Sat, 5 Feb 2022 09:30:34 +0700 Subject: [PATCH 09/11] raise FileNotFoundError to catch for reindex --- tubearchivist/home/src/index/video.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tubearchivist/home/src/index/video.py b/tubearchivist/home/src/index/video.py index b88196a..612af90 100644 --- a/tubearchivist/home/src/index/video.py +++ b/tubearchivist/home/src/index/video.py @@ -174,7 +174,7 @@ class YoutubeVideo(YouTubeItem, YoutubeSubtitle): vid_path = os.path.join(cache_path, file_cached) return vid_path - return False + raise FileNotFoundError def add_player(self): """add player information for new videos""" From ac531affb56a9bcc12da04018212f45ce4e6c3f4 Mon Sep 17 00:00:00 2001 From: simon Date: Sat, 5 Feb 2022 16:07:11 +0700 Subject: [PATCH 10/11] standardize country specific subtitle language codes --- tubearchivist/home/src/index/video.py | 28 +++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/tubearchivist/home/src/index/video.py b/tubearchivist/home/src/index/video.py index 612af90..5b92198 100644 --- a/tubearchivist/home/src/index/video.py +++ b/tubearchivist/home/src/index/video.py @@ -68,11 +68,21 @@ class YoutubeSubtitle: return relevant_subtitles + def _normalize_lang(self): + """normalize country specific language keys""" + all_subtitles = self.youtube_meta.get("subtitles") + all_keys = list(all_subtitles.keys()) + for key in all_keys: + lang = key.split("-")[0] + old = all_subtitles.pop(key) + all_subtitles[lang] = old + + return all_subtitles + def get_user_subtitles(self): """get subtitles uploaded from channel owner""" print(f"{self.youtube_id}: get user uploaded subtitles") - all_subtitles = self.youtube_meta.get("subtitles") - + all_subtitles = self._normalize_lang() if not all_subtitles: return False @@ -90,6 +100,19 @@ class YoutubeSubtitle: return relevant_subtitles + def download_subtitles(self, relevant_subtitles): + """download subtitle files to archive""" + for subtitle in relevant_subtitles: + dest_path = os.path.join( + self.config["application"]["videos"], subtitle["media_url"] + ) + response = requests.get(subtitle["url"]) + if response.ok: + with open(dest_path, "w", encoding="utf-8") as subfile: + subfile.write(response.text) + else: + print(f"{self.youtube_id}: failed to download subtitle") + class YoutubeVideo(YouTubeItem, YoutubeSubtitle): """represents a single youtube video""" @@ -256,6 +279,7 @@ class YoutubeVideo(YouTubeItem, YoutubeSubtitle): subtitles = handler.get_subtitles() if subtitles: self.json_data["subtitles"] = subtitles + handler.download_subtitles(relevant_subtitles=subtitles) def index_new_video(youtube_id): From 8fe00e2152103ddf0724f5627baab1e80bda3a08 Mon Sep 17 00:00:00 2001 From: simon Date: Sat, 5 Feb 2022 17:46:14 +0700 Subject: [PATCH 11/11] process api return values for frontend use --- tubearchivist/api/views.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/tubearchivist/api/views.py b/tubearchivist/api/views.py index 7ab8b47..f0923aa 100644 --- a/tubearchivist/api/views.py +++ b/tubearchivist/api/views.py @@ -26,13 +26,13 @@ class ApiBaseView(APIView): self.response = {"data": False} self.status_code = False self.context = False + self.default_conf = AppConfig().config def config_builder(self): """build confic context""" - default_conf = AppConfig().config self.context = { - "es_url": default_conf["application"]["es_url"], - "es_auth": default_conf["application"]["es_auth"], + "es_url": self.default_conf["application"]["es_url"], + "es_auth": self.default_conf["application"]["es_auth"], } def get_document(self, document_id): @@ -48,6 +48,19 @@ class ApiBaseView(APIView): self.response["data"] = False self.status_code = response.status_code + def process_keys(self): + """process keys for frontend""" + all_keys = self.response["data"].keys() + if "media_url" in all_keys: + media_url = self.response["data"]["media_url"] + self.response["data"]["media_url"] = f"/media/{media_url}" + if "vid_thumb_url" in all_keys: + youtube_id = self.response["data"]["youtube_id"] + vid_thumb_url = ThumbManager().vid_thumb_path(youtube_id) + cache_dir = self.default_conf["application"]["cache_dir"] + new_thumb = f"{cache_dir}/{vid_thumb_url}" + self.response["data"]["vid_thumb_url"] = new_thumb + def get_paginate(self): """add pagination detail to response""" self.response["paginate"] = False @@ -75,6 +88,7 @@ class VideoApiView(ApiBaseView): """get request""" self.config_builder() self.get_document(video_id) + self.process_keys() return Response(self.response, status=self.status_code)