From b7bfeaf2154f8e02d16d50c55261caa07799b326 Mon Sep 17 00:00:00 2001 From: simon Date: Tue, 2 May 2023 11:44:51 +0700 Subject: [PATCH 1/8] add channel tags to mapping --- tubearchivist/home/src/es/index_mapping.json | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tubearchivist/home/src/es/index_mapping.json b/tubearchivist/home/src/es/index_mapping.json index c748204..34baf87 100644 --- a/tubearchivist/home/src/es/index_mapping.json +++ b/tubearchivist/home/src/es/index_mapping.json @@ -39,6 +39,16 @@ "channel_last_refresh": { "type": "date" }, + "channel_tags": { + "type": "text", + "analyzer": "english", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256 + } + } + }, "channel_overwrites": { "properties": { "download_format": { @@ -121,6 +131,16 @@ "channel_last_refresh": { "type": "date" }, + "channel_tags": { + "type": "text", + "analyzer": "english", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256 + } + } + }, "channel_overwrites": { "properties": { "download_format": { From 2b66786728691e1284f47433f3f79a8c65e2270b Mon Sep 17 00:00:00 2001 From: simon Date: Tue, 2 May 2023 11:45:34 +0700 Subject: [PATCH 2/8] switch channel index to yt-dlp, index tags --- tubearchivist/home/src/index/channel.py | 225 ++++++------------ tubearchivist/home/src/index/generic.py | 15 +- tubearchivist/home/src/index/playlist.py | 1 - tubearchivist/home/src/index/video.py | 1 - .../home/templates/home/channel_id_about.html | 13 +- tubearchivist/requirements.txt | 1 - 6 files changed, 89 insertions(+), 167 deletions(-) diff --git a/tubearchivist/home/src/index/channel.py b/tubearchivist/home/src/index/channel.py index 84b1add..44a1c7a 100644 --- a/tubearchivist/home/src/index/channel.py +++ b/tubearchivist/home/src/index/channel.py @@ -6,158 +6,15 @@ functionality: import json import os -import re from datetime import datetime -import requests -from bs4 import BeautifulSoup from home.src.download import queue # partial import from home.src.download.thumbnails import ThumbManager from home.src.download.yt_dlp_base import YtWrap from home.src.es.connect import ElasticWrap, IndexPaginate from home.src.index.generic import YouTubeItem from home.src.index.playlist import YoutubePlaylist -from home.src.ta.helper import clean_string, requests_headers - - -class ChannelScraper: - """custom scraper using bs4 to scrape channel about page - will be able to be integrated into yt-dlp - once #2237 and #2350 are merged upstream - """ - - def __init__(self, channel_id): - self.channel_id = channel_id - self.soup = False - self.yt_json = False - self.json_data = False - - def get_json(self): - """main method to return channel dict""" - self.get_soup() - self._extract_yt_json() - if self._is_deactivated(): - return False - - self._parse_channel_main() - self._parse_channel_meta() - return self.json_data - - def get_soup(self): - """return soup from youtube""" - print(f"{self.channel_id}: scrape channel data from youtube") - url = f"https://www.youtube.com/channel/{self.channel_id}/about?hl=en" - cookies = {"CONSENT": "YES+xxxxxxxxxxxxxxxxxxxxxxxxxxx"} - response = requests.get( - url, cookies=cookies, headers=requests_headers(), timeout=10 - ) - if response.ok: - channel_page = response.text - else: - print(f"{self.channel_id}: failed to extract channel info") - raise ConnectionError - self.soup = BeautifulSoup(channel_page, "html.parser") - - def _extract_yt_json(self): - """parse soup and get ytInitialData json""" - all_scripts = self.soup.find("body").find_all("script") - for script in all_scripts: - if "var ytInitialData = " in str(script): - script_content = str(script) - break - # extract payload - script_content = script_content.split("var ytInitialData = ")[1] - json_raw = script_content.rstrip(";") - self.yt_json = json.loads(json_raw) - - def _is_deactivated(self): - """check if channel is deactivated""" - alerts = self.yt_json.get("alerts") - if not alerts: - return False - - for alert in alerts: - alert_text = alert["alertRenderer"]["text"]["simpleText"] - print(f"{self.channel_id}: failed to extract, {alert_text}") - return True - - def _parse_channel_main(self): - """extract maintab values from scraped channel json data""" - main_tab = self.yt_json["header"]["c4TabbedHeaderRenderer"] - # build and return dict - self.json_data = { - "channel_active": True, - "channel_last_refresh": int(datetime.now().timestamp()), - "channel_subs": self._get_channel_subs(main_tab), - "channel_name": main_tab["title"], - "channel_banner_url": self._get_thumbnails(main_tab, "banner"), - "channel_tvart_url": self._get_thumbnails(main_tab, "tvBanner"), - "channel_id": self.channel_id, - "channel_subscribed": False, - } - - @staticmethod - def _get_thumbnails(main_tab, thumb_name): - """extract banner url from main_tab""" - try: - all_banners = main_tab[thumb_name]["thumbnails"] - banner = sorted(all_banners, key=lambda k: k["width"])[-1]["url"] - except KeyError: - banner = False - - return banner - - @staticmethod - def _get_channel_subs(main_tab): - """process main_tab to get channel subs as int""" - try: - sub_text_simple = main_tab["subscriberCountText"]["simpleText"] - sub_text = sub_text_simple.split(" ")[0] - if sub_text[-1] == "K": - channel_subs = int(float(sub_text.replace("K", "")) * 1000) - elif sub_text[-1] == "M": - channel_subs = int(float(sub_text.replace("M", "")) * 1000000) - elif int(sub_text) >= 0: - channel_subs = int(sub_text) - else: - message = f"{sub_text} not dealt with" - print(message) - except KeyError: - channel_subs = 0 - - return channel_subs - - def _parse_channel_meta(self): - """extract meta tab values from channel payload""" - # meta tab - meta_tab = self.yt_json["metadata"]["channelMetadataRenderer"] - all_thumbs = meta_tab["avatar"]["thumbnails"] - thumb_url = sorted(all_thumbs, key=lambda k: k["width"])[-1]["url"] - # stats tab - renderer = "twoColumnBrowseResultsRenderer" - all_tabs = self.yt_json["contents"][renderer]["tabs"] - for tab in all_tabs: - if "tabRenderer" in tab.keys(): - if tab["tabRenderer"]["title"] == "About": - about_tab = tab["tabRenderer"]["content"][ - "sectionListRenderer" - ]["contents"][0]["itemSectionRenderer"]["contents"][0][ - "channelAboutFullMetadataRenderer" - ] - break - try: - channel_views_text = about_tab["viewCountText"]["simpleText"] - channel_views = int(re.sub(r"\D", "", channel_views_text)) - except KeyError: - channel_views = 0 - - self.json_data.update( - { - "channel_description": meta_tab["description"], - "channel_thumb_url": thumb_url, - "channel_views": channel_views, - } - ) +from home.src.ta.helper import clean_string class YoutubeChannel(YouTubeItem): @@ -166,36 +23,93 @@ class YoutubeChannel(YouTubeItem): es_path = False index_name = "ta_channel" yt_base = "https://www.youtube.com/channel/" + yt_obs = { + "extract_flat": True, + "allow_playlist_files": True, + } def __init__(self, youtube_id, task=False): super().__init__(youtube_id) - self.es_path = f"{self.index_name}/_doc/{youtube_id}" self.all_playlists = False self.task = task + def build_yt_url(self): + """overwrite base to use channel about page""" + return f"{self.yt_base}{self.youtube_id}/about" + def build_json(self, upload=False, fallback=False): """get from es or from youtube""" self.get_from_es() if self.json_data: return - self.get_from_youtube(fallback) + self.get_from_youtube() + if not self.youtube_meta and fallback: + self._video_fallback(fallback) + else: + self._process_youtube_meta() + self.get_channel_art() if upload: self.upload_to_es() - return - def get_from_youtube(self, fallback=False): - """use bs4 to scrape channel about page""" - self.json_data = ChannelScraper(self.youtube_id).get_json() + def _process_youtube_meta(self): + """extract relevant fields""" + self.youtube_meta["thumbnails"].reverse() + self.json_data = { + "channel_active": True, + "channel_description": self.youtube_meta.get("description", False), + "channel_id": self.youtube_id, + "channel_last_refresh": int(datetime.now().timestamp()), + "channel_name": self.youtube_meta["uploader"], + "channel_subs": self.youtube_meta.get("channel_follower_count", 0), + "channel_subscribed": False, + "channel_tags": self._parse_tags(self.youtube_meta.get("tags")), + "channel_banner_url": self._get_banner_art(), + "channel_thumb_url": self._get_thumb_art(), + "channel_tvart_url": self._get_tv_art(), + "channel_views": self.youtube_meta.get("view_count", 0), + } - if not self.json_data and fallback: - self._video_fallback(fallback) + def _parse_tags(self, tags): + """parse channel tags""" + if not tags: + return False - if not self.json_data: - return + joined = " ".join(tags) + return [i.strip() for i in joined.split('"') if i and not i == " "] - self.get_channel_art() + def _get_thumb_art(self): + """extract thumb art""" + for i in self.youtube_meta["thumbnails"]: + if not i.get("width"): + continue + if i.get("width") == i.get("height"): + return i["url"] + + return False + + def _get_tv_art(self): + """extract tv artwork""" + for i in self.youtube_meta["thumbnails"]: + if i.get("id") == "avatar_uncropped": + return i["url"] + if not i.get("width"): + continue + if i["width"] // i["height"] < 2: + return i["url"] + + return False + + def _get_banner_art(self): + """extract banner artwork""" + for i in self.youtube_meta["thumbnails"]: + if not i.get("width"): + continue + if i["width"] // i["height"] > 5: + return i["url"] + + return False def _video_fallback(self, fallback): """use video metadata as fallback""" @@ -209,6 +123,7 @@ class YoutubeChannel(YouTubeItem): "channel_tvart_url": False, "channel_id": self.youtube_id, "channel_subscribed": False, + "channel_tags": False, "channel_description": False, "channel_thumb_url": False, "channel_views": 0, diff --git a/tubearchivist/home/src/index/generic.py b/tubearchivist/home/src/index/generic.py index 105f66b..921eb10 100644 --- a/tubearchivist/home/src/index/generic.py +++ b/tubearchivist/home/src/index/generic.py @@ -15,8 +15,8 @@ class YouTubeItem: """base class for youtube""" es_path = False - index_name = False - yt_base = False + index_name = "" + yt_base = "" yt_obs = { "skip_download": True, "noplaylist": True, @@ -24,18 +24,21 @@ class YouTubeItem: def __init__(self, youtube_id): self.youtube_id = youtube_id + self.es_path = f"{self.index_name}/_doc/{youtube_id}" self.config = AppConfig().config self.app_conf = self.config["application"] self.youtube_meta = False self.json_data = False + def build_yt_url(self): + """build youtube url""" + return self.yt_base + self.youtube_id + def get_from_youtube(self): """use yt-dlp to get meta data from youtube""" print(f"{self.youtube_id}: get metadata from youtube") - url = self.yt_base + self.youtube_id - response = YtWrap(self.yt_obs, self.config).extract(url) - - self.youtube_meta = response + url = self.build_yt_url() + self.youtube_meta = YtWrap(self.yt_obs, self.config).extract(url) def get_from_es(self): """get indexed data from elastic search""" diff --git a/tubearchivist/home/src/index/playlist.py b/tubearchivist/home/src/index/playlist.py index 375082e..656da4a 100644 --- a/tubearchivist/home/src/index/playlist.py +++ b/tubearchivist/home/src/index/playlist.py @@ -26,7 +26,6 @@ class YoutubePlaylist(YouTubeItem): def __init__(self, youtube_id): super().__init__(youtube_id) - self.es_path = f"{self.index_name}/_doc/{youtube_id}" self.all_members = False self.nav = False self.all_youtube_ids = [] diff --git a/tubearchivist/home/src/index/video.py b/tubearchivist/home/src/index/video.py index f3ade1c..b4daae2 100644 --- a/tubearchivist/home/src/index/video.py +++ b/tubearchivist/home/src/index/video.py @@ -138,7 +138,6 @@ class YoutubeVideo(YouTubeItem, YoutubeSubtitle): self.channel_id = False self.video_overwrites = video_overwrites self.video_type = video_type - self.es_path = f"{self.index_name}/_doc/{youtube_id}" self.offline_import = False def build_json(self, youtube_meta_overwrite=False, media_path=False): diff --git a/tubearchivist/home/templates/home/channel_id_about.html b/tubearchivist/home/templates/home/channel_id_about.html index dc38b7a..fb55e0d 100644 --- a/tubearchivist/home/templates/home/channel_id_about.html +++ b/tubearchivist/home/templates/home/channel_id_about.html @@ -81,11 +81,18 @@ {% endif %} -
-

Customize {{ channel_info.channel_name }}

-
+ {% if channel_info.channel_tags %} +
+
+ {% for tag in channel_info.channel_tags %} + {{ tag }} + {% endfor %} +
+
+ {% endif %}
+

Customize {{ channel_info.channel_name }}

{% csrf_token %}
diff --git a/tubearchivist/requirements.txt b/tubearchivist/requirements.txt index 54cad2c..35bc193 100644 --- a/tubearchivist/requirements.txt +++ b/tubearchivist/requirements.txt @@ -1,4 +1,3 @@ -beautifulsoup4==4.12.2 celery==5.2.7 Django==4.2 django-auth-ldap==4.3.0 From b95a65939601f87236db5eaa6795b0a3b31901f2 Mon Sep 17 00:00:00 2001 From: simon Date: Tue, 2 May 2023 12:42:35 +0700 Subject: [PATCH 3/8] fix empty channel_subs parsing --- tubearchivist/home/src/index/channel.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tubearchivist/home/src/index/channel.py b/tubearchivist/home/src/index/channel.py index 44a1c7a..ec8a4db 100644 --- a/tubearchivist/home/src/index/channel.py +++ b/tubearchivist/home/src/index/channel.py @@ -56,13 +56,14 @@ class YoutubeChannel(YouTubeItem): def _process_youtube_meta(self): """extract relevant fields""" self.youtube_meta["thumbnails"].reverse() + channel_subs = self.youtube_meta.get("channel_follower_count") or 0 self.json_data = { "channel_active": True, "channel_description": self.youtube_meta.get("description", False), "channel_id": self.youtube_id, "channel_last_refresh": int(datetime.now().timestamp()), "channel_name": self.youtube_meta["uploader"], - "channel_subs": self.youtube_meta.get("channel_follower_count", 0), + "channel_subs": channel_subs, "channel_subscribed": False, "channel_tags": self._parse_tags(self.youtube_meta.get("tags")), "channel_banner_url": self._get_banner_art(), @@ -96,7 +97,7 @@ class YoutubeChannel(YouTubeItem): return i["url"] if not i.get("width"): continue - if i["width"] // i["height"] < 2: + if i["width"] // i["height"] < 2 and not i["width"] == i["height"]: return i["url"] return False @@ -164,6 +165,8 @@ class YoutubeChannel(YouTubeItem): # add ingest pipeline processors = [] for field, value in self.json_data.items(): + if not value: + continue line = {"set": {"field": "channel." + field, "value": value}} processors.append(line) data = {"description": self.youtube_id, "processors": processors} From 170839362ef1146e2e7ca1c04502f487df2a27cf Mon Sep 17 00:00:00 2001 From: simon Date: Thu, 4 May 2023 11:48:38 +0700 Subject: [PATCH 4/8] fix ignoring progress message when not initiated with task --- tubearchivist/home/src/es/backup.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tubearchivist/home/src/es/backup.py b/tubearchivist/home/src/es/backup.py index 97092f0..df1a481 100644 --- a/tubearchivist/home/src/es/backup.py +++ b/tubearchivist/home/src/es/backup.py @@ -32,7 +32,8 @@ class ElasticBackup: if not self.reason: raise ValueError("missing backup reason in ElasticBackup") - self.task.send_progress(["Scanning your index."]) + if self.task: + self.task.send_progress(["Scanning your index."]) for index in self.index_config: index_name = index["index_name"] print(f"backup: export in progress for {index_name}") @@ -42,7 +43,8 @@ class ElasticBackup: self.backup_index(index_name) - self.task.send_progress(["Compress files to zip archive."]) + if self.task: + self.task.send_progress(["Compress files to zip archive."]) self.zip_it() if self.reason == "auto": self.rotate_backup() From e092a29b13953145449015b7a9e41a9308b57133 Mon Sep 17 00:00:00 2001 From: simon Date: Fri, 5 May 2023 19:20:51 +0700 Subject: [PATCH 5/8] bump libs --- tubearchivist/requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tubearchivist/requirements.txt b/tubearchivist/requirements.txt index 35bc193..8b26b8f 100644 --- a/tubearchivist/requirements.txt +++ b/tubearchivist/requirements.txt @@ -1,11 +1,11 @@ celery==5.2.7 -Django==4.2 +Django==4.2.1 django-auth-ldap==4.3.0 django-cors-headers==3.14.0 djangorestframework==3.14.0 Pillow==9.5.0 redis==4.5.4 -requests==2.29.0 +requests==2.30.0 ryd-client==0.0.6 uWSGI==2.0.21 whitenoise==6.4.0 From b47687535a413720f86e3638bf5f13675fa7829c Mon Sep 17 00:00:00 2001 From: simon Date: Fri, 5 May 2023 19:21:09 +0700 Subject: [PATCH 6/8] undo faulty channel sync to videos --- tubearchivist/home/src/index/channel.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tubearchivist/home/src/index/channel.py b/tubearchivist/home/src/index/channel.py index ec8a4db..eef5cdb 100644 --- a/tubearchivist/home/src/index/channel.py +++ b/tubearchivist/home/src/index/channel.py @@ -165,8 +165,6 @@ class YoutubeChannel(YouTubeItem): # add ingest pipeline processors = [] for field, value in self.json_data.items(): - if not value: - continue line = {"set": {"field": "channel." + field, "value": value}} processors.append(line) data = {"description": self.youtube_id, "processors": processors} From 0d2d3353a900d27331a4de613c4149af385caaf4 Mon Sep 17 00:00:00 2001 From: simon Date: Sun, 7 May 2023 09:47:03 +0700 Subject: [PATCH 7/8] add extractor lang, #316 --- tubearchivist/home/config.json | 1 + tubearchivist/home/src/frontend/forms.py | 1 + tubearchivist/home/src/index/generic.py | 8 +++++++- tubearchivist/home/templates/home/settings.html | 5 +++++ 4 files changed, 14 insertions(+), 1 deletion(-) diff --git a/tubearchivist/home/config.json b/tubearchivist/home/config.json index 5c91fd1..2a30ced 100644 --- a/tubearchivist/home/config.json +++ b/tubearchivist/home/config.json @@ -33,6 +33,7 @@ "comment_sort": "top", "cookie_import": false, "throttledratelimit": false, + "extractor_lang": false, "integrate_ryd": false, "integrate_sponsorblock": false }, diff --git a/tubearchivist/home/src/frontend/forms.py b/tubearchivist/home/src/frontend/forms.py index 19b3058..edde00c 100644 --- a/tubearchivist/home/src/frontend/forms.py +++ b/tubearchivist/home/src/frontend/forms.py @@ -122,6 +122,7 @@ class ApplicationSettingsForm(forms.Form): downloads_autodelete_days = forms.IntegerField(required=False) downloads_format = forms.CharField(required=False) downloads_format_sort = forms.CharField(required=False) + downloads_extractor_lang = forms.CharField(required=False) downloads_add_metadata = forms.ChoiceField( widget=forms.Select, choices=METADATA_CHOICES, required=False ) diff --git a/tubearchivist/home/src/index/generic.py b/tubearchivist/home/src/index/generic.py index 921eb10..6e82e54 100644 --- a/tubearchivist/home/src/index/generic.py +++ b/tubearchivist/home/src/index/generic.py @@ -37,8 +37,14 @@ class YouTubeItem: def get_from_youtube(self): """use yt-dlp to get meta data from youtube""" print(f"{self.youtube_id}: get metadata from youtube") + obs_request = self.yt_obs.copy() + if self.config["downloads"]["extractor_lang"]: + langs = self.config["downloads"]["extractor_lang"] + langs_list = [i.strip() for i in langs.split(",")] + obs_request["extractor_args"] = {"youtube": {"lang": langs_list}} + url = self.build_yt_url() - self.youtube_meta = YtWrap(self.yt_obs, self.config).extract(url) + self.youtube_meta = YtWrap(obs_request, self.config).extract(url) def get_from_es(self): """get indexed data from elastic search""" diff --git a/tubearchivist/home/templates/home/settings.html b/tubearchivist/home/templates/home/settings.html index e739fde..25228dd 100644 --- a/tubearchivist/home/templates/home/settings.html +++ b/tubearchivist/home/templates/home/settings.html @@ -108,6 +108,11 @@ {{ app_form.downloads_format_sort }}
+
+

Prefer translated metadata language: {{ config.downloads.extractor_lang }}

+ This will change the language this video gets indexed as. That will only be available if the uploader provides translations. Add as two letter ISO language code, check the documentation which languages are available.
+ {{ app_form.downloads_extractor_lang}} +

Current metadata embed setting: {{ config.downloads.add_metadata }}

Metadata is not embedded into the downloaded files by default.
From 206921baf0b5521786783a927b0f28103f10f51b Mon Sep 17 00:00:00 2001 From: simon Date: Sun, 7 May 2023 09:58:07 +0700 Subject: [PATCH 8/8] add unstable footer --- tubearchivist/config/settings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tubearchivist/config/settings.py b/tubearchivist/config/settings.py index 1c31326..e50f050 100644 --- a/tubearchivist/config/settings.py +++ b/tubearchivist/config/settings.py @@ -256,4 +256,4 @@ CORS_ALLOW_HEADERS = list(default_headers) + [ # TA application settings TA_UPSTREAM = "https://github.com/tubearchivist/tubearchivist" -TA_VERSION = "v0.3.5" +TA_VERSION = "v0.3.6-unstable"