diff --git a/tubearchivist/config/settings.py b/tubearchivist/config/settings.py index 1c31326..e50f050 100644 --- a/tubearchivist/config/settings.py +++ b/tubearchivist/config/settings.py @@ -256,4 +256,4 @@ CORS_ALLOW_HEADERS = list(default_headers) + [ # TA application settings TA_UPSTREAM = "https://github.com/tubearchivist/tubearchivist" -TA_VERSION = "v0.3.5" +TA_VERSION = "v0.3.6-unstable" diff --git a/tubearchivist/home/config.json b/tubearchivist/home/config.json index 5c91fd1..2a30ced 100644 --- a/tubearchivist/home/config.json +++ b/tubearchivist/home/config.json @@ -33,6 +33,7 @@ "comment_sort": "top", "cookie_import": false, "throttledratelimit": false, + "extractor_lang": false, "integrate_ryd": false, "integrate_sponsorblock": false }, diff --git a/tubearchivist/home/src/es/backup.py b/tubearchivist/home/src/es/backup.py index 97092f0..df1a481 100644 --- a/tubearchivist/home/src/es/backup.py +++ b/tubearchivist/home/src/es/backup.py @@ -32,7 +32,8 @@ class ElasticBackup: if not self.reason: raise ValueError("missing backup reason in ElasticBackup") - self.task.send_progress(["Scanning your index."]) + if self.task: + self.task.send_progress(["Scanning your index."]) for index in self.index_config: index_name = index["index_name"] print(f"backup: export in progress for {index_name}") @@ -42,7 +43,8 @@ class ElasticBackup: self.backup_index(index_name) - self.task.send_progress(["Compress files to zip archive."]) + if self.task: + self.task.send_progress(["Compress files to zip archive."]) self.zip_it() if self.reason == "auto": self.rotate_backup() diff --git a/tubearchivist/home/src/es/index_mapping.json b/tubearchivist/home/src/es/index_mapping.json index c748204..34baf87 100644 --- a/tubearchivist/home/src/es/index_mapping.json +++ b/tubearchivist/home/src/es/index_mapping.json @@ -39,6 +39,16 @@ "channel_last_refresh": { "type": "date" }, + "channel_tags": { + "type": "text", + "analyzer": "english", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256 + } + } + }, "channel_overwrites": { "properties": { "download_format": { @@ -121,6 +131,16 @@ "channel_last_refresh": { "type": "date" }, + "channel_tags": { + "type": "text", + "analyzer": "english", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256 + } + } + }, "channel_overwrites": { "properties": { "download_format": { diff --git a/tubearchivist/home/src/frontend/forms.py b/tubearchivist/home/src/frontend/forms.py index 19b3058..edde00c 100644 --- a/tubearchivist/home/src/frontend/forms.py +++ b/tubearchivist/home/src/frontend/forms.py @@ -122,6 +122,7 @@ class ApplicationSettingsForm(forms.Form): downloads_autodelete_days = forms.IntegerField(required=False) downloads_format = forms.CharField(required=False) downloads_format_sort = forms.CharField(required=False) + downloads_extractor_lang = forms.CharField(required=False) downloads_add_metadata = forms.ChoiceField( widget=forms.Select, choices=METADATA_CHOICES, required=False ) diff --git a/tubearchivist/home/src/index/channel.py b/tubearchivist/home/src/index/channel.py index 84b1add..eef5cdb 100644 --- a/tubearchivist/home/src/index/channel.py +++ b/tubearchivist/home/src/index/channel.py @@ -6,158 +6,15 @@ functionality: import json import os -import re from datetime import datetime -import requests -from bs4 import BeautifulSoup from home.src.download import queue # partial import from home.src.download.thumbnails import ThumbManager from home.src.download.yt_dlp_base import YtWrap from home.src.es.connect import ElasticWrap, IndexPaginate from home.src.index.generic import YouTubeItem from home.src.index.playlist import YoutubePlaylist -from home.src.ta.helper import clean_string, requests_headers - - -class ChannelScraper: - """custom scraper using bs4 to scrape channel about page - will be able to be integrated into yt-dlp - once #2237 and #2350 are merged upstream - """ - - def __init__(self, channel_id): - self.channel_id = channel_id - self.soup = False - self.yt_json = False - self.json_data = False - - def get_json(self): - """main method to return channel dict""" - self.get_soup() - self._extract_yt_json() - if self._is_deactivated(): - return False - - self._parse_channel_main() - self._parse_channel_meta() - return self.json_data - - def get_soup(self): - """return soup from youtube""" - print(f"{self.channel_id}: scrape channel data from youtube") - url = f"https://www.youtube.com/channel/{self.channel_id}/about?hl=en" - cookies = {"CONSENT": "YES+xxxxxxxxxxxxxxxxxxxxxxxxxxx"} - response = requests.get( - url, cookies=cookies, headers=requests_headers(), timeout=10 - ) - if response.ok: - channel_page = response.text - else: - print(f"{self.channel_id}: failed to extract channel info") - raise ConnectionError - self.soup = BeautifulSoup(channel_page, "html.parser") - - def _extract_yt_json(self): - """parse soup and get ytInitialData json""" - all_scripts = self.soup.find("body").find_all("script") - for script in all_scripts: - if "var ytInitialData = " in str(script): - script_content = str(script) - break - # extract payload - script_content = script_content.split("var ytInitialData = ")[1] - json_raw = script_content.rstrip(";") - self.yt_json = json.loads(json_raw) - - def _is_deactivated(self): - """check if channel is deactivated""" - alerts = self.yt_json.get("alerts") - if not alerts: - return False - - for alert in alerts: - alert_text = alert["alertRenderer"]["text"]["simpleText"] - print(f"{self.channel_id}: failed to extract, {alert_text}") - return True - - def _parse_channel_main(self): - """extract maintab values from scraped channel json data""" - main_tab = self.yt_json["header"]["c4TabbedHeaderRenderer"] - # build and return dict - self.json_data = { - "channel_active": True, - "channel_last_refresh": int(datetime.now().timestamp()), - "channel_subs": self._get_channel_subs(main_tab), - "channel_name": main_tab["title"], - "channel_banner_url": self._get_thumbnails(main_tab, "banner"), - "channel_tvart_url": self._get_thumbnails(main_tab, "tvBanner"), - "channel_id": self.channel_id, - "channel_subscribed": False, - } - - @staticmethod - def _get_thumbnails(main_tab, thumb_name): - """extract banner url from main_tab""" - try: - all_banners = main_tab[thumb_name]["thumbnails"] - banner = sorted(all_banners, key=lambda k: k["width"])[-1]["url"] - except KeyError: - banner = False - - return banner - - @staticmethod - def _get_channel_subs(main_tab): - """process main_tab to get channel subs as int""" - try: - sub_text_simple = main_tab["subscriberCountText"]["simpleText"] - sub_text = sub_text_simple.split(" ")[0] - if sub_text[-1] == "K": - channel_subs = int(float(sub_text.replace("K", "")) * 1000) - elif sub_text[-1] == "M": - channel_subs = int(float(sub_text.replace("M", "")) * 1000000) - elif int(sub_text) >= 0: - channel_subs = int(sub_text) - else: - message = f"{sub_text} not dealt with" - print(message) - except KeyError: - channel_subs = 0 - - return channel_subs - - def _parse_channel_meta(self): - """extract meta tab values from channel payload""" - # meta tab - meta_tab = self.yt_json["metadata"]["channelMetadataRenderer"] - all_thumbs = meta_tab["avatar"]["thumbnails"] - thumb_url = sorted(all_thumbs, key=lambda k: k["width"])[-1]["url"] - # stats tab - renderer = "twoColumnBrowseResultsRenderer" - all_tabs = self.yt_json["contents"][renderer]["tabs"] - for tab in all_tabs: - if "tabRenderer" in tab.keys(): - if tab["tabRenderer"]["title"] == "About": - about_tab = tab["tabRenderer"]["content"][ - "sectionListRenderer" - ]["contents"][0]["itemSectionRenderer"]["contents"][0][ - "channelAboutFullMetadataRenderer" - ] - break - try: - channel_views_text = about_tab["viewCountText"]["simpleText"] - channel_views = int(re.sub(r"\D", "", channel_views_text)) - except KeyError: - channel_views = 0 - - self.json_data.update( - { - "channel_description": meta_tab["description"], - "channel_thumb_url": thumb_url, - "channel_views": channel_views, - } - ) +from home.src.ta.helper import clean_string class YoutubeChannel(YouTubeItem): @@ -166,36 +23,94 @@ class YoutubeChannel(YouTubeItem): es_path = False index_name = "ta_channel" yt_base = "https://www.youtube.com/channel/" + yt_obs = { + "extract_flat": True, + "allow_playlist_files": True, + } def __init__(self, youtube_id, task=False): super().__init__(youtube_id) - self.es_path = f"{self.index_name}/_doc/{youtube_id}" self.all_playlists = False self.task = task + def build_yt_url(self): + """overwrite base to use channel about page""" + return f"{self.yt_base}{self.youtube_id}/about" + def build_json(self, upload=False, fallback=False): """get from es or from youtube""" self.get_from_es() if self.json_data: return - self.get_from_youtube(fallback) + self.get_from_youtube() + if not self.youtube_meta and fallback: + self._video_fallback(fallback) + else: + self._process_youtube_meta() + self.get_channel_art() if upload: self.upload_to_es() - return - def get_from_youtube(self, fallback=False): - """use bs4 to scrape channel about page""" - self.json_data = ChannelScraper(self.youtube_id).get_json() + def _process_youtube_meta(self): + """extract relevant fields""" + self.youtube_meta["thumbnails"].reverse() + channel_subs = self.youtube_meta.get("channel_follower_count") or 0 + self.json_data = { + "channel_active": True, + "channel_description": self.youtube_meta.get("description", False), + "channel_id": self.youtube_id, + "channel_last_refresh": int(datetime.now().timestamp()), + "channel_name": self.youtube_meta["uploader"], + "channel_subs": channel_subs, + "channel_subscribed": False, + "channel_tags": self._parse_tags(self.youtube_meta.get("tags")), + "channel_banner_url": self._get_banner_art(), + "channel_thumb_url": self._get_thumb_art(), + "channel_tvart_url": self._get_tv_art(), + "channel_views": self.youtube_meta.get("view_count", 0), + } - if not self.json_data and fallback: - self._video_fallback(fallback) + def _parse_tags(self, tags): + """parse channel tags""" + if not tags: + return False - if not self.json_data: - return + joined = " ".join(tags) + return [i.strip() for i in joined.split('"') if i and not i == " "] - self.get_channel_art() + def _get_thumb_art(self): + """extract thumb art""" + for i in self.youtube_meta["thumbnails"]: + if not i.get("width"): + continue + if i.get("width") == i.get("height"): + return i["url"] + + return False + + def _get_tv_art(self): + """extract tv artwork""" + for i in self.youtube_meta["thumbnails"]: + if i.get("id") == "avatar_uncropped": + return i["url"] + if not i.get("width"): + continue + if i["width"] // i["height"] < 2 and not i["width"] == i["height"]: + return i["url"] + + return False + + def _get_banner_art(self): + """extract banner artwork""" + for i in self.youtube_meta["thumbnails"]: + if not i.get("width"): + continue + if i["width"] // i["height"] > 5: + return i["url"] + + return False def _video_fallback(self, fallback): """use video metadata as fallback""" @@ -209,6 +124,7 @@ class YoutubeChannel(YouTubeItem): "channel_tvart_url": False, "channel_id": self.youtube_id, "channel_subscribed": False, + "channel_tags": False, "channel_description": False, "channel_thumb_url": False, "channel_views": 0, diff --git a/tubearchivist/home/src/index/generic.py b/tubearchivist/home/src/index/generic.py index 105f66b..6e82e54 100644 --- a/tubearchivist/home/src/index/generic.py +++ b/tubearchivist/home/src/index/generic.py @@ -15,8 +15,8 @@ class YouTubeItem: """base class for youtube""" es_path = False - index_name = False - yt_base = False + index_name = "" + yt_base = "" yt_obs = { "skip_download": True, "noplaylist": True, @@ -24,18 +24,27 @@ class YouTubeItem: def __init__(self, youtube_id): self.youtube_id = youtube_id + self.es_path = f"{self.index_name}/_doc/{youtube_id}" self.config = AppConfig().config self.app_conf = self.config["application"] self.youtube_meta = False self.json_data = False + def build_yt_url(self): + """build youtube url""" + return self.yt_base + self.youtube_id + def get_from_youtube(self): """use yt-dlp to get meta data from youtube""" print(f"{self.youtube_id}: get metadata from youtube") - url = self.yt_base + self.youtube_id - response = YtWrap(self.yt_obs, self.config).extract(url) + obs_request = self.yt_obs.copy() + if self.config["downloads"]["extractor_lang"]: + langs = self.config["downloads"]["extractor_lang"] + langs_list = [i.strip() for i in langs.split(",")] + obs_request["extractor_args"] = {"youtube": {"lang": langs_list}} - self.youtube_meta = response + url = self.build_yt_url() + self.youtube_meta = YtWrap(obs_request, self.config).extract(url) def get_from_es(self): """get indexed data from elastic search""" diff --git a/tubearchivist/home/src/index/playlist.py b/tubearchivist/home/src/index/playlist.py index 375082e..656da4a 100644 --- a/tubearchivist/home/src/index/playlist.py +++ b/tubearchivist/home/src/index/playlist.py @@ -26,7 +26,6 @@ class YoutubePlaylist(YouTubeItem): def __init__(self, youtube_id): super().__init__(youtube_id) - self.es_path = f"{self.index_name}/_doc/{youtube_id}" self.all_members = False self.nav = False self.all_youtube_ids = [] diff --git a/tubearchivist/home/src/index/video.py b/tubearchivist/home/src/index/video.py index f3ade1c..b4daae2 100644 --- a/tubearchivist/home/src/index/video.py +++ b/tubearchivist/home/src/index/video.py @@ -138,7 +138,6 @@ class YoutubeVideo(YouTubeItem, YoutubeSubtitle): self.channel_id = False self.video_overwrites = video_overwrites self.video_type = video_type - self.es_path = f"{self.index_name}/_doc/{youtube_id}" self.offline_import = False def build_json(self, youtube_meta_overwrite=False, media_path=False): diff --git a/tubearchivist/home/templates/home/channel_id_about.html b/tubearchivist/home/templates/home/channel_id_about.html index dc38b7a..fb55e0d 100644 --- a/tubearchivist/home/templates/home/channel_id_about.html +++ b/tubearchivist/home/templates/home/channel_id_about.html @@ -81,11 +81,18 @@ {% endif %} -
-

Customize {{ channel_info.channel_name }}

-
+ {% if channel_info.channel_tags %} +
+
+ {% for tag in channel_info.channel_tags %} + {{ tag }} + {% endfor %} +
+
+ {% endif %}
+

Customize {{ channel_info.channel_name }}

{% csrf_token %}
diff --git a/tubearchivist/home/templates/home/settings.html b/tubearchivist/home/templates/home/settings.html index e739fde..25228dd 100644 --- a/tubearchivist/home/templates/home/settings.html +++ b/tubearchivist/home/templates/home/settings.html @@ -108,6 +108,11 @@ {{ app_form.downloads_format_sort }}
+
+

Prefer translated metadata language: {{ config.downloads.extractor_lang }}

+ This will change the language this video gets indexed as. That will only be available if the uploader provides translations. Add as two letter ISO language code, check the documentation which languages are available.
+ {{ app_form.downloads_extractor_lang}} +

Current metadata embed setting: {{ config.downloads.add_metadata }}

Metadata is not embedded into the downloaded files by default.
diff --git a/tubearchivist/requirements.txt b/tubearchivist/requirements.txt index 54cad2c..8b26b8f 100644 --- a/tubearchivist/requirements.txt +++ b/tubearchivist/requirements.txt @@ -1,12 +1,11 @@ -beautifulsoup4==4.12.2 celery==5.2.7 -Django==4.2 +Django==4.2.1 django-auth-ldap==4.3.0 django-cors-headers==3.14.0 djangorestframework==3.14.0 Pillow==9.5.0 redis==4.5.4 -requests==2.29.0 +requests==2.30.0 ryd-client==0.0.6 uWSGI==2.0.21 whitenoise==6.4.0