diff --git a/tubearchivist/api/views.py b/tubearchivist/api/views.py index 898c721d..7ae37340 100644 --- a/tubearchivist/api/views.py +++ b/tubearchivist/api/views.py @@ -361,7 +361,7 @@ class ChannelApiListView(ApiBaseView): must_list = [] if query_filter: if query_filter not in self.valid_filter: - message = f"invalid url query filder: {query_filter}" + message = f"invalid url query filter: {query_filter}" print(message) return Response({"message": message}, status=400) @@ -657,7 +657,7 @@ class DownloadApiListView(ApiBaseView): must_list = [] if query_filter: if query_filter not in self.valid_filter: - message = f"invalid url query filder: {query_filter}" + message = f"invalid url query filter: {query_filter}" print(message) return Response({"message": message}, status=400) diff --git a/tubearchivist/config/settings.py b/tubearchivist/config/settings.py index 8057a101..274fd9f4 100644 --- a/tubearchivist/config/settings.py +++ b/tubearchivist/config/settings.py @@ -276,4 +276,4 @@ CORS_ALLOW_HEADERS = list(default_headers) + [ # TA application settings TA_UPSTREAM = "https://github.com/tubearchivist/tubearchivist" -TA_VERSION = "v0.4.8" +TA_VERSION = "v0.4.9-unstable" diff --git a/tubearchivist/home/src/download/queue.py b/tubearchivist/home/src/download/queue.py index 661f39b9..5f0a86bb 100644 --- a/tubearchivist/home/src/download/queue.py +++ b/tubearchivist/home/src/download/queue.py @@ -324,6 +324,10 @@ class PendingList(PendingIndex): else: vid_type = VideoTypeEnum.VIDEOS + if not vid.get("channel"): + print(f"{youtube_id}: skip video not part of channel") + return False + return self._parse_youtube_details(vid, vid_type) @staticmethod diff --git a/tubearchivist/home/src/download/subscriptions.py b/tubearchivist/home/src/download/subscriptions.py index a276dcee..34cd87e2 100644 --- a/tubearchivist/home/src/download/subscriptions.py +++ b/tubearchivist/home/src/download/subscriptions.py @@ -39,11 +39,15 @@ class ChannelSubscription: return all_channels def get_last_youtube_videos( - self, channel_id, limit=True, query_filter=VideoTypeEnum.UNKNOWN + self, + channel_id, + limit=True, + query_filter=None, + channel_overwrites=None, ): """get a list of last videos from channel""" - queries = self._build_queries(query_filter, limit) - + query_handler = VideoQueryBuilder(self.config, channel_overwrites) + queries = query_handler.build_queries(query_filter) last_videos = [] for vid_type_enum, limit_amount in queries: @@ -51,55 +55,25 @@ class ChannelSubscription: "skip_download": True, "extract_flat": True, } + vid_type = vid_type_enum.value + if limit: obs["playlistend"] = limit_amount - vid_type = vid_type_enum.value - channel = YtWrap(obs, self.config).extract( - f"https://www.youtube.com/channel/{channel_id}/{vid_type}" - ) - if not channel: + url = f"https://www.youtube.com/channel/{channel_id}/{vid_type}" + channel_query = YtWrap(obs, self.config).extract(url) + if not channel_query: continue + last_videos.extend( - [(i["id"], i["title"], vid_type) for i in channel["entries"]] + [ + (i["id"], i["title"], vid_type) + for i in channel_query["entries"] + ] ) return last_videos - def _build_queries(self, query_filter, limit): - """build query list for vid_type""" - limit_map = { - "videos": self.config["subscriptions"]["channel_size"], - "streams": self.config["subscriptions"]["live_channel_size"], - "shorts": self.config["subscriptions"]["shorts_channel_size"], - } - - queries = [] - - if query_filter and query_filter.value != "unknown": - if limit: - query_limit = limit_map.get(query_filter.value) - else: - query_limit = False - - queries.append((query_filter, query_limit)) - - return queries - - for query_item, default_limit in limit_map.items(): - if not default_limit: - # is deactivated in config - continue - - if limit: - query_limit = default_limit - else: - query_limit = False - - queries.append((VideoTypeEnum(query_item), query_limit)) - - return queries - def find_missing(self): """add missing videos from subscribed channels to pending""" all_channels = self.get_channels() @@ -112,7 +86,10 @@ class ChannelSubscription: for idx, channel in enumerate(all_channels): channel_id = channel["channel_id"] print(f"{channel_id}: find missing videos.") - last_videos = self.get_last_youtube_videos(channel_id) + last_videos = self.get_last_youtube_videos( + channel_id, + channel_overwrites=channel.get("channel_overwrites"), + ) if last_videos: ids_to_add = is_missing([i[0] for i in last_videos]) @@ -144,6 +121,92 @@ class ChannelSubscription: channel.sync_to_videos() +class VideoQueryBuilder: + """Build queries for yt-dlp.""" + + def __init__(self, config: dict, channel_overwrites: dict | None = None): + self.config = config + self.channel_overwrites = channel_overwrites or {} + + def build_queries( + self, video_type: VideoTypeEnum | None, limit: bool = True + ) -> list[tuple[VideoTypeEnum, int | None]]: + """Build queries for all or specific video type.""" + query_methods = { + VideoTypeEnum.VIDEOS: self.videos_query, + VideoTypeEnum.STREAMS: self.streams_query, + VideoTypeEnum.SHORTS: self.shorts_query, + } + + if video_type: + # build query for specific type + query_method = query_methods.get(video_type) + if query_method: + query = query_method(limit) + if query[1] != 0: + return [query] + return [] + + # Build and return queries for all video types + queries = [] + for build_query in query_methods.values(): + query = build_query(limit) + if query[1] != 0: + queries.append(query) + + return queries + + def videos_query(self, limit: bool) -> tuple[VideoTypeEnum, int | None]: + """Build query for videos.""" + return self._build_generic_query( + video_type=VideoTypeEnum.VIDEOS, + overwrite_key="subscriptions_channel_size", + config_key="channel_size", + limit=limit, + ) + + def streams_query(self, limit: bool) -> tuple[VideoTypeEnum, int | None]: + """Build query for streams.""" + return self._build_generic_query( + video_type=VideoTypeEnum.STREAMS, + overwrite_key="subscriptions_live_channel_size", + config_key="live_channel_size", + limit=limit, + ) + + def shorts_query(self, limit: bool) -> tuple[VideoTypeEnum, int | None]: + """Build query for shorts.""" + return self._build_generic_query( + video_type=VideoTypeEnum.SHORTS, + overwrite_key="subscriptions_shorts_channel_size", + config_key="shorts_channel_size", + limit=limit, + ) + + def _build_generic_query( + self, + video_type: VideoTypeEnum, + overwrite_key: str, + config_key: str, + limit: bool, + ) -> tuple[VideoTypeEnum, int | None]: + """Generic query for video page scraping.""" + if not limit: + return (video_type, None) + + if ( + overwrite_key in self.channel_overwrites + and self.channel_overwrites[overwrite_key] is not None + ): + overwrite = self.channel_overwrites[overwrite_key] + return (video_type, overwrite) + + if overwrite := self.config["subscriptions"].get(config_key): + return (video_type, overwrite) + + return (video_type, 0) + + class PlaylistSubscription: """manage the playlist download functionality""" diff --git a/tubearchivist/home/src/download/yt_dlp_handler.py b/tubearchivist/home/src/download/yt_dlp_handler.py index c1a150c2..5c4ad48d 100644 --- a/tubearchivist/home/src/download/yt_dlp_handler.py +++ b/tubearchivist/home/src/download/yt_dlp_handler.py @@ -50,9 +50,10 @@ class VideoDownloader(DownloaderBase): self.obs = False self._build_obs() - def run_queue(self, auto_only=False) -> int: + def run_queue(self, auto_only=False) -> tuple[int, int]: """setup download queue in redis loop until no more items""" downloaded = 0 + failed = 0 while True: video_data = self._get_next(auto_only) if self.task.is_stopped() or not video_data: @@ -66,6 +67,7 @@ class VideoDownloader(DownloaderBase): success = self._dl_single_vid(youtube_id, channel_id) if not success: + failed += 1 continue self._notify(video_data, "Add video metadata to index", progress=1) @@ -82,7 +84,7 @@ class VideoDownloader(DownloaderBase): # post processing DownloadPostProcess(self.task).run() - return downloaded + return downloaded, failed def _notify(self, video_data, message, progress=False): """send progress notification to task""" @@ -153,6 +155,7 @@ class VideoDownloader(DownloaderBase): "continuedl": True, "writethumbnail": False, "noplaylist": True, + "color": "no_color", } def _build_obs_user(self): @@ -219,12 +222,6 @@ class VideoDownloader(DownloaderBase): self._set_overwrites(obs, channel_id) dl_cache = os.path.join(self.CACHE_DIR, "download") - # check if already in cache to continue from there - all_cached = ignore_filelist(os.listdir(dl_cache)) - for file_name in all_cached: - if youtube_id in file_name: - obs["outtmpl"] = os.path.join(dl_cache, file_name) - success, message = YtWrap(obs, self.config).download(youtube_id) if not success: self._handle_error(youtube_id, message) diff --git a/tubearchivist/home/src/es/index_mapping.json b/tubearchivist/home/src/es/index_mapping.json index f74252d1..1635b6b1 100644 --- a/tubearchivist/home/src/es/index_mapping.json +++ b/tubearchivist/home/src/es/index_mapping.json @@ -75,6 +75,15 @@ }, "integrate_sponsorblock": { "type": "boolean" + }, + "subscriptions_channel_size": { + "type": "long" + }, + "subscriptions_live_channel_size": { + "type": "long" + }, + "subscriptions_shorts_channel_size": { + "type": "long" } } } @@ -169,6 +178,15 @@ }, "integrate_sponsorblock": { "type": "boolean" + }, + "subscriptions_channel_size": { + "type": "long" + }, + "subscriptions_live_channel_size": { + "type": "long" + }, + "subscriptions_shorts_channel_size": { + "type": "long" } } } diff --git a/tubearchivist/home/src/frontend/forms.py b/tubearchivist/home/src/frontend/forms.py index 7f42797c..5286a932 100644 --- a/tubearchivist/home/src/frontend/forms.py +++ b/tubearchivist/home/src/frontend/forms.py @@ -259,3 +259,12 @@ class ChannelOverwriteForm(forms.Form): integrate_sponsorblock = forms.ChoiceField( widget=forms.Select, choices=SP_CHOICES, required=False ) + subscriptions_channel_size = forms.IntegerField( + label=False, required=False + ) + subscriptions_live_channel_size = forms.IntegerField( + label=False, required=False + ) + subscriptions_shorts_channel_size = forms.IntegerField( + label=False, required=False + ) diff --git a/tubearchivist/home/src/index/channel.py b/tubearchivist/home/src/index/channel.py index 3a6af34f..d55e46c4 100644 --- a/tubearchivist/home/src/index/channel.py +++ b/tubearchivist/home/src/index/channel.py @@ -6,46 +6,16 @@ functionality: import json import os -import re from datetime import datetime -import requests from home.src.download.thumbnails import ThumbManager from home.src.download.yt_dlp_base import YtWrap from home.src.es.connect import ElasticWrap, IndexPaginate from home.src.index.generic import YouTubeItem from home.src.index.playlist import YoutubePlaylist -from home.src.ta.helper import requests_headers from home.src.ta.settings import EnvironmentSettings -def banner_extractor(channel_id: str) -> dict[str, str] | None: - """workaround for new channel renderer, upstream #9893""" - url = f"https://www.youtube.com/channel/{channel_id}?hl=en" - cookies = {"SOCS": "CAI"} - response = requests.get( - url, cookies=cookies, headers=requests_headers(), timeout=30 - ) - if not response.ok: - return None - - matched_urls = re.findall( - r'"(https://yt3.googleusercontent.com/[^"]+=w(\d{3,4})-fcrop64[^"]*)"', - response.text, - ) - if not matched_urls: - return None - - sorted_urls = sorted(matched_urls, key=lambda x: int(x[1]), reverse=True) - banner = sorted_urls[0][0] - channel_art_fallback = { - "channel_banner_url": banner, - "channel_tvart_url": banner.split("-fcrop64")[0], - } - - return channel_art_fallback - - class YoutubeChannel(YouTubeItem): """represents a single youtube channel""" @@ -87,7 +57,7 @@ class YoutubeChannel(YouTubeItem): "channel_id": self.youtube_id, "channel_last_refresh": int(datetime.now().timestamp()), "channel_name": self.youtube_meta["uploader"], - "channel_subs": self._extract_follower_count(), + "channel_subs": self.youtube_meta.get("channel_follower_count", 0), "channel_subscribed": False, "channel_tags": self._parse_tags(self.youtube_meta.get("tags")), "channel_banner_url": self._get_banner_art(), @@ -95,34 +65,6 @@ class YoutubeChannel(YouTubeItem): "channel_tvart_url": self._get_tv_art(), "channel_views": self.youtube_meta.get("view_count") or 0, } - self._inject_fallback() - - def _inject_fallback(self): - """fallback channel art work, workaround for upstream #9893""" - if self.json_data["channel_banner_url"]: - return - - print(f"{self.youtube_id}: attempt art fallback extraction") - fallback = banner_extractor(self.youtube_id) - if fallback: - print(f"{self.youtube_id}: fallback succeeded: {fallback}") - self.json_data.update(fallback) - - def _extract_follower_count(self) -> int: - """workaround for upstream #9893, extract subs from first video""" - subs = self.youtube_meta.get("channel_follower_count") - if subs is not None: - return subs - - entries = self.youtube_meta.get("entries", []) - if entries: - first_entry = entries[0] - if isinstance(first_entry, dict): - subs_entry = first_entry.get("channel_follower_count") - if subs_entry is not None: - return subs_entry - - return 0 def _parse_tags(self, tags): """parse channel tags""" @@ -375,23 +317,30 @@ class YoutubeChannel(YouTubeItem): "autodelete_days", "index_playlists", "integrate_sponsorblock", + "subscriptions_channel_size", + "subscriptions_live_channel_size", + "subscriptions_shorts_channel_size", ] to_write = self.json_data.get("channel_overwrites", {}) for key, value in overwrites.items(): if key not in valid_keys: raise ValueError(f"invalid overwrite key: {key}") - if value == "disable": + elif value == "disable": to_write[key] = False continue - if value in [0, "0"]: + elif value == "0": if key in to_write: del to_write[key] continue - if value == "1": + elif value == "1": to_write[key] = True continue - if value: + elif isinstance(value, int) and int(value) < 0: + if key in to_write: + del to_write[key] + continue + elif value is not None and value != "": to_write.update({key: value}) self.json_data["channel_overwrites"] = to_write diff --git a/tubearchivist/home/tasks.py b/tubearchivist/home/tasks.py index 89f15a6b..c890fa64 100644 --- a/tubearchivist/home/tasks.py +++ b/tubearchivist/home/tasks.py @@ -7,6 +7,7 @@ Functionality: """ from celery import Task, shared_task +from celery.exceptions import Retry from home.src.download.queue import PendingList from home.src.download.subscriptions import ( SubscriptionHandler, @@ -114,7 +115,13 @@ def update_subscribed(self): return None -@shared_task(name="download_pending", bind=True, base=BaseTask) +@shared_task( + name="download_pending", + bind=True, + base=BaseTask, + max_retries=3, + default_retry_delay=10, +) def download_pending(self, auto_only=False): """download latest pending videos""" manager = TaskManager() @@ -124,11 +131,20 @@ def download_pending(self, auto_only=False): return None manager.init(self) - downloader = VideoDownloader(task=self) - videos_downloaded = downloader.run_queue(auto_only=auto_only) + try: + downloader = VideoDownloader(task=self) + downloaded, failed = downloader.run_queue(auto_only=auto_only) - if videos_downloaded: - return f"downloaded {videos_downloaded} video(s)." + if failed: + print(f"[task][{self.name}] Videos failed, retry.") + self.send_progress("Videos failed, retry.") + raise self.retry() + + except Retry as exc: + raise exc + + if downloaded: + return f"downloaded {downloaded} video(s)." return None diff --git a/tubearchivist/home/templates/home/channel_id_about.html b/tubearchivist/home/templates/home/channel_id_about.html index cd965f31..56fae3dc 100644 --- a/tubearchivist/home/templates/home/channel_id_about.html +++ b/tubearchivist/home/templates/home/channel_id_about.html @@ -107,16 +107,20 @@ {{ channel_info.channel_overwrites.download_format }} {% else %} False - {% endif %}
+ {% endif %}Auto delete watched videos after x days: - {% if channel_info.channel_overwrites.autodelete_days %} + {% if channel_info.channel_overwrites.autodelete_days is not None %} {{ channel_info.channel_overwrites.autodelete_days }} {% else %} False - {% endif %}
+ {% endif %}Disable standard videos, shorts, or streams for this channel by setting their page size to 0 (zero).
Disable page size overwrite for channel by setting to negative value.
YouTube page size: + {% if channel_info.channel_overwrites.subscriptions_channel_size is not None %} + {{ channel_info.channel_overwrites.subscriptions_channel_size }} + {% else %} + False + {% endif %}
+ Videos to scan to find new items for the Rescan subscriptions task, max recommended 50.YouTube Live page size: + {% if channel_info.channel_overwrites.subscriptions_live_channel_size is not None %} + {{ channel_info.channel_overwrites.subscriptions_live_channel_size }} + {% else %} + False + {% endif %}
+ Live Videos to scan to find new items for the Rescan subscriptions task, max recommended 50.YouTube Shorts page size: + {% if channel_info.channel_overwrites.subscriptions_shorts_channel_size is not None %} + {{ channel_info.channel_overwrites.subscriptions_shorts_channel_size }} + {% else %} + False + {% endif %}
+ Shorts Videos to scan to find new items for the Rescan subscriptions task, max recommended 50.