Channel page size overwrite, download retry, #build

Changed:
- Added page size overwrite per channel
- Added retry logic for download failure
- Fixed download continue from cache logic
This commit is contained in:
Simon 2024-07-06 16:52:52 +02:00
commit 9b84169fab
No known key found for this signature in database
GPG Key ID: 2C15AA5E89985DD4
11 changed files with 228 additions and 135 deletions

View File

@ -361,7 +361,7 @@ class ChannelApiListView(ApiBaseView):
must_list = []
if query_filter:
if query_filter not in self.valid_filter:
message = f"invalid url query filder: {query_filter}"
message = f"invalid url query filter: {query_filter}"
print(message)
return Response({"message": message}, status=400)
@ -657,7 +657,7 @@ class DownloadApiListView(ApiBaseView):
must_list = []
if query_filter:
if query_filter not in self.valid_filter:
message = f"invalid url query filder: {query_filter}"
message = f"invalid url query filter: {query_filter}"
print(message)
return Response({"message": message}, status=400)

View File

@ -276,4 +276,4 @@ CORS_ALLOW_HEADERS = list(default_headers) + [
# TA application settings
TA_UPSTREAM = "https://github.com/tubearchivist/tubearchivist"
TA_VERSION = "v0.4.8"
TA_VERSION = "v0.4.9-unstable"

View File

@ -324,6 +324,10 @@ class PendingList(PendingIndex):
else:
vid_type = VideoTypeEnum.VIDEOS
if not vid.get("channel"):
print(f"{youtube_id}: skip video not part of channel")
return False
return self._parse_youtube_details(vid, vid_type)
@staticmethod

View File

@ -39,11 +39,15 @@ class ChannelSubscription:
return all_channels
def get_last_youtube_videos(
self, channel_id, limit=True, query_filter=VideoTypeEnum.UNKNOWN
self,
channel_id,
limit=True,
query_filter=None,
channel_overwrites=None,
):
"""get a list of last videos from channel"""
queries = self._build_queries(query_filter, limit)
query_handler = VideoQueryBuilder(self.config, channel_overwrites)
queries = query_handler.build_queries(query_filter)
last_videos = []
for vid_type_enum, limit_amount in queries:
@ -51,55 +55,25 @@ class ChannelSubscription:
"skip_download": True,
"extract_flat": True,
}
vid_type = vid_type_enum.value
if limit:
obs["playlistend"] = limit_amount
vid_type = vid_type_enum.value
channel = YtWrap(obs, self.config).extract(
f"https://www.youtube.com/channel/{channel_id}/{vid_type}"
)
if not channel:
url = f"https://www.youtube.com/channel/{channel_id}/{vid_type}"
channel_query = YtWrap(obs, self.config).extract(url)
if not channel_query:
continue
last_videos.extend(
[(i["id"], i["title"], vid_type) for i in channel["entries"]]
[
(i["id"], i["title"], vid_type)
for i in channel_query["entries"]
]
)
return last_videos
def _build_queries(self, query_filter, limit):
"""build query list for vid_type"""
limit_map = {
"videos": self.config["subscriptions"]["channel_size"],
"streams": self.config["subscriptions"]["live_channel_size"],
"shorts": self.config["subscriptions"]["shorts_channel_size"],
}
queries = []
if query_filter and query_filter.value != "unknown":
if limit:
query_limit = limit_map.get(query_filter.value)
else:
query_limit = False
queries.append((query_filter, query_limit))
return queries
for query_item, default_limit in limit_map.items():
if not default_limit:
# is deactivated in config
continue
if limit:
query_limit = default_limit
else:
query_limit = False
queries.append((VideoTypeEnum(query_item), query_limit))
return queries
def find_missing(self):
"""add missing videos from subscribed channels to pending"""
all_channels = self.get_channels()
@ -112,7 +86,10 @@ class ChannelSubscription:
for idx, channel in enumerate(all_channels):
channel_id = channel["channel_id"]
print(f"{channel_id}: find missing videos.")
last_videos = self.get_last_youtube_videos(channel_id)
last_videos = self.get_last_youtube_videos(
channel_id,
channel_overwrites=channel.get("channel_overwrites"),
)
if last_videos:
ids_to_add = is_missing([i[0] for i in last_videos])
@ -144,6 +121,92 @@ class ChannelSubscription:
channel.sync_to_videos()
class VideoQueryBuilder:
"""Build queries for yt-dlp."""
def __init__(self, config: dict, channel_overwrites: dict | None = None):
self.config = config
self.channel_overwrites = channel_overwrites or {}
def build_queries(
self, video_type: VideoTypeEnum | None, limit: bool = True
) -> list[tuple[VideoTypeEnum, int | None]]:
"""Build queries for all or specific video type."""
query_methods = {
VideoTypeEnum.VIDEOS: self.videos_query,
VideoTypeEnum.STREAMS: self.streams_query,
VideoTypeEnum.SHORTS: self.shorts_query,
}
if video_type:
# build query for specific type
query_method = query_methods.get(video_type)
if query_method:
query = query_method(limit)
if query[1] != 0:
return [query]
return []
# Build and return queries for all video types
queries = []
for build_query in query_methods.values():
query = build_query(limit)
if query[1] != 0:
queries.append(query)
return queries
def videos_query(self, limit: bool) -> tuple[VideoTypeEnum, int | None]:
"""Build query for videos."""
return self._build_generic_query(
video_type=VideoTypeEnum.VIDEOS,
overwrite_key="subscriptions_channel_size",
config_key="channel_size",
limit=limit,
)
def streams_query(self, limit: bool) -> tuple[VideoTypeEnum, int | None]:
"""Build query for streams."""
return self._build_generic_query(
video_type=VideoTypeEnum.STREAMS,
overwrite_key="subscriptions_live_channel_size",
config_key="live_channel_size",
limit=limit,
)
def shorts_query(self, limit: bool) -> tuple[VideoTypeEnum, int | None]:
"""Build query for shorts."""
return self._build_generic_query(
video_type=VideoTypeEnum.SHORTS,
overwrite_key="subscriptions_shorts_channel_size",
config_key="shorts_channel_size",
limit=limit,
)
def _build_generic_query(
self,
video_type: VideoTypeEnum,
overwrite_key: str,
config_key: str,
limit: bool,
) -> tuple[VideoTypeEnum, int | None]:
"""Generic query for video page scraping."""
if not limit:
return (video_type, None)
if (
overwrite_key in self.channel_overwrites
and self.channel_overwrites[overwrite_key] is not None
):
overwrite = self.channel_overwrites[overwrite_key]
return (video_type, overwrite)
if overwrite := self.config["subscriptions"].get(config_key):
return (video_type, overwrite)
return (video_type, 0)
class PlaylistSubscription:
"""manage the playlist download functionality"""

View File

@ -50,9 +50,10 @@ class VideoDownloader(DownloaderBase):
self.obs = False
self._build_obs()
def run_queue(self, auto_only=False) -> int:
def run_queue(self, auto_only=False) -> tuple[int, int]:
"""setup download queue in redis loop until no more items"""
downloaded = 0
failed = 0
while True:
video_data = self._get_next(auto_only)
if self.task.is_stopped() or not video_data:
@ -66,6 +67,7 @@ class VideoDownloader(DownloaderBase):
success = self._dl_single_vid(youtube_id, channel_id)
if not success:
failed += 1
continue
self._notify(video_data, "Add video metadata to index", progress=1)
@ -82,7 +84,7 @@ class VideoDownloader(DownloaderBase):
# post processing
DownloadPostProcess(self.task).run()
return downloaded
return downloaded, failed
def _notify(self, video_data, message, progress=False):
"""send progress notification to task"""
@ -153,6 +155,7 @@ class VideoDownloader(DownloaderBase):
"continuedl": True,
"writethumbnail": False,
"noplaylist": True,
"color": "no_color",
}
def _build_obs_user(self):
@ -219,12 +222,6 @@ class VideoDownloader(DownloaderBase):
self._set_overwrites(obs, channel_id)
dl_cache = os.path.join(self.CACHE_DIR, "download")
# check if already in cache to continue from there
all_cached = ignore_filelist(os.listdir(dl_cache))
for file_name in all_cached:
if youtube_id in file_name:
obs["outtmpl"] = os.path.join(dl_cache, file_name)
success, message = YtWrap(obs, self.config).download(youtube_id)
if not success:
self._handle_error(youtube_id, message)

View File

@ -75,6 +75,15 @@
},
"integrate_sponsorblock": {
"type": "boolean"
},
"subscriptions_channel_size": {
"type": "long"
},
"subscriptions_live_channel_size": {
"type": "long"
},
"subscriptions_shorts_channel_size": {
"type": "long"
}
}
}
@ -169,6 +178,15 @@
},
"integrate_sponsorblock": {
"type": "boolean"
},
"subscriptions_channel_size": {
"type": "long"
},
"subscriptions_live_channel_size": {
"type": "long"
},
"subscriptions_shorts_channel_size": {
"type": "long"
}
}
}

View File

@ -259,3 +259,12 @@ class ChannelOverwriteForm(forms.Form):
integrate_sponsorblock = forms.ChoiceField(
widget=forms.Select, choices=SP_CHOICES, required=False
)
subscriptions_channel_size = forms.IntegerField(
label=False, required=False
)
subscriptions_live_channel_size = forms.IntegerField(
label=False, required=False
)
subscriptions_shorts_channel_size = forms.IntegerField(
label=False, required=False
)

View File

@ -6,46 +6,16 @@ functionality:
import json
import os
import re
from datetime import datetime
import requests
from home.src.download.thumbnails import ThumbManager
from home.src.download.yt_dlp_base import YtWrap
from home.src.es.connect import ElasticWrap, IndexPaginate
from home.src.index.generic import YouTubeItem
from home.src.index.playlist import YoutubePlaylist
from home.src.ta.helper import requests_headers
from home.src.ta.settings import EnvironmentSettings
def banner_extractor(channel_id: str) -> dict[str, str] | None:
"""workaround for new channel renderer, upstream #9893"""
url = f"https://www.youtube.com/channel/{channel_id}?hl=en"
cookies = {"SOCS": "CAI"}
response = requests.get(
url, cookies=cookies, headers=requests_headers(), timeout=30
)
if not response.ok:
return None
matched_urls = re.findall(
r'"(https://yt3.googleusercontent.com/[^"]+=w(\d{3,4})-fcrop64[^"]*)"',
response.text,
)
if not matched_urls:
return None
sorted_urls = sorted(matched_urls, key=lambda x: int(x[1]), reverse=True)
banner = sorted_urls[0][0]
channel_art_fallback = {
"channel_banner_url": banner,
"channel_tvart_url": banner.split("-fcrop64")[0],
}
return channel_art_fallback
class YoutubeChannel(YouTubeItem):
"""represents a single youtube channel"""
@ -87,7 +57,7 @@ class YoutubeChannel(YouTubeItem):
"channel_id": self.youtube_id,
"channel_last_refresh": int(datetime.now().timestamp()),
"channel_name": self.youtube_meta["uploader"],
"channel_subs": self._extract_follower_count(),
"channel_subs": self.youtube_meta.get("channel_follower_count", 0),
"channel_subscribed": False,
"channel_tags": self._parse_tags(self.youtube_meta.get("tags")),
"channel_banner_url": self._get_banner_art(),
@ -95,34 +65,6 @@ class YoutubeChannel(YouTubeItem):
"channel_tvart_url": self._get_tv_art(),
"channel_views": self.youtube_meta.get("view_count") or 0,
}
self._inject_fallback()
def _inject_fallback(self):
"""fallback channel art work, workaround for upstream #9893"""
if self.json_data["channel_banner_url"]:
return
print(f"{self.youtube_id}: attempt art fallback extraction")
fallback = banner_extractor(self.youtube_id)
if fallback:
print(f"{self.youtube_id}: fallback succeeded: {fallback}")
self.json_data.update(fallback)
def _extract_follower_count(self) -> int:
"""workaround for upstream #9893, extract subs from first video"""
subs = self.youtube_meta.get("channel_follower_count")
if subs is not None:
return subs
entries = self.youtube_meta.get("entries", [])
if entries:
first_entry = entries[0]
if isinstance(first_entry, dict):
subs_entry = first_entry.get("channel_follower_count")
if subs_entry is not None:
return subs_entry
return 0
def _parse_tags(self, tags):
"""parse channel tags"""
@ -375,23 +317,30 @@ class YoutubeChannel(YouTubeItem):
"autodelete_days",
"index_playlists",
"integrate_sponsorblock",
"subscriptions_channel_size",
"subscriptions_live_channel_size",
"subscriptions_shorts_channel_size",
]
to_write = self.json_data.get("channel_overwrites", {})
for key, value in overwrites.items():
if key not in valid_keys:
raise ValueError(f"invalid overwrite key: {key}")
if value == "disable":
elif value == "disable":
to_write[key] = False
continue
if value in [0, "0"]:
elif value == "0":
if key in to_write:
del to_write[key]
continue
if value == "1":
elif value == "1":
to_write[key] = True
continue
if value:
elif isinstance(value, int) and int(value) < 0:
if key in to_write:
del to_write[key]
continue
elif value is not None and value != "":
to_write.update({key: value})
self.json_data["channel_overwrites"] = to_write

View File

@ -7,6 +7,7 @@ Functionality:
"""
from celery import Task, shared_task
from celery.exceptions import Retry
from home.src.download.queue import PendingList
from home.src.download.subscriptions import (
SubscriptionHandler,
@ -114,7 +115,13 @@ def update_subscribed(self):
return None
@shared_task(name="download_pending", bind=True, base=BaseTask)
@shared_task(
name="download_pending",
bind=True,
base=BaseTask,
max_retries=3,
default_retry_delay=10,
)
def download_pending(self, auto_only=False):
"""download latest pending videos"""
manager = TaskManager()
@ -124,11 +131,20 @@ def download_pending(self, auto_only=False):
return None
manager.init(self)
downloader = VideoDownloader(task=self)
videos_downloaded = downloader.run_queue(auto_only=auto_only)
try:
downloader = VideoDownloader(task=self)
downloaded, failed = downloader.run_queue(auto_only=auto_only)
if videos_downloaded:
return f"downloaded {videos_downloaded} video(s)."
if failed:
print(f"[task][{self.name}] Videos failed, retry.")
self.send_progress("Videos failed, retry.")
raise self.retry()
except Retry as exc:
raise exc
if downloaded:
return f"downloaded {downloaded} video(s)."
return None

View File

@ -107,16 +107,20 @@
{{ channel_info.channel_overwrites.download_format }}
{% else %}
False
{% endif %}</span></p>
{% endif %}</span><br>
Enter "disable" to disable this override.
</p>
{{ channel_overwrite_form.download_format }}<br>
</div>
<div class="overwrite-form-item">
<p>Auto delete watched videos after x days: <span class="settings-current">
{% if channel_info.channel_overwrites.autodelete_days %}
{% if channel_info.channel_overwrites.autodelete_days is not None %}
{{ channel_info.channel_overwrites.autodelete_days }}
{% else %}
False
{% endif %}</span></p>
{% endif %}</span><br>
Enter a negative number to disable this override.
</p>
{{ channel_overwrite_form.autodelete_days }}<br>
</div>
<div class="overwrite-form-item">
@ -139,6 +143,39 @@
{% endif %}</span></p>
{{ channel_overwrite_form.integrate_sponsorblock }}<br>
</div>
<h3>Page Size Overrides</h3><br>
<p>Disable standard videos, shorts, or streams for this channel by setting their page size to 0 (zero).</p><br>
<p>Disable page size overwrite for channel by setting to negative value.</p><br>
<div class="overwrite-form-item">
<p>YouTube page size: <span class="settings-current">
{% if channel_info.channel_overwrites.subscriptions_channel_size is not None %}
{{ channel_info.channel_overwrites.subscriptions_channel_size }}
{% else %}
False
{% endif %}</span></p>
<i>Videos to scan to find new items for the <b>Rescan subscriptions</b> task, max recommended 50.</i><br>
{{ channel_overwrite_form.subscriptions_channel_size }}<br>
</div>
<div class="overwrite-form-item">
<p>YouTube Live page size: <span class="settings-current">
{% if channel_info.channel_overwrites.subscriptions_live_channel_size is not None %}
{{ channel_info.channel_overwrites.subscriptions_live_channel_size }}
{% else %}
False
{% endif %}</span></p>
<i>Live Videos to scan to find new items for the <b>Rescan subscriptions</b> task, max recommended 50.</i><br>
{{ channel_overwrite_form.subscriptions_live_channel_size }}<br>
</div>
<div class="overwrite-form-item">
<p>YouTube Shorts page size: <span class="settings-current">
{% if channel_info.channel_overwrites.subscriptions_shorts_channel_size is not None %}
{{ channel_info.channel_overwrites.subscriptions_shorts_channel_size }}
{% else %}
False
{% endif %}</span></p>
<i>Shorts Videos to scan to find new items for the <b>Rescan subscriptions</b> task, max recommended 50.</i><br>
{{ channel_overwrite_form.subscriptions_shorts_channel_size }}<br>
</div><br>
<button type="submit">Save Channel Overwrites</button>
</form>
</div>
@ -146,4 +183,4 @@
{% endif %}
</div>
<script type="text/javascript" src="{% static 'progress.js' %}"></script>
{% endblock content %}
{% endblock content %}

View File

@ -3,12 +3,12 @@ celery==5.4.0
Django==5.0.6
django-auth-ldap==4.8.0
django-celery-beat==2.6.0
django-cors-headers==4.3.1
djangorestframework==3.15.1
Pillow==10.3.0
redis==5.0.4
requests==2.32.1
django-cors-headers==4.4.0
djangorestframework==3.15.2
Pillow==10.4.0
redis==5.0.7
requests==2.32.3
ryd-client==0.0.6
uWSGI==2.0.25.1
whitenoise==6.6.0
yt-dlp @ git+https://github.com/yt-dlp/yt-dlp@8e15177b4113c355989881e4e030f695a9b59c3a
uWSGI==2.0.26
whitenoise==6.7.0
yt-dlp==2024.7.2