mirror of
https://github.com/tubearchivist/tubearchivist.git
synced 2025-07-14 13:08:16 +00:00
* ES Client must bootstrap itself to be the source of config If this is not done a cyclic loop is created between the config loader and the ES client. This lays the ground work for ES being the source of all app config. * auto_download is not used anymore * Add UserConfig class that encapsulates user config storage This class will allow the rest of the code to 'not care' about how user properties are stored. This requires the addition of a ta_users index in ES. * Create migration task for user config transfer * Replace getters and setters for each property Strongly type the user configuration Migrate missed sponsorblock ID * Other DB settings will be another PR
424 lines
15 KiB
Python
424 lines
15 KiB
Python
"""
|
|
functionality:
|
|
- handle yt_dlp
|
|
- build options and post processor
|
|
- download video files
|
|
- move to archive
|
|
"""
|
|
|
|
import os
|
|
import shutil
|
|
from datetime import datetime
|
|
|
|
from home.src.download.queue import PendingList
|
|
from home.src.download.subscriptions import PlaylistSubscription
|
|
from home.src.download.yt_dlp_base import YtWrap
|
|
from home.src.es.connect import ElasticWrap, IndexPaginate
|
|
from home.src.index.channel import YoutubeChannel
|
|
from home.src.index.comments import CommentList
|
|
from home.src.index.playlist import YoutubePlaylist
|
|
from home.src.index.video import YoutubeVideo, index_new_video
|
|
from home.src.index.video_constants import VideoTypeEnum
|
|
from home.src.ta.config import AppConfig
|
|
from home.src.ta.helper import ignore_filelist
|
|
|
|
|
|
class DownloadPostProcess:
|
|
"""handle task to run after download queue finishes"""
|
|
|
|
def __init__(self, download):
|
|
self.download = download
|
|
self.now = int(datetime.now().timestamp())
|
|
self.pending = False
|
|
|
|
def run(self):
|
|
"""run all functions"""
|
|
self.pending = PendingList()
|
|
self.pending.get_download()
|
|
self.pending.get_channels()
|
|
self.pending.get_indexed()
|
|
self.auto_delete_all()
|
|
self.auto_delete_overwrites()
|
|
self.validate_playlists()
|
|
self.get_comments()
|
|
|
|
def auto_delete_all(self):
|
|
"""handle auto delete"""
|
|
autodelete_days = self.download.config["downloads"]["autodelete_days"]
|
|
if not autodelete_days:
|
|
return
|
|
|
|
print(f"auto delete older than {autodelete_days} days")
|
|
now_lte = self.now - autodelete_days * 24 * 60 * 60
|
|
data = {
|
|
"query": {"range": {"player.watched_date": {"lte": now_lte}}},
|
|
"sort": [{"player.watched_date": {"order": "asc"}}],
|
|
}
|
|
self._auto_delete_watched(data)
|
|
|
|
def auto_delete_overwrites(self):
|
|
"""handle per channel auto delete from overwrites"""
|
|
for channel_id, value in self.pending.channel_overwrites.items():
|
|
if "autodelete_days" in value:
|
|
autodelete_days = value.get("autodelete_days")
|
|
print(f"{channel_id}: delete older than {autodelete_days}d")
|
|
now_lte = self.now - autodelete_days * 24 * 60 * 60
|
|
must_list = [
|
|
{"range": {"player.watched_date": {"lte": now_lte}}},
|
|
{"term": {"channel.channel_id": {"value": channel_id}}},
|
|
]
|
|
data = {
|
|
"query": {"bool": {"must": must_list}},
|
|
"sort": [{"player.watched_date": {"order": "desc"}}],
|
|
}
|
|
self._auto_delete_watched(data)
|
|
|
|
@staticmethod
|
|
def _auto_delete_watched(data):
|
|
"""delete watched videos after x days"""
|
|
to_delete = IndexPaginate("ta_video", data).get_results()
|
|
if not to_delete:
|
|
return
|
|
|
|
for video in to_delete:
|
|
youtube_id = video["youtube_id"]
|
|
print(f"{youtube_id}: auto delete video")
|
|
YoutubeVideo(youtube_id).delete_media_file()
|
|
|
|
print("add deleted to ignore list")
|
|
vids = [{"type": "video", "url": i["youtube_id"]} for i in to_delete]
|
|
pending = PendingList(youtube_ids=vids)
|
|
pending.parse_url_list()
|
|
pending.add_to_pending(status="ignore")
|
|
|
|
def validate_playlists(self):
|
|
"""look for playlist needing to update"""
|
|
for id_c, channel_id in enumerate(self.download.channels):
|
|
channel = YoutubeChannel(channel_id, task=self.download.task)
|
|
overwrites = self.pending.channel_overwrites.get(channel_id, False)
|
|
if overwrites and overwrites.get("index_playlists"):
|
|
# validate from remote
|
|
channel.index_channel_playlists()
|
|
continue
|
|
|
|
# validate from local
|
|
playlists = channel.get_indexed_playlists(active_only=True)
|
|
all_channel_playlist = [i["playlist_id"] for i in playlists]
|
|
self._validate_channel_playlist(all_channel_playlist, id_c)
|
|
|
|
def _validate_channel_playlist(self, all_channel_playlist, id_c):
|
|
"""scan channel for playlist needing update"""
|
|
all_youtube_ids = [i["youtube_id"] for i in self.pending.all_videos]
|
|
for id_p, playlist_id in enumerate(all_channel_playlist):
|
|
playlist = YoutubePlaylist(playlist_id)
|
|
playlist.all_youtube_ids = all_youtube_ids
|
|
playlist.build_json(scrape=True)
|
|
if not playlist.json_data:
|
|
playlist.deactivate()
|
|
continue
|
|
|
|
playlist.add_vids_to_playlist()
|
|
playlist.upload_to_es()
|
|
self._notify_playlist_progress(all_channel_playlist, id_c, id_p)
|
|
|
|
def _notify_playlist_progress(self, all_channel_playlist, id_c, id_p):
|
|
"""notify to UI"""
|
|
if not self.download.task:
|
|
return
|
|
|
|
total_channel = len(self.download.channels)
|
|
total_playlist = len(all_channel_playlist)
|
|
|
|
message = [
|
|
f"Post Processing Channels: {id_c}/{total_channel}",
|
|
f"Validate Playlists {id_p + 1}/{total_playlist}",
|
|
]
|
|
progress = (id_c + 1) / total_channel
|
|
self.download.task.send_progress(message, progress=progress)
|
|
|
|
def get_comments(self):
|
|
"""get comments from youtube"""
|
|
CommentList(self.download.videos, task=self.download.task).index()
|
|
|
|
|
|
class VideoDownloader:
|
|
"""
|
|
handle the video download functionality
|
|
if not initiated with list, take from queue
|
|
"""
|
|
|
|
def __init__(self, youtube_id_list=False, task=False):
|
|
self.obs = False
|
|
self.video_overwrites = False
|
|
self.youtube_id_list = youtube_id_list
|
|
self.task = task
|
|
self.config = AppConfig().config
|
|
self._build_obs()
|
|
self.channels = set()
|
|
self.videos = set()
|
|
|
|
def run_queue(self, auto_only=False):
|
|
"""setup download queue in redis loop until no more items"""
|
|
self._get_overwrites()
|
|
while True:
|
|
video_data = self._get_next(auto_only)
|
|
if self.task.is_stopped() or not video_data:
|
|
self._reset_auto()
|
|
break
|
|
|
|
youtube_id = video_data.get("youtube_id")
|
|
print(f"{youtube_id}: Downloading video")
|
|
self._notify(video_data, "Validate download format")
|
|
|
|
success = self._dl_single_vid(youtube_id)
|
|
if not success:
|
|
continue
|
|
|
|
self._notify(video_data, "Add video metadata to index")
|
|
|
|
vid_dict = index_new_video(
|
|
youtube_id,
|
|
video_overwrites=self.video_overwrites,
|
|
video_type=VideoTypeEnum(video_data["vid_type"]),
|
|
)
|
|
self.channels.add(vid_dict["channel"]["channel_id"])
|
|
self.videos.add(vid_dict["youtube_id"])
|
|
|
|
self._notify(video_data, "Move downloaded file to archive")
|
|
self.move_to_archive(vid_dict)
|
|
self._delete_from_pending(youtube_id)
|
|
|
|
# post processing
|
|
self._add_subscribed_channels()
|
|
DownloadPostProcess(self).run()
|
|
|
|
return self.videos
|
|
|
|
def _notify(self, video_data, message):
|
|
"""send progress notification to task"""
|
|
if not self.task:
|
|
return
|
|
|
|
typ = VideoTypeEnum(video_data["vid_type"]).value.rstrip("s").title()
|
|
title = video_data.get("title")
|
|
self.task.send_progress([f"Processing {typ}: {title}", message])
|
|
|
|
def _get_next(self, auto_only):
|
|
"""get next item in queue"""
|
|
must_list = [{"term": {"status": {"value": "pending"}}}]
|
|
must_not_list = [{"exists": {"field": "message"}}]
|
|
if auto_only:
|
|
must_list.append({"term": {"auto_start": {"value": True}}})
|
|
|
|
data = {
|
|
"size": 1,
|
|
"query": {"bool": {"must": must_list, "must_not": must_not_list}},
|
|
"sort": [
|
|
{"auto_start": {"order": "desc"}},
|
|
{"timestamp": {"order": "asc"}},
|
|
],
|
|
}
|
|
path = "ta_download/_search"
|
|
response, _ = ElasticWrap(path).get(data=data)
|
|
if not response["hits"]["hits"]:
|
|
return False
|
|
|
|
return response["hits"]["hits"][0]["_source"]
|
|
|
|
def _get_overwrites(self):
|
|
"""get channel overwrites"""
|
|
pending = PendingList()
|
|
pending.get_download()
|
|
pending.get_channels()
|
|
self.video_overwrites = pending.video_overwrites
|
|
|
|
def _progress_hook(self, response):
|
|
"""process the progress_hooks from yt_dlp"""
|
|
progress = False
|
|
try:
|
|
size = response.get("_total_bytes_str")
|
|
if size.strip() == "N/A":
|
|
size = response.get("_total_bytes_estimate_str", "N/A")
|
|
|
|
percent = response["_percent_str"]
|
|
progress = float(percent.strip("%")) / 100
|
|
speed = response["_speed_str"]
|
|
eta = response["_eta_str"]
|
|
message = f"{percent} of {size} at {speed} - time left: {eta}"
|
|
except KeyError:
|
|
message = "processing"
|
|
|
|
if self.task:
|
|
title = response["info_dict"]["title"]
|
|
self.task.send_progress([title, message], progress=progress)
|
|
|
|
def _build_obs(self):
|
|
"""collection to build all obs passed to yt-dlp"""
|
|
self._build_obs_basic()
|
|
self._build_obs_user()
|
|
self._build_obs_postprocessors()
|
|
|
|
def _build_obs_basic(self):
|
|
"""initial obs"""
|
|
self.obs = {
|
|
"merge_output_format": "mp4",
|
|
"outtmpl": (
|
|
self.config["application"]["cache_dir"]
|
|
+ "/download/%(id)s.mp4"
|
|
),
|
|
"progress_hooks": [self._progress_hook],
|
|
"noprogress": True,
|
|
"continuedl": True,
|
|
"writethumbnail": False,
|
|
"noplaylist": True,
|
|
}
|
|
|
|
def _build_obs_user(self):
|
|
"""build user customized options"""
|
|
if self.config["downloads"]["format"]:
|
|
self.obs["format"] = self.config["downloads"]["format"]
|
|
if self.config["downloads"]["format_sort"]:
|
|
format_sort = self.config["downloads"]["format_sort"]
|
|
format_sort_list = [i.strip() for i in format_sort.split(",")]
|
|
self.obs["format_sort"] = format_sort_list
|
|
if self.config["downloads"]["limit_speed"]:
|
|
self.obs["ratelimit"] = (
|
|
self.config["downloads"]["limit_speed"] * 1024
|
|
)
|
|
|
|
throttle = self.config["downloads"]["throttledratelimit"]
|
|
if throttle:
|
|
self.obs["throttledratelimit"] = throttle * 1024
|
|
|
|
def _build_obs_postprocessors(self):
|
|
"""add postprocessor to obs"""
|
|
postprocessors = []
|
|
|
|
if self.config["downloads"]["add_metadata"]:
|
|
postprocessors.append(
|
|
{
|
|
"key": "FFmpegMetadata",
|
|
"add_chapters": True,
|
|
"add_metadata": True,
|
|
}
|
|
)
|
|
postprocessors.append(
|
|
{
|
|
"key": "MetadataFromField",
|
|
"formats": [
|
|
"%(title)s:%(meta_title)s",
|
|
"%(uploader)s:%(meta_artist)s",
|
|
":(?P<album>)",
|
|
],
|
|
"when": "pre_process",
|
|
}
|
|
)
|
|
|
|
if self.config["downloads"]["add_thumbnail"]:
|
|
postprocessors.append(
|
|
{
|
|
"key": "EmbedThumbnail",
|
|
"already_have_thumbnail": True,
|
|
}
|
|
)
|
|
self.obs["writethumbnail"] = True
|
|
|
|
self.obs["postprocessors"] = postprocessors
|
|
|
|
def get_format_overwrites(self, youtube_id):
|
|
"""get overwrites from single video"""
|
|
overwrites = self.video_overwrites.get(youtube_id, False)
|
|
if overwrites:
|
|
return overwrites.get("download_format", False)
|
|
|
|
return False
|
|
|
|
def _dl_single_vid(self, youtube_id):
|
|
"""download single video"""
|
|
obs = self.obs.copy()
|
|
format_overwrite = self.get_format_overwrites(youtube_id)
|
|
if format_overwrite:
|
|
obs["format"] = format_overwrite
|
|
|
|
dl_cache = self.config["application"]["cache_dir"] + "/download/"
|
|
|
|
# check if already in cache to continue from there
|
|
all_cached = ignore_filelist(os.listdir(dl_cache))
|
|
for file_name in all_cached:
|
|
if youtube_id in file_name:
|
|
obs["outtmpl"] = os.path.join(dl_cache, file_name)
|
|
|
|
success, message = YtWrap(obs, self.config).download(youtube_id)
|
|
if not success:
|
|
self._handle_error(youtube_id, message)
|
|
|
|
if self.obs["writethumbnail"]:
|
|
# webp files don't get cleaned up automatically
|
|
all_cached = ignore_filelist(os.listdir(dl_cache))
|
|
to_clean = [i for i in all_cached if not i.endswith(".mp4")]
|
|
for file_name in to_clean:
|
|
file_path = os.path.join(dl_cache, file_name)
|
|
os.remove(file_path)
|
|
|
|
return success
|
|
|
|
@staticmethod
|
|
def _handle_error(youtube_id, message):
|
|
"""store error message"""
|
|
data = {"doc": {"message": message}}
|
|
_, _ = ElasticWrap(f"ta_download/_update/{youtube_id}").post(data=data)
|
|
|
|
def move_to_archive(self, vid_dict):
|
|
"""move downloaded video from cache to archive"""
|
|
videos = self.config["application"]["videos"]
|
|
host_uid = self.config["application"]["HOST_UID"]
|
|
host_gid = self.config["application"]["HOST_GID"]
|
|
# make folder
|
|
folder = os.path.join(videos, vid_dict["channel"]["channel_id"])
|
|
if not os.path.exists(folder):
|
|
os.makedirs(folder)
|
|
if host_uid and host_gid:
|
|
os.chown(folder, host_uid, host_gid)
|
|
# move media file
|
|
media_file = vid_dict["youtube_id"] + ".mp4"
|
|
cache_dir = self.config["application"]["cache_dir"]
|
|
old_path = os.path.join(cache_dir, "download", media_file)
|
|
new_path = os.path.join(videos, vid_dict["media_url"])
|
|
# move media file and fix permission
|
|
shutil.move(old_path, new_path, copy_function=shutil.copyfile)
|
|
if host_uid and host_gid:
|
|
os.chown(new_path, host_uid, host_gid)
|
|
|
|
@staticmethod
|
|
def _delete_from_pending(youtube_id):
|
|
"""delete downloaded video from pending index if its there"""
|
|
path = f"ta_download/_doc/{youtube_id}?refresh=true"
|
|
_, _ = ElasticWrap(path).delete()
|
|
|
|
def _add_subscribed_channels(self):
|
|
"""add all channels subscribed to refresh"""
|
|
all_subscribed = PlaylistSubscription().get_playlists()
|
|
if not all_subscribed:
|
|
return
|
|
|
|
channel_ids = [i["playlist_channel_id"] for i in all_subscribed]
|
|
for channel_id in channel_ids:
|
|
self.channels.add(channel_id)
|
|
|
|
return
|
|
|
|
def _reset_auto(self):
|
|
"""reset autostart to defaults after queue stop"""
|
|
path = "ta_download/_update_by_query"
|
|
data = {
|
|
"query": {"term": {"auto_start": {"value": True}}},
|
|
"script": {
|
|
"source": "ctx._source.auto_start = false",
|
|
"lang": "painless",
|
|
},
|
|
}
|
|
response, _ = ElasticWrap(path).post(data=data)
|
|
updated = response.get("updated")
|
|
if updated:
|
|
print(f"[download] reset auto start on {updated} videos.")
|