Clark 85b56300b3
Move user configuration from Redis to ES (#533)
* ES Client must bootstrap itself to be the source of config

If this is not done a cyclic loop is created between the config loader and the ES client.
This lays the ground work for ES being the source of all app config.

* auto_download is not used anymore

* Add UserConfig class that encapsulates user config storage

This class will allow the rest of the code to 'not care' about how user properties are stored.
This requires the addition of a ta_users index in ES.

* Create migration task for user config transfer

* Replace getters and setters for each property

Strongly type the user configuration
Migrate missed sponsorblock ID

* Other DB settings will be another PR
2023-09-21 21:46:55 +07:00

424 lines
15 KiB
Python

"""
functionality:
- handle yt_dlp
- build options and post processor
- download video files
- move to archive
"""
import os
import shutil
from datetime import datetime
from home.src.download.queue import PendingList
from home.src.download.subscriptions import PlaylistSubscription
from home.src.download.yt_dlp_base import YtWrap
from home.src.es.connect import ElasticWrap, IndexPaginate
from home.src.index.channel import YoutubeChannel
from home.src.index.comments import CommentList
from home.src.index.playlist import YoutubePlaylist
from home.src.index.video import YoutubeVideo, index_new_video
from home.src.index.video_constants import VideoTypeEnum
from home.src.ta.config import AppConfig
from home.src.ta.helper import ignore_filelist
class DownloadPostProcess:
"""handle task to run after download queue finishes"""
def __init__(self, download):
self.download = download
self.now = int(datetime.now().timestamp())
self.pending = False
def run(self):
"""run all functions"""
self.pending = PendingList()
self.pending.get_download()
self.pending.get_channels()
self.pending.get_indexed()
self.auto_delete_all()
self.auto_delete_overwrites()
self.validate_playlists()
self.get_comments()
def auto_delete_all(self):
"""handle auto delete"""
autodelete_days = self.download.config["downloads"]["autodelete_days"]
if not autodelete_days:
return
print(f"auto delete older than {autodelete_days} days")
now_lte = self.now - autodelete_days * 24 * 60 * 60
data = {
"query": {"range": {"player.watched_date": {"lte": now_lte}}},
"sort": [{"player.watched_date": {"order": "asc"}}],
}
self._auto_delete_watched(data)
def auto_delete_overwrites(self):
"""handle per channel auto delete from overwrites"""
for channel_id, value in self.pending.channel_overwrites.items():
if "autodelete_days" in value:
autodelete_days = value.get("autodelete_days")
print(f"{channel_id}: delete older than {autodelete_days}d")
now_lte = self.now - autodelete_days * 24 * 60 * 60
must_list = [
{"range": {"player.watched_date": {"lte": now_lte}}},
{"term": {"channel.channel_id": {"value": channel_id}}},
]
data = {
"query": {"bool": {"must": must_list}},
"sort": [{"player.watched_date": {"order": "desc"}}],
}
self._auto_delete_watched(data)
@staticmethod
def _auto_delete_watched(data):
"""delete watched videos after x days"""
to_delete = IndexPaginate("ta_video", data).get_results()
if not to_delete:
return
for video in to_delete:
youtube_id = video["youtube_id"]
print(f"{youtube_id}: auto delete video")
YoutubeVideo(youtube_id).delete_media_file()
print("add deleted to ignore list")
vids = [{"type": "video", "url": i["youtube_id"]} for i in to_delete]
pending = PendingList(youtube_ids=vids)
pending.parse_url_list()
pending.add_to_pending(status="ignore")
def validate_playlists(self):
"""look for playlist needing to update"""
for id_c, channel_id in enumerate(self.download.channels):
channel = YoutubeChannel(channel_id, task=self.download.task)
overwrites = self.pending.channel_overwrites.get(channel_id, False)
if overwrites and overwrites.get("index_playlists"):
# validate from remote
channel.index_channel_playlists()
continue
# validate from local
playlists = channel.get_indexed_playlists(active_only=True)
all_channel_playlist = [i["playlist_id"] for i in playlists]
self._validate_channel_playlist(all_channel_playlist, id_c)
def _validate_channel_playlist(self, all_channel_playlist, id_c):
"""scan channel for playlist needing update"""
all_youtube_ids = [i["youtube_id"] for i in self.pending.all_videos]
for id_p, playlist_id in enumerate(all_channel_playlist):
playlist = YoutubePlaylist(playlist_id)
playlist.all_youtube_ids = all_youtube_ids
playlist.build_json(scrape=True)
if not playlist.json_data:
playlist.deactivate()
continue
playlist.add_vids_to_playlist()
playlist.upload_to_es()
self._notify_playlist_progress(all_channel_playlist, id_c, id_p)
def _notify_playlist_progress(self, all_channel_playlist, id_c, id_p):
"""notify to UI"""
if not self.download.task:
return
total_channel = len(self.download.channels)
total_playlist = len(all_channel_playlist)
message = [
f"Post Processing Channels: {id_c}/{total_channel}",
f"Validate Playlists {id_p + 1}/{total_playlist}",
]
progress = (id_c + 1) / total_channel
self.download.task.send_progress(message, progress=progress)
def get_comments(self):
"""get comments from youtube"""
CommentList(self.download.videos, task=self.download.task).index()
class VideoDownloader:
"""
handle the video download functionality
if not initiated with list, take from queue
"""
def __init__(self, youtube_id_list=False, task=False):
self.obs = False
self.video_overwrites = False
self.youtube_id_list = youtube_id_list
self.task = task
self.config = AppConfig().config
self._build_obs()
self.channels = set()
self.videos = set()
def run_queue(self, auto_only=False):
"""setup download queue in redis loop until no more items"""
self._get_overwrites()
while True:
video_data = self._get_next(auto_only)
if self.task.is_stopped() or not video_data:
self._reset_auto()
break
youtube_id = video_data.get("youtube_id")
print(f"{youtube_id}: Downloading video")
self._notify(video_data, "Validate download format")
success = self._dl_single_vid(youtube_id)
if not success:
continue
self._notify(video_data, "Add video metadata to index")
vid_dict = index_new_video(
youtube_id,
video_overwrites=self.video_overwrites,
video_type=VideoTypeEnum(video_data["vid_type"]),
)
self.channels.add(vid_dict["channel"]["channel_id"])
self.videos.add(vid_dict["youtube_id"])
self._notify(video_data, "Move downloaded file to archive")
self.move_to_archive(vid_dict)
self._delete_from_pending(youtube_id)
# post processing
self._add_subscribed_channels()
DownloadPostProcess(self).run()
return self.videos
def _notify(self, video_data, message):
"""send progress notification to task"""
if not self.task:
return
typ = VideoTypeEnum(video_data["vid_type"]).value.rstrip("s").title()
title = video_data.get("title")
self.task.send_progress([f"Processing {typ}: {title}", message])
def _get_next(self, auto_only):
"""get next item in queue"""
must_list = [{"term": {"status": {"value": "pending"}}}]
must_not_list = [{"exists": {"field": "message"}}]
if auto_only:
must_list.append({"term": {"auto_start": {"value": True}}})
data = {
"size": 1,
"query": {"bool": {"must": must_list, "must_not": must_not_list}},
"sort": [
{"auto_start": {"order": "desc"}},
{"timestamp": {"order": "asc"}},
],
}
path = "ta_download/_search"
response, _ = ElasticWrap(path).get(data=data)
if not response["hits"]["hits"]:
return False
return response["hits"]["hits"][0]["_source"]
def _get_overwrites(self):
"""get channel overwrites"""
pending = PendingList()
pending.get_download()
pending.get_channels()
self.video_overwrites = pending.video_overwrites
def _progress_hook(self, response):
"""process the progress_hooks from yt_dlp"""
progress = False
try:
size = response.get("_total_bytes_str")
if size.strip() == "N/A":
size = response.get("_total_bytes_estimate_str", "N/A")
percent = response["_percent_str"]
progress = float(percent.strip("%")) / 100
speed = response["_speed_str"]
eta = response["_eta_str"]
message = f"{percent} of {size} at {speed} - time left: {eta}"
except KeyError:
message = "processing"
if self.task:
title = response["info_dict"]["title"]
self.task.send_progress([title, message], progress=progress)
def _build_obs(self):
"""collection to build all obs passed to yt-dlp"""
self._build_obs_basic()
self._build_obs_user()
self._build_obs_postprocessors()
def _build_obs_basic(self):
"""initial obs"""
self.obs = {
"merge_output_format": "mp4",
"outtmpl": (
self.config["application"]["cache_dir"]
+ "/download/%(id)s.mp4"
),
"progress_hooks": [self._progress_hook],
"noprogress": True,
"continuedl": True,
"writethumbnail": False,
"noplaylist": True,
}
def _build_obs_user(self):
"""build user customized options"""
if self.config["downloads"]["format"]:
self.obs["format"] = self.config["downloads"]["format"]
if self.config["downloads"]["format_sort"]:
format_sort = self.config["downloads"]["format_sort"]
format_sort_list = [i.strip() for i in format_sort.split(",")]
self.obs["format_sort"] = format_sort_list
if self.config["downloads"]["limit_speed"]:
self.obs["ratelimit"] = (
self.config["downloads"]["limit_speed"] * 1024
)
throttle = self.config["downloads"]["throttledratelimit"]
if throttle:
self.obs["throttledratelimit"] = throttle * 1024
def _build_obs_postprocessors(self):
"""add postprocessor to obs"""
postprocessors = []
if self.config["downloads"]["add_metadata"]:
postprocessors.append(
{
"key": "FFmpegMetadata",
"add_chapters": True,
"add_metadata": True,
}
)
postprocessors.append(
{
"key": "MetadataFromField",
"formats": [
"%(title)s:%(meta_title)s",
"%(uploader)s:%(meta_artist)s",
":(?P<album>)",
],
"when": "pre_process",
}
)
if self.config["downloads"]["add_thumbnail"]:
postprocessors.append(
{
"key": "EmbedThumbnail",
"already_have_thumbnail": True,
}
)
self.obs["writethumbnail"] = True
self.obs["postprocessors"] = postprocessors
def get_format_overwrites(self, youtube_id):
"""get overwrites from single video"""
overwrites = self.video_overwrites.get(youtube_id, False)
if overwrites:
return overwrites.get("download_format", False)
return False
def _dl_single_vid(self, youtube_id):
"""download single video"""
obs = self.obs.copy()
format_overwrite = self.get_format_overwrites(youtube_id)
if format_overwrite:
obs["format"] = format_overwrite
dl_cache = self.config["application"]["cache_dir"] + "/download/"
# check if already in cache to continue from there
all_cached = ignore_filelist(os.listdir(dl_cache))
for file_name in all_cached:
if youtube_id in file_name:
obs["outtmpl"] = os.path.join(dl_cache, file_name)
success, message = YtWrap(obs, self.config).download(youtube_id)
if not success:
self._handle_error(youtube_id, message)
if self.obs["writethumbnail"]:
# webp files don't get cleaned up automatically
all_cached = ignore_filelist(os.listdir(dl_cache))
to_clean = [i for i in all_cached if not i.endswith(".mp4")]
for file_name in to_clean:
file_path = os.path.join(dl_cache, file_name)
os.remove(file_path)
return success
@staticmethod
def _handle_error(youtube_id, message):
"""store error message"""
data = {"doc": {"message": message}}
_, _ = ElasticWrap(f"ta_download/_update/{youtube_id}").post(data=data)
def move_to_archive(self, vid_dict):
"""move downloaded video from cache to archive"""
videos = self.config["application"]["videos"]
host_uid = self.config["application"]["HOST_UID"]
host_gid = self.config["application"]["HOST_GID"]
# make folder
folder = os.path.join(videos, vid_dict["channel"]["channel_id"])
if not os.path.exists(folder):
os.makedirs(folder)
if host_uid and host_gid:
os.chown(folder, host_uid, host_gid)
# move media file
media_file = vid_dict["youtube_id"] + ".mp4"
cache_dir = self.config["application"]["cache_dir"]
old_path = os.path.join(cache_dir, "download", media_file)
new_path = os.path.join(videos, vid_dict["media_url"])
# move media file and fix permission
shutil.move(old_path, new_path, copy_function=shutil.copyfile)
if host_uid and host_gid:
os.chown(new_path, host_uid, host_gid)
@staticmethod
def _delete_from_pending(youtube_id):
"""delete downloaded video from pending index if its there"""
path = f"ta_download/_doc/{youtube_id}?refresh=true"
_, _ = ElasticWrap(path).delete()
def _add_subscribed_channels(self):
"""add all channels subscribed to refresh"""
all_subscribed = PlaylistSubscription().get_playlists()
if not all_subscribed:
return
channel_ids = [i["playlist_channel_id"] for i in all_subscribed]
for channel_id in channel_ids:
self.channels.add(channel_id)
return
def _reset_auto(self):
"""reset autostart to defaults after queue stop"""
path = "ta_download/_update_by_query"
data = {
"query": {"term": {"auto_start": {"value": True}}},
"script": {
"source": "ctx._source.auto_start = false",
"lang": "painless",
},
}
response, _ = ElasticWrap(path).post(data=data)
updated = response.get("updated")
if updated:
print(f"[download] reset auto start on {updated} videos.")