tubearchivist/tubearchivist/home/src/index/reindex.py

310 lines
11 KiB
Python

"""
functionality:
- periodically refresh documents
- index and update in es
"""
import os
import shutil
from datetime import datetime
from math import ceil
from time import sleep
from home.src.download.queue import PendingList
from home.src.download.thumbnails import ThumbManager
from home.src.download.yt_dlp_base import CookieHandler
from home.src.download.yt_dlp_handler import VideoDownloader
from home.src.es.connect import ElasticWrap
from home.src.index.channel import YoutubeChannel
from home.src.index.playlist import YoutubePlaylist
from home.src.index.video import YoutubeVideo
from home.src.ta.config import AppConfig
class Reindex:
"""check for outdated documents and refresh data from youtube"""
MATCH_FIELD = {
"ta_video": "active",
"ta_channel": "channel_active",
"ta_playlist": "playlist_active",
}
MULTIPLY = 1.2
def __init__(self):
# config
self.now = int(datetime.now().strftime("%s"))
self.config = AppConfig().config
self.interval = self.config["scheduler"]["check_reindex_days"]
# scan
self.all_youtube_ids = False
self.all_channel_ids = False
self.all_playlist_ids = False
def check_cookie(self):
"""validate cookie if enabled"""
if self.config["downloads"]["cookie_import"]:
valid = CookieHandler(self.config).validate()
if not valid:
return
def _get_daily(self):
"""get daily refresh values"""
total_videos = self._get_total_hits("ta_video")
video_daily = ceil(total_videos / self.interval * self.MULTIPLY)
if video_daily >= 10000:
video_daily = 9999
total_channels = self._get_total_hits("ta_channel")
channel_daily = ceil(total_channels / self.interval * self.MULTIPLY)
total_playlists = self._get_total_hits("ta_playlist")
playlist_daily = ceil(total_playlists / self.interval * self.MULTIPLY)
return (video_daily, channel_daily, playlist_daily)
def _get_total_hits(self, index):
"""get total hits from index"""
match_field = self.MATCH_FIELD[index]
path = f"{index}/_search?filter_path=hits.total"
data = {"query": {"match": {match_field: True}}}
response, _ = ElasticWrap(path).post(data=data)
total_hits = response["hits"]["total"]["value"]
return total_hits
def _get_unrated_vids(self):
"""get max 200 videos without rating if ryd integration is enabled"""
data = {
"size": 200,
"query": {
"bool": {
"must_not": [{"exists": {"field": "stats.average_rating"}}]
}
},
}
response, _ = ElasticWrap("ta_video/_search").get(data=data)
missing_rating = [i["_id"] for i in response["hits"]["hits"]]
self.all_youtube_ids = self.all_youtube_ids + missing_rating
def _get_outdated_vids(self, size):
"""get daily videos to refresh"""
now_lte = self.now - self.interval * 24 * 60 * 60
must_list = [
{"match": {"active": True}},
{"range": {"vid_last_refresh": {"lte": now_lte}}},
]
data = {
"size": size,
"query": {"bool": {"must": must_list}},
"sort": [{"vid_last_refresh": {"order": "asc"}}],
"_source": False,
}
response, _ = ElasticWrap("ta_video/_search").get(data=data)
all_youtube_ids = [i["_id"] for i in response["hits"]["hits"]]
return all_youtube_ids
def _get_outdated_channels(self, size):
"""get daily channels to refresh"""
now_lte = self.now - self.interval * 24 * 60 * 60
must_list = [
{"match": {"channel_active": True}},
{"range": {"channel_last_refresh": {"lte": now_lte}}},
]
data = {
"size": size,
"query": {"bool": {"must": must_list}},
"sort": [{"channel_last_refresh": {"order": "asc"}}],
"_source": False,
}
response, _ = ElasticWrap("ta_channel/_search").get(data=data)
all_channel_ids = [i["_id"] for i in response["hits"]["hits"]]
return all_channel_ids
def _get_outdated_playlists(self, size):
"""get daily outdated playlists to refresh"""
now_lte = self.now - self.interval * 24 * 60 * 60
must_list = [
{"match": {"playlist_active": True}},
{"range": {"playlist_last_refresh": {"lte": now_lte}}},
]
data = {
"size": size,
"query": {"bool": {"must": must_list}},
"sort": [{"playlist_last_refresh": {"order": "asc"}}],
"_source": False,
}
response, _ = ElasticWrap("ta_playlist/_search").get(data=data)
all_playlist_ids = [i["_id"] for i in response["hits"]["hits"]]
return all_playlist_ids
def check_outdated(self):
"""add missing vids and channels"""
video_daily, channel_daily, playlist_daily = self._get_daily()
self.all_youtube_ids = self._get_outdated_vids(video_daily)
self.all_channel_ids = self._get_outdated_channels(channel_daily)
self.all_playlist_ids = self._get_outdated_playlists(playlist_daily)
integrate_ryd = self.config["downloads"]["integrate_ryd"]
if integrate_ryd:
self._get_unrated_vids()
@staticmethod
def _reindex_single_video(youtube_id):
"""refresh data for single video"""
video = YoutubeVideo(youtube_id)
# read current state
video.get_from_es()
player = video.json_data["player"]
date_downloaded = video.json_data["date_downloaded"]
channel_dict = video.json_data["channel"]
playlist = video.json_data.get("playlist")
subtitles = video.json_data.get("subtitles")
# get new
video.build_json()
if not video.youtube_meta:
video.deactivate()
return
video.delete_subtitles(subtitles=subtitles)
video.check_subtitles()
# add back
video.json_data["player"] = player
video.json_data["date_downloaded"] = date_downloaded
video.json_data["channel"] = channel_dict
if playlist:
video.json_data["playlist"] = playlist
video.upload_to_es()
thumb_handler = ThumbManager()
thumb_handler.delete_vid_thumb(youtube_id)
to_download = (youtube_id, video.json_data["vid_thumb_url"])
thumb_handler.download_vid([to_download], notify=False)
return
@staticmethod
def _reindex_single_channel(channel_id):
"""refresh channel data and sync to videos"""
channel = YoutubeChannel(channel_id)
channel.get_from_es()
subscribed = channel.json_data["channel_subscribed"]
overwrites = channel.json_data.get("channel_overwrites", False)
channel.get_from_youtube()
channel.json_data["channel_subscribed"] = subscribed
if overwrites:
channel.json_data["channel_overwrites"] = overwrites
channel.upload_to_es()
channel.sync_to_videos()
@staticmethod
def _reindex_single_playlist(playlist_id, all_indexed_ids):
"""refresh playlist data"""
playlist = YoutubePlaylist(playlist_id)
playlist.get_from_es()
subscribed = playlist.json_data["playlist_subscribed"]
playlist.all_youtube_ids = all_indexed_ids
playlist.build_json(scrape=True)
if not playlist.json_data:
playlist.deactivate()
return
playlist.json_data["playlist_subscribed"] = subscribed
playlist.upload_to_es()
return
def reindex(self):
"""reindex what's needed"""
sleep_interval = self.config["downloads"]["sleep_interval"]
# videos
print(f"reindexing {len(self.all_youtube_ids)} videos")
for youtube_id in self.all_youtube_ids:
try:
self._reindex_single_video(youtube_id)
except FileNotFoundError:
# handle channel name change here
ChannelUrlFixer(youtube_id, self.config).run()
self._reindex_single_video(youtube_id)
if sleep_interval:
sleep(sleep_interval)
# channels
print(f"reindexing {len(self.all_channel_ids)} channels")
for channel_id in self.all_channel_ids:
self._reindex_single_channel(channel_id)
if sleep_interval:
sleep(sleep_interval)
# playlist
print(f"reindexing {len(self.all_playlist_ids)} playlists")
if self.all_playlist_ids:
handler = PendingList()
handler.get_download()
handler.get_indexed()
all_indexed_ids = [i["youtube_id"] for i in handler.all_videos]
for playlist_id in self.all_playlist_ids:
self._reindex_single_playlist(playlist_id, all_indexed_ids)
if sleep_interval:
sleep(sleep_interval)
class ChannelUrlFixer:
"""fix not matching channel names in reindex"""
def __init__(self, youtube_id, config):
self.youtube_id = youtube_id
self.config = config
self.video = False
def run(self):
"""check and run if needed"""
print(f"{self.youtube_id}: failed to build channel path, try to fix.")
video_path_is, video_folder_is = self.get_as_is()
if not os.path.exists(video_path_is):
print(f"giving up reindex, video in video: {self.video.json_data}")
raise ValueError
_, video_folder_should = self.get_as_should()
if video_folder_is != video_folder_should:
self.process(video_path_is)
else:
print(f"{self.youtube_id}: skip channel url fixer")
def get_as_is(self):
"""get video object as is"""
self.video = YoutubeVideo(self.youtube_id)
self.video.get_from_es()
video_path_is = os.path.join(
self.config["application"]["videos"],
self.video.json_data["media_url"],
)
video_folder_is = os.path.split(video_path_is)[0]
return video_path_is, video_folder_is
def get_as_should(self):
"""add fresh metadata from remote"""
self.video.get_from_youtube()
self.video.add_file_path()
video_path_should = os.path.join(
self.config["application"]["videos"],
self.video.json_data["media_url"],
)
video_folder_should = os.path.split(video_path_should)[0]
return video_path_should, video_folder_should
def process(self, video_path_is):
"""fix filepath"""
print(f"{self.youtube_id}: fixing channel rename.")
cache_dir = self.config["application"]["cache_dir"]
new_file_path = os.path.join(
cache_dir, "download", self.youtube_id + ".mp4"
)
shutil.move(video_path_is, new_file_path)
VideoDownloader().move_to_archive(self.video.json_data)
self.video.update_media_url()