tubearchivist/tubearchivist/home/src/index/reindex.py

597 lines
20 KiB
Python
Raw Normal View History

"""
functionality:
- periodically refresh documents
- index and update in es
"""
2022-01-22 15:13:37 +00:00
import json
import os
import shutil
2022-01-22 15:13:37 +00:00
from datetime import datetime
from time import sleep
from home.src.download.queue import PendingList
from home.src.download.subscriptions import ChannelSubscription
2022-01-22 15:13:37 +00:00
from home.src.download.thumbnails import ThumbManager
from home.src.download.yt_dlp_base import CookieHandler
from home.src.download.yt_dlp_handler import VideoDownloader
from home.src.es.connect import ElasticWrap, IndexPaginate
2022-01-22 15:13:37 +00:00
from home.src.index.channel import YoutubeChannel
2022-11-18 03:31:09 +00:00
from home.src.index.comments import Comments
2022-01-22 15:13:37 +00:00
from home.src.index.playlist import YoutubePlaylist
from home.src.index.video import YoutubeVideo
from home.src.ta.config import AppConfig
2023-03-15 09:59:20 +00:00
from home.src.ta.ta_redis import RedisQueue
2022-12-11 05:03:21 +00:00
class ReindexBase:
"""base config class for reindex task"""
2022-12-12 11:21:31 +00:00
REINDEX_CONFIG = {
"video": {
2022-12-11 05:03:21 +00:00
"index_name": "ta_video",
"queue_name": "reindex:ta_video",
"active_key": "active",
"refresh_key": "vid_last_refresh",
},
2022-12-12 11:21:31 +00:00
"channel": {
2022-12-11 05:03:21 +00:00
"index_name": "ta_channel",
"queue_name": "reindex:ta_channel",
"active_key": "channel_active",
"refresh_key": "channel_last_refresh",
},
2022-12-12 11:21:31 +00:00
"playlist": {
2022-12-11 05:03:21 +00:00
"index_name": "ta_playlist",
"queue_name": "reindex:ta_playlist",
"active_key": "playlist_active",
"refresh_key": "playlist_last_refresh",
},
2022-12-12 11:21:31 +00:00
}
2022-01-22 15:13:37 +00:00
2022-03-23 08:48:38 +00:00
MULTIPLY = 1.2
DAYS3 = 60 * 60 * 24 * 3
2022-03-23 08:48:38 +00:00
2022-01-22 15:13:37 +00:00
def __init__(self):
2022-03-23 08:48:38 +00:00
self.config = AppConfig().config
2022-12-19 08:05:47 +00:00
self.now = int(datetime.now().timestamp())
2022-12-11 05:03:21 +00:00
def populate(self, all_ids, reindex_config):
"""add all to reindex ids to redis queue"""
if not all_ids:
return
RedisQueue(queue_name=reindex_config["queue_name"]).add_list(all_ids)
class ReindexPopulate(ReindexBase):
"""add outdated and recent documents to reindex queue"""
2022-12-11 05:03:21 +00:00
def __init__(self):
super().__init__()
2022-03-23 08:48:38 +00:00
self.interval = self.config["scheduler"]["check_reindex_days"]
2022-12-11 05:03:21 +00:00
def add_recent(self):
"""add recent videos to refresh"""
gte = datetime.fromtimestamp(self.now - self.DAYS3).date().isoformat()
must_list = [
{"term": {"active": {"value": True}}},
{"range": {"published": {"gte": gte}}},
]
data = {
"size": 10000,
"query": {"bool": {"must": must_list}},
"sort": [{"published": {"order": "desc"}}],
}
response, _ = ElasticWrap("ta_video/_search").get(data=data)
hits = response["hits"]["hits"]
if not hits:
return
all_ids = [i["_source"]["youtube_id"] for i in hits]
reindex_config = self.REINDEX_CONFIG.get("video")
self.populate(all_ids, reindex_config)
2022-12-11 05:03:21 +00:00
def add_outdated(self):
"""add outdated documents"""
2022-12-12 11:21:31 +00:00
for reindex_config in self.REINDEX_CONFIG.values():
2022-12-11 05:03:21 +00:00
total_hits = self._get_total_hits(reindex_config)
daily_should = self._get_daily_should(total_hits)
all_ids = self._get_outdated_ids(reindex_config, daily_should)
self.populate(all_ids, reindex_config)
@staticmethod
def _get_total_hits(reindex_config):
2022-03-23 08:48:38 +00:00
"""get total hits from index"""
2022-12-11 05:03:21 +00:00
index_name = reindex_config["index_name"]
active_key = reindex_config["active_key"]
path = f"{index_name}/_search?filter_path=hits.total"
data = {"query": {"match": {active_key: True}}}
2022-03-23 08:48:38 +00:00
response, _ = ElasticWrap(path).post(data=data)
total_hits = response["hits"]["total"]["value"]
return total_hits
2022-12-11 05:03:21 +00:00
def _get_daily_should(self, total_hits):
"""calc how many should reindex daily"""
daily_should = int((total_hits // self.interval + 1) * self.MULTIPLY)
if daily_should >= 10000:
daily_should = 9999
2022-03-23 08:48:38 +00:00
2022-12-11 05:03:21 +00:00
return daily_should
2022-01-22 15:13:37 +00:00
2022-12-11 05:03:21 +00:00
def _get_outdated_ids(self, reindex_config, daily_should):
"""get outdated from index_name"""
index_name = reindex_config["index_name"]
refresh_key = reindex_config["refresh_key"]
2022-03-23 08:48:38 +00:00
now_lte = self.now - self.interval * 24 * 60 * 60
must_list = [
2023-01-07 08:53:32 +00:00
{"match": {reindex_config["active_key"]: True}},
2022-12-11 05:03:21 +00:00
{"range": {refresh_key: {"lte": now_lte}}},
2022-03-23 08:48:38 +00:00
]
data = {
2022-12-11 05:03:21 +00:00
"size": daily_should,
2022-03-23 08:48:38 +00:00
"query": {"bool": {"must": must_list}},
2022-12-11 05:03:21 +00:00
"sort": [{refresh_key: {"order": "asc"}}],
2022-03-23 08:48:38 +00:00
"_source": False,
}
2022-12-11 05:03:21 +00:00
response, _ = ElasticWrap(f"{index_name}/_search").get(data=data)
2022-03-23 08:48:38 +00:00
2022-12-11 05:03:21 +00:00
all_ids = [i["_id"] for i in response["hits"]["hits"]]
return all_ids
2022-03-23 08:48:38 +00:00
class ReindexManual(ReindexBase):
"""
manually add ids to reindex queue from API
data_example = {
2022-12-12 11:21:31 +00:00
"video": ["video1", "video2", "video3"],
"channel": ["channel1", "channel2", "channel3"],
"playlist": ["playlist1", "playlist2"],
}
extract_videos to also reindex all videos of channel/playlist
"""
def __init__(self, extract_videos=False):
super().__init__()
self.extract_videos = extract_videos
self.data = False
def extract_data(self, data):
"""process data"""
self.data = data
for key, values in self.data.items():
2022-12-12 11:21:31 +00:00
reindex_config = self.REINDEX_CONFIG.get(key)
if not reindex_config:
print(f"reindex type {key} not valid")
raise ValueError
2022-12-12 11:21:31 +00:00
self.process_index(reindex_config, values)
def process_index(self, index_config, values):
"""process values per index"""
index_name = index_config["index_name"]
if index_name == "ta_video":
self._add_videos(values)
elif index_name == "ta_channel":
self._add_channels(values)
elif index_name == "ta_playlist":
self._add_playlists(values)
def _add_videos(self, values):
"""add list of videos to reindex queue"""
if not values:
return
RedisQueue("reindex:ta_video").add_list(values)
def _add_channels(self, values):
"""add list of channels to reindex queue"""
RedisQueue("reindex:ta_channel").add_list(values)
if self.extract_videos:
for channel_id in values:
all_videos = self._get_channel_videos(channel_id)
self._add_videos(all_videos)
def _add_playlists(self, values):
"""add list of playlists to reindex queue"""
RedisQueue("reindex:ta_playlist").add_list(values)
if self.extract_videos:
for playlist_id in values:
all_videos = self._get_playlist_videos(playlist_id)
self._add_videos(all_videos)
def _get_channel_videos(self, channel_id):
"""get all videos from channel"""
data = {
"query": {"term": {"channel.channel_id": {"value": channel_id}}},
"_source": ["youtube_id"],
}
all_results = IndexPaginate("ta_video", data).get_results()
return [i["youtube_id"] for i in all_results]
def _get_playlist_videos(self, playlist_id):
"""get all videos from playlist"""
data = {
"query": {"term": {"playlist.keyword": {"value": playlist_id}}},
"_source": ["youtube_id"],
}
all_results = IndexPaginate("ta_video", data).get_results()
return [i["youtube_id"] for i in all_results]
2022-12-11 05:03:21 +00:00
class Reindex(ReindexBase):
"""reindex all documents from redis queue"""
2022-01-22 15:13:37 +00:00
2023-03-15 09:59:20 +00:00
def __init__(self, task=False):
2022-12-11 05:03:21 +00:00
super().__init__()
2023-03-15 09:59:20 +00:00
self.task = task
2022-12-11 05:03:21 +00:00
self.all_indexed_ids = False
2022-03-23 08:48:38 +00:00
2022-12-11 05:03:21 +00:00
def reindex_all(self):
"""reindex all in queue"""
2022-12-31 08:35:58 +00:00
if not self.cookie_is_valid():
2022-12-11 05:03:21 +00:00
print("[reindex] cookie invalid, exiting...")
return
2022-01-22 15:13:37 +00:00
2023-03-15 09:59:20 +00:00
for name, index_config in self.REINDEX_CONFIG.items():
2022-12-11 05:03:21 +00:00
if not RedisQueue(index_config["queue_name"]).has_item():
continue
2023-03-15 09:59:20 +00:00
total = RedisQueue(index_config["queue_name"]).length()
2022-12-11 05:03:21 +00:00
while True:
2023-03-15 09:59:20 +00:00
has_next = self.reindex_index(name, index_config, total)
2022-12-11 05:03:21 +00:00
if not has_next:
break
2023-03-15 09:59:20 +00:00
def reindex_index(self, name, index_config, total):
2022-12-11 05:03:21 +00:00
"""reindex all of a single index"""
reindex = self.get_reindex_map(index_config["index_name"])
youtube_id = RedisQueue(index_config["queue_name"]).get_next()
if youtube_id:
2023-03-15 09:59:20 +00:00
self._notify(name, index_config, total)
2022-12-11 05:03:21 +00:00
reindex(youtube_id)
sleep_interval = self.config["downloads"].get("sleep_interval", 0)
sleep(sleep_interval)
return bool(youtube_id)
def get_reindex_map(self, index_name):
"""return def to run for index"""
def_map = {
"ta_video": self._reindex_single_video,
"ta_channel": self._reindex_single_channel,
"ta_playlist": self._reindex_single_playlist,
}
2022-03-23 08:48:38 +00:00
2022-12-11 05:03:21 +00:00
return def_map.get(index_name)
2022-01-22 15:13:37 +00:00
2023-03-15 09:59:20 +00:00
def _notify(self, name, index_config, total):
"""send notification back to task"""
remaining = RedisQueue(index_config["queue_name"]).length()
idx = total - remaining
2023-03-23 05:20:12 +00:00
message = [f"Reindexing {name.title()}s {idx}/{total}"]
2023-03-15 09:59:20 +00:00
progress = idx / total
self.task.send_progress(message, progress=progress)
2022-11-18 03:31:09 +00:00
def _reindex_single_video(self, youtube_id):
2022-12-11 05:03:21 +00:00
"""wrapper to handle channel name changes"""
try:
self._reindex_single_video_call(youtube_id)
except FileNotFoundError:
2022-12-31 09:06:33 +00:00
ChannelUrlFixer(youtube_id, self.config).run()
2022-12-11 05:03:21 +00:00
self._reindex_single_video_call(youtube_id)
def _reindex_single_video_call(self, youtube_id):
2022-01-22 15:13:37 +00:00
"""refresh data for single video"""
video = YoutubeVideo(youtube_id)
# read current state
video.get_from_es()
2023-03-01 06:51:43 +00:00
es_meta = video.json_data.copy()
2022-01-22 15:13:37 +00:00
# get new
video.build_json()
2022-02-13 03:05:08 +00:00
if not video.youtube_meta:
2022-01-22 15:13:37 +00:00
video.deactivate()
return
2022-01-22 15:13:37 +00:00
2023-03-01 06:51:43 +00:00
video.delete_subtitles(subtitles=es_meta.get("subtitles"))
video.check_subtitles()
2022-01-22 15:13:37 +00:00
# add back
2023-03-01 06:51:43 +00:00
video.json_data["player"] = es_meta.get("player")
video.json_data["date_downloaded"] = es_meta.get("date_downloaded")
video.json_data["channel"] = es_meta.get("channel")
if es_meta.get("playlist"):
video.json_data["playlist"] = es_meta.get("playlist")
2022-01-22 15:13:37 +00:00
video.upload_to_es()
2023-03-01 06:51:43 +00:00
if es_meta.get("media_url") != video.json_data["media_url"]:
self._rename_media_file(
es_meta.get("media_url"), video.json_data["media_url"]
)
2022-01-22 15:13:37 +00:00
2022-08-10 14:03:54 +00:00
thumb_handler = ThumbManager(youtube_id)
thumb_handler.delete_video_thumb()
thumb_handler.download_video_thumb(video.json_data["vid_thumb_url"])
2022-11-18 03:31:09 +00:00
Comments(youtube_id, config=self.config).reindex_comments()
return
2022-01-22 15:13:37 +00:00
2023-03-01 06:45:01 +00:00
def _rename_media_file(self, media_url_is, media_url_should):
"""handle title change"""
print(f"[reindex] fix media_url {media_url_is} to {media_url_should}")
videos = self.config["application"]["videos"]
old_path = os.path.join(videos, media_url_is)
new_path = os.path.join(videos, media_url_should)
os.rename(old_path, new_path)
2022-01-22 15:13:37 +00:00
@staticmethod
2022-03-23 08:48:38 +00:00
def _reindex_single_channel(channel_id):
2022-01-22 15:13:37 +00:00
"""refresh channel data and sync to videos"""
2023-05-07 10:45:48 +00:00
# read current state
2022-01-22 15:13:37 +00:00
channel = YoutubeChannel(channel_id)
channel.get_from_es()
2023-05-07 10:45:48 +00:00
es_meta = channel.json_data.copy()
# get new
2022-01-22 15:13:37 +00:00
channel.get_from_youtube()
2023-05-07 10:45:48 +00:00
if not channel.youtube_meta:
channel.deactivate()
2022-11-22 03:17:44 +00:00
channel.get_from_es()
2022-11-21 07:55:47 +00:00
channel.sync_to_videos()
return
2023-05-07 10:45:48 +00:00
channel.process_youtube_meta()
channel.get_channel_art()
# add back
channel.json_data["channel_subscribed"] = es_meta["channel_subscribed"]
overwrites = es_meta.get("channel_overwrites")
2022-03-23 08:48:38 +00:00
if overwrites:
channel.json_data["channel_overwrites"] = overwrites
2022-01-22 15:13:37 +00:00
2023-05-07 10:45:48 +00:00
channel.upload_to_es()
ChannelFullScan(channel_id).scan()
2022-12-11 05:03:21 +00:00
def _reindex_single_playlist(self, playlist_id):
2022-01-22 15:13:37 +00:00
"""refresh playlist data"""
2022-12-11 05:03:21 +00:00
self._get_all_videos()
2022-01-22 15:13:37 +00:00
playlist = YoutubePlaylist(playlist_id)
playlist.get_from_es()
subscribed = playlist.json_data["playlist_subscribed"]
2022-12-11 05:03:21 +00:00
playlist.all_youtube_ids = self.all_indexed_ids
2022-01-22 15:13:37 +00:00
playlist.build_json(scrape=True)
if not playlist.json_data:
playlist.deactivate()
return
playlist.json_data["playlist_subscribed"] = subscribed
playlist.upload_to_es()
return
2022-12-11 05:03:21 +00:00
def _get_all_videos(self):
"""add all videos for playlist index validation"""
if self.all_indexed_ids:
return
handler = PendingList()
handler.get_download()
handler.get_indexed()
self.all_indexed_ids = [i["youtube_id"] for i in handler.all_videos]
2022-12-31 08:35:58 +00:00
def cookie_is_valid(self):
"""return true if cookie is enabled and valid"""
2022-12-11 05:03:21 +00:00
if not self.config["downloads"]["cookie_import"]:
# is not activated, continue reindex
return True
2022-12-11 05:03:21 +00:00
valid = CookieHandler(self.config).validate()
return valid
class ReindexProgress(ReindexBase):
"""
get progress of reindex task
request_type: key of self.REINDEX_CONFIG
request_id: id of request_type
return = {
"state": "running" | "queued" | False
"total_queued": int
"in_queue_name": "queue_name"
}
"""
def __init__(self, request_type=False, request_id=False):
super().__init__()
self.request_type = request_type
self.request_id = request_id
def get_progress(self):
"""get progress from task"""
queue_name, request_type = self._get_queue_name()
total = self._get_total_in_queue(queue_name)
progress = {
"total_queued": total,
"type": request_type,
}
state = self._get_state(total, queue_name)
progress.update(state)
return progress
def _get_queue_name(self):
"""return queue_name, queue_type, raise exception on error"""
if not self.request_type:
return "all", "all"
reindex_config = self.REINDEX_CONFIG.get(self.request_type)
if not reindex_config:
print(f"reindex_config not found: {self.request_type}")
raise ValueError
return reindex_config["queue_name"], self.request_type
def _get_total_in_queue(self, queue_name):
"""get all items in queue"""
total = 0
if queue_name == "all":
queues = [i["queue_name"] for i in self.REINDEX_CONFIG.values()]
for queue in queues:
total += len(RedisQueue(queue).get_all())
else:
total += len(RedisQueue(queue_name).get_all())
return total
def _get_state(self, total, queue_name):
"""get state based on request_id"""
state_dict = {}
if self.request_id:
state = RedisQueue(queue_name).in_queue(self.request_id)
state_dict.update({"id": self.request_id, "state": state})
return state_dict
if total:
state = "running"
else:
state = "empty"
state_dict.update({"state": state})
return state_dict
class ChannelUrlFixer:
"""fix not matching channel names in reindex"""
def __init__(self, youtube_id, config):
self.youtube_id = youtube_id
self.config = config
self.video = False
def run(self):
"""check and run if needed"""
print(f"{self.youtube_id}: failed to build channel path, try to fix.")
video_path_is, video_folder_is = self.get_as_is()
if not os.path.exists(video_path_is):
print(f"giving up reindex, video in video: {self.video.json_data}")
raise ValueError
_, video_folder_should = self.get_as_should()
if video_folder_is != video_folder_should:
self.process(video_path_is)
else:
print(f"{self.youtube_id}: skip channel url fixer")
def get_as_is(self):
"""get video object as is"""
self.video = YoutubeVideo(self.youtube_id)
self.video.get_from_es()
video_path_is = os.path.join(
self.config["application"]["videos"],
self.video.json_data["media_url"],
)
video_folder_is = os.path.split(video_path_is)[0]
return video_path_is, video_folder_is
def get_as_should(self):
"""add fresh metadata from remote"""
self.video.get_from_youtube()
self.video.add_file_path()
video_path_should = os.path.join(
self.config["application"]["videos"],
self.video.json_data["media_url"],
)
video_folder_should = os.path.split(video_path_should)[0]
return video_path_should, video_folder_should
def process(self, video_path_is):
"""fix filepath"""
print(f"{self.youtube_id}: fixing channel rename.")
cache_dir = self.config["application"]["cache_dir"]
new_path = os.path.join(
cache_dir, "download", self.youtube_id + ".mp4"
)
shutil.move(video_path_is, new_path, copy_function=shutil.copyfile)
VideoDownloader().move_to_archive(self.video.json_data)
self.video.update_media_url()
class ChannelFullScan:
"""
update from v0.3.0 to v0.3.1
full scan of channel to fix vid_type mismatch
"""
def __init__(self, channel_id):
self.channel_id = channel_id
self.to_update = False
def scan(self):
"""match local with remote"""
print(f"{self.channel_id}: start full scan")
all_local_videos = self._get_all_local()
all_remote_videos = self._get_all_remote()
self.to_update = []
for video in all_local_videos:
video_id = video["youtube_id"]
remote_match = [i for i in all_remote_videos if i[0] == video_id]
if not remote_match:
print(f"{video_id}: no remote match found")
continue
2023-03-15 09:59:20 +00:00
expected_type = remote_match[0][-1]
if video["vid_type"] != expected_type:
self.to_update.append(
{
"video_id": video_id,
"vid_type": expected_type,
}
)
self.update()
def _get_all_remote(self):
"""get all channel videos"""
sub = ChannelSubscription()
all_remote_videos = sub.get_last_youtube_videos(
self.channel_id, limit=False
)
return all_remote_videos
def _get_all_local(self):
"""get all local indexed channel_videos"""
channel = YoutubeChannel(self.channel_id)
all_local_videos = channel.get_channel_videos()
return all_local_videos
def update(self):
"""build bulk query for updates"""
if not self.to_update:
print(f"{self.channel_id}: nothing to update")
return
print(f"{self.channel_id}: fixing {len(self.to_update)} videos")
bulk_list = []
for video in self.to_update:
action = {
"update": {"_id": video.get("video_id"), "_index": "ta_video"}
}
source = {"doc": {"vid_type": video.get("vid_type")}}
bulk_list.append(json.dumps(action))
bulk_list.append(json.dumps(source))
# add last newline
bulk_list.append("\n")
data = "\n".join(bulk_list)
_, _ = ElasticWrap("_bulk").post(data=data, ndjson=True)