tubearchivist/tubearchivist/home/src/index/reindex.py

"""
functionality:
- periodically refresh documents
- index and update in es
"""

import os
import shutil
from datetime import datetime
from math import ceil
from time import sleep

from home.src.download.queue import PendingList
from home.src.download.thumbnails import ThumbManager
from home.src.download.yt_dlp_base import CookieHandler
from home.src.download.yt_dlp_handler import VideoDownloader
from home.src.es.connect import ElasticWrap
from home.src.index.channel import YoutubeChannel
from home.src.index.playlist import YoutubePlaylist
from home.src.index.video import YoutubeVideo
from home.src.ta.config import AppConfig


class Reindex:
    """check for outdated documents and refresh data from youtube"""

    MATCH_FIELD = {
        "ta_video": "active",
        "ta_channel": "channel_active",
        "ta_playlist": "playlist_active",
    }
    MULTIPLY = 1.2

    def __init__(self):
        # config
        self.now = int(datetime.now().strftime("%s"))
        self.config = AppConfig().config
        self.interval = self.config["scheduler"]["check_reindex_days"]
        # scan
        self.all_youtube_ids = False
        self.all_channel_ids = False
        self.all_playlist_ids = False

    def check_cookie(self):
        """validate cookie if enabled"""
        if self.config["downloads"]["cookie_import"]:
            valid = CookieHandler(self.config).validate()
            if not valid:
                return

    def _get_daily(self):
        """get daily refresh values"""
        total_videos = self._get_total_hits("ta_video")
        video_daily = ceil(total_videos / self.interval * self.MULTIPLY)
        if video_daily >= 10000:
            video_daily = 9999

        total_channels = self._get_total_hits("ta_channel")
        channel_daily = ceil(total_channels / self.interval * self.MULTIPLY)
        total_playlists = self._get_total_hits("ta_playlist")
        playlist_daily = ceil(total_playlists / self.interval * self.MULTIPLY)
        return (video_daily, channel_daily, playlist_daily)

    def _get_total_hits(self, index):
        """get total hits from index"""
        match_field = self.MATCH_FIELD[index]
        path = f"{index}/_search?filter_path=hits.total"
        data = {"query": {"match": {match_field: True}}}
        response, _ = ElasticWrap(path).post(data=data)
        total_hits = response["hits"]["total"]["value"]
        return total_hits

    def _get_unrated_vids(self):
        """get max 200 videos without rating if ryd integration is enabled"""
        data = {
            "size": 200,
            "query": {
                "bool": {
                    "must_not": [{"exists": {"field": "stats.average_rating"}}]
                }
            },
        }
        response, _ = ElasticWrap("ta_video/_search").get(data=data)

        missing_rating = [i["_id"] for i in response["hits"]["hits"]]
        self.all_youtube_ids = self.all_youtube_ids + missing_rating

    def _get_outdated_vids(self, size):
        """get daily videos to refresh"""
        now_lte = self.now - self.interval * 24 * 60 * 60
        must_list = [
            {"match": {"active": True}},
            {"range": {"vid_last_refresh": {"lte": now_lte}}},
        ]
        data = {
            "size": size,
            "query": {"bool": {"must": must_list}},
            "sort": [{"vid_last_refresh": {"order": "asc"}}],
            "_source": False,
        }
        response, _ = ElasticWrap("ta_video/_search").get(data=data)

        all_youtube_ids = [i["_id"] for i in response["hits"]["hits"]]
        return all_youtube_ids

    def _get_outdated_channels(self, size):
        """get daily channels to refresh"""
        now_lte = self.now - self.interval * 24 * 60 * 60
        must_list = [
            {"match": {"channel_active": True}},
            {"range": {"channel_last_refresh": {"lte": now_lte}}},
        ]
        data = {
            "size": size,
            "query": {"bool": {"must": must_list}},
            "sort": [{"channel_last_refresh": {"order": "asc"}}],
            "_source": False,
        }
        response, _ = ElasticWrap("ta_channel/_search").get(data=data)

        all_channel_ids = [i["_id"] for i in response["hits"]["hits"]]
        return all_channel_ids

    def _get_outdated_playlists(self, size):
        """get daily outdated playlists to refresh"""
        now_lte = self.now - self.interval * 24 * 60 * 60
        must_list = [
            {"match": {"playlist_active": True}},
            {"range": {"playlist_last_refresh": {"lte": now_lte}}},
        ]
        data = {
            "size": size,
            "query": {"bool": {"must": must_list}},
            "sort": [{"playlist_last_refresh": {"order": "asc"}}],
            "_source": False,
        }
        response, _ = ElasticWrap("ta_playlist/_search").get(data=data)

        all_playlist_ids = [i["_id"] for i in response["hits"]["hits"]]
        return all_playlist_ids

    def check_outdated(self):
        """add missing vids and channels"""
        video_daily, channel_daily, playlist_daily = self._get_daily()
        self.all_youtube_ids = self._get_outdated_vids(video_daily)
        self.all_channel_ids = self._get_outdated_channels(channel_daily)
        self.all_playlist_ids = self._get_outdated_playlists(playlist_daily)

        integrate_ryd = self.config["downloads"]["integrate_ryd"]
        if integrate_ryd:
            self._get_unrated_vids()

    @staticmethod
    def _reindex_single_video(youtube_id):
        """refresh data for single video"""
        video = YoutubeVideo(youtube_id)

        # read current state
        video.get_from_es()
        player = video.json_data["player"]
        date_downloaded = video.json_data["date_downloaded"]
        channel_dict = video.json_data["channel"]
        playlist = video.json_data.get("playlist")
        subtitles = video.json_data.get("subtitles")

        # get new
        video.build_json()
        if not video.youtube_meta:
            video.deactivate()
            return

        video.delete_subtitles(subtitles=subtitles)
        video.check_subtitles()

        # add back
        video.json_data["player"] = player
        video.json_data["date_downloaded"] = date_downloaded
        video.json_data["channel"] = channel_dict
        if playlist:
            video.json_data["playlist"] = playlist

        video.upload_to_es()

        thumb_handler = ThumbManager()
        thumb_handler.delete_vid_thumb(youtube_id)
        to_download = (youtube_id, video.json_data["vid_thumb_url"])
        thumb_handler.download_vid([to_download], notify=False)
        return

    @staticmethod
    def _reindex_single_channel(channel_id):
        """refresh channel data and sync to videos"""
        channel = YoutubeChannel(channel_id)
        channel.get_from_es()
        subscribed = channel.json_data["channel_subscribed"]
        overwrites = channel.json_data.get("channel_overwrites", False)
        channel.get_from_youtube()
        channel.json_data["channel_subscribed"] = subscribed
        if overwrites:
            channel.json_data["channel_overwrites"] = overwrites
        channel.upload_to_es()
        channel.sync_to_videos()

    @staticmethod
    def _reindex_single_playlist(playlist_id, all_indexed_ids):
        """refresh playlist data"""
        playlist = YoutubePlaylist(playlist_id)
        playlist.get_from_es()
        subscribed = playlist.json_data["playlist_subscribed"]
        playlist.all_youtube_ids = all_indexed_ids
        playlist.build_json(scrape=True)
        if not playlist.json_data:
            playlist.deactivate()
            return

        playlist.json_data["playlist_subscribed"] = subscribed
        playlist.upload_to_es()
        return

    def reindex(self):
        """reindex what's needed"""
        sleep_interval = self.config["downloads"]["sleep_interval"]
        # videos
        print(f"reindexing {len(self.all_youtube_ids)} videos")
        for youtube_id in self.all_youtube_ids:
            try:
                self._reindex_single_video(youtube_id)
            except FileNotFoundError:
                # handle channel name change here
                ChannelUrlFixer(youtube_id, self.config).run()
                self._reindex_single_video(youtube_id)
            if sleep_interval:
                sleep(sleep_interval)
        # channels
        print(f"reindexing {len(self.all_channel_ids)} channels")
        for channel_id in self.all_channel_ids:
            self._reindex_single_channel(channel_id)
            if sleep_interval:
                sleep(sleep_interval)
        # playlist
        print(f"reindexing {len(self.all_playlist_ids)} playlists")
        if self.all_playlist_ids:
            handler = PendingList()
            handler.get_download()
            handler.get_indexed()
            all_indexed_ids = [i["youtube_id"] for i in handler.all_videos]
            for playlist_id in self.all_playlist_ids:
                self._reindex_single_playlist(playlist_id, all_indexed_ids)
                if sleep_interval:
                    sleep(sleep_interval)


class ChannelUrlFixer:
    """fix not matching channel names in reindex"""

    def __init__(self, youtube_id, config):
        self.youtube_id = youtube_id
        self.config = config
        self.video = False

    def run(self):
        """check and run if needed"""
        print(f"{self.youtube_id}: failed to build channel path, try to fix.")
        video_path_is, video_folder_is = self.get_as_is()
        if not os.path.exists(video_path_is):
            print(f"giving up reindex, video in video: {self.video.json_data}")
            raise ValueError

        _, video_folder_should = self.get_as_should()

        if video_folder_is != video_folder_should:
            self.process(video_path_is)
        else:
            print(f"{self.youtube_id}: skip channel url fixer")

    def get_as_is(self):
        """get video object as is"""
        self.video = YoutubeVideo(self.youtube_id)
        self.video.get_from_es()
        video_path_is = os.path.join(
            self.config["application"]["videos"],
            self.video.json_data["media_url"],
        )
        video_folder_is = os.path.split(video_path_is)[0]

        return video_path_is, video_folder_is

    def get_as_should(self):
        """add fresh metadata from remote"""
        self.video.get_from_youtube()
        self.video.add_file_path()

        video_path_should = os.path.join(
            self.config["application"]["videos"],
            self.video.json_data["media_url"],
        )
        video_folder_should = os.path.split(video_path_should)[0]
        return video_path_should, video_folder_should

    def process(self, video_path_is):
        """fix filepath"""
        print(f"{self.youtube_id}: fixing channel rename.")
        cache_dir = self.config["application"]["cache_dir"]
        new_file_path = os.path.join(
            cache_dir, "download", self.youtube_id + ".mp4"
        )
        shutil.move(video_path_is, new_file_path)
        VideoDownloader().move_to_archive(self.video.json_data)
        self.video.update_media_url()