tubearchivist/tubearchivist/home/src/index/channel.py

"""
functionality:
- get metadata from youtube for a channel
- index and update in es
"""

import json
import os
from datetime import datetime

from home.src.download import queue  # partial import
from home.src.download.thumbnails import ThumbManager
from home.src.download.yt_dlp_base import YtWrap
from home.src.es.connect import ElasticWrap, IndexPaginate
from home.src.index.generic import YouTubeItem
from home.src.index.playlist import YoutubePlaylist
from home.src.ta.helper import clean_string


class YoutubeChannel(YouTubeItem):
    """represents a single youtube channel"""

    es_path = False
    index_name = "ta_channel"
    yt_base = "https://www.youtube.com/channel/"
    yt_obs = {
        "extract_flat": True,
        "allow_playlist_files": True,
    }

    def __init__(self, youtube_id, task=False):
        super().__init__(youtube_id)
        self.all_playlists = False
        self.task = task

    def build_yt_url(self):
        """overwrite base to use channel about page"""
        return f"{self.yt_base}{self.youtube_id}/about"

    def build_json(self, upload=False, fallback=False):
        """get from es or from youtube"""
        self.get_from_es()
        if self.json_data:
            return

        self.get_from_youtube()
        if not self.youtube_meta and fallback:
            self._video_fallback(fallback)
        else:
            self.process_youtube_meta()
            self.get_channel_art()

        if upload:
            self.upload_to_es()

    def process_youtube_meta(self):
        """extract relevant fields"""
        self.youtube_meta["thumbnails"].reverse()
        channel_subs = self.youtube_meta.get("channel_follower_count") or 0
        self.json_data = {
            "channel_active": True,
            "channel_description": self.youtube_meta.get("description", False),
            "channel_id": self.youtube_id,
            "channel_last_refresh": int(datetime.now().timestamp()),
            "channel_name": self.youtube_meta["uploader"],
            "channel_subs": channel_subs,
            "channel_subscribed": False,
            "channel_tags": self._parse_tags(self.youtube_meta.get("tags")),
            "channel_banner_url": self._get_banner_art(),
            "channel_thumb_url": self._get_thumb_art(),
            "channel_tvart_url": self._get_tv_art(),
            "channel_views": self.youtube_meta.get("view_count", 0),
        }

    def _parse_tags(self, tags):
        """parse channel tags"""
        if not tags:
            return False

        joined = " ".join(tags)
        return [i.strip() for i in joined.split('"') if i and not i == " "]

    def _get_thumb_art(self):
        """extract thumb art"""
        for i in self.youtube_meta["thumbnails"]:
            if not i.get("width"):
                continue
            if i.get("width") == i.get("height"):
                return i["url"]

        return False

    def _get_tv_art(self):
        """extract tv artwork"""
        for i in self.youtube_meta["thumbnails"]:
            if i.get("id") == "avatar_uncropped":
                return i["url"]
            if not i.get("width"):
                continue
            if i["width"] // i["height"] < 2 and not i["width"] == i["height"]:
                return i["url"]

        return False

    def _get_banner_art(self):
        """extract banner artwork"""
        for i in self.youtube_meta["thumbnails"]:
            if not i.get("width"):
                continue
            if i["width"] // i["height"] > 5:
                return i["url"]

        return False

    def _video_fallback(self, fallback):
        """use video metadata as fallback"""
        print(f"{self.youtube_id}: fallback to video metadata")
        self.json_data = {
            "channel_active": False,
            "channel_last_refresh": int(datetime.now().timestamp()),
            "channel_subs": fallback.get("channel_follower_count", 0),
            "channel_name": fallback["uploader"],
            "channel_banner_url": False,
            "channel_tvart_url": False,
            "channel_id": self.youtube_id,
            "channel_subscribed": False,
            "channel_tags": False,
            "channel_description": False,
            "channel_thumb_url": False,
            "channel_views": 0,
        }
        self._info_json_fallback()

    def _info_json_fallback(self):
        """read channel info.json for additional metadata"""
        info_json = os.path.join(
            self.config["application"]["cache_dir"],
            "import",
            f"{self.youtube_id}.info.json",
        )
        if os.path.exists(info_json):
            print(f"{self.youtube_id}: read info.json file")
            with open(info_json, "r", encoding="utf-8") as f:
                content = json.loads(f.read())

            self.json_data.update(
                {
                    "channel_subs": content.get("channel_follower_count", 0),
                    "channel_description": content.get("description", False),
                }
            )
            os.remove(info_json)

    def get_channel_art(self):
        """download channel art for new channels"""
        urls = (
            self.json_data["channel_thumb_url"],
            self.json_data["channel_banner_url"],
            self.json_data["channel_tvart_url"],
        )
        ThumbManager(self.youtube_id, item_type="channel").download(urls)

    def sync_to_videos(self):
        """sync new channel_dict to all videos of channel"""
        # add ingest pipeline
        processors = []
        for field, value in self.json_data.items():
            line = {"set": {"field": "channel." + field, "value": value}}
            processors.append(line)
        data = {"description": self.youtube_id, "processors": processors}
        ingest_path = f"_ingest/pipeline/{self.youtube_id}"
        _, _ = ElasticWrap(ingest_path).put(data)
        # apply pipeline
        data = {"query": {"match": {"channel.channel_id": self.youtube_id}}}
        update_path = f"ta_video/_update_by_query?pipeline={self.youtube_id}"
        _, _ = ElasticWrap(update_path).post(data)

    def get_folder_path(self):
        """get folder where media files get stored"""
        channel_name = self.json_data["channel_name"]
        folder_name = clean_string(channel_name)
        if len(folder_name) <= 3:
            # fall back to channel id
            folder_name = self.json_data["channel_id"]
        folder_path = os.path.join(self.app_conf["videos"], folder_name)
        return folder_path

    def delete_es_videos(self):
        """delete all channel documents from elasticsearch"""
        data = {
            "query": {
                "term": {"channel.channel_id": {"value": self.youtube_id}}
            }
        }
        _, _ = ElasticWrap("ta_video/_delete_by_query").post(data)

    def delete_es_comments(self):
        """delete all comments from this channel"""
        data = {
            "query": {
                "term": {"comment_channel_id": {"value": self.youtube_id}}
            }
        }
        _, _ = ElasticWrap("ta_comment/_delete_by_query").post(data)

    def delete_playlists(self):
        """delete all indexed playlist from es"""
        all_playlists = self.get_indexed_playlists()
        for playlist in all_playlists:
            playlist_id = playlist["playlist_id"]
            YoutubePlaylist(playlist_id).delete_metadata()

    def delete_channel(self):
        """delete channel and all videos"""
        print(f"{self.youtube_id}: delete channel")
        self.get_from_es()
        if not self.json_data:
            raise FileNotFoundError

        folder_path = self.get_folder_path()
        print(f"{self.youtube_id}: delete all media files")
        try:
            all_videos = os.listdir(folder_path)
            for video in all_videos:
                video_path = os.path.join(folder_path, video)
                os.remove(video_path)
            os.rmdir(folder_path)
        except FileNotFoundError:
            print(f"no videos found for {folder_path}")

        print(f"{self.youtube_id}: delete indexed playlists")
        self.delete_playlists()
        print(f"{self.youtube_id}: delete indexed videos")
        self.delete_es_videos()
        self.delete_es_comments()
        self.del_in_es()

    def index_channel_playlists(self):
        """add all playlists of channel to index"""
        print(f"{self.youtube_id}: index all playlists")
        self.get_from_es()
        channel_name = self.json_data["channel_name"]
        self.task.send_progress([f"{channel_name}: Looking for Playlists"])
        self.get_all_playlists()
        if not self.all_playlists:
            print(f"{self.youtube_id}: no playlists found.")
            return

        all_youtube_ids = self.get_all_video_ids()
        total = len(self.all_playlists)
        for idx, playlist in enumerate(self.all_playlists):
            if self.task:
                self._notify_single_playlist(idx, total)

            self._index_single_playlist(playlist, all_youtube_ids)
            print("add playlist: " + playlist[1])

    def _notify_single_playlist(self, idx, total):
        """send notification"""
        channel_name = self.json_data["channel_name"]
        message = [
            f"{channel_name}: Scanning channel for playlists",
            f"Progress: {idx + 1}/{total}",
        ]
        self.task.send_progress(message, progress=(idx + 1) / total)

    @staticmethod
    def _index_single_playlist(playlist, all_youtube_ids):
        """add single playlist if needed"""
        playlist = YoutubePlaylist(playlist[0])
        playlist.all_youtube_ids = all_youtube_ids
        playlist.build_json()
        if not playlist.json_data:
            return

        entries = playlist.json_data["playlist_entries"]
        downloaded = [i for i in entries if i["downloaded"]]
        if not downloaded:
            return

        playlist.upload_to_es()
        playlist.add_vids_to_playlist()
        playlist.get_playlist_art()

    @staticmethod
    def get_all_video_ids():
        """match all playlists with videos"""
        handler = queue.PendingList()
        handler.get_download()
        handler.get_indexed()
        all_youtube_ids = [i["youtube_id"] for i in handler.all_videos]

        return all_youtube_ids

    def get_channel_videos(self):
        """get all videos from channel"""
        data = {
            "query": {
                "term": {"channel.channel_id": {"value": self.youtube_id}}
            },
            "_source": ["youtube_id", "vid_type"],
        }
        all_videos = IndexPaginate("ta_video", data).get_results()
        return all_videos

    def get_all_playlists(self):
        """get all playlists owned by this channel"""
        url = (
            f"https://www.youtube.com/channel/{self.youtube_id}"
            + "/playlists?view=1&sort=dd&shelf_id=0"
        )
        obs = {"skip_download": True, "extract_flat": True}
        playlists = YtWrap(obs, self.config).extract(url)
        all_entries = [(i["id"], i["title"]) for i in playlists["entries"]]
        self.all_playlists = all_entries

    def get_indexed_playlists(self, active_only=False):
        """get all indexed playlists from channel"""
        must_list = [
            {"term": {"playlist_channel_id": {"value": self.youtube_id}}}
        ]
        if active_only:
            must_list.append({"term": {"playlist_active": {"value": True}}})

        data = {"query": {"bool": {"must": must_list}}}

        all_playlists = IndexPaginate("ta_playlist", data).get_results()
        return all_playlists

    def get_overwrites(self):
        """get all per channel overwrites"""
        return self.json_data.get("channel_overwrites", False)

    def set_overwrites(self, overwrites):
        """set per channel overwrites"""
        valid_keys = [
            "download_format",
            "autodelete_days",
            "index_playlists",
            "integrate_sponsorblock",
        ]

        to_write = self.json_data.get("channel_overwrites", {})
        for key, value in overwrites.items():
            if key not in valid_keys:
                raise ValueError(f"invalid overwrite key: {key}")
            if value == "disable":
                to_write[key] = False
                continue
            if value in [0, "0"]:
                if key in to_write:
                    del to_write[key]
                continue
            if value == "1":
                to_write[key] = True
                continue
            if value:
                to_write.update({key: value})

        self.json_data["channel_overwrites"] = to_write


def channel_overwrites(channel_id, overwrites):
    """collection to overwrite settings per channel"""
    channel = YoutubeChannel(channel_id)
    channel.build_json()
    channel.set_overwrites(overwrites)
    channel.upload_to_es()
    channel.sync_to_videos()