""" functionality: - get metadata from youtube for a channel - index and update in es """ import json import os from datetime import datetime from home.src.download import queue # partial import from home.src.download.thumbnails import ThumbManager from home.src.download.yt_dlp_base import YtWrap from home.src.es.connect import ElasticWrap, IndexPaginate from home.src.index.generic import YouTubeItem from home.src.index.playlist import YoutubePlaylist from home.src.ta.helper import clean_string class YoutubeChannel(YouTubeItem): """represents a single youtube channel""" es_path = False index_name = "ta_channel" yt_base = "https://www.youtube.com/channel/" yt_obs = { "extract_flat": True, "allow_playlist_files": True, } def __init__(self, youtube_id, task=False): super().__init__(youtube_id) self.all_playlists = False self.task = task def build_yt_url(self): """overwrite base to use channel about page""" return f"{self.yt_base}{self.youtube_id}/about" def build_json(self, upload=False, fallback=False): """get from es or from youtube""" self.get_from_es() if self.json_data: return self.get_from_youtube() if not self.youtube_meta and fallback: self._video_fallback(fallback) else: self._process_youtube_meta() self.get_channel_art() if upload: self.upload_to_es() def _process_youtube_meta(self): """extract relevant fields""" self.youtube_meta["thumbnails"].reverse() channel_subs = self.youtube_meta.get("channel_follower_count") or 0 self.json_data = { "channel_active": True, "channel_description": self.youtube_meta.get("description", False), "channel_id": self.youtube_id, "channel_last_refresh": int(datetime.now().timestamp()), "channel_name": self.youtube_meta["uploader"], "channel_subs": channel_subs, "channel_subscribed": False, "channel_tags": self._parse_tags(self.youtube_meta.get("tags")), "channel_banner_url": self._get_banner_art(), "channel_thumb_url": self._get_thumb_art(), "channel_tvart_url": self._get_tv_art(), "channel_views": self.youtube_meta.get("view_count", 0), } def _parse_tags(self, tags): """parse channel tags""" if not tags: return False joined = " ".join(tags) return [i.strip() for i in joined.split('"') if i and not i == " "] def _get_thumb_art(self): """extract thumb art""" for i in self.youtube_meta["thumbnails"]: if not i.get("width"): continue if i.get("width") == i.get("height"): return i["url"] return False def _get_tv_art(self): """extract tv artwork""" for i in self.youtube_meta["thumbnails"]: if i.get("id") == "avatar_uncropped": return i["url"] if not i.get("width"): continue if i["width"] // i["height"] < 2 and not i["width"] == i["height"]: return i["url"] return False def _get_banner_art(self): """extract banner artwork""" for i in self.youtube_meta["thumbnails"]: if not i.get("width"): continue if i["width"] // i["height"] > 5: return i["url"] return False def _video_fallback(self, fallback): """use video metadata as fallback""" print(f"{self.youtube_id}: fallback to video metadata") self.json_data = { "channel_active": False, "channel_last_refresh": int(datetime.now().timestamp()), "channel_subs": fallback.get("channel_follower_count", 0), "channel_name": fallback["uploader"], "channel_banner_url": False, "channel_tvart_url": False, "channel_id": self.youtube_id, "channel_subscribed": False, "channel_tags": False, "channel_description": False, "channel_thumb_url": False, "channel_views": 0, } self._info_json_fallback() def _info_json_fallback(self): """read channel info.json for additional metadata""" info_json = os.path.join( self.config["application"]["cache_dir"], "import", f"{self.youtube_id}.info.json", ) if os.path.exists(info_json): print(f"{self.youtube_id}: read info.json file") with open(info_json, "r", encoding="utf-8") as f: content = json.loads(f.read()) self.json_data.update( { "channel_subs": content.get("channel_follower_count", 0), "channel_description": content.get("description", False), } ) os.remove(info_json) def get_channel_art(self): """download channel art for new channels""" urls = ( self.json_data["channel_thumb_url"], self.json_data["channel_banner_url"], self.json_data["channel_tvart_url"], ) ThumbManager(self.youtube_id, item_type="channel").download(urls) def sync_to_videos(self): """sync new channel_dict to all videos of channel""" # add ingest pipeline processors = [] for field, value in self.json_data.items(): line = {"set": {"field": "channel." + field, "value": value}} processors.append(line) data = {"description": self.youtube_id, "processors": processors} ingest_path = f"_ingest/pipeline/{self.youtube_id}" _, _ = ElasticWrap(ingest_path).put(data) # apply pipeline data = {"query": {"match": {"channel.channel_id": self.youtube_id}}} update_path = f"ta_video/_update_by_query?pipeline={self.youtube_id}" _, _ = ElasticWrap(update_path).post(data) def get_folder_path(self): """get folder where media files get stored""" channel_name = self.json_data["channel_name"] folder_name = clean_string(channel_name) if len(folder_name) <= 3: # fall back to channel id folder_name = self.json_data["channel_id"] folder_path = os.path.join(self.app_conf["videos"], folder_name) return folder_path def delete_es_videos(self): """delete all channel documents from elasticsearch""" data = { "query": { "term": {"channel.channel_id": {"value": self.youtube_id}} } } _, _ = ElasticWrap("ta_video/_delete_by_query").post(data) def delete_es_comments(self): """delete all comments from this channel""" data = { "query": { "term": {"comment_channel_id": {"value": self.youtube_id}} } } _, _ = ElasticWrap("ta_comment/_delete_by_query").post(data) def delete_playlists(self): """delete all indexed playlist from es""" all_playlists = self.get_indexed_playlists() for playlist in all_playlists: playlist_id = playlist["playlist_id"] YoutubePlaylist(playlist_id).delete_metadata() def delete_channel(self): """delete channel and all videos""" print(f"{self.youtube_id}: delete channel") self.get_from_es() if not self.json_data: raise FileNotFoundError folder_path = self.get_folder_path() print(f"{self.youtube_id}: delete all media files") try: all_videos = os.listdir(folder_path) for video in all_videos: video_path = os.path.join(folder_path, video) os.remove(video_path) os.rmdir(folder_path) except FileNotFoundError: print(f"no videos found for {folder_path}") print(f"{self.youtube_id}: delete indexed playlists") self.delete_playlists() print(f"{self.youtube_id}: delete indexed videos") self.delete_es_videos() self.delete_es_comments() self.del_in_es() def index_channel_playlists(self): """add all playlists of channel to index""" print(f"{self.youtube_id}: index all playlists") self.get_from_es() channel_name = self.json_data["channel_name"] self.task.send_progress([f"{channel_name}: Looking for Playlists"]) self.get_all_playlists() if not self.all_playlists: print(f"{self.youtube_id}: no playlists found.") return all_youtube_ids = self.get_all_video_ids() total = len(self.all_playlists) for idx, playlist in enumerate(self.all_playlists): if self.task: self._notify_single_playlist(idx, total) self._index_single_playlist(playlist, all_youtube_ids) print("add playlist: " + playlist[1]) def _notify_single_playlist(self, idx, total): """send notification""" channel_name = self.json_data["channel_name"] message = [ f"{channel_name}: Scanning channel for playlists", f"Progress: {idx + 1}/{total}", ] self.task.send_progress(message, progress=(idx + 1) / total) @staticmethod def _index_single_playlist(playlist, all_youtube_ids): """add single playlist if needed""" playlist = YoutubePlaylist(playlist[0]) playlist.all_youtube_ids = all_youtube_ids playlist.build_json() if not playlist.json_data: return entries = playlist.json_data["playlist_entries"] downloaded = [i for i in entries if i["downloaded"]] if not downloaded: return playlist.upload_to_es() playlist.add_vids_to_playlist() playlist.get_playlist_art() @staticmethod def get_all_video_ids(): """match all playlists with videos""" handler = queue.PendingList() handler.get_download() handler.get_indexed() all_youtube_ids = [i["youtube_id"] for i in handler.all_videos] return all_youtube_ids def get_channel_videos(self): """get all videos from channel""" data = { "query": { "term": {"channel.channel_id": {"value": self.youtube_id}} }, "_source": ["youtube_id", "vid_type"], } all_videos = IndexPaginate("ta_video", data).get_results() return all_videos def get_all_playlists(self): """get all playlists owned by this channel""" url = ( f"https://www.youtube.com/channel/{self.youtube_id}" + "/playlists?view=1&sort=dd&shelf_id=0" ) obs = {"skip_download": True, "extract_flat": True} playlists = YtWrap(obs, self.config).extract(url) all_entries = [(i["id"], i["title"]) for i in playlists["entries"]] self.all_playlists = all_entries def get_indexed_playlists(self, active_only=False): """get all indexed playlists from channel""" must_list = [ {"term": {"playlist_channel_id": {"value": self.youtube_id}}} ] if active_only: must_list.append({"term": {"playlist_active": {"value": True}}}) data = {"query": {"bool": {"must": must_list}}} all_playlists = IndexPaginate("ta_playlist", data).get_results() return all_playlists def get_overwrites(self): """get all per channel overwrites""" return self.json_data.get("channel_overwrites", False) def set_overwrites(self, overwrites): """set per channel overwrites""" valid_keys = [ "download_format", "autodelete_days", "index_playlists", "integrate_sponsorblock", ] to_write = self.json_data.get("channel_overwrites", {}) for key, value in overwrites.items(): if key not in valid_keys: raise ValueError(f"invalid overwrite key: {key}") if value == "disable": to_write[key] = False continue if value in [0, "0"]: if key in to_write: del to_write[key] continue if value == "1": to_write[key] = True continue if value: to_write.update({key: value}) self.json_data["channel_overwrites"] = to_write def channel_overwrites(channel_id, overwrites): """collection to overwrite settings per channel""" channel = YoutubeChannel(channel_id) channel.build_json() channel.set_overwrites(overwrites) channel.upload_to_es() channel.sync_to_videos()