tubearchivist/tubearchivist/home/src/index/channel.py

370 lines
13 KiB
Python

"""
functionality:
- get metadata from youtube for a channel
- index and update in es
"""
import json
import os
from datetime import datetime
from home.src.download import queue # partial import
from home.src.download.thumbnails import ThumbManager
from home.src.download.yt_dlp_base import YtWrap
from home.src.es.connect import ElasticWrap, IndexPaginate
from home.src.index.generic import YouTubeItem
from home.src.index.playlist import YoutubePlaylist
from home.src.ta.helper import clean_string
class YoutubeChannel(YouTubeItem):
"""represents a single youtube channel"""
es_path = False
index_name = "ta_channel"
yt_base = "https://www.youtube.com/channel/"
yt_obs = {
"extract_flat": True,
"allow_playlist_files": True,
}
def __init__(self, youtube_id, task=False):
super().__init__(youtube_id)
self.all_playlists = False
self.task = task
def build_yt_url(self):
"""overwrite base to use channel about page"""
return f"{self.yt_base}{self.youtube_id}/about"
def build_json(self, upload=False, fallback=False):
"""get from es or from youtube"""
self.get_from_es()
if self.json_data:
return
self.get_from_youtube()
if not self.youtube_meta and fallback:
self._video_fallback(fallback)
else:
self._process_youtube_meta()
self.get_channel_art()
if upload:
self.upload_to_es()
def _process_youtube_meta(self):
"""extract relevant fields"""
self.youtube_meta["thumbnails"].reverse()
channel_subs = self.youtube_meta.get("channel_follower_count") or 0
self.json_data = {
"channel_active": True,
"channel_description": self.youtube_meta.get("description", False),
"channel_id": self.youtube_id,
"channel_last_refresh": int(datetime.now().timestamp()),
"channel_name": self.youtube_meta["uploader"],
"channel_subs": channel_subs,
"channel_subscribed": False,
"channel_tags": self._parse_tags(self.youtube_meta.get("tags")),
"channel_banner_url": self._get_banner_art(),
"channel_thumb_url": self._get_thumb_art(),
"channel_tvart_url": self._get_tv_art(),
"channel_views": self.youtube_meta.get("view_count", 0),
}
def _parse_tags(self, tags):
"""parse channel tags"""
if not tags:
return False
joined = " ".join(tags)
return [i.strip() for i in joined.split('"') if i and not i == " "]
def _get_thumb_art(self):
"""extract thumb art"""
for i in self.youtube_meta["thumbnails"]:
if not i.get("width"):
continue
if i.get("width") == i.get("height"):
return i["url"]
return False
def _get_tv_art(self):
"""extract tv artwork"""
for i in self.youtube_meta["thumbnails"]:
if i.get("id") == "avatar_uncropped":
return i["url"]
if not i.get("width"):
continue
if i["width"] // i["height"] < 2 and not i["width"] == i["height"]:
return i["url"]
return False
def _get_banner_art(self):
"""extract banner artwork"""
for i in self.youtube_meta["thumbnails"]:
if not i.get("width"):
continue
if i["width"] // i["height"] > 5:
return i["url"]
return False
def _video_fallback(self, fallback):
"""use video metadata as fallback"""
print(f"{self.youtube_id}: fallback to video metadata")
self.json_data = {
"channel_active": False,
"channel_last_refresh": int(datetime.now().timestamp()),
"channel_subs": fallback.get("channel_follower_count", 0),
"channel_name": fallback["uploader"],
"channel_banner_url": False,
"channel_tvart_url": False,
"channel_id": self.youtube_id,
"channel_subscribed": False,
"channel_tags": False,
"channel_description": False,
"channel_thumb_url": False,
"channel_views": 0,
}
self._info_json_fallback()
def _info_json_fallback(self):
"""read channel info.json for additional metadata"""
info_json = os.path.join(
self.config["application"]["cache_dir"],
"import",
f"{self.youtube_id}.info.json",
)
if os.path.exists(info_json):
print(f"{self.youtube_id}: read info.json file")
with open(info_json, "r", encoding="utf-8") as f:
content = json.loads(f.read())
self.json_data.update(
{
"channel_subs": content.get("channel_follower_count", 0),
"channel_description": content.get("description", False),
}
)
os.remove(info_json)
def get_channel_art(self):
"""download channel art for new channels"""
urls = (
self.json_data["channel_thumb_url"],
self.json_data["channel_banner_url"],
self.json_data["channel_tvart_url"],
)
ThumbManager(self.youtube_id, item_type="channel").download(urls)
def sync_to_videos(self):
"""sync new channel_dict to all videos of channel"""
# add ingest pipeline
processors = []
for field, value in self.json_data.items():
line = {"set": {"field": "channel." + field, "value": value}}
processors.append(line)
data = {"description": self.youtube_id, "processors": processors}
ingest_path = f"_ingest/pipeline/{self.youtube_id}"
_, _ = ElasticWrap(ingest_path).put(data)
# apply pipeline
data = {"query": {"match": {"channel.channel_id": self.youtube_id}}}
update_path = f"ta_video/_update_by_query?pipeline={self.youtube_id}"
_, _ = ElasticWrap(update_path).post(data)
def get_folder_path(self):
"""get folder where media files get stored"""
channel_name = self.json_data["channel_name"]
folder_name = clean_string(channel_name)
if len(folder_name) <= 3:
# fall back to channel id
folder_name = self.json_data["channel_id"]
folder_path = os.path.join(self.app_conf["videos"], folder_name)
return folder_path
def delete_es_videos(self):
"""delete all channel documents from elasticsearch"""
data = {
"query": {
"term": {"channel.channel_id": {"value": self.youtube_id}}
}
}
_, _ = ElasticWrap("ta_video/_delete_by_query").post(data)
def delete_es_comments(self):
"""delete all comments from this channel"""
data = {
"query": {
"term": {"comment_channel_id": {"value": self.youtube_id}}
}
}
_, _ = ElasticWrap("ta_comment/_delete_by_query").post(data)
def delete_playlists(self):
"""delete all indexed playlist from es"""
all_playlists = self.get_indexed_playlists()
for playlist in all_playlists:
playlist_id = playlist["playlist_id"]
YoutubePlaylist(playlist_id).delete_metadata()
def delete_channel(self):
"""delete channel and all videos"""
print(f"{self.youtube_id}: delete channel")
self.get_from_es()
if not self.json_data:
raise FileNotFoundError
folder_path = self.get_folder_path()
print(f"{self.youtube_id}: delete all media files")
try:
all_videos = os.listdir(folder_path)
for video in all_videos:
video_path = os.path.join(folder_path, video)
os.remove(video_path)
os.rmdir(folder_path)
except FileNotFoundError:
print(f"no videos found for {folder_path}")
print(f"{self.youtube_id}: delete indexed playlists")
self.delete_playlists()
print(f"{self.youtube_id}: delete indexed videos")
self.delete_es_videos()
self.delete_es_comments()
self.del_in_es()
def index_channel_playlists(self):
"""add all playlists of channel to index"""
print(f"{self.youtube_id}: index all playlists")
self.get_from_es()
channel_name = self.json_data["channel_name"]
self.task.send_progress([f"{channel_name}: Looking for Playlists"])
self.get_all_playlists()
if not self.all_playlists:
print(f"{self.youtube_id}: no playlists found.")
return
all_youtube_ids = self.get_all_video_ids()
total = len(self.all_playlists)
for idx, playlist in enumerate(self.all_playlists):
if self.task:
self._notify_single_playlist(idx, total)
self._index_single_playlist(playlist, all_youtube_ids)
print("add playlist: " + playlist[1])
def _notify_single_playlist(self, idx, total):
"""send notification"""
channel_name = self.json_data["channel_name"]
message = [
f"{channel_name}: Scanning channel for playlists",
f"Progress: {idx + 1}/{total}",
]
self.task.send_progress(message, progress=(idx + 1) / total)
@staticmethod
def _index_single_playlist(playlist, all_youtube_ids):
"""add single playlist if needed"""
playlist = YoutubePlaylist(playlist[0])
playlist.all_youtube_ids = all_youtube_ids
playlist.build_json()
if not playlist.json_data:
return
entries = playlist.json_data["playlist_entries"]
downloaded = [i for i in entries if i["downloaded"]]
if not downloaded:
return
playlist.upload_to_es()
playlist.add_vids_to_playlist()
playlist.get_playlist_art()
@staticmethod
def get_all_video_ids():
"""match all playlists with videos"""
handler = queue.PendingList()
handler.get_download()
handler.get_indexed()
all_youtube_ids = [i["youtube_id"] for i in handler.all_videos]
return all_youtube_ids
def get_channel_videos(self):
"""get all videos from channel"""
data = {
"query": {
"term": {"channel.channel_id": {"value": self.youtube_id}}
},
"_source": ["youtube_id", "vid_type"],
}
all_videos = IndexPaginate("ta_video", data).get_results()
return all_videos
def get_all_playlists(self):
"""get all playlists owned by this channel"""
url = (
f"https://www.youtube.com/channel/{self.youtube_id}"
+ "/playlists?view=1&sort=dd&shelf_id=0"
)
obs = {"skip_download": True, "extract_flat": True}
playlists = YtWrap(obs, self.config).extract(url)
all_entries = [(i["id"], i["title"]) for i in playlists["entries"]]
self.all_playlists = all_entries
def get_indexed_playlists(self, active_only=False):
"""get all indexed playlists from channel"""
must_list = [
{"term": {"playlist_channel_id": {"value": self.youtube_id}}}
]
if active_only:
must_list.append({"term": {"playlist_active": {"value": True}}})
data = {"query": {"bool": {"must": must_list}}}
all_playlists = IndexPaginate("ta_playlist", data).get_results()
return all_playlists
def get_overwrites(self):
"""get all per channel overwrites"""
return self.json_data.get("channel_overwrites", False)
def set_overwrites(self, overwrites):
"""set per channel overwrites"""
valid_keys = [
"download_format",
"autodelete_days",
"index_playlists",
"integrate_sponsorblock",
]
to_write = self.json_data.get("channel_overwrites", {})
for key, value in overwrites.items():
if key not in valid_keys:
raise ValueError(f"invalid overwrite key: {key}")
if value == "disable":
to_write[key] = False
continue
if value in [0, "0"]:
if key in to_write:
del to_write[key]
continue
if value == "1":
to_write[key] = True
continue
if value:
to_write.update({key: value})
self.json_data["channel_overwrites"] = to_write
def channel_overwrites(channel_id, overwrites):
"""collection to overwrite settings per channel"""
channel = YoutubeChannel(channel_id)
channel.build_json()
channel.set_overwrites(overwrites)
channel.upload_to_es()
channel.sync_to_videos()