tubearchivist/tubearchivist/home/src/index.py

810 lines
28 KiB
Python

"""
Functionality:
- index new videos into elastisearch
- extract video info with yt_dlp
- scrape youtube channel page if needed
"""
import json
import os
import re
from datetime import datetime
import requests
import yt_dlp
from bs4 import BeautifulSoup
from home.src.config import AppConfig
from home.src.es import ElasticWrap, IndexPaginate
from home.src.helper import DurationConverter, UrlListParser, clean_string
from home.src.thumbnails import ThumbManager
from ryd_client import ryd_client
class YouTubeItem:
"""base class for youtube"""
es_path = False
index_name = False
yt_base = False
yt_obs = {
"quiet": True,
"default_search": "ytsearch",
"skip_download": True,
"check_formats": "selected",
"noplaylist": True,
}
def __init__(self, youtube_id):
self.youtube_id = youtube_id
self.config = False
self.app_conf = False
self.youtube_meta = False
self.json_data = False
self._get_conf()
def _get_conf(self):
"""read user conf"""
self.config = AppConfig().config
self.app_conf = self.config["application"]
def get_from_youtube(self):
"""use yt-dlp to get meta data from youtube"""
print(f"{self.youtube_id}: get metadata from youtube")
try:
yt_item = yt_dlp.YoutubeDL(self.yt_obs)
response = yt_item.extract_info(self.yt_base + self.youtube_id)
except (
yt_dlp.utils.ExtractorError,
yt_dlp.utils.DownloadError,
):
print(f"{self.youtube_id}: failed to get info from youtube")
self.youtube_meta = False
self.youtube_meta = response
def get_from_es(self):
"""get indexed data from elastic search"""
print(f"{self.youtube_id}: get metadata from es")
response, _ = ElasticWrap(f"{self.es_path}").get()
source = response.get("_source")
self.json_data = source
def upload_to_es(self):
"""add json_data to elastic"""
_, _ = ElasticWrap(self.es_path).put(self.json_data, refresh=True)
def deactivate(self):
"""deactivate document in es"""
key_match = {
"video": "active",
"channel": "channel_active",
"playlist": "playlist_active",
}
update_path = f"{self.index_name}/_update/{self.youtube_id}"
data = {
"script": f"ctx._source.{key_match.get(self.index_name)} = false"
}
_, _ = ElasticWrap(update_path).post(data)
def del_in_es(self):
"""delete item from elastic search"""
print(f"{self.youtube_id}: delete from es")
_, _ = ElasticWrap(self.es_path).delete()
class YoutubeVideo(YouTubeItem):
"""represents a single youtube video"""
es_path = False
index_name = "ta_video"
yt_base = "https://www.youtube.com/watch?v="
def __init__(self, youtube_id):
super().__init__(youtube_id)
self.channel_id = False
self.es_path = f"{self.index_name}/_doc/{youtube_id}"
def build_json(self):
"""build json dict of video"""
self.get_from_youtube()
if not self.youtube_meta:
return
self._process_youtube_meta()
self._add_channel()
self._add_stats()
self.add_file_path()
self.add_player()
if self.config["downloads"]["integrate_ryd"]:
self._get_ryd_stats()
return
def _process_youtube_meta(self):
"""extract relevant fields from youtube"""
# extract
self.channel_id = self.youtube_meta["channel_id"]
upload_date = self.youtube_meta["upload_date"]
upload_date_time = datetime.strptime(upload_date, "%Y%m%d")
published = upload_date_time.strftime("%Y-%m-%d")
last_refresh = int(datetime.now().strftime("%s"))
# build json_data basics
self.json_data = {
"title": self.youtube_meta["title"],
"description": self.youtube_meta["description"],
"category": self.youtube_meta["categories"],
"vid_thumb_url": self.youtube_meta["thumbnail"],
"tags": self.youtube_meta["tags"],
"published": published,
"vid_last_refresh": last_refresh,
"date_downloaded": last_refresh,
"youtube_id": self.youtube_id,
"active": True,
}
def _add_channel(self):
"""add channel dict to video json_data"""
channel = YoutubeChannel(self.channel_id)
channel.build_json(upload=True)
self.json_data.update({"channel": channel.json_data})
def _add_stats(self):
"""add stats dicst to json_data"""
# likes
like_count = self.youtube_meta.get("like_count", 0)
dislike_count = self.youtube_meta.get("dislike_count", 0)
self.json_data.update(
{
"stats": {
"view_count": self.youtube_meta["view_count"],
"like_count": like_count,
"dislike_count": dislike_count,
"average_rating": self.youtube_meta["average_rating"],
}
}
)
def build_dl_cache_path(self):
"""find video path in dl cache"""
cache_dir = self.app_conf["cache_dir"]
cache_path = f"{cache_dir}/download/"
all_cached = os.listdir(cache_path)
for file_cached in all_cached:
if self.youtube_id in file_cached:
vid_path = os.path.join(cache_path, file_cached)
return vid_path
return False
def add_player(self):
"""add player information for new videos"""
try:
# when indexing from download task
vid_path = self.build_dl_cache_path()
except FileNotFoundError:
# when reindexing
base = self.app_conf["videos"]
vid_path = os.path.join(base, self.json_data["media_url"])
duration_handler = DurationConverter()
duration = duration_handler.get_sec(vid_path)
duration_str = duration_handler.get_str(duration)
self.json_data.update(
{
"player": {
"watched": False,
"duration": duration,
"duration_str": duration_str,
}
}
)
def add_file_path(self):
"""build media_url for where file will be located"""
channel_name = self.json_data["channel"]["channel_name"]
clean_channel_name = clean_string(channel_name)
timestamp = self.json_data["published"].replace("-", "")
youtube_id = self.json_data["youtube_id"]
title = self.json_data["title"]
clean_title = clean_string(title)
filename = f"{timestamp}_{youtube_id}_{clean_title}.mp4"
media_url = os.path.join(clean_channel_name, filename)
self.json_data["media_url"] = media_url
def delete_media_file(self):
"""delete video file, meta data"""
self.get_from_es()
video_base = self.app_conf["videos"]
media_url = self.json_data["media_url"]
print(f"{self.youtube_id}: delete {media_url} from file system")
to_delete = os.path.join(video_base, media_url)
os.remove(to_delete)
self.del_in_es()
def _get_ryd_stats(self):
"""get optional stats from returnyoutubedislikeapi.com"""
try:
print(f"{self.youtube_id}: get ryd stats")
result = ryd_client.get(self.youtube_id)
except requests.exceptions.ConnectionError:
print(f"{self.youtube_id}: failed to query ryd api, skipping")
return False
if result["status"] == 404:
return False
dislikes = {
"dislike_count": result["dislikes"],
"average_rating": result["rating"],
}
self.json_data["stats"].update(dislikes)
return True
class ChannelScraper:
"""custom scraper using bs4 to scrape channel about page
will be able to be integrated into yt-dlp
once #2237 and #2350 are merged upstream
"""
def __init__(self, channel_id):
self.channel_id = channel_id
self.soup = False
self.yt_json = False
self.json_data = False
def get_json(self):
"""main method to return channel dict"""
self.get_soup()
self._extract_yt_json()
self._parse_channel_main()
self._parse_channel_meta()
return self.json_data
def get_soup(self):
"""return soup from youtube"""
print(f"{self.channel_id}: scrape channel data from youtube")
url = f"https://www.youtube.com/channel/{self.channel_id}/about?hl=en"
cookies = {"CONSENT": "YES+xxxxxxxxxxxxxxxxxxxxxxxxxxx"}
response = requests.get(url, cookies=cookies)
if response.ok:
channel_page = response.text
else:
print(f"{self.channel_id}: failed to extract channel info")
raise ConnectionError
self.soup = BeautifulSoup(channel_page, "html.parser")
def _extract_yt_json(self):
"""parse soup and get ytInitialData json"""
all_scripts = self.soup.find("body").find_all("script")
for script in all_scripts:
if "var ytInitialData = " in str(script):
script_content = str(script)
break
# extract payload
script_content = script_content.split("var ytInitialData = ")[1]
json_raw = script_content.rstrip(";</script>")
self.yt_json = json.loads(json_raw)
def _parse_channel_main(self):
"""extract maintab values from scraped channel json data"""
main_tab = self.yt_json["header"]["c4TabbedHeaderRenderer"]
# build and return dict
self.json_data = {
"channel_active": True,
"channel_last_refresh": int(datetime.now().strftime("%s")),
"channel_subs": self._get_channel_subs(main_tab),
"channel_name": main_tab["title"],
"channel_banner_url": self._get_thumbnails(main_tab, "banner"),
"channel_tvart_url": self._get_thumbnails(main_tab, "tvBanner"),
"channel_id": self.channel_id,
"channel_subscribed": False,
}
@staticmethod
def _get_thumbnails(main_tab, thumb_name):
"""extract banner url from main_tab"""
try:
all_banners = main_tab[thumb_name]["thumbnails"]
banner = sorted(all_banners, key=lambda k: k["width"])[-1]["url"]
except KeyError:
banner = False
return banner
@staticmethod
def _get_channel_subs(main_tab):
"""process main_tab to get channel subs as int"""
try:
sub_text_simple = main_tab["subscriberCountText"]["simpleText"]
sub_text = sub_text_simple.split(" ")[0]
if sub_text[-1] == "K":
channel_subs = int(float(sub_text.replace("K", "")) * 1000)
elif sub_text[-1] == "M":
channel_subs = int(float(sub_text.replace("M", "")) * 1000000)
elif int(sub_text) >= 0:
channel_subs = int(sub_text)
else:
message = f"{sub_text} not dealt with"
print(message)
except KeyError:
channel_subs = 0
return channel_subs
def _parse_channel_meta(self):
"""extract meta tab values from channel payload"""
# meta tab
meta_tab = self.yt_json["metadata"]["channelMetadataRenderer"]
all_thumbs = meta_tab["avatar"]["thumbnails"]
thumb_url = sorted(all_thumbs, key=lambda k: k["width"])[-1]["url"]
# stats tab
renderer = "twoColumnBrowseResultsRenderer"
all_tabs = self.yt_json["contents"][renderer]["tabs"]
for tab in all_tabs:
if "tabRenderer" in tab.keys():
if tab["tabRenderer"]["title"] == "About":
about_tab = tab["tabRenderer"]["content"][
"sectionListRenderer"
]["contents"][0]["itemSectionRenderer"]["contents"][0][
"channelAboutFullMetadataRenderer"
]
break
try:
channel_views_text = about_tab["viewCountText"]["simpleText"]
channel_views = int(re.sub(r"\D", "", channel_views_text))
except KeyError:
channel_views = 0
self.json_data.update(
{
"channel_description": meta_tab["description"],
"channel_thumb_url": thumb_url,
"channel_views": channel_views,
}
)
class YoutubeChannel(YouTubeItem):
"""represents a single youtube channel"""
es_path = False
index_name = "ta_channel"
yt_base = "https://www.youtube.com/channel/"
def __init__(self, youtube_id):
super().__init__(youtube_id)
self.es_path = f"{self.index_name}/_doc/{youtube_id}"
def build_json(self, upload=False):
"""get from es or from youtube"""
self.get_from_es()
if self.json_data:
return
self.get_from_youtube()
if upload:
self.upload_to_es()
return
def get_from_youtube(self):
"""use bs4 to scrape channel about page"""
self.json_data = ChannelScraper(self.youtube_id).get_json()
self.get_channel_art()
def get_channel_art(self):
"""download channel art for new channels"""
channel_id = self.youtube_id
channel_thumb = self.json_data["channel_thumb_url"]
channel_banner = self.json_data["channel_banner_url"]
ThumbManager().download_chan(
[(channel_id, channel_thumb, channel_banner)]
)
def sync_to_videos(self):
"""sync new channel_dict to all videos of channel"""
# add ingest pipeline
processors = []
for field, value in self.json_data.items():
line = {"set": {"field": "channel." + field, "value": value}}
processors.append(line)
data = {"description": self.youtube_id, "processors": processors}
ingest_path = f"_ingest/pipeline/{self.youtube_id}"
_, _ = ElasticWrap(ingest_path).put(data)
# apply pipeline
data = {"query": {"match": {"channel.channel_id": self.youtube_id}}}
update_path = f"ta_video/_update_by_query?pipeline={self.youtube_id}"
_, _ = ElasticWrap(update_path).post(data)
def get_folder_path(self):
"""get folder where media files get stored"""
channel_name = self.json_data["channel_name"]
folder_name = clean_string(channel_name)
folder_path = os.path.join(self.app_conf["videos"], folder_name)
return folder_path
def delete_es_videos(self):
"""delete all channel documents from elasticsearch"""
data = {
"query": {
"term": {"channel.channel_id": {"value": self.youtube_id}}
}
}
_, _ = ElasticWrap("ta_video/_delete_by_query").post(data)
def delete_playlists(self):
"""delete all indexed playlist from es"""
all_playlists = self.get_indexed_playlists()
for playlist in all_playlists:
playlist_id = playlist["playlist_id"]
YoutubePlaylist(playlist_id).delete_metadata()
def delete_channel(self):
"""delete channel and all videos"""
print(f"{self.youtube_id}: delete channel")
self.get_from_es()
folder_path = self.get_folder_path()
print(f"{self.youtube_id}: delete all media files")
try:
all_videos = os.listdir(folder_path)
for video in all_videos:
video_path = os.path.join(folder_path, video)
os.remove(video_path)
os.rmdir(folder_path)
except FileNotFoundError:
print(f"no videos found for {folder_path}")
print(f"{self.youtube_id}: delete indexed playlists")
self.delete_playlists()
print(f"{self.youtube_id}: delete indexed videos")
self.delete_es_videos()
self.del_in_es()
def get_all_playlists(self):
"""get all playlists owned by this channel"""
url = (
f"https://www.youtube.com/channel/{self.youtube_id}"
+ "/playlists?view=1&sort=dd&shelf_id=0"
)
obs = {
"quiet": True,
"skip_download": True,
"extract_flat": True,
}
playlists = yt_dlp.YoutubeDL(obs).extract_info(url)
all_entries = [(i["id"], i["title"]) for i in playlists["entries"]]
return all_entries
def get_indexed_playlists(self):
"""get all indexed playlists from channel"""
data = {
"query": {
"term": {"playlist_channel_id": {"value": self.youtube_id}}
},
"sort": [{"playlist_channel.keyword": {"order": "desc"}}],
}
all_playlists = IndexPaginate("ta_playlist", data).get_results()
return all_playlists
class YoutubePlaylist(YouTubeItem):
"""represents a single youtube playlist"""
es_path = False
index_name = "ta_playlist"
yt_obs = {
"default_search": "ytsearch",
"quiet": True,
"skip_download": True,
"extract_flat": True,
}
yt_base = "https://www.youtube.com/playlist?list="
def __init__(self, youtube_id):
super().__init__(youtube_id)
self.es_path = f"{self.index_name}/_doc/{youtube_id}"
self.all_members = False
self.nav = False
self.all_youtube_ids = []
def build_json(self, scrape=False):
"""collection to create json_data"""
if not scrape:
self.get_from_es()
if scrape or not self.json_data:
self.get_from_youtube()
self.process_youtube_meta()
self.get_entries()
self.json_data["playlist_entries"] = self.all_members
self.get_playlist_art()
def process_youtube_meta(self):
"""extract relevant fields from youtube"""
self.json_data = {
"playlist_id": self.youtube_id,
"playlist_active": True,
"playlist_subscribed": False,
"playlist_name": self.youtube_meta["title"],
"playlist_channel": self.youtube_meta["channel"],
"playlist_channel_id": self.youtube_meta["channel_id"],
"playlist_thumbnail": self.youtube_meta["thumbnails"][-1]["url"],
"playlist_description": self.youtube_meta["description"] or False,
"playlist_last_refresh": int(datetime.now().strftime("%s")),
}
def get_entries(self, playlistend=False):
"""get all videos in playlist"""
if playlistend:
# implement playlist end
print(playlistend)
all_members = []
for idx, entry in enumerate(self.youtube_meta["entries"]):
if self.all_youtube_ids:
downloaded = entry["id"] in self.all_youtube_ids
else:
downloaded = False
if not entry["uploader"]:
continue
to_append = {
"youtube_id": entry["id"],
"title": entry["title"],
"uploader": entry["uploader"],
"idx": idx,
"downloaded": downloaded,
}
all_members.append(to_append)
self.all_members = all_members
@staticmethod
def get_playlist_art():
"""download artwork of playlist"""
thumbnails = ThumbManager()
missing_playlists = thumbnails.get_missing_playlists()
thumbnails.download_playlist(missing_playlists)
def add_vids_to_playlist(self):
"""sync the playlist id to videos"""
script = (
'if (!ctx._source.containsKey("playlist")) '
+ "{ctx._source.playlist = [params.playlist]} "
+ "else if (!ctx._source.playlist.contains(params.playlist)) "
+ "{ctx._source.playlist.add(params.playlist)} "
+ "else {ctx.op = 'none'}"
)
bulk_list = []
for entry in self.json_data["playlist_entries"]:
video_id = entry["youtube_id"]
action = {"update": {"_id": video_id, "_index": "ta_video"}}
source = {
"script": {
"source": script,
"lang": "painless",
"params": {"playlist": self.youtube_id},
}
}
bulk_list.append(json.dumps(action))
bulk_list.append(json.dumps(source))
# add last newline
bulk_list.append("\n")
query_str = "\n".join(bulk_list)
ElasticWrap("_bulk").post(query_str, ndjson=True)
def update_playlist(self):
"""update metadata for playlist with data from YouTube"""
self.get_from_es()
subscribed = self.json_data["playlist_subscribed"]
self.get_from_youtube()
if not self.json_data:
# return false to deactivate
return False
self.json_data["playlist_subscribed"] = subscribed
self.upload_to_es()
return True
def build_nav(self, youtube_id):
"""find next and previous in playlist of a given youtube_id"""
all_entries_available = self.json_data["playlist_entries"]
all_entries = [i for i in all_entries_available if i["downloaded"]]
current = [i for i in all_entries if i["youtube_id"] == youtube_id]
# stop if not found or playlist of 1
if not current or not len(all_entries) > 1:
return
current_idx = all_entries.index(current[0])
if current_idx == 0:
previous_item = False
else:
previous_item = all_entries[current_idx - 1]
prev_thumb = ThumbManager().vid_thumb_path(
previous_item["youtube_id"]
)
previous_item["vid_thumb"] = prev_thumb
if current_idx == len(all_entries) - 1:
next_item = False
else:
next_item = all_entries[current_idx + 1]
next_thumb = ThumbManager().vid_thumb_path(next_item["youtube_id"])
next_item["vid_thumb"] = next_thumb
self.nav = {
"playlist_meta": {
"current_idx": current[0]["idx"],
"playlist_id": self.youtube_id,
"playlist_name": self.json_data["playlist_name"],
"playlist_channel": self.json_data["playlist_channel"],
},
"playlist_previous": previous_item,
"playlist_next": next_item,
}
return
def delete_metadata(self):
"""delete metadata for playlist"""
script = (
"ctx._source.playlist.removeAll("
+ "Collections.singleton(params.playlist)) "
)
data = {
"query": {
"term": {"playlist.keyword": {"value": self.youtube_id}}
},
"script": {
"source": script,
"lang": "painless",
"params": {"playlist": self.youtube_id},
},
}
_, _ = ElasticWrap("ta_video/_update_by_query").post(data)
self.del_in_es()
def delete_videos_playlist(self):
"""delete playlist with all videos"""
print(f"{self.youtube_id}: delete playlist")
self.get_from_es()
all_youtube_id = [
i["youtube_id"]
for i in self.json_data["playlist_entries"]
if i["downloaded"]
]
for youtube_id in all_youtube_id:
YoutubeVideo(youtube_id).delete_media_file()
self.delete_metadata()
class WatchState:
"""handle watched checkbox for videos and channels"""
CONFIG = AppConfig().config
ES_URL = CONFIG["application"]["es_url"]
ES_AUTH = CONFIG["application"]["es_auth"]
HEADERS = {"Content-type": "application/json"}
def __init__(self, youtube_id):
self.youtube_id = youtube_id
self.stamp = int(datetime.now().strftime("%s"))
def mark_as_watched(self):
"""update es with new watched value"""
url_type = self.dedect_type()
if url_type == "video":
self.mark_vid_watched()
elif url_type == "channel":
self.mark_channel_watched()
elif url_type == "playlist":
self.mark_playlist_watched()
print(f"marked {self.youtube_id} as watched")
def mark_as_unwatched(self):
"""revert watched state to false"""
url_type = self.dedect_type()
if url_type == "video":
self.mark_vid_watched(revert=True)
print(f"revert {self.youtube_id} as unwatched")
def dedect_type(self):
"""find youtube id type"""
print(self.youtube_id)
url_process = UrlListParser(self.youtube_id).process_list()
url_type = url_process[0]["type"]
return url_type
def mark_vid_watched(self, revert=False):
"""change watched status of single video"""
url = self.ES_URL + "/ta_video/_update/" + self.youtube_id
data = {
"doc": {"player": {"watched": True, "watched_date": self.stamp}}
}
if revert:
data["doc"]["player"]["watched"] = False
payload = json.dumps(data)
request = requests.post(
url, data=payload, headers=self.HEADERS, auth=self.ES_AUTH
)
if not request.ok:
print(request.text)
raise ValueError("failed to mark video as watched")
def mark_channel_watched(self):
"""change watched status of every video in channel"""
data = {
"query": {
"bool": {
"must": [
{
"term": {
"channel.channel_id": {
"value": self.youtube_id
}
}
},
{"term": {"player.watched": {"value": False}}},
]
}
},
"script": {
"source": "ctx._source.player['watched'] = true",
"lang": "painless",
},
}
payload = json.dumps(data)
url = f"{self.ES_URL}/ta_video/_update_by_query"
request = requests.post(
url, data=payload, headers=self.HEADERS, auth=self.ES_AUTH
)
if not request.ok:
print(request.text)
raise ValueError("failed mark channel as watched")
def mark_playlist_watched(self):
"""change watched state of all videos in playlist"""
data = {
"query": {
"bool": {
"must": [
{
"term": {
"playlist.keyword": {"value": self.youtube_id}
}
},
{"term": {"player.watched": {"value": False}}},
]
}
},
"script": {
"source": "ctx._source.player['watched'] = true",
"lang": "painless",
},
}
payload = json.dumps(data)
url = f"{self.ES_URL}/ta_video/_update_by_query"
request = requests.post(
url, data=payload, headers=self.HEADERS, auth=self.ES_AUTH
)
if not request.ok:
print(request.text)
raise ValueError("failed mark playlist as watched")
def index_new_video(youtube_id):
"""combined classes to create new video in index"""
video = YoutubeVideo(youtube_id)
video.build_json()
if not video.json_data:
raise ValueError("failed to get metadata for " + youtube_id)
video.upload_to_es()
return video.json_data