tubearchivist/tubearchivist/home/src/index/video.py

423 lines
14 KiB
Python

"""
functionality:
- get metadata from youtube for a video
- index and update in es
"""
import os
from datetime import datetime
import requests
from django.conf import settings
from home.src.es.connect import ElasticWrap
from home.src.index import channel as ta_channel
from home.src.index import comments as ta_comments
from home.src.index import playlist as ta_playlist
from home.src.index.generic import YouTubeItem
from home.src.index.subtitle import YoutubeSubtitle
from home.src.index.video_constants import VideoTypeEnum
from home.src.index.video_streams import DurationConverter
from home.src.ta.helper import clean_string, randomizor
from home.src.ta.ta_redis import RedisArchivist
from ryd_client import ryd_client
class SponsorBlock:
"""handle sponsor block integration"""
API = "https://sponsor.ajay.app/api"
def __init__(self, user_id=False):
self.user_id = user_id
self.user_agent = f"{settings.TA_UPSTREAM} {settings.TA_VERSION}"
self.last_refresh = int(datetime.now().timestamp())
def get_sb_id(self):
"""get sponsorblock userid or generate if needed"""
if not self.user_id:
print("missing request user id")
raise ValueError
key = f"{self.user_id}:id_sponsorblock"
sb_id = RedisArchivist().get_message(key)
if not sb_id["status"]:
sb_id = {"status": randomizor(32)}
RedisArchivist().set_message(key, sb_id)
return sb_id
def get_timestamps(self, youtube_id):
"""get timestamps from the API"""
url = f"{self.API}/skipSegments?videoID={youtube_id}"
headers = {"User-Agent": self.user_agent}
print(f"{youtube_id}: get sponsorblock timestamps")
try:
response = requests.get(url, headers=headers, timeout=10)
except requests.ReadTimeout:
print(f"{youtube_id}: sponsorblock API timeout")
return False
if not response.ok:
print(f"{youtube_id}: sponsorblock failed: {response.status_code}")
if response.status_code == 503:
return False
sponsor_dict = {
"last_refresh": self.last_refresh,
"is_enabled": True,
"segments": [],
}
else:
all_segments = response.json()
sponsor_dict = self._get_sponsor_dict(all_segments)
return sponsor_dict
def _get_sponsor_dict(self, all_segments):
"""format and process response"""
_ = [i.pop("description", None) for i in all_segments]
has_unlocked = not any(i.get("locked") for i in all_segments)
sponsor_dict = {
"last_refresh": self.last_refresh,
"has_unlocked": has_unlocked,
"is_enabled": True,
"segments": all_segments,
}
return sponsor_dict
def post_timestamps(self, youtube_id, start_time, end_time):
"""post timestamps to api"""
user_id = self.get_sb_id().get("status")
data = {
"videoID": youtube_id,
"startTime": start_time,
"endTime": end_time,
"category": "sponsor",
"userID": user_id,
"userAgent": self.user_agent,
}
url = f"{self.API}/skipSegments?videoID={youtube_id}"
print(f"post: {data}")
print(f"to: {url}")
return {"success": True}, 200
def vote_on_segment(self, uuid, vote):
"""send vote on existing segment"""
user_id = self.get_sb_id().get("status")
data = {
"UUID": uuid,
"userID": user_id,
"type": vote,
}
url = f"{self.API}/api/voteOnSponsorTime"
print(f"post: {data}")
print(f"to: {url}")
return {"success": True}, 200
class YoutubeVideo(YouTubeItem, YoutubeSubtitle):
"""represents a single youtube video"""
es_path = False
index_name = "ta_video"
yt_base = "https://www.youtube.com/watch?v="
def __init__(
self,
youtube_id,
video_overwrites=False,
video_type=VideoTypeEnum.VIDEOS,
):
super().__init__(youtube_id)
self.channel_id = False
self.video_overwrites = video_overwrites
self.video_type = video_type
self.es_path = f"{self.index_name}/_doc/{youtube_id}"
self.offline_import = False
def build_json(self, youtube_meta_overwrite=False, media_path=False):
"""build json dict of video"""
self.get_from_youtube()
if not self.youtube_meta and not youtube_meta_overwrite:
return
if not self.youtube_meta:
self.youtube_meta = youtube_meta_overwrite
self.offline_import = True
self._process_youtube_meta()
self._add_channel()
self._add_stats()
self.add_file_path()
self.add_player(media_path)
if self.config["downloads"]["integrate_ryd"]:
self._get_ryd_stats()
if self._check_get_sb():
self._get_sponsorblock()
return
def _check_get_sb(self):
"""check if need to run sponsor block"""
integrate = self.config["downloads"]["integrate_sponsorblock"]
if self.video_overwrites:
single_overwrite = self.video_overwrites.get(self.youtube_id)
if not single_overwrite:
return integrate
if "integrate_sponsorblock" in single_overwrite:
return single_overwrite.get("integrate_sponsorblock")
return integrate
def _process_youtube_meta(self):
"""extract relevant fields from youtube"""
# extract
self.channel_id = self.youtube_meta["channel_id"]
upload_date = self.youtube_meta["upload_date"]
upload_date_time = datetime.strptime(upload_date, "%Y%m%d")
published = upload_date_time.strftime("%Y-%m-%d")
last_refresh = int(datetime.now().timestamp())
# base64_blur = ThumbManager().get_base64_blur(self.youtube_id)
base64_blur = False
# build json_data basics
self.json_data = {
"title": self.youtube_meta["title"],
"description": self.youtube_meta["description"],
"category": self.youtube_meta["categories"],
"vid_thumb_url": self.youtube_meta["thumbnail"],
"vid_thumb_base64": base64_blur,
"tags": self.youtube_meta["tags"],
"published": published,
"vid_last_refresh": last_refresh,
"date_downloaded": last_refresh,
"youtube_id": self.youtube_id,
# Using .value to make json encodable
"vid_type": self.video_type.value,
"active": True,
}
def _add_channel(self):
"""add channel dict to video json_data"""
channel = ta_channel.YoutubeChannel(self.channel_id)
channel.build_json(upload=True, fallback=self.youtube_meta)
self.json_data.update({"channel": channel.json_data})
def _add_stats(self):
"""add stats dicst to json_data"""
# likes
like_count = self.youtube_meta.get("like_count", 0)
dislike_count = self.youtube_meta.get("dislike_count", 0)
average_rating = self.youtube_meta.get("average_rating", 0)
self.json_data.update(
{
"stats": {
"view_count": self.youtube_meta["view_count"],
"like_count": like_count,
"dislike_count": dislike_count,
"average_rating": average_rating,
}
}
)
def build_dl_cache_path(self):
"""find video path in dl cache"""
cache_dir = self.app_conf["cache_dir"]
cache_path = f"{cache_dir}/download/"
all_cached = os.listdir(cache_path)
for file_cached in all_cached:
if self.youtube_id in file_cached:
vid_path = os.path.join(cache_path, file_cached)
return vid_path
raise FileNotFoundError
def add_player(self, media_path=False):
"""add player information for new videos"""
vid_path = self._get_vid_path(media_path)
duration_handler = DurationConverter()
duration = duration_handler.get_sec(vid_path)
duration_str = duration_handler.get_str(duration)
self.json_data.update(
{
"player": {
"watched": False,
"duration": duration,
"duration_str": duration_str,
}
}
)
def _get_vid_path(self, media_path=False):
"""get path of media file"""
if media_path:
return media_path
try:
# when indexing from download task
vid_path = self.build_dl_cache_path()
except FileNotFoundError as err:
# when reindexing needs to handle title rename
channel = os.path.split(self.json_data["media_url"])[0]
channel_dir = os.path.join(self.app_conf["videos"], channel)
all_files = os.listdir(channel_dir)
for file in all_files:
if self.youtube_id in file and file.endswith(".mp4"):
vid_path = os.path.join(channel_dir, file)
break
else:
raise FileNotFoundError("could not find video file") from err
return vid_path
def add_file_path(self):
"""build media_url for where file will be located"""
channel_name = self.json_data["channel"]["channel_name"]
clean_channel_name = clean_string(channel_name)
if len(clean_channel_name) <= 3:
# fall back to channel id
clean_channel_name = self.json_data["channel"]["channel_id"]
timestamp = self.json_data["published"].replace("-", "")
youtube_id = self.json_data["youtube_id"]
title = self.json_data["title"]
clean_title = clean_string(title)
filename = f"{timestamp}_{youtube_id}_{clean_title}.mp4"
media_url = os.path.join(clean_channel_name, filename)
self.json_data["media_url"] = media_url
def delete_media_file(self):
"""delete video file, meta data"""
print(f"{self.youtube_id}: delete video")
self.get_from_es()
if not self.json_data:
raise FileNotFoundError
video_base = self.app_conf["videos"]
media_url = self.json_data.get("media_url")
file_path = os.path.join(video_base, media_url)
try:
os.remove(file_path)
except FileNotFoundError:
print(f"{self.youtube_id}: failed {media_url}, continue.")
self.del_in_playlists()
self.del_in_es()
self.delete_subtitles()
self.delete_comments()
def del_in_playlists(self):
"""remove downloaded in playlist"""
all_playlists = self.json_data.get("playlist")
if not all_playlists:
return
for playlist_id in all_playlists:
print(f"{playlist_id}: delete video {self.youtube_id}")
playlist = ta_playlist.YoutubePlaylist(playlist_id)
playlist.get_from_es()
entries = playlist.json_data["playlist_entries"]
for idx, entry in enumerate(entries):
if entry["youtube_id"] == self.youtube_id:
playlist.json_data["playlist_entries"][idx].update(
{"downloaded": False}
)
playlist.upload_to_es()
def delete_subtitles(self, subtitles=False):
"""delete indexed subtitles"""
print(f"{self.youtube_id}: delete subtitles")
YoutubeSubtitle(self).delete(subtitles=subtitles)
def delete_comments(self):
"""delete comments from es"""
comments = ta_comments.Comments(self.youtube_id, config=self.config)
comments.check_config()
if comments.is_activated:
comments.delete_comments()
def _get_ryd_stats(self):
"""get optional stats from returnyoutubedislikeapi.com"""
# pylint: disable=broad-except
try:
print(f"{self.youtube_id}: get ryd stats")
result = ryd_client.get(self.youtube_id)
except Exception as err:
print(f"{self.youtube_id}: failed to query ryd api {err}")
return
if result["status"] == 404:
return
dislikes = {
"dislike_count": result.get("dislikes", 0),
"average_rating": result.get("rating", 0),
}
self.json_data["stats"].update(dislikes)
def _get_sponsorblock(self):
"""get optional sponsorblock timestamps from sponsor.ajay.app"""
sponsorblock = SponsorBlock().get_timestamps(self.youtube_id)
if sponsorblock:
self.json_data["sponsorblock"] = sponsorblock
def check_subtitles(self, subtitle_files=False):
"""optionally add subtitles"""
if self.offline_import and subtitle_files:
indexed = self._offline_subtitles(subtitle_files)
self.json_data["subtitles"] = indexed
return
handler = YoutubeSubtitle(self)
subtitles = handler.get_subtitles()
if subtitles:
indexed = handler.download_subtitles(relevant_subtitles=subtitles)
self.json_data["subtitles"] = indexed
def _offline_subtitles(self, subtitle_files):
"""import offline subtitles"""
base_name, _ = os.path.splitext(self.json_data["media_url"])
subtitles = []
for subtitle in subtitle_files:
lang = subtitle.split(".")[-2]
subtitle_media_url = f"{base_name}.{lang}.vtt"
to_add = {
"ext": "vtt",
"url": False,
"name": lang,
"lang": lang,
"source": "file",
"media_url": subtitle_media_url,
}
subtitles.append(to_add)
return subtitles
def update_media_url(self):
"""update only media_url in es for reindex channel rename"""
data = {"doc": {"media_url": self.json_data["media_url"]}}
path = f"{self.index_name}/_update/{self.youtube_id}"
_, _ = ElasticWrap(path).post(data=data)
def index_new_video(
youtube_id, video_overwrites=False, video_type=VideoTypeEnum.VIDEOS
):
"""combined classes to create new video in index"""
video = YoutubeVideo(
youtube_id, video_overwrites=video_overwrites, video_type=video_type
)
video.build_json()
if not video.json_data:
raise ValueError("failed to get metadata for " + youtube_id)
video.check_subtitles()
video.upload_to_es()
return video.json_data