You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
656 lines
22 KiB
656 lines
22 KiB
""" |
|
functionality: |
|
- get metadata from youtube for a video |
|
- index and update in es |
|
""" |
|
|
|
import json |
|
import os |
|
from datetime import datetime |
|
|
|
import requests |
|
from django.conf import settings |
|
from home.src.es.connect import ElasticWrap |
|
from home.src.index import channel as ta_channel |
|
from home.src.index import playlist as ta_playlist |
|
from home.src.index.generic import YouTubeItem |
|
from home.src.ta.helper import ( |
|
DurationConverter, |
|
clean_string, |
|
randomizor, |
|
requests_headers, |
|
) |
|
from home.src.ta.ta_redis import RedisArchivist |
|
from ryd_client import ryd_client |
|
|
|
|
|
class YoutubeSubtitle: |
|
"""handle video subtitle functionality""" |
|
|
|
def __init__(self, video): |
|
self.video = video |
|
self.languages = False |
|
|
|
def _sub_conf_parse(self): |
|
"""add additional conf values to self""" |
|
languages_raw = self.video.config["downloads"]["subtitle"] |
|
if languages_raw: |
|
self.languages = [i.strip() for i in languages_raw.split(",")] |
|
|
|
def get_subtitles(self): |
|
"""check what to do""" |
|
self._sub_conf_parse() |
|
if not self.languages: |
|
# no subtitles |
|
return False |
|
|
|
relevant_subtitles = [] |
|
for lang in self.languages: |
|
user_sub = self._get_user_subtitles(lang) |
|
if user_sub: |
|
relevant_subtitles.append(user_sub) |
|
continue |
|
|
|
if self.video.config["downloads"]["subtitle_source"] == "auto": |
|
auto_cap = self._get_auto_caption(lang) |
|
if auto_cap: |
|
relevant_subtitles.append(auto_cap) |
|
|
|
return relevant_subtitles |
|
|
|
def _get_auto_caption(self, lang): |
|
"""get auto_caption subtitles""" |
|
print(f"{self.video.youtube_id}-{lang}: get auto generated subtitles") |
|
all_subtitles = self.video.youtube_meta.get("automatic_captions") |
|
|
|
if not all_subtitles: |
|
return False |
|
|
|
video_media_url = self.video.json_data["media_url"] |
|
media_url = video_media_url.replace(".mp4", f".{lang}.vtt") |
|
all_formats = all_subtitles.get(lang) |
|
if not all_formats: |
|
return False |
|
|
|
subtitle = [i for i in all_formats if i["ext"] == "json3"][0] |
|
subtitle.update( |
|
{"lang": lang, "source": "auto", "media_url": media_url} |
|
) |
|
|
|
return subtitle |
|
|
|
def _normalize_lang(self): |
|
"""normalize country specific language keys""" |
|
all_subtitles = self.video.youtube_meta.get("subtitles") |
|
if not all_subtitles: |
|
return False |
|
|
|
all_keys = list(all_subtitles.keys()) |
|
for key in all_keys: |
|
lang = key.split("-")[0] |
|
old = all_subtitles.pop(key) |
|
if lang == "live_chat": |
|
continue |
|
all_subtitles[lang] = old |
|
|
|
return all_subtitles |
|
|
|
def _get_user_subtitles(self, lang): |
|
"""get subtitles uploaded from channel owner""" |
|
print(f"{self.video.youtube_id}-{lang}: get user uploaded subtitles") |
|
all_subtitles = self._normalize_lang() |
|
if not all_subtitles: |
|
return False |
|
|
|
video_media_url = self.video.json_data["media_url"] |
|
media_url = video_media_url.replace(".mp4", f".{lang}.vtt") |
|
all_formats = all_subtitles.get(lang) |
|
if not all_formats: |
|
# no user subtitles found |
|
return False |
|
|
|
subtitle = [i for i in all_formats if i["ext"] == "json3"][0] |
|
subtitle.update( |
|
{"lang": lang, "source": "user", "media_url": media_url} |
|
) |
|
|
|
return subtitle |
|
|
|
def download_subtitles(self, relevant_subtitles): |
|
"""download subtitle files to archive""" |
|
videos_base = self.video.config["application"]["videos"] |
|
for subtitle in relevant_subtitles: |
|
dest_path = os.path.join(videos_base, subtitle["media_url"]) |
|
source = subtitle["source"] |
|
lang = subtitle.get("lang") |
|
response = requests.get( |
|
subtitle["url"], headers=requests_headers() |
|
) |
|
if not response.ok: |
|
print(f"{self.video.youtube_id}: failed to download subtitle") |
|
print(response.text) |
|
continue |
|
|
|
parser = SubtitleParser(response.text, lang, source) |
|
parser.process() |
|
subtitle_str = parser.get_subtitle_str() |
|
self._write_subtitle_file(dest_path, subtitle_str) |
|
if self.video.config["downloads"]["subtitle_index"]: |
|
query_str = parser.create_bulk_import(self.video, source) |
|
self._index_subtitle(query_str) |
|
|
|
@staticmethod |
|
def _write_subtitle_file(dest_path, subtitle_str): |
|
"""write subtitle file to disk""" |
|
# create folder here for first video of channel |
|
os.makedirs(os.path.split(dest_path)[0], exist_ok=True) |
|
with open(dest_path, "w", encoding="utf-8") as subfile: |
|
subfile.write(subtitle_str) |
|
|
|
@staticmethod |
|
def _index_subtitle(query_str): |
|
"""send subtitle to es for indexing""" |
|
_, _ = ElasticWrap("_bulk").post(data=query_str, ndjson=True) |
|
|
|
def delete(self, subtitles=False): |
|
"""delete subtitles from index and filesystem""" |
|
youtube_id = self.video.youtube_id |
|
videos_base = self.video.config["application"]["videos"] |
|
# delete files |
|
if subtitles: |
|
files = [i["media_url"] for i in subtitles] |
|
else: |
|
if not self.video.json_data.get("subtitles"): |
|
return |
|
|
|
files = [i["media_url"] for i in self.video.json_data["subtitles"]] |
|
|
|
for file_name in files: |
|
file_path = os.path.join(videos_base, file_name) |
|
try: |
|
os.remove(file_path) |
|
except FileNotFoundError: |
|
print(f"{youtube_id}: {file_path} failed to delete") |
|
# delete from index |
|
path = "ta_subtitle/_delete_by_query?refresh=true" |
|
data = {"query": {"term": {"youtube_id": {"value": youtube_id}}}} |
|
_, _ = ElasticWrap(path).post(data=data) |
|
|
|
|
|
class SubtitleParser: |
|
"""parse subtitle str from youtube""" |
|
|
|
def __init__(self, subtitle_str, lang, source): |
|
self.subtitle_raw = json.loads(subtitle_str) |
|
self.lang = lang |
|
self.source = source |
|
self.all_cues = False |
|
|
|
def process(self): |
|
"""extract relevant que data""" |
|
all_events = self.subtitle_raw.get("events") |
|
if self.source == "auto": |
|
all_events = self._flat_auto_caption(all_events) |
|
|
|
self.all_cues = [] |
|
for idx, event in enumerate(all_events): |
|
if "dDurationMs" not in event or "segs" not in event: |
|
# some events won't have a duration or segs |
|
print(f"skipping subtitle event without content: {event}") |
|
continue |
|
|
|
cue = { |
|
"start": self._ms_conv(event["tStartMs"]), |
|
"end": self._ms_conv(event["tStartMs"] + event["dDurationMs"]), |
|
"text": "".join([i.get("utf8") for i in event["segs"]]), |
|
"idx": idx + 1, |
|
} |
|
self.all_cues.append(cue) |
|
|
|
@staticmethod |
|
def _flat_auto_caption(all_events): |
|
"""flatten autocaption segments""" |
|
flatten = [] |
|
for event in all_events: |
|
if "segs" not in event.keys(): |
|
continue |
|
text = "".join([i.get("utf8") for i in event.get("segs")]) |
|
if not text.strip(): |
|
continue |
|
|
|
if flatten: |
|
# fix overlapping retiming issue |
|
last = flatten[-1] |
|
if "dDurationMs" not in last or "segs" not in last: |
|
# some events won't have a duration or segs |
|
print(f"skipping subtitle event without content: {event}") |
|
continue |
|
|
|
last_end = last["tStartMs"] + last["dDurationMs"] |
|
if event["tStartMs"] < last_end: |
|
joined = last["segs"][0]["utf8"] + "\n" + text |
|
last["segs"][0]["utf8"] = joined |
|
continue |
|
|
|
event.update({"segs": [{"utf8": text}]}) |
|
flatten.append(event) |
|
|
|
return flatten |
|
|
|
@staticmethod |
|
def _ms_conv(ms): |
|
"""convert ms to timestamp""" |
|
hours = str((ms // (1000 * 60 * 60)) % 24).zfill(2) |
|
minutes = str((ms // (1000 * 60)) % 60).zfill(2) |
|
secs = str((ms // 1000) % 60).zfill(2) |
|
millis = str(ms % 1000).zfill(3) |
|
|
|
return f"{hours}:{minutes}:{secs}.{millis}" |
|
|
|
def get_subtitle_str(self): |
|
"""create vtt text str from cues""" |
|
subtitle_str = f"WEBVTT\nKind: captions\nLanguage: {self.lang}" |
|
|
|
for cue in self.all_cues: |
|
stamp = f"{cue.get('start')} --> {cue.get('end')}" |
|
cue_text = f"\n\n{cue.get('idx')}\n{stamp}\n{cue.get('text')}" |
|
subtitle_str = subtitle_str + cue_text |
|
|
|
return subtitle_str |
|
|
|
def create_bulk_import(self, video, source): |
|
"""subtitle lines for es import""" |
|
documents = self._create_documents(video, source) |
|
bulk_list = [] |
|
|
|
for document in documents: |
|
document_id = document.get("subtitle_fragment_id") |
|
action = {"index": {"_index": "ta_subtitle", "_id": document_id}} |
|
bulk_list.append(json.dumps(action)) |
|
bulk_list.append(json.dumps(document)) |
|
|
|
bulk_list.append("\n") |
|
query_str = "\n".join(bulk_list) |
|
|
|
return query_str |
|
|
|
def _create_documents(self, video, source): |
|
"""process documents""" |
|
documents = self._chunk_list(video.youtube_id) |
|
channel = video.json_data.get("channel") |
|
meta_dict = { |
|
"youtube_id": video.youtube_id, |
|
"title": video.json_data.get("title"), |
|
"subtitle_channel": channel.get("channel_name"), |
|
"subtitle_channel_id": channel.get("channel_id"), |
|
"subtitle_last_refresh": int(datetime.now().strftime("%s")), |
|
"subtitle_lang": self.lang, |
|
"subtitle_source": source, |
|
} |
|
|
|
_ = [i.update(meta_dict) for i in documents] |
|
|
|
return documents |
|
|
|
def _chunk_list(self, youtube_id): |
|
"""join cues for bulk import""" |
|
chunk_list = [] |
|
|
|
chunk = {} |
|
for cue in self.all_cues: |
|
if chunk: |
|
text = f"{chunk.get('subtitle_line')} {cue.get('text')}\n" |
|
chunk["subtitle_line"] = text |
|
else: |
|
idx = len(chunk_list) + 1 |
|
chunk = { |
|
"subtitle_index": idx, |
|
"subtitle_line": cue.get("text"), |
|
"subtitle_start": cue.get("start"), |
|
} |
|
|
|
chunk["subtitle_fragment_id"] = f"{youtube_id}-{self.lang}-{idx}" |
|
|
|
if cue["idx"] % 5 == 0: |
|
chunk["subtitle_end"] = cue.get("end") |
|
chunk_list.append(chunk) |
|
chunk = {} |
|
|
|
return chunk_list |
|
|
|
|
|
class SponsorBlock: |
|
"""handle sponsor block integration""" |
|
|
|
API = "https://sponsor.ajay.app/api" |
|
|
|
def __init__(self, user_id=False): |
|
self.user_id = user_id |
|
self.user_agent = f"{settings.TA_UPSTREAM} {settings.TA_VERSION}" |
|
self.last_refresh = int(datetime.now().strftime("%s")) |
|
|
|
def get_sb_id(self): |
|
"""get sponsorblock userid or generate if needed""" |
|
if not self.user_id: |
|
print("missing request user id") |
|
raise ValueError |
|
|
|
key = f"{self.user_id}:id_sponsorblock" |
|
sb_id = RedisArchivist().get_message(key) |
|
if not sb_id["status"]: |
|
sb_id = {"status": randomizor(32)} |
|
RedisArchivist().set_message(key, sb_id) |
|
|
|
return sb_id |
|
|
|
def get_timestamps(self, youtube_id): |
|
"""get timestamps from the API""" |
|
url = f"{self.API}/skipSegments?videoID={youtube_id}" |
|
headers = {"User-Agent": self.user_agent} |
|
print(f"{youtube_id}: get sponsorblock timestamps") |
|
response = requests.get(url, headers=headers) |
|
if not response.ok: |
|
print(f"{youtube_id}: sponsorblock failed: {response.text}") |
|
sponsor_dict = { |
|
"last_refresh": self.last_refresh, |
|
"is_enabled": True, |
|
"segments": [], |
|
} |
|
else: |
|
all_segments = response.json() |
|
sponsor_dict = self._get_sponsor_dict(all_segments) |
|
|
|
return sponsor_dict |
|
|
|
def _get_sponsor_dict(self, all_segments): |
|
"""format and process response""" |
|
has_unlocked = False |
|
cleaned_segments = [] |
|
for segment in all_segments: |
|
if not segment["locked"]: |
|
has_unlocked = True |
|
del segment["userID"] |
|
del segment["description"] |
|
cleaned_segments.append(segment) |
|
|
|
sponsor_dict = { |
|
"last_refresh": self.last_refresh, |
|
"has_unlocked": has_unlocked, |
|
"is_enabled": True, |
|
"segments": cleaned_segments, |
|
} |
|
return sponsor_dict |
|
|
|
def post_timestamps(self, youtube_id, start_time, end_time): |
|
"""post timestamps to api""" |
|
user_id = self.get_sb_id().get("status") |
|
data = { |
|
"videoID": youtube_id, |
|
"startTime": start_time, |
|
"endTime": end_time, |
|
"category": "sponsor", |
|
"userID": user_id, |
|
"userAgent": self.user_agent, |
|
} |
|
url = f"{self.API}/skipSegments?videoID={youtube_id}" |
|
print(f"post: {data}") |
|
print(f"to: {url}") |
|
|
|
return {"success": True}, 200 |
|
|
|
def vote_on_segment(self, uuid, vote): |
|
"""send vote on existing segment""" |
|
user_id = self.get_sb_id().get("status") |
|
data = { |
|
"UUID": uuid, |
|
"userID": user_id, |
|
"type": vote, |
|
} |
|
url = f"{self.API}/api/voteOnSponsorTime" |
|
print(f"post: {data}") |
|
print(f"to: {url}") |
|
|
|
return {"success": True}, 200 |
|
|
|
|
|
class YoutubeVideo(YouTubeItem, YoutubeSubtitle): |
|
"""represents a single youtube video""" |
|
|
|
es_path = False |
|
index_name = "ta_video" |
|
yt_base = "https://www.youtube.com/watch?v=" |
|
|
|
def __init__(self, youtube_id, video_overwrites=False): |
|
super().__init__(youtube_id) |
|
self.channel_id = False |
|
self.video_overwrites = video_overwrites |
|
self.es_path = f"{self.index_name}/_doc/{youtube_id}" |
|
|
|
def build_json(self): |
|
"""build json dict of video""" |
|
self.get_from_youtube() |
|
if not self.youtube_meta: |
|
return |
|
|
|
self._process_youtube_meta() |
|
self._add_channel() |
|
self._add_stats() |
|
self.add_file_path() |
|
self.add_player() |
|
if self.config["downloads"]["integrate_ryd"]: |
|
self._get_ryd_stats() |
|
|
|
if self._check_get_sb(): |
|
self._get_sponsorblock() |
|
|
|
return |
|
|
|
def _check_get_sb(self): |
|
"""check if need to run sponsor block""" |
|
integrate = self.config["downloads"]["integrate_sponsorblock"] |
|
|
|
if self.video_overwrites: |
|
single_overwrite = self.video_overwrites.get(self.youtube_id) |
|
if not single_overwrite: |
|
return integrate |
|
|
|
if "integrate_sponsorblock" in single_overwrite: |
|
return single_overwrite.get("integrate_sponsorblock") |
|
|
|
return integrate |
|
|
|
def _process_youtube_meta(self): |
|
"""extract relevant fields from youtube""" |
|
# extract |
|
self.channel_id = self.youtube_meta["channel_id"] |
|
upload_date = self.youtube_meta["upload_date"] |
|
upload_date_time = datetime.strptime(upload_date, "%Y%m%d") |
|
published = upload_date_time.strftime("%Y-%m-%d") |
|
last_refresh = int(datetime.now().strftime("%s")) |
|
# base64_blur = ThumbManager().get_base64_blur(self.youtube_id) |
|
base64_blur = False |
|
# build json_data basics |
|
self.json_data = { |
|
"title": self.youtube_meta["title"], |
|
"description": self.youtube_meta["description"], |
|
"category": self.youtube_meta["categories"], |
|
"vid_thumb_url": self.youtube_meta["thumbnail"], |
|
"vid_thumb_base64": base64_blur, |
|
"tags": self.youtube_meta["tags"], |
|
"published": published, |
|
"vid_last_refresh": last_refresh, |
|
"date_downloaded": last_refresh, |
|
"youtube_id": self.youtube_id, |
|
"active": True, |
|
} |
|
|
|
def _add_channel(self): |
|
"""add channel dict to video json_data""" |
|
channel = ta_channel.YoutubeChannel(self.channel_id) |
|
channel.build_json(upload=True) |
|
self.json_data.update({"channel": channel.json_data}) |
|
|
|
def _add_stats(self): |
|
"""add stats dicst to json_data""" |
|
# likes |
|
like_count = self.youtube_meta.get("like_count", 0) |
|
dislike_count = self.youtube_meta.get("dislike_count", 0) |
|
self.json_data.update( |
|
{ |
|
"stats": { |
|
"view_count": self.youtube_meta["view_count"], |
|
"like_count": like_count, |
|
"dislike_count": dislike_count, |
|
"average_rating": self.youtube_meta["average_rating"], |
|
} |
|
} |
|
) |
|
|
|
def build_dl_cache_path(self): |
|
"""find video path in dl cache""" |
|
cache_dir = self.app_conf["cache_dir"] |
|
cache_path = f"{cache_dir}/download/" |
|
all_cached = os.listdir(cache_path) |
|
for file_cached in all_cached: |
|
if self.youtube_id in file_cached: |
|
vid_path = os.path.join(cache_path, file_cached) |
|
return vid_path |
|
|
|
raise FileNotFoundError |
|
|
|
def add_player(self): |
|
"""add player information for new videos""" |
|
try: |
|
# when indexing from download task |
|
vid_path = self.build_dl_cache_path() |
|
except FileNotFoundError as err: |
|
# when reindexing needs to handle title rename |
|
channel = os.path.split(self.json_data["media_url"])[0] |
|
channel_dir = os.path.join(self.app_conf["videos"], channel) |
|
all_files = os.listdir(channel_dir) |
|
for file in all_files: |
|
if self.youtube_id in file and file.endswith(".mp4"): |
|
vid_path = os.path.join(channel_dir, file) |
|
break |
|
else: |
|
raise FileNotFoundError("could not find video file") from err |
|
|
|
duration_handler = DurationConverter() |
|
duration = duration_handler.get_sec(vid_path) |
|
duration_str = duration_handler.get_str(duration) |
|
self.json_data.update( |
|
{ |
|
"player": { |
|
"watched": False, |
|
"duration": duration, |
|
"duration_str": duration_str, |
|
} |
|
} |
|
) |
|
|
|
def add_file_path(self): |
|
"""build media_url for where file will be located""" |
|
channel_name = self.json_data["channel"]["channel_name"] |
|
clean_channel_name = clean_string(channel_name) |
|
if len(clean_channel_name) <= 3: |
|
# fall back to channel id |
|
clean_channel_name = self.json_data["channel"]["channel_id"] |
|
|
|
timestamp = self.json_data["published"].replace("-", "") |
|
youtube_id = self.json_data["youtube_id"] |
|
title = self.json_data["title"] |
|
clean_title = clean_string(title) |
|
filename = f"{timestamp}_{youtube_id}_{clean_title}.mp4" |
|
media_url = os.path.join(clean_channel_name, filename) |
|
self.json_data["media_url"] = media_url |
|
|
|
def delete_media_file(self): |
|
"""delete video file, meta data""" |
|
print(f"{self.youtube_id}: delete video") |
|
self.get_from_es() |
|
video_base = self.app_conf["videos"] |
|
media_url = self.json_data.get("media_url") |
|
file_path = os.path.join(video_base, media_url) |
|
try: |
|
os.remove(file_path) |
|
except FileNotFoundError: |
|
print(f"{self.youtube_id}: failed {media_url}, continue.") |
|
|
|
self.del_in_playlists() |
|
self.del_in_es() |
|
self.delete_subtitles() |
|
|
|
def del_in_playlists(self): |
|
"""remove downloaded in playlist""" |
|
all_playlists = self.json_data.get("playlist") |
|
if not all_playlists: |
|
return |
|
|
|
for playlist_id in all_playlists: |
|
print(f"{playlist_id}: delete video {self.youtube_id}") |
|
playlist = ta_playlist.YoutubePlaylist(playlist_id) |
|
playlist.get_from_es() |
|
entries = playlist.json_data["playlist_entries"] |
|
for idx, entry in enumerate(entries): |
|
if entry["youtube_id"] == self.youtube_id: |
|
playlist.json_data["playlist_entries"][idx].update( |
|
{"downloaded": False} |
|
) |
|
playlist.upload_to_es() |
|
|
|
def delete_subtitles(self, subtitles=False): |
|
"""delete indexed subtitles""" |
|
print(f"{self.youtube_id}: delete subtitles") |
|
YoutubeSubtitle(self).delete(subtitles=subtitles) |
|
|
|
def _get_ryd_stats(self): |
|
"""get optional stats from returnyoutubedislikeapi.com""" |
|
try: |
|
print(f"{self.youtube_id}: get ryd stats") |
|
result = ryd_client.get(self.youtube_id) |
|
except requests.exceptions.ConnectionError: |
|
print(f"{self.youtube_id}: failed to query ryd api, skipping") |
|
return False |
|
|
|
if result["status"] == 404: |
|
return False |
|
|
|
dislikes = { |
|
"dislike_count": result["dislikes"], |
|
"average_rating": result["rating"], |
|
} |
|
self.json_data["stats"].update(dislikes) |
|
|
|
return True |
|
|
|
def _get_sponsorblock(self): |
|
"""get optional sponsorblock timestamps from sponsor.ajay.app""" |
|
sponsorblock = SponsorBlock().get_timestamps(self.youtube_id) |
|
if sponsorblock: |
|
self.json_data["sponsorblock"] = sponsorblock |
|
|
|
def check_subtitles(self): |
|
"""optionally add subtitles""" |
|
handler = YoutubeSubtitle(self) |
|
subtitles = handler.get_subtitles() |
|
if subtitles: |
|
self.json_data["subtitles"] = subtitles |
|
handler.download_subtitles(relevant_subtitles=subtitles) |
|
|
|
def update_media_url(self): |
|
"""update only media_url in es for reindex channel rename""" |
|
data = {"doc": {"media_url": self.json_data["media_url"]}} |
|
path = f"{self.index_name}/_update/{self.youtube_id}" |
|
_, _ = ElasticWrap(path).post(data=data) |
|
|
|
|
|
def index_new_video(youtube_id, video_overwrites=False): |
|
"""combined classes to create new video in index""" |
|
video = YoutubeVideo(youtube_id, video_overwrites=video_overwrites) |
|
video.build_json() |
|
if not video.json_data: |
|
raise ValueError("failed to get metadata for " + youtube_id) |
|
|
|
video.check_subtitles() |
|
video.upload_to_es() |
|
return video.json_data
|
|
|