diff --git a/docker-compose.yml b/docker-compose.yml index ce79327..b300949 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -33,7 +33,7 @@ services: depends_on: - archivist-es archivist-es: - image: docker.elastic.co/elasticsearch/elasticsearch:7.16.2 + image: docker.elastic.co/elasticsearch/elasticsearch:7.17.0 container_name: archivist-es restart: always environment: @@ -54,4 +54,4 @@ volumes: media: cache: redis: - es: \ No newline at end of file + es: diff --git a/tubearchivist/api/README.md b/tubearchivist/api/README.md index 0e74a11..73dd4fc 100644 --- a/tubearchivist/api/README.md +++ b/tubearchivist/api/README.md @@ -23,10 +23,6 @@ response = requests.get(url, headers=headers) ## Video Item View /api/video/\/ -## Video Player View -returns all relevant information to create video player -/api/video/\/player - ## Channel List View /api/channel/ diff --git a/tubearchivist/api/urls.py b/tubearchivist/api/urls.py index a6c6801..d39dc30 100644 --- a/tubearchivist/api/urls.py +++ b/tubearchivist/api/urls.py @@ -6,7 +6,6 @@ from api.views import ( DownloadApiListView, DownloadApiView, PlaylistApiView, - VideoApiPlayerView, VideoApiView, ) from django.urls import path @@ -17,11 +16,6 @@ urlpatterns = [ VideoApiView.as_view(), name="api-video", ), - path( - "video//player/", - VideoApiPlayerView.as_view(), - name="api-video-player", - ), path( "channel/", ChannelApiListView.as_view(), diff --git a/tubearchivist/api/views.py b/tubearchivist/api/views.py index f0923aa..620c063 100644 --- a/tubearchivist/api/views.py +++ b/tubearchivist/api/views.py @@ -60,6 +60,12 @@ class ApiBaseView(APIView): cache_dir = self.default_conf["application"]["cache_dir"] new_thumb = f"{cache_dir}/{vid_thumb_url}" self.response["data"]["vid_thumb_url"] = new_thumb + if "subtitles" in all_keys: + all_subtitles = self.response["data"]["subtitles"] + for idx, _ in enumerate(all_subtitles): + url = self.response["data"]["subtitles"][idx]["media_url"] + new_url = f"/media/{url}" + self.response["data"]["subtitles"][idx]["media_url"] = new_url def get_paginate(self): """add pagination detail to response""" @@ -92,38 +98,6 @@ class VideoApiView(ApiBaseView): return Response(self.response, status=self.status_code) -class VideoApiPlayerView(ApiBaseView): - """resolves to /api/video//player - GET: returns dict of video to build player - """ - - search_base = "/ta_video/_doc/" - - def get(self, request, video_id): - # pylint: disable=unused-argument - """get request""" - self.config_builder() - self.get_document(video_id) - player = self.process_response() - return Response(player, status=self.status_code) - - def process_response(self): - """build all needed vars for player""" - vid_data = self.response["data"] - youtube_id = vid_data["youtube_id"] - vid_thumb_url = ThumbManager().vid_thumb_path(youtube_id) - player = { - "youtube_id": youtube_id, - "media_url": "/media/" + vid_data["media_url"], - "vid_thumb_url": "/cache/" + vid_thumb_url, - "title": vid_data["title"], - "channel_name": vid_data["channel"]["channel_name"], - "channel_id": vid_data["channel"]["channel_id"], - "is_watched": vid_data["player"]["watched"], - } - return player - - class ChannelApiView(ApiBaseView): """resolves to /api/channel// GET: returns metadata dict of channel diff --git a/tubearchivist/home/src/es/index_mapping.json b/tubearchivist/home/src/es/index_mapping.json index 3272ed3..f30a82d 100644 --- a/tubearchivist/home/src/es/index_mapping.json +++ b/tubearchivist/home/src/es/index_mapping.json @@ -156,6 +156,32 @@ "normalizer": "to_lower" } } + }, + "subtitles": { + "properties": { + "ext": { + "type": "keyword", + "index": false + }, + "lang": { + "type": "keyword", + "index": false + }, + "media_url": { + "type": "keyword", + "index": false + }, + "name": { + "type": "keyword" + }, + "source": { + "type": "keyword" + }, + "url": { + "type": "keyword", + "index": false + } + } } }, "expected_set": { @@ -277,6 +303,73 @@ }, "number_of_replicas": "0" } + }, + { + "index_name": "subtitle", + "expected_map": { + "youtube_id": { + "type": "keyword" + }, + "title": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256, + "normalizer": "to_lower" + } + } + }, + "subtitle_fragment_id": { + "type": "keyword" + }, + "subtitle_channel": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256, + "normalizer": "to_lower" + } + } + }, + "subtitle_channel_id": { + "type": "keyword" + }, + "subtitle_start": { + "type": "text" + }, + "subtitle_end": { + "type": "text" + }, + "subtitle_last_refresh": { + "type": "date" + }, + "subtitle_index": { + "type" : "long" + }, + "subtitle_lang": { + "type": "keyword" + }, + "subtitle_source": { + "type": "keyword" + }, + "subtitle_line": { + "type" : "text", + "analyzer": "english" + } + }, + "expected_set": { + "analysis": { + "normalizer": { + "to_lower": { + "type": "custom", + "filter": ["lowercase"] + } + } + }, + "number_of_replicas": "0" + } } ] } \ No newline at end of file diff --git a/tubearchivist/home/src/index/filesystem.py b/tubearchivist/home/src/index/filesystem.py index 5a33501..0354e83 100644 --- a/tubearchivist/home/src/index/filesystem.py +++ b/tubearchivist/home/src/index/filesystem.py @@ -46,8 +46,9 @@ class FilesystemScanner: all_downloaded = [] for channel_name in all_channels: channel_path = os.path.join(self.VIDEOS, channel_name) - videos = os.listdir(channel_path) - all_videos = ignore_filelist(videos) + channel_files = os.listdir(channel_path) + channel_files_clean = ignore_filelist(channel_files) + all_videos = [i for i in channel_files_clean if i.endswith(".mp4")] for video in all_videos: youtube_id = video[9:20] all_downloaded.append((channel_name, video, youtube_id)) diff --git a/tubearchivist/home/src/index/video.py b/tubearchivist/home/src/index/video.py index 5b92198..e2695d4 100644 --- a/tubearchivist/home/src/index/video.py +++ b/tubearchivist/home/src/index/video.py @@ -4,10 +4,13 @@ functionality: - index and update in es """ +import json import os +import re from datetime import datetime import requests +from home.src.es.connect import ElasticWrap from home.src.index import channel as ta_channel from home.src.index.generic import YouTubeItem from home.src.ta.helper import DurationConverter, clean_string @@ -17,16 +20,13 @@ from ryd_client import ryd_client class YoutubeSubtitle: """handle video subtitle functionality""" - def __init__(self, config, youtube_meta, media_url, youtube_id): - self.config = config - self.youtube_meta = youtube_meta - self.media_url = media_url - self.youtube_id = youtube_id + def __init__(self, video): + self.video = video self.languages = False def sub_conf_parse(self): """add additional conf values to self""" - languages_raw = self.config["downloads"]["subtitle"] + languages_raw = self.video.config["downloads"]["subtitle"] self.languages = [i.strip() for i in languages_raw.split(",")] def get_subtitles(self): @@ -36,82 +36,226 @@ class YoutubeSubtitle: # no subtitles return False - relevant_subtitles = self.get_user_subtitles() - if relevant_subtitles: - return relevant_subtitles + relevant_subtitles = [] + for lang in self.languages: + user_sub = self.get_user_subtitles(lang) + if user_sub: + relevant_subtitles.append(user_sub) + continue - if self.config["downloads"]["subtitle_source"] == "auto": - relevant_auto = self.get_auto_caption() - return relevant_auto + if self.video.config["downloads"]["subtitle_source"] == "auto": + auto_cap = self.get_auto_caption(lang) + if auto_cap: + relevant_subtitles.append(auto_cap) - return False + return relevant_subtitles - def get_auto_caption(self): + def get_auto_caption(self, lang): """get auto_caption subtitles""" - print(f"{self.youtube_id}: get auto generated subtitles") - all_subtitles = self.youtube_meta.get("automatic_captions") + print(f"{self.video.youtube_id}-{lang}: get auto generated subtitles") + all_subtitles = self.video.youtube_meta.get("automatic_captions") if not all_subtitles: return False - relevant_subtitles = [] + video_media_url = self.video.json_data["media_url"] + media_url = video_media_url.replace(".mp4", f"-{lang}.vtt") + all_formats = all_subtitles.get(lang) + subtitle = [i for i in all_formats if i["ext"] == "vtt"][0] + subtitle.update( + {"lang": lang, "source": "auto", "media_url": media_url} + ) - for lang in self.languages: - media_url = self.media_url.replace(".mp4", f"-{lang}.vtt") - all_formats = all_subtitles.get(lang) - subtitle = [i for i in all_formats if i["ext"] == "vtt"][0] - subtitle.update( - {"lang": lang, "source": "auto", "media_url": media_url} - ) - relevant_subtitles.append(subtitle) - break - - return relevant_subtitles + return subtitle def _normalize_lang(self): """normalize country specific language keys""" - all_subtitles = self.youtube_meta.get("subtitles") + all_subtitles = self.video.youtube_meta.get("subtitles") + if not all_subtitles: + return False + all_keys = list(all_subtitles.keys()) for key in all_keys: lang = key.split("-")[0] old = all_subtitles.pop(key) + if lang == "live_chat": + continue all_subtitles[lang] = old return all_subtitles - def get_user_subtitles(self): + def get_user_subtitles(self, lang): """get subtitles uploaded from channel owner""" - print(f"{self.youtube_id}: get user uploaded subtitles") + print(f"{self.video.youtube_id}-{lang}: get user uploaded subtitles") all_subtitles = self._normalize_lang() if not all_subtitles: return False - relevant_subtitles = [] + video_media_url = self.video.json_data["media_url"] + media_url = video_media_url.replace(".mp4", f"-{lang}.vtt") + all_formats = all_subtitles.get(lang) + if not all_formats: + # no user subtitles found + return False - for lang in self.languages: - media_url = self.media_url.replace(".mp4", f"-{lang}.vtt") - all_formats = all_subtitles.get(lang) - subtitle = [i for i in all_formats if i["ext"] == "vtt"][0] - subtitle.update( - {"lang": lang, "source": "user", "media_url": media_url} - ) - relevant_subtitles.append(subtitle) - break + subtitle = [i for i in all_formats if i["ext"] == "vtt"][0] + subtitle.update( + {"lang": lang, "source": "user", "media_url": media_url} + ) - return relevant_subtitles + return subtitle def download_subtitles(self, relevant_subtitles): """download subtitle files to archive""" + videos_base = self.video.config["application"]["videos"] for subtitle in relevant_subtitles: - dest_path = os.path.join( - self.config["application"]["videos"], subtitle["media_url"] - ) + dest_path = os.path.join(videos_base, subtitle["media_url"]) + source = subtitle["source"] response = requests.get(subtitle["url"]) - if response.ok: - with open(dest_path, "w", encoding="utf-8") as subfile: - subfile.write(response.text) + if not response.ok: + print(f"{self.video.youtube_id}: failed to download subtitle") + continue + + parser = SubtitleParser(response.text, subtitle.get("lang")) + parser.process() + subtitle_str = parser.get_subtitle_str() + self._write_subtitle_file(dest_path, subtitle_str) + query_str = parser.create_bulk_import(self.video, source) + self._index_subtitle(query_str) + + @staticmethod + def _write_subtitle_file(dest_path, subtitle_str): + """write subtitle file to disk""" + # create folder here for first video of channel + os.makedirs(os.path.split(dest_path)[0], exist_ok=True) + with open(dest_path, "w", encoding="utf-8") as subfile: + subfile.write(subtitle_str) + + @staticmethod + def _index_subtitle(query_str): + """send subtitle to es for indexing""" + _, _ = ElasticWrap("_bulk").post(data=query_str, ndjson=True) + + +class SubtitleParser: + """parse subtitle str from youtube""" + + time_reg = r"^([0-9]{2}:?){3}\.[0-9]{3} --> ([0-9]{2}:?){3}\.[0-9]{3}" + stamp_reg = r"<([0-9]{2}:?){3}\.[0-9]{3}>" + tag_reg = r"" + + def __init__(self, subtitle_str, lang): + self.subtitle_str = subtitle_str + self.lang = lang + self.header = False + self.parsed_cue_list = False + self.all_text_lines = False + self.matched = False + + def process(self): + """collection to process subtitle string""" + self._parse_cues() + self._match_text_lines() + self._add_id() + + def _parse_cues(self): + """split into cues""" + all_cues = self.subtitle_str.replace("\n \n", "\n").split("\n\n") + self.header = all_cues[0] + self.all_text_lines = [] + self.parsed_cue_list = [self._cue_cleaner(i) for i in all_cues[1:]] + + def _cue_cleaner(self, cue): + """parse single cue""" + all_lines = cue.split("\n") + cue_dict = {"lines": []} + + for line in all_lines: + if re.match(self.time_reg, line): + clean = re.search(self.time_reg, line).group() + start, end = clean.split(" --> ") + cue_dict.update({"start": start, "end": end}) else: - print(f"{self.youtube_id}: failed to download subtitle") + clean = re.sub(self.stamp_reg, "", line) + clean = re.sub(self.tag_reg, "", clean) + cue_dict["lines"].append(clean) + if clean and clean not in self.all_text_lines: + self.all_text_lines.append(clean) + + return cue_dict + + def _match_text_lines(self): + """match unique text lines with timestamps""" + + self.matched = [] + + while self.all_text_lines: + check = self.all_text_lines[0] + matches = [i for i in self.parsed_cue_list if check in i["lines"]] + new_cue = matches[-1] + new_cue["start"] = matches[0]["start"] + + for line in new_cue["lines"]: + try: + self.all_text_lines.remove(line) + except ValueError: + print("failed to process:") + print(line) + + self.matched.append(new_cue) + + def _add_id(self): + """add id to matched cues""" + for idx, _ in enumerate(self.matched): + self.matched[idx]["id"] = idx + 1 + + def get_subtitle_str(self): + """stitch cues and return processed new string""" + new_subtitle_str = self.header + "\n\n" + + for cue in self.matched: + timestamp = f"{cue.get('start')} --> {cue.get('end')}" + lines = "\n".join(cue.get("lines")) + cue_text = f"{cue.get('id')}\n{timestamp}\n{lines}\n\n" + new_subtitle_str = new_subtitle_str + cue_text + + return new_subtitle_str + + def create_bulk_import(self, video, source): + """process matched for es import""" + bulk_list = [] + channel = video.json_data.get("channel") + + document = { + "youtube_id": video.youtube_id, + "title": video.json_data.get("title"), + "subtitle_channel": channel.get("channel_name"), + "subtitle_channel_id": channel.get("channel_id"), + "subtitle_last_refresh": int(datetime.now().strftime("%s")), + "subtitle_lang": self.lang, + "subtitle_source": source, + } + + for match in self.matched: + match_id = match.get("id") + document_id = f"{video.youtube_id}-{self.lang}-{match_id}" + action = {"index": {"_index": "ta_subtitle", "_id": document_id}} + document.update( + { + "subtitle_fragment_id": document_id, + "subtitle_start": match.get("start"), + "subtitle_end": match.get("end"), + "subtitle_index": match_id, + "subtitle_line": " ".join(match.get("lines")), + } + ) + bulk_list.append(json.dumps(action)) + bulk_list.append(json.dumps(document)) + + bulk_list.append("\n") + query_str = "\n".join(bulk_list) + + return query_str class YoutubeVideo(YouTubeItem, YoutubeSubtitle): @@ -204,10 +348,17 @@ class YoutubeVideo(YouTubeItem, YoutubeSubtitle): try: # when indexing from download task vid_path = self.build_dl_cache_path() - except FileNotFoundError: - # when reindexing - base = self.app_conf["videos"] - vid_path = os.path.join(base, self.json_data["media_url"]) + except FileNotFoundError as err: + # when reindexing needs to handle title rename + channel = os.path.split(self.json_data["media_url"])[0] + channel_dir = os.path.join(self.app_conf["videos"], channel) + all_files = os.listdir(channel_dir) + for file in all_files: + if self.youtube_id in file: + vid_path = os.path.join(channel_dir, file) + break + else: + raise FileNotFoundError("could not find video file") from err duration_handler = DurationConverter() duration = duration_handler.get_sec(vid_path) @@ -242,11 +393,18 @@ class YoutubeVideo(YouTubeItem, YoutubeSubtitle): """delete video file, meta data""" self.get_from_es() video_base = self.app_conf["videos"] - media_url = self.json_data["media_url"] - print(f"{self.youtube_id}: delete {media_url} from file system") - to_delete = os.path.join(video_base, media_url) - os.remove(to_delete) + to_del = [self.json_data.get("media_url")] + + all_subtitles = self.json_data.get("subtitles") + if all_subtitles: + to_del = to_del + [i.get("media_url") for i in all_subtitles] + + for media_url in to_del: + file_path = os.path.join(video_base, media_url) + os.remove(file_path) + self.del_in_es() + self._delete_subtitles() def _get_ryd_stats(self): """get optional stats from returnyoutubedislikeapi.com""" @@ -270,17 +428,17 @@ class YoutubeVideo(YouTubeItem, YoutubeSubtitle): def _check_subtitles(self): """optionally add subtitles""" - handler = YoutubeSubtitle( - self.config, - self.youtube_meta, - media_url=self.json_data["media_url"], - youtube_id=self.youtube_id, - ) + handler = YoutubeSubtitle(self) subtitles = handler.get_subtitles() if subtitles: self.json_data["subtitles"] = subtitles handler.download_subtitles(relevant_subtitles=subtitles) + def _delete_subtitles(self): + """delete indexed subtitles""" + data = {"query": {"term": {"youtube_id": {"value": self.youtube_id}}}} + _, _ = ElasticWrap("ta_subtitle/_delete_by_query").post(data=data) + def index_new_video(youtube_id): """combined classes to create new video in index""" diff --git a/tubearchivist/home/src/ta/helper.py b/tubearchivist/home/src/ta/helper.py index 4788636..d577dcd 100644 --- a/tubearchivist/home/src/ta/helper.py +++ b/tubearchivist/home/src/ta/helper.py @@ -169,7 +169,11 @@ class DurationConverter: capture_output=True, check=True, ) - duration_sec = int(float(duration.stdout.decode().strip())) + duration_raw = duration.stdout.decode().strip() + if duration_raw == "N/A": + return 0 + + duration_sec = int(float(duration_raw)) return duration_sec @staticmethod diff --git a/tubearchivist/home/templates/home/video.html b/tubearchivist/home/templates/home/video.html index 1b2e03d..bec823f 100644 --- a/tubearchivist/home/templates/home/video.html +++ b/tubearchivist/home/templates/home/video.html @@ -3,10 +3,14 @@ {% load static %} {% load humanize %}
-
@@ -57,10 +61,10 @@
-

Views: {{ video.stats.view_count|intcomma }}

+

views: {{ video.stats.view_count|intcomma }}

{% if video.stats.dislike_count %} -

thumbs-down: {{ video.stats.dislike_count|intcomma }}

+

thumbs-down: {{ video.stats.dislike_count|intcomma }}

{% endif %} {% if video.stats.average_rating %}

Rating: diff --git a/tubearchivist/requirements.txt b/tubearchivist/requirements.txt index 815d2c0..fc21392 100644 --- a/tubearchivist/requirements.txt +++ b/tubearchivist/requirements.txt @@ -4,9 +4,9 @@ Django==4.0.2 django-cors-headers==3.11.0 djangorestframework==3.13.1 Pillow==9.0.1 -redis==4.1.2 +redis==4.1.3 requests==2.27.1 ryd-client==0.0.3 uWSGI==2.0.20 -whitenoise==5.3.0 +whitenoise==6.0.0 yt_dlp==2022.2.4