From 91452b511476c82055961643bdda9325e0089426 Mon Sep 17 00:00:00 2001 From: simon Date: Sat, 5 Feb 2022 18:35:02 +0700 Subject: [PATCH 01/21] remove redundant video player api endpoint --- tubearchivist/api/README.md | 4 ---- tubearchivist/api/urls.py | 6 ------ tubearchivist/api/views.py | 32 -------------------------------- 3 files changed, 42 deletions(-) diff --git a/tubearchivist/api/README.md b/tubearchivist/api/README.md index 0e74a11..73dd4fc 100644 --- a/tubearchivist/api/README.md +++ b/tubearchivist/api/README.md @@ -23,10 +23,6 @@ response = requests.get(url, headers=headers) ## Video Item View /api/video/\/ -## Video Player View -returns all relevant information to create video player -/api/video/\/player - ## Channel List View /api/channel/ diff --git a/tubearchivist/api/urls.py b/tubearchivist/api/urls.py index a6c6801..d39dc30 100644 --- a/tubearchivist/api/urls.py +++ b/tubearchivist/api/urls.py @@ -6,7 +6,6 @@ from api.views import ( DownloadApiListView, DownloadApiView, PlaylistApiView, - VideoApiPlayerView, VideoApiView, ) from django.urls import path @@ -17,11 +16,6 @@ urlpatterns = [ VideoApiView.as_view(), name="api-video", ), - path( - "video//player/", - VideoApiPlayerView.as_view(), - name="api-video-player", - ), path( "channel/", ChannelApiListView.as_view(), diff --git a/tubearchivist/api/views.py b/tubearchivist/api/views.py index f0923aa..ec75370 100644 --- a/tubearchivist/api/views.py +++ b/tubearchivist/api/views.py @@ -92,38 +92,6 @@ class VideoApiView(ApiBaseView): return Response(self.response, status=self.status_code) -class VideoApiPlayerView(ApiBaseView): - """resolves to /api/video//player - GET: returns dict of video to build player - """ - - search_base = "/ta_video/_doc/" - - def get(self, request, video_id): - # pylint: disable=unused-argument - """get request""" - self.config_builder() - self.get_document(video_id) - player = self.process_response() - return Response(player, status=self.status_code) - - def process_response(self): - """build all needed vars for player""" - vid_data = self.response["data"] - youtube_id = vid_data["youtube_id"] - vid_thumb_url = ThumbManager().vid_thumb_path(youtube_id) - player = { - "youtube_id": youtube_id, - "media_url": "/media/" + vid_data["media_url"], - "vid_thumb_url": "/cache/" + vid_thumb_url, - "title": vid_data["title"], - "channel_name": vid_data["channel"]["channel_name"], - "channel_id": vid_data["channel"]["channel_id"], - "is_watched": vid_data["player"]["watched"], - } - return player - - class ChannelApiView(ApiBaseView): """resolves to /api/channel// GET: returns metadata dict of channel From 851fbae90045ac35c9ec8760278ec74fe0233d9d Mon Sep 17 00:00:00 2001 From: simon Date: Sat, 5 Feb 2022 18:42:09 +0700 Subject: [PATCH 02/21] fix video template dislike icon and add watched icon --- tubearchivist/home/templates/home/video.html | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tubearchivist/home/templates/home/video.html b/tubearchivist/home/templates/home/video.html index 1b2e03d..e44aa18 100644 --- a/tubearchivist/home/templates/home/video.html +++ b/tubearchivist/home/templates/home/video.html @@ -57,10 +57,10 @@
-

Views: {{ video.stats.view_count|intcomma }}

+

views: {{ video.stats.view_count|intcomma }}

{% if video.stats.dislike_count %} -

thumbs-down: {{ video.stats.dislike_count|intcomma }}

+

thumbs-down: {{ video.stats.dislike_count|intcomma }}

{% endif %} {% if video.stats.average_rating %}

Rating: From f5f46349b2404dc629c8e231473e33587c327486 Mon Sep 17 00:00:00 2001 From: simon Date: Sat, 5 Feb 2022 22:38:59 +0700 Subject: [PATCH 03/21] handle rescan name change --- tubearchivist/home/src/index/video.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/tubearchivist/home/src/index/video.py b/tubearchivist/home/src/index/video.py index 5b92198..49d0b6b 100644 --- a/tubearchivist/home/src/index/video.py +++ b/tubearchivist/home/src/index/video.py @@ -205,9 +205,16 @@ class YoutubeVideo(YouTubeItem, YoutubeSubtitle): # when indexing from download task vid_path = self.build_dl_cache_path() except FileNotFoundError: - # when reindexing - base = self.app_conf["videos"] - vid_path = os.path.join(base, self.json_data["media_url"]) + # when reindexing needs to handle title rename + channel = os.path.split(self.json_data["media_url"])[0] + channel_dir = os.path.join(self.app_conf["videos"], channel) + all_files = os.listdir(channel_dir) + for file in all_files: + if self.youtube_id in file: + vid_path = os.path.join(channel_dir, file) + break + else: + raise FileNotFoundError duration_handler = DurationConverter() duration = duration_handler.get_sec(vid_path) From 2bf9e9683b13dc49558df0302e7b8fbc85f72817 Mon Sep 17 00:00:00 2001 From: simon Date: Sat, 5 Feb 2022 22:51:38 +0700 Subject: [PATCH 04/21] error handeling in _normalize_lang to skip livechat and ignore missing --- tubearchivist/home/src/index/video.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tubearchivist/home/src/index/video.py b/tubearchivist/home/src/index/video.py index 49d0b6b..897371b 100644 --- a/tubearchivist/home/src/index/video.py +++ b/tubearchivist/home/src/index/video.py @@ -71,10 +71,15 @@ class YoutubeSubtitle: def _normalize_lang(self): """normalize country specific language keys""" all_subtitles = self.youtube_meta.get("subtitles") + if not all_subtitles: + return False + all_keys = list(all_subtitles.keys()) for key in all_keys: lang = key.split("-")[0] old = all_subtitles.pop(key) + if lang == "live_chat": + continue all_subtitles[lang] = old return all_subtitles From 44af78b7e30d623f486b007cbcf4b88140130d3d Mon Sep 17 00:00:00 2001 From: simon Date: Sat, 5 Feb 2022 23:09:05 +0700 Subject: [PATCH 05/21] handle NA in ffprobe duration extractor --- tubearchivist/home/src/ta/helper.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tubearchivist/home/src/ta/helper.py b/tubearchivist/home/src/ta/helper.py index 4788636..d577dcd 100644 --- a/tubearchivist/home/src/ta/helper.py +++ b/tubearchivist/home/src/ta/helper.py @@ -169,7 +169,11 @@ class DurationConverter: capture_output=True, check=True, ) - duration_sec = int(float(duration.stdout.decode().strip())) + duration_raw = duration.stdout.decode().strip() + if duration_raw == "N/A": + return 0 + + duration_sec = int(float(duration_raw)) return duration_sec @staticmethod From 52013aff3f06382816e90cd19a1c34e4401b014e Mon Sep 17 00:00:00 2001 From: simon Date: Sat, 5 Feb 2022 23:42:42 +0700 Subject: [PATCH 06/21] fix subtitle download of first video of channel without folder --- tubearchivist/home/src/index/video.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tubearchivist/home/src/index/video.py b/tubearchivist/home/src/index/video.py index 897371b..e22e52d 100644 --- a/tubearchivist/home/src/index/video.py +++ b/tubearchivist/home/src/index/video.py @@ -113,6 +113,8 @@ class YoutubeSubtitle: ) response = requests.get(subtitle["url"]) if response.ok: + # create folder here for first video of channel + os.makedirs(os.path.split(dest_path)[0], exist_ok=True) with open(dest_path, "w", encoding="utf-8") as subfile: subfile.write(response.text) else: From e98ffc00502226c2223339d6e5cbbaf9827bbf53 Mon Sep 17 00:00:00 2001 From: simon Date: Sat, 5 Feb 2022 23:50:47 +0700 Subject: [PATCH 07/21] add subtitles mapping to video index --- tubearchivist/home/src/es/index_mapping.json | 26 ++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/tubearchivist/home/src/es/index_mapping.json b/tubearchivist/home/src/es/index_mapping.json index 3272ed3..0ad6494 100644 --- a/tubearchivist/home/src/es/index_mapping.json +++ b/tubearchivist/home/src/es/index_mapping.json @@ -156,6 +156,32 @@ "normalizer": "to_lower" } } + }, + "subtitles": { + "properties": { + "ext": { + "type": "keyword", + "index": false + }, + "lang": { + "type": "keyword", + "index": false + }, + "media_url": { + "type": "keyword", + "index": false + }, + "name": { + "type": "keyword" + }, + "source": { + "type": "keyword" + }, + "url": { + "type": "keyword", + "index": false + } + } } }, "expected_set": { From 1664b0d4fc33243ca92bf6a2787b5bbe1460a1f5 Mon Sep 17 00:00:00 2001 From: simon Date: Sun, 6 Feb 2022 00:08:24 +0700 Subject: [PATCH 08/21] restructure video tag to add subtitle tracks --- tubearchivist/home/templates/home/video.html | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tubearchivist/home/templates/home/video.html b/tubearchivist/home/templates/home/video.html index e44aa18..bec823f 100644 --- a/tubearchivist/home/templates/home/video.html +++ b/tubearchivist/home/templates/home/video.html @@ -3,10 +3,14 @@ {% load static %} {% load humanize %}

-
From 5f6158243e018edf5da0c4edc4343609b3d9ee75 Mon Sep 17 00:00:00 2001 From: simon Date: Mon, 7 Feb 2022 21:18:52 +0700 Subject: [PATCH 09/21] auto generated subtitle parser and cleaner --- tubearchivist/home/src/index/video.py | 85 +++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) diff --git a/tubearchivist/home/src/index/video.py b/tubearchivist/home/src/index/video.py index e22e52d..ab5c22b 100644 --- a/tubearchivist/home/src/index/video.py +++ b/tubearchivist/home/src/index/video.py @@ -5,6 +5,7 @@ functionality: """ import os +import re from datetime import datetime import requests @@ -121,6 +122,90 @@ class YoutubeSubtitle: print(f"{self.youtube_id}: failed to download subtitle") +class SubtitleParser: + """parse subtitle str from youtube""" + + time_reg = r"^([0-9]{2}:?){3}\.[0-9]{3} --> ([0-9]{2}:?){3}\.[0-9]{3}" + stamp_reg = r"<([0-9]{2}:?){3}\.[0-9]{3}>" + tag_reg = r"" + + def __init__(self, subtitle_str): + self.subtitle_str = subtitle_str + self.header = False + self.parsed_cue_list = False + self.all_text_lines = False + self.matched = False + + def process(self): + """collection to process subtitle string""" + self._parse_cues() + self._match_text_lines() + self._add_id() + + def _parse_cues(self): + """split into cues""" + all_cues = self.subtitle_str.replace("\n \n", "\n").split("\n\n") + self.header = all_cues[0] + self.all_text_lines = [] + self.parsed_cue_list = [self._cue_cleaner(i) for i in all_cues[1:]] + + def _cue_cleaner(self, cue): + """parse single cue""" + all_lines = cue.split("\n") + cue_dict = {"lines": []} + + for line in all_lines: + if re.match(r"^([0-9]{2}:?){3}", line): + clean = re.search(self.time_reg, line).group() + start, end = clean.split(" --> ") + cue_dict.update({"start": start, "end": end}) + else: + clean = re.sub(self.stamp_reg, "", line) + clean = re.sub(self.tag_reg, "", clean) + cue_dict["lines"].append(clean) + if clean and clean not in self.all_text_lines: + self.all_text_lines.append(clean) + + return cue_dict + + def _match_text_lines(self): + """match unique text lines with timestamps""" + + self.matched = [] + + while self.all_text_lines: + check = self.all_text_lines[0] + matches = [i for i in self.parsed_cue_list if check in i["lines"]] + new_cue = matches[-1] + new_cue["start"] = matches[0]["start"] + + for line in new_cue["lines"]: + try: + self.all_text_lines.remove(line) + except ValueError: + print("failed to process:") + print(line) + + self.matched.append(new_cue) + + def _add_id(self): + """add id to matched cues""" + for idx, _ in enumerate(self.matched): + self.matched[idx]["id"] = idx + 1 + + def get_subtitle_str(self): + """stitch cues and return processed new string""" + new_subtitle_str = self.header + "\n\n" + + for cue in self.matched: + timestamp = f"{cue.get('start')} --> {cue.get('end')}" + lines = "\n".join(cue.get("lines")) + cue_text = f"{cue.get('id')}\n{timestamp}\n{lines}\n\n" + new_subtitle_str = new_subtitle_str + cue_text + + return new_subtitle_str + + class YoutubeVideo(YouTubeItem, YoutubeSubtitle): """represents a single youtube video""" From 6cb892a811fabea5bec08fb231759aa9c2ddac11 Mon Sep 17 00:00:00 2001 From: simon Date: Wed, 9 Feb 2022 21:33:41 +0700 Subject: [PATCH 10/21] integrate auto generated subtitle cleaner --- tubearchivist/home/src/index/video.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tubearchivist/home/src/index/video.py b/tubearchivist/home/src/index/video.py index ab5c22b..4d0bfcb 100644 --- a/tubearchivist/home/src/index/video.py +++ b/tubearchivist/home/src/index/video.py @@ -113,11 +113,17 @@ class YoutubeSubtitle: self.config["application"]["videos"], subtitle["media_url"] ) response = requests.get(subtitle["url"]) + if subtitle["source"] == "auto": + parser = SubtitleParser(response.text) + parser.process() + subtitle_str_clean = parser.get_subtitle_str() + else: + subtitle_str_clean = response.text if response.ok: # create folder here for first video of channel os.makedirs(os.path.split(dest_path)[0], exist_ok=True) with open(dest_path, "w", encoding="utf-8") as subfile: - subfile.write(response.text) + subfile.write(subtitle_str_clean) else: print(f"{self.youtube_id}: failed to download subtitle") From 4e2d0fa46407bc7bf9230d7cb6317e894c62b999 Mon Sep 17 00:00:00 2001 From: simon Date: Wed, 9 Feb 2022 23:38:18 +0700 Subject: [PATCH 11/21] bump es version --- docker-compose.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index ce79327..b300949 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -33,7 +33,7 @@ services: depends_on: - archivist-es archivist-es: - image: docker.elastic.co/elasticsearch/elasticsearch:7.16.2 + image: docker.elastic.co/elasticsearch/elasticsearch:7.17.0 container_name: archivist-es restart: always environment: @@ -54,4 +54,4 @@ volumes: media: cache: redis: - es: \ No newline at end of file + es: From 4e4cfe333450c4c9f937508c49464e3d2cb7df33 Mon Sep 17 00:00:00 2001 From: simon Date: Wed, 9 Feb 2022 23:40:15 +0700 Subject: [PATCH 12/21] pass whole video object into YoutubeSubtitle class --- tubearchivist/home/src/index/video.py | 39 +++++++++++---------------- 1 file changed, 16 insertions(+), 23 deletions(-) diff --git a/tubearchivist/home/src/index/video.py b/tubearchivist/home/src/index/video.py index 4d0bfcb..3f39674 100644 --- a/tubearchivist/home/src/index/video.py +++ b/tubearchivist/home/src/index/video.py @@ -18,16 +18,13 @@ from ryd_client import ryd_client class YoutubeSubtitle: """handle video subtitle functionality""" - def __init__(self, config, youtube_meta, media_url, youtube_id): - self.config = config - self.youtube_meta = youtube_meta - self.media_url = media_url - self.youtube_id = youtube_id + def __init__(self, video): + self.video = video self.languages = False def sub_conf_parse(self): """add additional conf values to self""" - languages_raw = self.config["downloads"]["subtitle"] + languages_raw = self.video.config["downloads"]["subtitle"] self.languages = [i.strip() for i in languages_raw.split(",")] def get_subtitles(self): @@ -41,7 +38,7 @@ class YoutubeSubtitle: if relevant_subtitles: return relevant_subtitles - if self.config["downloads"]["subtitle_source"] == "auto": + if self.video.config["downloads"]["subtitle_source"] == "auto": relevant_auto = self.get_auto_caption() return relevant_auto @@ -49,8 +46,8 @@ class YoutubeSubtitle: def get_auto_caption(self): """get auto_caption subtitles""" - print(f"{self.youtube_id}: get auto generated subtitles") - all_subtitles = self.youtube_meta.get("automatic_captions") + print(f"{self.video.youtube_id}: get auto generated subtitles") + all_subtitles = self.video.youtube_meta.get("automatic_captions") if not all_subtitles: return False @@ -58,7 +55,8 @@ class YoutubeSubtitle: relevant_subtitles = [] for lang in self.languages: - media_url = self.media_url.replace(".mp4", f"-{lang}.vtt") + video_media_url = self.video.json_data["media_url"] + media_url = video_media_url.replace(".mp4", f"-{lang}.vtt") all_formats = all_subtitles.get(lang) subtitle = [i for i in all_formats if i["ext"] == "vtt"][0] subtitle.update( @@ -71,7 +69,7 @@ class YoutubeSubtitle: def _normalize_lang(self): """normalize country specific language keys""" - all_subtitles = self.youtube_meta.get("subtitles") + all_subtitles = self.video.youtube_meta.get("subtitles") if not all_subtitles: return False @@ -87,7 +85,7 @@ class YoutubeSubtitle: def get_user_subtitles(self): """get subtitles uploaded from channel owner""" - print(f"{self.youtube_id}: get user uploaded subtitles") + print(f"{self.video.youtube_id}: get user uploaded subtitles") all_subtitles = self._normalize_lang() if not all_subtitles: return False @@ -95,7 +93,8 @@ class YoutubeSubtitle: relevant_subtitles = [] for lang in self.languages: - media_url = self.media_url.replace(".mp4", f"-{lang}.vtt") + video_media_url = self.video.json_data["media_url"] + media_url = video_media_url.replace(".mp4", f"-{lang}.vtt") all_formats = all_subtitles.get(lang) subtitle = [i for i in all_formats if i["ext"] == "vtt"][0] subtitle.update( @@ -108,10 +107,9 @@ class YoutubeSubtitle: def download_subtitles(self, relevant_subtitles): """download subtitle files to archive""" + videos_base = self.video.config["application"]["videos"] for subtitle in relevant_subtitles: - dest_path = os.path.join( - self.config["application"]["videos"], subtitle["media_url"] - ) + dest_path = os.path.join(videos_base, subtitle["media_url"]) response = requests.get(subtitle["url"]) if subtitle["source"] == "auto": parser = SubtitleParser(response.text) @@ -125,7 +123,7 @@ class YoutubeSubtitle: with open(dest_path, "w", encoding="utf-8") as subfile: subfile.write(subtitle_str_clean) else: - print(f"{self.youtube_id}: failed to download subtitle") + print(f"{self.video.youtube_id}: failed to download subtitle") class SubtitleParser: @@ -375,12 +373,7 @@ class YoutubeVideo(YouTubeItem, YoutubeSubtitle): def _check_subtitles(self): """optionally add subtitles""" - handler = YoutubeSubtitle( - self.config, - self.youtube_meta, - media_url=self.json_data["media_url"], - youtube_id=self.youtube_id, - ) + handler = YoutubeSubtitle(self) subtitles = handler.get_subtitles() if subtitles: self.json_data["subtitles"] = subtitles From 9f652802ae22bba180407914f19a83374eda92d5 Mon Sep 17 00:00:00 2001 From: simon Date: Thu, 10 Feb 2022 11:47:14 +0700 Subject: [PATCH 13/21] add new mapping for subtitle index --- tubearchivist/home/src/es/index_mapping.json | 67 ++++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/tubearchivist/home/src/es/index_mapping.json b/tubearchivist/home/src/es/index_mapping.json index 0ad6494..f30a82d 100644 --- a/tubearchivist/home/src/es/index_mapping.json +++ b/tubearchivist/home/src/es/index_mapping.json @@ -303,6 +303,73 @@ }, "number_of_replicas": "0" } + }, + { + "index_name": "subtitle", + "expected_map": { + "youtube_id": { + "type": "keyword" + }, + "title": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256, + "normalizer": "to_lower" + } + } + }, + "subtitle_fragment_id": { + "type": "keyword" + }, + "subtitle_channel": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256, + "normalizer": "to_lower" + } + } + }, + "subtitle_channel_id": { + "type": "keyword" + }, + "subtitle_start": { + "type": "text" + }, + "subtitle_end": { + "type": "text" + }, + "subtitle_last_refresh": { + "type": "date" + }, + "subtitle_index": { + "type" : "long" + }, + "subtitle_lang": { + "type": "keyword" + }, + "subtitle_source": { + "type": "keyword" + }, + "subtitle_line": { + "type" : "text", + "analyzer": "english" + } + }, + "expected_set": { + "analysis": { + "normalizer": { + "to_lower": { + "type": "custom", + "filter": ["lowercase"] + } + } + }, + "number_of_replicas": "0" + } } ] } \ No newline at end of file From a2cae51f48a3ca962e7a3ff6c778d229618ad266 Mon Sep 17 00:00:00 2001 From: simon Date: Thu, 10 Feb 2022 17:02:19 +0700 Subject: [PATCH 14/21] bulk import subtitle lines into es --- tubearchivist/home/src/index/video.py | 76 ++++++++++++++++++++++----- 1 file changed, 63 insertions(+), 13 deletions(-) diff --git a/tubearchivist/home/src/index/video.py b/tubearchivist/home/src/index/video.py index 3f39674..1385e16 100644 --- a/tubearchivist/home/src/index/video.py +++ b/tubearchivist/home/src/index/video.py @@ -4,11 +4,13 @@ functionality: - index and update in es """ +import json import os import re from datetime import datetime import requests +from home.src.es.connect import ElasticWrap from home.src.index import channel as ta_channel from home.src.index.generic import YouTubeItem from home.src.ta.helper import DurationConverter, clean_string @@ -110,20 +112,31 @@ class YoutubeSubtitle: videos_base = self.video.config["application"]["videos"] for subtitle in relevant_subtitles: dest_path = os.path.join(videos_base, subtitle["media_url"]) + source = subtitle["media_url"] response = requests.get(subtitle["url"]) - if subtitle["source"] == "auto": - parser = SubtitleParser(response.text) - parser.process() - subtitle_str_clean = parser.get_subtitle_str() - else: - subtitle_str_clean = response.text - if response.ok: - # create folder here for first video of channel - os.makedirs(os.path.split(dest_path)[0], exist_ok=True) - with open(dest_path, "w", encoding="utf-8") as subfile: - subfile.write(subtitle_str_clean) - else: + if not response.ok: print(f"{self.video.youtube_id}: failed to download subtitle") + continue + + parser = SubtitleParser(response.text, subtitle.get("lang")) + parser.process() + subtitle_str = parser.get_subtitle_str() + self._write_subtitle_file(dest_path, subtitle_str) + query_str = parser.create_bulk_import(self.video, source) + self._index_subtitle(query_str) + + @staticmethod + def _write_subtitle_file(dest_path, subtitle_str): + """write subtitle file to disk""" + # create folder here for first video of channel + os.makedirs(os.path.split(dest_path)[0], exist_ok=True) + with open(dest_path, "w", encoding="utf-8") as subfile: + subfile.write(subtitle_str) + + @staticmethod + def _index_subtitle(query_str): + """send subtitle to es for indexing""" + _, _ = ElasticWrap("_bulk").post(data=query_str, ndjson=True) class SubtitleParser: @@ -133,8 +146,9 @@ class SubtitleParser: stamp_reg = r"<([0-9]{2}:?){3}\.[0-9]{3}>" tag_reg = r"" - def __init__(self, subtitle_str): + def __init__(self, subtitle_str, lang): self.subtitle_str = subtitle_str + self.lang = lang self.header = False self.parsed_cue_list = False self.all_text_lines = False @@ -209,6 +223,42 @@ class SubtitleParser: return new_subtitle_str + def create_bulk_import(self, video, source): + """process matched for es import""" + bulk_list = [] + channel = video.json_data.get("channel") + + document = { + "youtube_id": video.youtube_id, + "title": video.json_data.get("title"), + "subtitle_channel": channel.get("channel_name"), + "subtitle_channel_id": channel.get("channel_id"), + "subtitle_last_refresh": int(datetime.now().strftime("%s")), + "subtitle_lang": self.lang, + "subtitle_source": source, + } + + for match in self.matched: + match_id = match.get("id") + document_id = f"{video.youtube_id}-{self.lang}-{match_id}" + action = {"index": {"_index": "ta_subtitle", "_id": document_id}} + document.update( + { + "subtitle_fragment_id": document_id, + "subtitle_start": match.get("start"), + "subtitle_end": match.get("end"), + "subtitle_index": match_id, + "subtitle_line": " ".join(match.get("lines")), + } + ) + bulk_list.append(json.dumps(action)) + bulk_list.append(json.dumps(document)) + + bulk_list.append("\n") + query_str = "\n".join(bulk_list) + + return query_str + class YoutubeVideo(YouTubeItem, YoutubeSubtitle): """represents a single youtube video""" From 0414df0de087d027baa72c354ad53412e49fef45 Mon Sep 17 00:00:00 2001 From: simon Date: Thu, 10 Feb 2022 17:10:30 +0700 Subject: [PATCH 15/21] fix key error for subtitle source --- tubearchivist/home/src/index/video.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tubearchivist/home/src/index/video.py b/tubearchivist/home/src/index/video.py index 1385e16..4ac4461 100644 --- a/tubearchivist/home/src/index/video.py +++ b/tubearchivist/home/src/index/video.py @@ -112,7 +112,7 @@ class YoutubeSubtitle: videos_base = self.video.config["application"]["videos"] for subtitle in relevant_subtitles: dest_path = os.path.join(videos_base, subtitle["media_url"]) - source = subtitle["media_url"] + source = subtitle["source"] response = requests.get(subtitle["url"]) if not response.ok: print(f"{self.video.youtube_id}: failed to download subtitle") From b071612038df25cf963d3c4e542e6e578cb18d61 Mon Sep 17 00:00:00 2001 From: simon Date: Thu, 10 Feb 2022 17:34:21 +0700 Subject: [PATCH 16/21] better error raising for add player info --- tubearchivist/home/src/index/video.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tubearchivist/home/src/index/video.py b/tubearchivist/home/src/index/video.py index 4ac4461..c9c57ce 100644 --- a/tubearchivist/home/src/index/video.py +++ b/tubearchivist/home/src/index/video.py @@ -350,7 +350,7 @@ class YoutubeVideo(YouTubeItem, YoutubeSubtitle): try: # when indexing from download task vid_path = self.build_dl_cache_path() - except FileNotFoundError: + except FileNotFoundError as err: # when reindexing needs to handle title rename channel = os.path.split(self.json_data["media_url"])[0] channel_dir = os.path.join(self.app_conf["videos"], channel) @@ -360,7 +360,7 @@ class YoutubeVideo(YouTubeItem, YoutubeSubtitle): vid_path = os.path.join(channel_dir, file) break else: - raise FileNotFoundError + raise FileNotFoundError("could not find video file") from err duration_handler = DurationConverter() duration = duration_handler.get_sec(vid_path) From 077692987bdbcd48bc71b6cb0d586c8de174becb Mon Sep 17 00:00:00 2001 From: simon Date: Thu, 10 Feb 2022 18:32:23 +0700 Subject: [PATCH 17/21] fix multi language subtitle extractor, and better regex for timestamp matching --- tubearchivist/home/src/index/video.py | 72 +++++++++++++-------------- 1 file changed, 35 insertions(+), 37 deletions(-) diff --git a/tubearchivist/home/src/index/video.py b/tubearchivist/home/src/index/video.py index c9c57ce..ca2c2c7 100644 --- a/tubearchivist/home/src/index/video.py +++ b/tubearchivist/home/src/index/video.py @@ -36,38 +36,37 @@ class YoutubeSubtitle: # no subtitles return False - relevant_subtitles = self.get_user_subtitles() - if relevant_subtitles: - return relevant_subtitles + relevant_subtitles = [] + for lang in self.languages: + user_sub = self.get_user_subtitles(lang) + if user_sub: + relevant_subtitles.append(user_sub) + continue - if self.video.config["downloads"]["subtitle_source"] == "auto": - relevant_auto = self.get_auto_caption() - return relevant_auto + if self.video.config["downloads"]["subtitle_source"] == "auto": + auto_cap = self.get_auto_caption(lang) + if auto_cap: + relevant_subtitles.append(auto_cap) - return False + return relevant_subtitles - def get_auto_caption(self): + def get_auto_caption(self, lang): """get auto_caption subtitles""" - print(f"{self.video.youtube_id}: get auto generated subtitles") + print(f"{self.video.youtube_id}-{lang}: get auto generated subtitles") all_subtitles = self.video.youtube_meta.get("automatic_captions") if not all_subtitles: return False - relevant_subtitles = [] + video_media_url = self.video.json_data["media_url"] + media_url = video_media_url.replace(".mp4", f"-{lang}.vtt") + all_formats = all_subtitles.get(lang) + subtitle = [i for i in all_formats if i["ext"] == "vtt"][0] + subtitle.update( + {"lang": lang, "source": "auto", "media_url": media_url} + ) - for lang in self.languages: - video_media_url = self.video.json_data["media_url"] - media_url = video_media_url.replace(".mp4", f"-{lang}.vtt") - all_formats = all_subtitles.get(lang) - subtitle = [i for i in all_formats if i["ext"] == "vtt"][0] - subtitle.update( - {"lang": lang, "source": "auto", "media_url": media_url} - ) - relevant_subtitles.append(subtitle) - break - - return relevant_subtitles + return subtitle def _normalize_lang(self): """normalize country specific language keys""" @@ -85,27 +84,26 @@ class YoutubeSubtitle: return all_subtitles - def get_user_subtitles(self): + def get_user_subtitles(self, lang): """get subtitles uploaded from channel owner""" - print(f"{self.video.youtube_id}: get user uploaded subtitles") + print(f"{self.video.youtube_id}-{lang}: get user uploaded subtitles") all_subtitles = self._normalize_lang() if not all_subtitles: return False - relevant_subtitles = [] + video_media_url = self.video.json_data["media_url"] + media_url = video_media_url.replace(".mp4", f"-{lang}.vtt") + all_formats = all_subtitles.get(lang) + if not all_formats: + # no user subtitles found + return False - for lang in self.languages: - video_media_url = self.video.json_data["media_url"] - media_url = video_media_url.replace(".mp4", f"-{lang}.vtt") - all_formats = all_subtitles.get(lang) - subtitle = [i for i in all_formats if i["ext"] == "vtt"][0] - subtitle.update( - {"lang": lang, "source": "user", "media_url": media_url} - ) - relevant_subtitles.append(subtitle) - break + subtitle = [i for i in all_formats if i["ext"] == "vtt"][0] + subtitle.update( + {"lang": lang, "source": "user", "media_url": media_url} + ) - return relevant_subtitles + return subtitle def download_subtitles(self, relevant_subtitles): """download subtitle files to archive""" @@ -173,7 +171,7 @@ class SubtitleParser: cue_dict = {"lines": []} for line in all_lines: - if re.match(r"^([0-9]{2}:?){3}", line): + if re.match(self.time_reg, line): clean = re.search(self.time_reg, line).group() start, end = clean.split(" --> ") cue_dict.update({"start": start, "end": end}) From 0e56efc428e3783433071e3b33217f89b09b6966 Mon Sep 17 00:00:00 2001 From: simon Date: Thu, 10 Feb 2022 18:48:35 +0700 Subject: [PATCH 18/21] limit filesystem scan to mp4 files only --- tubearchivist/home/src/index/filesystem.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tubearchivist/home/src/index/filesystem.py b/tubearchivist/home/src/index/filesystem.py index 5a33501..0354e83 100644 --- a/tubearchivist/home/src/index/filesystem.py +++ b/tubearchivist/home/src/index/filesystem.py @@ -46,8 +46,9 @@ class FilesystemScanner: all_downloaded = [] for channel_name in all_channels: channel_path = os.path.join(self.VIDEOS, channel_name) - videos = os.listdir(channel_path) - all_videos = ignore_filelist(videos) + channel_files = os.listdir(channel_path) + channel_files_clean = ignore_filelist(channel_files) + all_videos = [i for i in channel_files_clean if i.endswith(".mp4")] for video in all_videos: youtube_id = video[9:20] all_downloaded.append((channel_name, video, youtube_id)) From 4d30bed3ccc92a5dfc123480cacedead55ab7ad2 Mon Sep 17 00:00:00 2001 From: simon Date: Thu, 10 Feb 2022 19:09:07 +0700 Subject: [PATCH 19/21] extend delete video to also delete subtitles --- tubearchivist/home/src/index/video.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/tubearchivist/home/src/index/video.py b/tubearchivist/home/src/index/video.py index ca2c2c7..e2695d4 100644 --- a/tubearchivist/home/src/index/video.py +++ b/tubearchivist/home/src/index/video.py @@ -393,11 +393,18 @@ class YoutubeVideo(YouTubeItem, YoutubeSubtitle): """delete video file, meta data""" self.get_from_es() video_base = self.app_conf["videos"] - media_url = self.json_data["media_url"] - print(f"{self.youtube_id}: delete {media_url} from file system") - to_delete = os.path.join(video_base, media_url) - os.remove(to_delete) + to_del = [self.json_data.get("media_url")] + + all_subtitles = self.json_data.get("subtitles") + if all_subtitles: + to_del = to_del + [i.get("media_url") for i in all_subtitles] + + for media_url in to_del: + file_path = os.path.join(video_base, media_url) + os.remove(file_path) + self.del_in_es() + self._delete_subtitles() def _get_ryd_stats(self): """get optional stats from returnyoutubedislikeapi.com""" @@ -427,6 +434,11 @@ class YoutubeVideo(YouTubeItem, YoutubeSubtitle): self.json_data["subtitles"] = subtitles handler.download_subtitles(relevant_subtitles=subtitles) + def _delete_subtitles(self): + """delete indexed subtitles""" + data = {"query": {"term": {"youtube_id": {"value": self.youtube_id}}}} + _, _ = ElasticWrap("ta_subtitle/_delete_by_query").post(data=data) + def index_new_video(youtube_id): """combined classes to create new video in index""" From 3ea5e9c53708b50aef48ff1008d55efea0aee5b0 Mon Sep 17 00:00:00 2001 From: simon Date: Thu, 10 Feb 2022 19:27:05 +0700 Subject: [PATCH 20/21] bump dependencies --- tubearchivist/requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tubearchivist/requirements.txt b/tubearchivist/requirements.txt index 815d2c0..fc21392 100644 --- a/tubearchivist/requirements.txt +++ b/tubearchivist/requirements.txt @@ -4,9 +4,9 @@ Django==4.0.2 django-cors-headers==3.11.0 djangorestframework==3.13.1 Pillow==9.0.1 -redis==4.1.2 +redis==4.1.3 requests==2.27.1 ryd-client==0.0.3 uWSGI==2.0.20 -whitenoise==5.3.0 +whitenoise==6.0.0 yt_dlp==2022.2.4 From 16f33feda0dd1c65f73186d0fee2eb56795f0761 Mon Sep 17 00:00:00 2001 From: simon Date: Thu, 10 Feb 2022 19:45:22 +0700 Subject: [PATCH 21/21] process subtitle media url paths --- tubearchivist/api/views.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tubearchivist/api/views.py b/tubearchivist/api/views.py index ec75370..620c063 100644 --- a/tubearchivist/api/views.py +++ b/tubearchivist/api/views.py @@ -60,6 +60,12 @@ class ApiBaseView(APIView): cache_dir = self.default_conf["application"]["cache_dir"] new_thumb = f"{cache_dir}/{vid_thumb_url}" self.response["data"]["vid_thumb_url"] = new_thumb + if "subtitles" in all_keys: + all_subtitles = self.response["data"]["subtitles"] + for idx, _ in enumerate(all_subtitles): + url = self.response["data"]["subtitles"][idx]["media_url"] + new_url = f"/media/{url}" + self.response["data"]["subtitles"][idx]["media_url"] = new_url def get_paginate(self): """add pagination detail to response"""