From c186798e782a5c38e3714a6d358275ad1746fa08 Mon Sep 17 00:00:00 2001 From: simon Date: Wed, 9 Mar 2022 00:25:44 +0700 Subject: [PATCH 1/8] rewrite SubtitleParser, #180 --- tubearchivist/home/src/index/video.py | 182 +++++++++++--------------- 1 file changed, 73 insertions(+), 109 deletions(-) diff --git a/tubearchivist/home/src/index/video.py b/tubearchivist/home/src/index/video.py index bc6f272..461fb89 100644 --- a/tubearchivist/home/src/index/video.py +++ b/tubearchivist/home/src/index/video.py @@ -6,7 +6,6 @@ functionality: import json import os -import re from datetime import datetime import requests @@ -65,7 +64,7 @@ class YoutubeSubtitle: if not all_formats: return False - subtitle = [i for i in all_formats if i["ext"] == "vtt"][0] + subtitle = [i for i in all_formats if i["ext"] == "json3"][0] subtitle.update( {"lang": lang, "source": "auto", "media_url": media_url} ) @@ -102,7 +101,7 @@ class YoutubeSubtitle: # no user subtitles found return False - subtitle = [i for i in all_formats if i["ext"] == "vtt"][0] + subtitle = [i for i in all_formats if i["ext"] == "json3"][0] subtitle.update( {"lang": lang, "source": "user", "media_url": media_url} ) @@ -145,109 +144,65 @@ class YoutubeSubtitle: class SubtitleParser: """parse subtitle str from youtube""" - time_reg = r"^([0-9]{2}:?){3}\.[0-9]{3} --> ([0-9]{2}:?){3}\.[0-9]{3}" - stamp_reg = r"<([0-9]{2}:?){3}\.[0-9]{3}>" - tag_reg = r"" - def __init__(self, subtitle_str, lang): - self.subtitle_str = subtitle_str + self.subtitle_raw = json.loads(subtitle_str) self.lang = lang - self.header = False - self.parsed_cue_list = False - self.all_text_lines = False - self.matched = False + self.all_cues = False def process(self): - """collection to process subtitle string""" - self._parse_cues() - self._match_text_lines() - self._add_id() - self._timestamp_check() + """extract relevant que data""" + self.all_cues = [] + for idx, event in enumerate(self.subtitle_raw.get("events")): + cue = { + "start": self.ms_conv(event["tStartMs"]), + "end": self.ms_conv(event["tStartMs"] + event["dDurationMs"]), + "text": "".join([i.get("utf8") for i in event["segs"]]), + "idx": idx + 1, + } + self.all_cues.append(cue) - def _parse_cues(self): - """split into cues""" - all_cues = self.subtitle_str.replace("\n \n", "\n").split("\n\n") - self.header = all_cues[0] - self.all_text_lines = [] - self.parsed_cue_list = [self._cue_cleaner(i) for i in all_cues[1:]] + @staticmethod + def ms_conv(ms): + """convert ms to timestamp""" + hours = str((ms // (1000 * 60 * 60)) % 24).zfill(2) + minutes = str((ms // (1000 * 60)) % 60).zfill(2) + secs = str((ms // 1000) % 60).zfill(2) + millis = str(ms % 1000).zfill(3) - def _cue_cleaner(self, cue): - """parse single cue""" - all_lines = cue.split("\n") - cue_dict = {"lines": []} - - for line in all_lines: - if re.match(self.time_reg, line): - clean = re.search(self.time_reg, line).group() - start, end = clean.split(" --> ") - cue_dict.update({"start": start, "end": end}) - else: - clean = re.sub(self.stamp_reg, "", line) - clean = re.sub(self.tag_reg, "", clean) - cue_dict["lines"].append(clean) - if clean.strip() and clean not in self.all_text_lines[-4:]: - # remove immediate duplicates - self.all_text_lines.append(clean) - - return cue_dict - - def _match_text_lines(self): - """match unique text lines with timestamps""" - - self.matched = [] - - while self.all_text_lines: - check = self.all_text_lines[0] - matches = [i for i in self.parsed_cue_list if check in i["lines"]] - new_cue = matches[-1] - new_cue["start"] = matches[0]["start"] - - for line in new_cue["lines"]: - try: - self.all_text_lines.remove(line) - except ValueError: - continue - - self.matched.append(new_cue) - - def _timestamp_check(self): - """check if end timestamp is bigger than start timestamp""" - for idx, cue in enumerate(self.matched): - # this - end = int(re.sub("[^0-9]", "", cue.get("end"))) - # next - try: - next_cue = self.matched[idx + 1] - except IndexError: - continue - - start_next = int(re.sub("[^0-9]", "", next_cue.get("start"))) - if end > start_next: - self.matched[idx]["end"] = next_cue.get("start") - - def _add_id(self): - """add id to matched cues""" - for idx, _ in enumerate(self.matched): - self.matched[idx]["id"] = idx + 1 + return f"{hours}:{minutes}:{secs}.{millis}" def get_subtitle_str(self): - """stitch cues and return processed new string""" - new_subtitle_str = self.header + "\n\n" + """create vtt text str from cues""" + subtitle_str = f"WEBVTT\nKind: captions\nLanguage: {self.lang}" - for cue in self.matched: - timestamp = f"{cue.get('start')} --> {cue.get('end')}" - lines = "\n".join(cue.get("lines")) - cue_text = f"{cue.get('id')}\n{timestamp}\n{lines}\n\n" - new_subtitle_str = new_subtitle_str + cue_text + for cue in self.all_cues: + stamp = f"{cue.get('start')} --> {cue.get('end')}" + cue_text = f"\n\n{cue.get('idx')}\n{stamp}\n{cue.get('text')}" + subtitle_str = subtitle_str + cue_text - return new_subtitle_str + return subtitle_str def create_bulk_import(self, video, source): - """process matched for es import""" + """subtitle lines for es import""" + documents = self.create_documents(video, source) bulk_list = [] - channel = video.json_data.get("channel") - document = { + for document in documents: + document_id = document.get("subtitle_fragment_id") + action = {"index": {"_index": "ta_subtitle", "_id": document_id}} + bulk_list.append(json.dumps(action)) + bulk_list.append(json.dumps(document)) + + bulk_list.append("\n") + query_str = "\n".join(bulk_list) + + return query_str + + def create_documents(self, video, source): + """process documents""" + documents = self.chunk_list(video.youtube_id) + channel = video.json_data.get("channel") + meta_dict = { "youtube_id": video.youtube_id, "title": video.json_data.get("title"), "subtitle_channel": channel.get("channel_name"), @@ -257,26 +212,35 @@ class SubtitleParser: "subtitle_source": source, } - for match in self.matched: - match_id = match.get("id") - document_id = f"{video.youtube_id}-{self.lang}-{match_id}" - action = {"index": {"_index": "ta_subtitle", "_id": document_id}} - document.update( - { - "subtitle_fragment_id": document_id, - "subtitle_start": match.get("start"), - "subtitle_end": match.get("end"), - "subtitle_index": match_id, - "subtitle_line": " ".join(match.get("lines")), + _ = [i.update(meta_dict) for i in documents] + + return documents + + def chunk_list(self, youtube_id): + """join cues for bulk import""" + chunk_list = [] + + chunk = {} + for cue in self.all_cues: + if chunk: + text = f"{chunk.get('subtitle_line')} {cue.get('text')}\n" + chunk["subtitle_line"] = text + else: + idx = len(chunk_list) + 1 + chunk = { + "subtitle_index": idx, + "subtitle_line": cue.get("text"), + "subtitle_start": cue.get("start"), } - ) - bulk_list.append(json.dumps(action)) - bulk_list.append(json.dumps(document)) - bulk_list.append("\n") - query_str = "\n".join(bulk_list) + chunk["subtitle_fragment_id"] = f"{youtube_id}-{self.lang}-{idx}" - return query_str + if cue["idx"] % 5 == 0: + chunk["subtitle_end"] = cue.get("end") + chunk_list.append(chunk) + chunk = {} + + return chunk_list class YoutubeVideo(YouTubeItem, YoutubeSubtitle): From 6a6c8fa5d80b99ae4b10502ac53eec1da2b26466 Mon Sep 17 00:00:00 2001 From: simon Date: Thu, 10 Mar 2022 17:39:35 +0700 Subject: [PATCH 2/8] bump yt-dlp version --- tubearchivist/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tubearchivist/requirements.txt b/tubearchivist/requirements.txt index ecebf8c..3d10c3f 100644 --- a/tubearchivist/requirements.txt +++ b/tubearchivist/requirements.txt @@ -9,4 +9,4 @@ requests==2.27.1 ryd-client==0.0.3 uWSGI==2.0.20 whitenoise==6.0.0 -yt_dlp==2022.2.4 +yt_dlp==2022.3.8.2 From 1ce832b846e590a4f424716558d82129e0d30b17 Mon Sep 17 00:00:00 2001 From: simon Date: Thu, 10 Mar 2022 19:58:13 +0700 Subject: [PATCH 3/8] fix autocaption extraction, flatten words, #180 --- tubearchivist/home/src/index/video.py | 28 ++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/tubearchivist/home/src/index/video.py b/tubearchivist/home/src/index/video.py index 461fb89..7fe20a0 100644 --- a/tubearchivist/home/src/index/video.py +++ b/tubearchivist/home/src/index/video.py @@ -114,12 +114,13 @@ class YoutubeSubtitle: for subtitle in relevant_subtitles: dest_path = os.path.join(videos_base, subtitle["media_url"]) source = subtitle["source"] + lang = subtitle.get("lang") response = requests.get(subtitle["url"]) if not response.ok: print(f"{self.video.youtube_id}: failed to download subtitle") continue - parser = SubtitleParser(response.text, subtitle.get("lang")) + parser = SubtitleParser(response.text, lang, source) parser.process() subtitle_str = parser.get_subtitle_str() self._write_subtitle_file(dest_path, subtitle_str) @@ -144,15 +145,20 @@ class YoutubeSubtitle: class SubtitleParser: """parse subtitle str from youtube""" - def __init__(self, subtitle_str, lang): + def __init__(self, subtitle_str, lang, source): self.subtitle_raw = json.loads(subtitle_str) self.lang = lang + self.source = source self.all_cues = False def process(self): """extract relevant que data""" + all_events = self.subtitle_raw.get("events") + if self.source == "auto": + all_events = self._flat_auto_caption(all_events) + self.all_cues = [] - for idx, event in enumerate(self.subtitle_raw.get("events")): + for idx, event in enumerate(all_events): cue = { "start": self.ms_conv(event["tStartMs"]), "end": self.ms_conv(event["tStartMs"] + event["dDurationMs"]), @@ -161,6 +167,22 @@ class SubtitleParser: } self.all_cues.append(cue) + @staticmethod + def _flat_auto_caption(all_events): + """flatten autocaption segments""" + flatten = [] + for event in all_events: + if "segs" not in event.keys(): + continue + text = "".join([i.get("utf8") for i in event.get("segs")]) + if not text.strip(): + continue + + event.update({"segs": [{"utf8": text}]}) + flatten.append(event) + + return flatten + @staticmethod def ms_conv(ms): """convert ms to timestamp""" From 6e3df21f8c959640a3c50687641a58fb12f3e557 Mon Sep 17 00:00:00 2001 From: Nathan DeTar Date: Thu, 10 Mar 2022 05:20:23 -0800 Subject: [PATCH 4/8] Continue Watching Section (#188) * Replaced isWatched() function. * Switched to `updateVideoWatchStatus()` function * Updated Onclick to `updateVideoWatchStatus(this)` * Handle `this` input in `updateVideoWatchStatus()` --- .../home/templates/home/channel_id.html | 4 +- tubearchivist/home/templates/home/home.html | 8 +- .../home/templates/home/playlist_id.html | 4 +- tubearchivist/home/templates/home/video.html | 4 +- tubearchivist/static/script.js | 142 ++++++++++++------ 5 files changed, 107 insertions(+), 55 deletions(-) diff --git a/tubearchivist/home/templates/home/channel_id.html b/tubearchivist/home/templates/home/channel_id.html index 1198a15..3a84037 100644 --- a/tubearchivist/home/templates/home/channel_id.html +++ b/tubearchivist/home/templates/home/channel_id.html @@ -124,9 +124,9 @@
{% if video.source.player.watched %} - seen-icon + seen-icon {% else %} - unseen-icon + unseen-icon {% endif %} {{ video.source.published }} | {{ video.source.player.duration_str }}
diff --git a/tubearchivist/home/templates/home/home.html b/tubearchivist/home/templates/home/home.html index 042ceed..0eb2926 100644 --- a/tubearchivist/home/templates/home/home.html +++ b/tubearchivist/home/templates/home/home.html @@ -27,9 +27,9 @@
{% if video.player.watched %} - seen-icon + seen-icon {% else %} - unseen-icon + unseen-icon {% endif %} {{ video.published }} | {{ video.player.duration_str }}
@@ -103,9 +103,9 @@
{% if video.source.player.watched %} - seen-icon + seen-icon {% else %} - unseen-icon + unseen-icon {% endif %} {{ video.source.published }} | {{ video.source.player.duration_str }}
diff --git a/tubearchivist/home/templates/home/playlist_id.html b/tubearchivist/home/templates/home/playlist_id.html index 635bdca..f9e987e 100644 --- a/tubearchivist/home/templates/home/playlist_id.html +++ b/tubearchivist/home/templates/home/playlist_id.html @@ -105,9 +105,9 @@
{% if video.source.player.watched %} - seen-icon + seen-icon {% else %} - unseen-icon + unseen-icon {% endif %} {{ video.source.published }} | {{ video.source.player.duration_str }}
diff --git a/tubearchivist/home/templates/home/video.html b/tubearchivist/home/templates/home/video.html index 7a8b522..324d293 100644 --- a/tubearchivist/home/templates/home/video.html +++ b/tubearchivist/home/templates/home/video.html @@ -32,9 +32,9 @@

Last refreshed: {{ video.vid_last_refresh }}

Watched: {% if video.player.watched %} - seen-icon + seen-icon {% else %} - unseen-icon + unseen-icon {% endif %}

{% if video.active %} diff --git a/tubearchivist/static/script.js b/tubearchivist/static/script.js index acc592a..6e5c818 100644 --- a/tubearchivist/static/script.js +++ b/tubearchivist/static/script.js @@ -8,21 +8,62 @@ function sortChange(sortValue) { }, 500); } -function isWatched(youtube_id) { - postVideoProgress(youtube_id, 0); // Reset video progress on watched; - removeProgressBar(youtube_id); - var payload = JSON.stringify({'watched': youtube_id}); - sendPost(payload); - var seenIcon = document.createElement('img'); - seenIcon.setAttribute('src', "/static/img/icon-seen.svg"); - seenIcon.setAttribute('alt', 'seen-icon'); - seenIcon.setAttribute('id', youtube_id); - seenIcon.setAttribute('title', "Mark as unwatched"); - seenIcon.setAttribute('onclick', "isUnwatched(this.id)"); - seenIcon.classList = 'seen-icon'; - document.getElementById(youtube_id).replaceWith(seenIcon); +// Updates video watch status when passed a video id and it's current state (ex if the video was unwatched but you want to mark it as watched you will pass "unwatched") +function updateVideoWatchStatus(input1, videoCurrentWatchStatus) { + if (videoCurrentWatchStatus) { + videoId = input1; + } else if (input1.getAttribute("data-id")) { + videoId = input1.getAttribute("data-id"); + videoCurrentWatchStatus = input1.getAttribute("data-status"); + } + + postVideoProgress(videoId, 0); // Reset video progress on watched/unwatched; + removeProgressBar(videoId); + + if (videoCurrentWatchStatus == "watched") { + var watchStatusIndicator = createWatchStatusIndicator(videoId, "unwatched"); + var payload = JSON.stringify({'un_watched': videoId}); + sendPost(payload); + } else if (videoCurrentWatchStatus == "unwatched") { + var watchStatusIndicator = createWatchStatusIndicator(videoId, "watched"); + var payload = JSON.stringify({'watched': videoId}); + sendPost(payload); + } + + var watchButtons = document.getElementsByClassName("watch-button"); + for (let i = 0; i < watchButtons.length; i++) { + if (watchButtons[i].getAttribute("data-id") == videoId) { + watchButtons[i].outerHTML = watchStatusIndicator; + } + } } +// Creates a watch status indicator when passed a video id and the videos watch status +function createWatchStatusIndicator(videoId, videoWatchStatus) { + if (videoWatchStatus == "watched") { + var seen = "seen"; + var title = "Mark as unwatched"; + } else if (videoWatchStatus == "unwatched") { + var seen = "unseen"; + var title = "Mark as watched"; + } + var watchStatusIndicator = `${seen}-icon`; + return watchStatusIndicator; +} + +// function isWatched(youtube_id) { +// var payload = JSON.stringify({'watched': youtube_id}); +// sendPost(payload); +// var seenIcon = document.createElement('img'); +// seenIcon.setAttribute('src', "/static/img/icon-seen.svg"); +// seenIcon.setAttribute('alt', 'seen-icon'); +// seenIcon.setAttribute('id', youtube_id); +// seenIcon.setAttribute('title', "Mark as unwatched"); +// seenIcon.setAttribute('onclick', "isUnwatched(this.id)"); +// seenIcon.classList = 'seen-icon'; +// document.getElementById(youtube_id).replaceWith(seenIcon); +// } + // Removes the progress bar when passed a video id function removeProgressBar(videoId) { setProgressBar(videoId, 0, 1); @@ -39,19 +80,19 @@ function isWatchedButton(button) { }, 1000); } -function isUnwatched(youtube_id) { - postVideoProgress(youtube_id, 0); // Reset video progress on unwatched; - var payload = JSON.stringify({'un_watched': youtube_id}); - sendPost(payload); - var unseenIcon = document.createElement('img'); - unseenIcon.setAttribute('src', "/static/img/icon-unseen.svg"); - unseenIcon.setAttribute('alt', 'unseen-icon'); - unseenIcon.setAttribute('id', youtube_id); - unseenIcon.setAttribute('title', "Mark as watched"); - unseenIcon.setAttribute('onclick', "isWatched(this.id)"); - unseenIcon.classList = 'unseen-icon'; - document.getElementById(youtube_id).replaceWith(unseenIcon); -} +// function isUnwatched(youtube_id) { +// postVideoProgress(youtube_id, 0); // Reset video progress on unwatched; +// var payload = JSON.stringify({'un_watched': youtube_id}); +// sendPost(payload); +// var unseenIcon = document.createElement('img'); +// unseenIcon.setAttribute('src', "/static/img/icon-unseen.svg"); +// unseenIcon.setAttribute('alt', 'unseen-icon'); +// unseenIcon.setAttribute('id', youtube_id); +// unseenIcon.setAttribute('title', "Mark as watched"); +// unseenIcon.setAttribute('onclick', "isWatched(this.id)"); +// unseenIcon.classList = 'unseen-icon'; +// document.getElementById(youtube_id).replaceWith(unseenIcon); +// } function unsubscribe(id_unsub) { var payload = JSON.stringify({'unsubscribe': id_unsub}); @@ -327,7 +368,7 @@ function createPlayer(button) { var channelName = videoData.data.channel.channel_name; removePlayer(); - document.getElementById(videoId).outerHTML = ''; // Remove watch indicator from video info + // document.getElementById(videoId).outerHTML = ''; // Remove watch indicator from video info // If cast integration is enabled create cast button var castButton = ''; @@ -337,12 +378,11 @@ function createPlayer(button) { // Watched indicator if (videoData.data.player.watched) { - var playerState = "seen"; - var watchedFunction = "Unwatched"; + var watchStatusIndicator = createWatchStatusIndicator(videoId, "watched"); } else { - var playerState = "unseen"; - var watchedFunction = "Watched"; + var watchStatusIndicator = createWatchStatusIndicator(videoId, "unwatched"); } + var playerStats = `
views icon${videoViews}`; if (videoData.data.stats.like_count) { @@ -360,7 +400,7 @@ function createPlayer(button) { ${videoTag}
close-icon - ${playerState}-icon + ${watchStatusIndicator} ${castButton} ${playerStats}
@@ -444,8 +484,12 @@ function getVideoPlayerDuration() { function getVideoPlayerWatchStatus() { var videoId = getVideoPlayerVideoId(); var watched = false; - if(document.getElementById(videoId) != null && document.getElementById(videoId).className != "unseen-icon") { - watched = true; + + var watchButtons = document.getElementsByClassName("watch-button"); + for (let i = 0; i < watchButtons.length; i++) { + if (watchButtons[i].getAttribute("data-id") == videoId && watchButtons[i].getAttribute("data-status") == "watched") { + watched = true; + } } return watched; } @@ -459,7 +503,7 @@ function onVideoProgress() { postVideoProgress(videoId, currentTime); if (!getVideoPlayerWatchStatus()) { // Check if video is already marked as watched if (watchedThreshold(currentTime, duration)) { - isWatched(videoId); + updateVideoWatchStatus(videoId, "unwatched"); } } } @@ -469,7 +513,7 @@ function onVideoProgress() { function onVideoEnded() { var videoId = getVideoPlayerVideoId(); if (!getVideoPlayerWatchStatus()) { // Check if video is already marked as watched - isWatched(videoId); + updateVideoWatchStatus(videoId, "unwatched"); } } @@ -606,13 +650,21 @@ function removePlayer() { // Sets the progress bar when passed a video id, video progress and video duration function setProgressBar(videoId, currentTime, duration) { - progressBar = document.getElementById("progress-" + videoId); - progressBarWidth = (currentTime / duration) * 100 + "%"; - if (progressBar && !getVideoPlayerWatchStatus()) { - progressBar.style.width = progressBarWidth; - } else if (progressBar) { - progressBar.style.width = "0%"; + var progressBarWidth = (currentTime / duration) * 100 + "%"; + var progressBars = document.getElementsByClassName("video-progress-bar"); + for (let i = 0; i < progressBars.length; i++) { + if (progressBars[i].id == "progress-" + videoId) { + if (!getVideoPlayerWatchStatus()) { + progressBars[i].style.width = progressBarWidth; + } else { + progressBars[i].style.width = "0%"; + } + } } + + // progressBar = document.getElementById("progress-" + videoId); + + } // multi search form @@ -681,9 +733,9 @@ function createVideo(video, viewStyle) { const videoPublished = video.published; const videoDuration = video.player.duration_str; if (video.player.watched) { - var playerState = "seen"; + var watchStatusIndicator = createWatchStatusIndicator(videoId, "watched"); } else { - var playerState = "unseen"; + var watchStatusIndicator = createWatchStatusIndicator(videoId, "unwatched"); }; const channelId = video.channel.channel_id; const channelName = video.channel.channel_name; @@ -701,7 +753,7 @@ function createVideo(video, viewStyle) {
- ${playerState}-icon + ${watchStatusIndicator} ${videoPublished} | ${videoDuration}
From d3e9646fb6daf3a5e01c0284639eaad655dd5871 Mon Sep 17 00:00:00 2001 From: simon Date: Thu, 10 Mar 2022 20:45:13 +0700 Subject: [PATCH 5/8] private methods for YoutubeSubtitle and SubtitleParser --- tubearchivist/home/src/index/video.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/tubearchivist/home/src/index/video.py b/tubearchivist/home/src/index/video.py index 7fe20a0..a840ea8 100644 --- a/tubearchivist/home/src/index/video.py +++ b/tubearchivist/home/src/index/video.py @@ -23,7 +23,7 @@ class YoutubeSubtitle: self.video = video self.languages = False - def sub_conf_parse(self): + def _sub_conf_parse(self): """add additional conf values to self""" languages_raw = self.video.config["downloads"]["subtitle"] if languages_raw: @@ -31,26 +31,26 @@ class YoutubeSubtitle: def get_subtitles(self): """check what to do""" - self.sub_conf_parse() + self._sub_conf_parse() if not self.languages: # no subtitles return False relevant_subtitles = [] for lang in self.languages: - user_sub = self.get_user_subtitles(lang) + user_sub = self._get_user_subtitles(lang) if user_sub: relevant_subtitles.append(user_sub) continue if self.video.config["downloads"]["subtitle_source"] == "auto": - auto_cap = self.get_auto_caption(lang) + auto_cap = self._get_auto_caption(lang) if auto_cap: relevant_subtitles.append(auto_cap) return relevant_subtitles - def get_auto_caption(self, lang): + def _get_auto_caption(self, lang): """get auto_caption subtitles""" print(f"{self.video.youtube_id}-{lang}: get auto generated subtitles") all_subtitles = self.video.youtube_meta.get("automatic_captions") @@ -87,7 +87,7 @@ class YoutubeSubtitle: return all_subtitles - def get_user_subtitles(self, lang): + def _get_user_subtitles(self, lang): """get subtitles uploaded from channel owner""" print(f"{self.video.youtube_id}-{lang}: get user uploaded subtitles") all_subtitles = self._normalize_lang() @@ -160,8 +160,8 @@ class SubtitleParser: self.all_cues = [] for idx, event in enumerate(all_events): cue = { - "start": self.ms_conv(event["tStartMs"]), - "end": self.ms_conv(event["tStartMs"] + event["dDurationMs"]), + "start": self._ms_conv(event["tStartMs"]), + "end": self._ms_conv(event["tStartMs"] + event["dDurationMs"]), "text": "".join([i.get("utf8") for i in event["segs"]]), "idx": idx + 1, } @@ -184,7 +184,7 @@ class SubtitleParser: return flatten @staticmethod - def ms_conv(ms): + def _ms_conv(ms): """convert ms to timestamp""" hours = str((ms // (1000 * 60 * 60)) % 24).zfill(2) minutes = str((ms // (1000 * 60)) % 60).zfill(2) @@ -206,7 +206,7 @@ class SubtitleParser: def create_bulk_import(self, video, source): """subtitle lines for es import""" - documents = self.create_documents(video, source) + documents = self._create_documents(video, source) bulk_list = [] for document in documents: @@ -220,9 +220,9 @@ class SubtitleParser: return query_str - def create_documents(self, video, source): + def _create_documents(self, video, source): """process documents""" - documents = self.chunk_list(video.youtube_id) + documents = self._chunk_list(video.youtube_id) channel = video.json_data.get("channel") meta_dict = { "youtube_id": video.youtube_id, @@ -238,7 +238,7 @@ class SubtitleParser: return documents - def chunk_list(self, youtube_id): + def _chunk_list(self, youtube_id): """join cues for bulk import""" chunk_list = [] From aff0cfb794c09d900a18782e332de35787ae04b5 Mon Sep 17 00:00:00 2001 From: simon Date: Fri, 11 Mar 2022 17:47:04 +0700 Subject: [PATCH 6/8] fix retiming issue for auto subtitles --- tubearchivist/home/src/index/video.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tubearchivist/home/src/index/video.py b/tubearchivist/home/src/index/video.py index a840ea8..9968cc7 100644 --- a/tubearchivist/home/src/index/video.py +++ b/tubearchivist/home/src/index/video.py @@ -178,6 +178,14 @@ class SubtitleParser: if not text.strip(): continue + if flatten: + # fix overlapping retiming issue + last_end = flatten[-1]["tStartMs"] + flatten[-1]["dDurationMs"] + if event["tStartMs"] < last_end: + joined = flatten[-1]["segs"][0]["utf8"] + "\n" + text + flatten[-1]["segs"][0]["utf8"] = joined + continue + event.update({"segs": [{"utf8": text}]}) flatten.append(event) From f6950a2ca5005536c40fb467fef2ba16063a9157 Mon Sep 17 00:00:00 2001 From: simon Date: Sat, 12 Mar 2022 17:29:34 +0700 Subject: [PATCH 7/8] list all in progress videos --- tubearchivist/home/templates/home/home.html | 24 ++++++++--------- tubearchivist/home/views.py | 30 +++++++++++++++++---- 2 files changed, 37 insertions(+), 17 deletions(-) diff --git a/tubearchivist/home/templates/home/home.html b/tubearchivist/home/templates/home/home.html index 0eb2926..445de5b 100644 --- a/tubearchivist/home/templates/home/home.html +++ b/tubearchivist/home/templates/home/home.html @@ -9,14 +9,14 @@
{% for video in continue_vids %}
- +
- video-thumb - {% if video.player.progress %} -
+ video-thumb + {% if video.source.player.progress %} +
{% else %} -
+
{% endif %}
@@ -25,17 +25,17 @@
-
- {% if video.player.watched %} - seen-icon +
+ {% if video.source.player.watched %} + seen-icon {% else %} - unseen-icon + unseen-icon {% endif %} - {{ video.published }} | {{ video.player.duration_str }} + {{ video.source.published }} | {{ video.source.player.duration_str }}
diff --git a/tubearchivist/home/views.py b/tubearchivist/home/views.py index 781b690..08e5df8 100644 --- a/tubearchivist/home/views.py +++ b/tubearchivist/home/views.py @@ -175,15 +175,35 @@ class ArchivistResultsView(ArchivistViewConfig): if not results or not self.context["results"]: return - self.context["continue_vids"] = [] - progress = {i["youtube_id"]: i["position"] for i in results} + self.context["continue_vids"] = self.get_in_progress(results) + + in_progress = {i["youtube_id"]: i["position"] for i in results} for hit in self.context["results"]: video = hit["source"] - if video["youtube_id"] in progress: - played_sec = progress.get(video["youtube_id"]) + if video["youtube_id"] in in_progress: + played_sec = in_progress.get(video["youtube_id"]) total = video["player"]["duration"] video["player"]["progress"] = 100 * (played_sec / total) - self.context["continue_vids"].append(video) + + def get_in_progress(self, results): + """get all videos in progress""" + ids = [{"match": {"youtube_id": i.get("youtube_id")}} for i in results] + data = { + "size": self.default_conf["archive"]["page_size"], + "query": {"bool": {"should": ids}}, + } + search = SearchHandler( + "ta_video/_search", self.default_conf, data=data + ) + videos = search.get_data() + for video in videos: + youtube_id = video["source"]["youtube_id"] + matched = [i for i in results if i["youtube_id"] == youtube_id] + played_sec = matched[0]["position"] + total = video["source"]["player"]["duration"] + video["source"]["player"]["progress"] = 100 * (played_sec / total) + + return videos def single_lookup(self, es_path): """retrieve a single item from url""" From 7595e7501f19b33d8cea828f80cab1f11838ac12 Mon Sep 17 00:00:00 2001 From: simon Date: Sat, 12 Mar 2022 20:29:26 +0700 Subject: [PATCH 8/8] sort continue watching videos --- tubearchivist/home/views.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tubearchivist/home/views.py b/tubearchivist/home/views.py index 08e5df8..0587704 100644 --- a/tubearchivist/home/views.py +++ b/tubearchivist/home/views.py @@ -191,6 +191,7 @@ class ArchivistResultsView(ArchivistViewConfig): data = { "size": self.default_conf["archive"]["page_size"], "query": {"bool": {"should": ids}}, + "sort": [{"published": {"order": "desc"}}], } search = SearchHandler( "ta_video/_search", self.default_conf, data=data