rewrite SubtitleParser, #180

2022-03-09 00:25:44 +07:00 · 2022-03-09 00:25:44 +07:00 · c186798e78
parent 40c8e6d146
commit c186798e78
1 changed files with 73 additions and 109 deletions
--- a/tubearchivist/home/src/index/video.py
+++ b/tubearchivist/home/src/index/video.py
@ -6,7 +6,6 @@ functionality:
 import json
 import os
 import re
 from datetime import datetime
 import requests
@ -65,7 +64,7 @@ class YoutubeSubtitle:
        if not all_formats:
            return False
-        subtitle = [i for i in all_formats if i["ext"] == "vtt"][0]
+        subtitle = [i for i in all_formats if i["ext"] == "json3"][0]
        subtitle.update(
            {"lang": lang, "source": "auto", "media_url": media_url}
        )
@ -102,7 +101,7 @@ class YoutubeSubtitle:
            # no user subtitles found
            return False
-        subtitle = [i for i in all_formats if i["ext"] == "vtt"][0]
+        subtitle = [i for i in all_formats if i["ext"] == "json3"][0]
        subtitle.update(
            {"lang": lang, "source": "user", "media_url": media_url}
        )
@ -145,109 +144,65 @@ class YoutubeSubtitle:
 class SubtitleParser:
    """parse subtitle str from youtube"""
    time_reg = r"^([0-9]{2}:?){3}\.[0-9]{3} --> ([0-9]{2}:?){3}\.[0-9]{3}"
    stamp_reg = r"<([0-9]{2}:?){3}\.[0-9]{3}>"
    tag_reg = r"</?c>"
    def __init__(self, subtitle_str, lang):
-        self.subtitle_str = subtitle_str
+        self.subtitle_raw = json.loads(subtitle_str)
        self.lang = lang
-        self.header = False
+        self.all_cues = False
        self.parsed_cue_list = False
        self.all_text_lines = False
        self.matched = False
    def process(self):
-        """collection to process subtitle string"""
+        """extract relevant que data"""
-        self._parse_cues()
+        self.all_cues = []
-        self._match_text_lines()
+        for idx, event in enumerate(self.subtitle_raw.get("events")):
-        self._add_id()
+            cue = {
-        self._timestamp_check()
+                "start": self.ms_conv(event["tStartMs"]),
                "end": self.ms_conv(event["tStartMs"] + event["dDurationMs"]),
                "text": "".join([i.get("utf8") for i in event["segs"]]),
                "idx": idx + 1,
            }
            self.all_cues.append(cue)
-    def _parse_cues(self):
+    @staticmethod
-        """split into cues"""
+    def ms_conv(ms):
-        all_cues = self.subtitle_str.replace("\n \n", "\n").split("\n\n")
+        """convert ms to timestamp"""
-        self.header = all_cues[0]
+        hours = str((ms // (1000 * 60 * 60)) % 24).zfill(2)
-        self.all_text_lines = []
+        minutes = str((ms // (1000 * 60)) % 60).zfill(2)
-        self.parsed_cue_list = [self._cue_cleaner(i) for i in all_cues[1:]]
+        secs = str((ms // 1000) % 60).zfill(2)
        millis = str(ms % 1000).zfill(3)
-    def _cue_cleaner(self, cue):
+        return f"{hours}:{minutes}:{secs}.{millis}"
        """parse single cue"""
        all_lines = cue.split("\n")
        cue_dict = {"lines": []}
        for line in all_lines:
            if re.match(self.time_reg, line):
                clean = re.search(self.time_reg, line).group()
                start, end = clean.split(" --> ")
                cue_dict.update({"start": start, "end": end})
            else:
                clean = re.sub(self.stamp_reg, "", line)
                clean = re.sub(self.tag_reg, "", clean)
                cue_dict["lines"].append(clean)
                if clean.strip() and clean not in self.all_text_lines[-4:]:
                    # remove immediate duplicates
                    self.all_text_lines.append(clean)
        return cue_dict
    def _match_text_lines(self):
        """match unique text lines with timestamps"""
        self.matched = []
        while self.all_text_lines:
            check = self.all_text_lines[0]
            matches = [i for i in self.parsed_cue_list if check in i["lines"]]
            new_cue = matches[-1]
            new_cue["start"] = matches[0]["start"]
            for line in new_cue["lines"]:
                try:
                    self.all_text_lines.remove(line)
                except ValueError:
                    continue
            self.matched.append(new_cue)
    def _timestamp_check(self):
        """check if end timestamp is bigger than start timestamp"""
        for idx, cue in enumerate(self.matched):
            # this
            end = int(re.sub("[^0-9]", "", cue.get("end")))
            # next
            try:
                next_cue = self.matched[idx + 1]
            except IndexError:
                continue
            start_next = int(re.sub("[^0-9]", "", next_cue.get("start")))
            if end > start_next:
                self.matched[idx]["end"] = next_cue.get("start")
    def _add_id(self):
        """add id to matched cues"""
        for idx, _ in enumerate(self.matched):
            self.matched[idx]["id"] = idx + 1
    def get_subtitle_str(self):
-        """stitch cues and return processed new string"""
+        """create vtt text str from cues"""
-        new_subtitle_str = self.header + "\n\n"
+        subtitle_str = f"WEBVTT\nKind: captions\nLanguage: {self.lang}"
-        for cue in self.matched:
+        for cue in self.all_cues:
-            timestamp = f"{cue.get('start')} --> {cue.get('end')}"
+            stamp = f"{cue.get('start')} --> {cue.get('end')}"
-            lines = "\n".join(cue.get("lines"))
+            cue_text = f"\n\n{cue.get('idx')}\n{stamp}\n{cue.get('text')}"
-            cue_text = f"{cue.get('id')}\n{timestamp}\n{lines}\n\n"
+            subtitle_str = subtitle_str + cue_text
            new_subtitle_str = new_subtitle_str + cue_text
-        return new_subtitle_str
+        return subtitle_str
    def create_bulk_import(self, video, source):
-        """process matched for es import"""
+        """subtitle lines for es import"""
        documents = self.create_documents(video, source)
        bulk_list = []
        channel = video.json_data.get("channel")
-        document = {
+        for document in documents:
            document_id = document.get("subtitle_fragment_id")
            action = {"index": {"_index": "ta_subtitle", "_id": document_id}}
            bulk_list.append(json.dumps(action))
            bulk_list.append(json.dumps(document))
        bulk_list.append("\n")
        query_str = "\n".join(bulk_list)
        return query_str
    def create_documents(self, video, source):
        """process documents"""
        documents = self.chunk_list(video.youtube_id)
        channel = video.json_data.get("channel")
        meta_dict = {
            "youtube_id": video.youtube_id,
            "title": video.json_data.get("title"),
            "subtitle_channel": channel.get("channel_name"),
@ -257,26 +212,35 @@ class SubtitleParser:
            "subtitle_source": source,
        }
-        for match in self.matched:
+        _ = [i.update(meta_dict) for i in documents]
-            match_id = match.get("id")
+
-            document_id = f"{video.youtube_id}-{self.lang}-{match_id}"
+        return documents
-            action = {"index": {"_index": "ta_subtitle", "_id": document_id}}
+
-            document.update(
+    def chunk_list(self, youtube_id):
-                {
+        """join cues for bulk import"""
-                    "subtitle_fragment_id": document_id,
+        chunk_list = []
-                    "subtitle_start": match.get("start"),
+
-                    "subtitle_end": match.get("end"),
+        chunk = {}
-                    "subtitle_index": match_id,
+        for cue in self.all_cues:
-                    "subtitle_line": " ".join(match.get("lines")),
+            if chunk:
                text = f"{chunk.get('subtitle_line')} {cue.get('text')}\n"
                chunk["subtitle_line"] = text
            else:
                idx = len(chunk_list) + 1
                chunk = {
                    "subtitle_index": idx,
                    "subtitle_line": cue.get("text"),
                    "subtitle_start": cue.get("start"),
                }
            )
            bulk_list.append(json.dumps(action))
            bulk_list.append(json.dumps(document))
-        bulk_list.append("\n")
+            chunk["subtitle_fragment_id"] = f"{youtube_id}-{self.lang}-{idx}"
        query_str = "\n".join(bulk_list)
-        return query_str
+            if cue["idx"] % 5 == 0:
                chunk["subtitle_end"] = cue.get("end")
                chunk_list.append(chunk)
                chunk = {}
        return chunk_list
 class YoutubeVideo(YouTubeItem, YoutubeSubtitle):