auto generated subtitle parser and cleaner

2025-07-18 23:18:14 +00:00 · 2022-02-07 21:18:52 +07:00 · 2022-02-07 21:18:52 +07:00 · 5f6158243e
commit 5f6158243e
parent 1664b0d4fc
1 changed files with 85 additions and 0 deletions
--- a/tubearchivist/home/src/index/video.py
+++ b/tubearchivist/home/src/index/video.py
@ -5,6 +5,7 @@ functionality:
 """

 import os
+import re
 from datetime import datetime

 import requests
@ -121,6 +122,90 @@ class YoutubeSubtitle:
                print(f"{self.youtube_id}: failed to download subtitle")


+class SubtitleParser:
+    """parse subtitle str from youtube"""
+
+    time_reg = r"^([0-9]{2}:?){3}\.[0-9]{3} --> ([0-9]{2}:?){3}\.[0-9]{3}"
+    stamp_reg = r"<([0-9]{2}:?){3}\.[0-9]{3}>"
+    tag_reg = r"</?c>"
+
+    def __init__(self, subtitle_str):
+        self.subtitle_str = subtitle_str
+        self.header = False
+        self.parsed_cue_list = False
+        self.all_text_lines = False
+        self.matched = False
+
+    def process(self):
+        """collection to process subtitle string"""
+        self._parse_cues()
+        self._match_text_lines()
+        self._add_id()
+
+    def _parse_cues(self):
+        """split into cues"""
+        all_cues = self.subtitle_str.replace("\n \n", "\n").split("\n\n")
+        self.header = all_cues[0]
+        self.all_text_lines = []
+        self.parsed_cue_list = [self._cue_cleaner(i) for i in all_cues[1:]]
+
+    def _cue_cleaner(self, cue):
+        """parse single cue"""
+        all_lines = cue.split("\n")
+        cue_dict = {"lines": []}
+
+        for line in all_lines:
+            if re.match(r"^([0-9]{2}:?){3}", line):
+                clean = re.search(self.time_reg, line).group()
+                start, end = clean.split(" --> ")
+                cue_dict.update({"start": start, "end": end})
+            else:
+                clean = re.sub(self.stamp_reg, "", line)
+                clean = re.sub(self.tag_reg, "", clean)
+                cue_dict["lines"].append(clean)
+                if clean and clean not in self.all_text_lines:
+                    self.all_text_lines.append(clean)
+
+        return cue_dict
+
+    def _match_text_lines(self):
+        """match unique text lines with timestamps"""
+
+        self.matched = []
+
+        while self.all_text_lines:
+            check = self.all_text_lines[0]
+            matches = [i for i in self.parsed_cue_list if check in i["lines"]]
+            new_cue = matches[-1]
+            new_cue["start"] = matches[0]["start"]
+
+            for line in new_cue["lines"]:
+                try:
+                    self.all_text_lines.remove(line)
+                except ValueError:
+                    print("failed to process:")
+                    print(line)
+
+            self.matched.append(new_cue)
+
+    def _add_id(self):
+        """add id to matched cues"""
+        for idx, _ in enumerate(self.matched):
+            self.matched[idx]["id"] = idx + 1
+
+    def get_subtitle_str(self):
+        """stitch cues and return processed new string"""
+        new_subtitle_str = self.header + "\n\n"
+
+        for cue in self.matched:
+            timestamp = f"{cue.get('start')} --> {cue.get('end')}"
+            lines = "\n".join(cue.get("lines"))
+            cue_text = f"{cue.get('id')}\n{timestamp}\n{lines}\n\n"
+            new_subtitle_str = new_subtitle_str + cue_text
+
+        return new_subtitle_str
+
+
 class YoutubeVideo(YouTubeItem, YoutubeSubtitle):
    """represents a single youtube video"""