cover edge cases where end timestamp is after start timestamp of new cue

2022-02-12 16:03:04 +07:00 · 2022-02-12 16:03:04 +07:00 · b494fc10af
parent d99ce0d98e
commit b494fc10af
1 changed files with 18 additions and 1 deletions
--- a/tubearchivist/home/src/index/video.py
+++ b/tubearchivist/home/src/index/video.py
@ -159,6 +159,7 @@ class SubtitleParser:
        self._parse_cues()
        self._match_text_lines()
        self._add_id()
+        self._timestamp_check()

    def _parse_cues(self):
        """split into cues"""
@ -181,7 +182,8 @@ class SubtitleParser:
                clean = re.sub(self.stamp_reg, "", line)
                clean = re.sub(self.tag_reg, "", clean)
                cue_dict["lines"].append(clean)
-                if clean and clean not in self.all_text_lines:
+                if clean.strip() and clean not in self.all_text_lines[-4:]:
+                    # remove immediate duplicates
                    self.all_text_lines.append(clean)

        return cue_dict
@ -205,6 +207,21 @@ class SubtitleParser:

            self.matched.append(new_cue)

+    def _timestamp_check(self):
+        """check if end timestamp is bigger than start timestamp"""
+        for idx, cue in enumerate(self.matched):
+            # this
+            end = int(re.sub("[^0-9]", "", cue.get("end")))
+            # next
+            try:
+                next_cue = self.matched[idx + 1]
+            except IndexError:
+                continue
+
+            start_next = int(re.sub("[^0-9]", "", next_cue.get("start")))
+            if end > start_next:
+                self.matched[idx]["end"] = next_cue.get("start")
+
    def _add_id(self):
        """add id to matched cues"""
        for idx, _ in enumerate(self.matched):