From b494fc10afbe7b1b58727e7e82410f8ccc48507d Mon Sep 17 00:00:00 2001
From: simon <simobilleter@gmail.com>
Date: Sat, 12 Feb 2022 16:03:04 +0700
Subject: [PATCH] cover edge cases where end timestamp is after start timestamp
 of new cue

---
 tubearchivist/home/src/index/video.py | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/tubearchivist/home/src/index/video.py b/tubearchivist/home/src/index/video.py
index 52735b0..1d7bf15 100644
--- a/tubearchivist/home/src/index/video.py
+++ b/tubearchivist/home/src/index/video.py
@@ -159,6 +159,7 @@ class SubtitleParser:
         self._parse_cues()
         self._match_text_lines()
         self._add_id()
+        self._timestamp_check()
 
     def _parse_cues(self):
         """split into cues"""
@@ -181,7 +182,8 @@ class SubtitleParser:
                 clean = re.sub(self.stamp_reg, "", line)
                 clean = re.sub(self.tag_reg, "", clean)
                 cue_dict["lines"].append(clean)
-                if clean and clean not in self.all_text_lines:
+                if clean.strip() and clean not in self.all_text_lines[-4:]:
+                    # remove immediate duplicates
                     self.all_text_lines.append(clean)
 
         return cue_dict
@@ -205,6 +207,21 @@ class SubtitleParser:
 
             self.matched.append(new_cue)
 
+    def _timestamp_check(self):
+        """check if end timestamp is bigger than start timestamp"""
+        for idx, cue in enumerate(self.matched):
+            # this
+            end = int(re.sub("[^0-9]", "", cue.get("end")))
+            # next
+            try:
+                next_cue = self.matched[idx + 1]
+            except IndexError:
+                continue
+
+            start_next = int(re.sub("[^0-9]", "", next_cue.get("start")))
+            if end > start_next:
+                self.matched[idx]["end"] = next_cue.get("start")
+
     def _add_id(self):
         """add id to matched cues"""
         for idx, _ in enumerate(self.matched):