mirror of
https://github.com/tubearchivist/tubearchivist-frontend.git
synced 2025-01-23 01:00:18 +00:00
auto generated subtitle parser and cleaner
This commit is contained in:
parent
1664b0d4fc
commit
5f6158243e
@ -5,6 +5,7 @@ functionality:
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
from datetime import datetime
|
||||
|
||||
import requests
|
||||
@ -121,6 +122,90 @@ class YoutubeSubtitle:
|
||||
print(f"{self.youtube_id}: failed to download subtitle")
|
||||
|
||||
|
||||
class SubtitleParser:
|
||||
"""parse subtitle str from youtube"""
|
||||
|
||||
time_reg = r"^([0-9]{2}:?){3}\.[0-9]{3} --> ([0-9]{2}:?){3}\.[0-9]{3}"
|
||||
stamp_reg = r"<([0-9]{2}:?){3}\.[0-9]{3}>"
|
||||
tag_reg = r"</?c>"
|
||||
|
||||
def __init__(self, subtitle_str):
|
||||
self.subtitle_str = subtitle_str
|
||||
self.header = False
|
||||
self.parsed_cue_list = False
|
||||
self.all_text_lines = False
|
||||
self.matched = False
|
||||
|
||||
def process(self):
|
||||
"""collection to process subtitle string"""
|
||||
self._parse_cues()
|
||||
self._match_text_lines()
|
||||
self._add_id()
|
||||
|
||||
def _parse_cues(self):
|
||||
"""split into cues"""
|
||||
all_cues = self.subtitle_str.replace("\n \n", "\n").split("\n\n")
|
||||
self.header = all_cues[0]
|
||||
self.all_text_lines = []
|
||||
self.parsed_cue_list = [self._cue_cleaner(i) for i in all_cues[1:]]
|
||||
|
||||
def _cue_cleaner(self, cue):
|
||||
"""parse single cue"""
|
||||
all_lines = cue.split("\n")
|
||||
cue_dict = {"lines": []}
|
||||
|
||||
for line in all_lines:
|
||||
if re.match(r"^([0-9]{2}:?){3}", line):
|
||||
clean = re.search(self.time_reg, line).group()
|
||||
start, end = clean.split(" --> ")
|
||||
cue_dict.update({"start": start, "end": end})
|
||||
else:
|
||||
clean = re.sub(self.stamp_reg, "", line)
|
||||
clean = re.sub(self.tag_reg, "", clean)
|
||||
cue_dict["lines"].append(clean)
|
||||
if clean and clean not in self.all_text_lines:
|
||||
self.all_text_lines.append(clean)
|
||||
|
||||
return cue_dict
|
||||
|
||||
def _match_text_lines(self):
|
||||
"""match unique text lines with timestamps"""
|
||||
|
||||
self.matched = []
|
||||
|
||||
while self.all_text_lines:
|
||||
check = self.all_text_lines[0]
|
||||
matches = [i for i in self.parsed_cue_list if check in i["lines"]]
|
||||
new_cue = matches[-1]
|
||||
new_cue["start"] = matches[0]["start"]
|
||||
|
||||
for line in new_cue["lines"]:
|
||||
try:
|
||||
self.all_text_lines.remove(line)
|
||||
except ValueError:
|
||||
print("failed to process:")
|
||||
print(line)
|
||||
|
||||
self.matched.append(new_cue)
|
||||
|
||||
def _add_id(self):
|
||||
"""add id to matched cues"""
|
||||
for idx, _ in enumerate(self.matched):
|
||||
self.matched[idx]["id"] = idx + 1
|
||||
|
||||
def get_subtitle_str(self):
|
||||
"""stitch cues and return processed new string"""
|
||||
new_subtitle_str = self.header + "\n\n"
|
||||
|
||||
for cue in self.matched:
|
||||
timestamp = f"{cue.get('start')} --> {cue.get('end')}"
|
||||
lines = "\n".join(cue.get("lines"))
|
||||
cue_text = f"{cue.get('id')}\n{timestamp}\n{lines}\n\n"
|
||||
new_subtitle_str = new_subtitle_str + cue_text
|
||||
|
||||
return new_subtitle_str
|
||||
|
||||
|
||||
class YoutubeVideo(YouTubeItem, YoutubeSubtitle):
|
||||
"""represents a single youtube video"""
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user