mirror of
https://github.com/tubearchivist/tubearchivist-frontend.git
synced 2024-11-22 11:50:14 +00:00
fix autocaption extraction, flatten words, #180
This commit is contained in:
parent
6a6c8fa5d8
commit
1ce832b846
@ -114,12 +114,13 @@ class YoutubeSubtitle:
|
|||||||
for subtitle in relevant_subtitles:
|
for subtitle in relevant_subtitles:
|
||||||
dest_path = os.path.join(videos_base, subtitle["media_url"])
|
dest_path = os.path.join(videos_base, subtitle["media_url"])
|
||||||
source = subtitle["source"]
|
source = subtitle["source"]
|
||||||
|
lang = subtitle.get("lang")
|
||||||
response = requests.get(subtitle["url"])
|
response = requests.get(subtitle["url"])
|
||||||
if not response.ok:
|
if not response.ok:
|
||||||
print(f"{self.video.youtube_id}: failed to download subtitle")
|
print(f"{self.video.youtube_id}: failed to download subtitle")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
parser = SubtitleParser(response.text, subtitle.get("lang"))
|
parser = SubtitleParser(response.text, lang, source)
|
||||||
parser.process()
|
parser.process()
|
||||||
subtitle_str = parser.get_subtitle_str()
|
subtitle_str = parser.get_subtitle_str()
|
||||||
self._write_subtitle_file(dest_path, subtitle_str)
|
self._write_subtitle_file(dest_path, subtitle_str)
|
||||||
@ -144,15 +145,20 @@ class YoutubeSubtitle:
|
|||||||
class SubtitleParser:
|
class SubtitleParser:
|
||||||
"""parse subtitle str from youtube"""
|
"""parse subtitle str from youtube"""
|
||||||
|
|
||||||
def __init__(self, subtitle_str, lang):
|
def __init__(self, subtitle_str, lang, source):
|
||||||
self.subtitle_raw = json.loads(subtitle_str)
|
self.subtitle_raw = json.loads(subtitle_str)
|
||||||
self.lang = lang
|
self.lang = lang
|
||||||
|
self.source = source
|
||||||
self.all_cues = False
|
self.all_cues = False
|
||||||
|
|
||||||
def process(self):
|
def process(self):
|
||||||
"""extract relevant que data"""
|
"""extract relevant que data"""
|
||||||
|
all_events = self.subtitle_raw.get("events")
|
||||||
|
if self.source == "auto":
|
||||||
|
all_events = self._flat_auto_caption(all_events)
|
||||||
|
|
||||||
self.all_cues = []
|
self.all_cues = []
|
||||||
for idx, event in enumerate(self.subtitle_raw.get("events")):
|
for idx, event in enumerate(all_events):
|
||||||
cue = {
|
cue = {
|
||||||
"start": self.ms_conv(event["tStartMs"]),
|
"start": self.ms_conv(event["tStartMs"]),
|
||||||
"end": self.ms_conv(event["tStartMs"] + event["dDurationMs"]),
|
"end": self.ms_conv(event["tStartMs"] + event["dDurationMs"]),
|
||||||
@ -161,6 +167,22 @@ class SubtitleParser:
|
|||||||
}
|
}
|
||||||
self.all_cues.append(cue)
|
self.all_cues.append(cue)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _flat_auto_caption(all_events):
|
||||||
|
"""flatten autocaption segments"""
|
||||||
|
flatten = []
|
||||||
|
for event in all_events:
|
||||||
|
if "segs" not in event.keys():
|
||||||
|
continue
|
||||||
|
text = "".join([i.get("utf8") for i in event.get("segs")])
|
||||||
|
if not text.strip():
|
||||||
|
continue
|
||||||
|
|
||||||
|
event.update({"segs": [{"utf8": text}]})
|
||||||
|
flatten.append(event)
|
||||||
|
|
||||||
|
return flatten
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def ms_conv(ms):
|
def ms_conv(ms):
|
||||||
"""convert ms to timestamp"""
|
"""convert ms to timestamp"""
|
||||||
|
Loading…
Reference in New Issue
Block a user