fix multi language subtitle extractor, and better regex for timestamp matching

This commit is contained in:
simon 2022-02-10 18:32:23 +07:00
parent b071612038
commit 077692987b
No known key found for this signature in database
GPG Key ID: 2C15AA5E89985DD4
1 changed files with 35 additions and 37 deletions

View File

@ -36,38 +36,37 @@ class YoutubeSubtitle:
# no subtitles # no subtitles
return False return False
relevant_subtitles = self.get_user_subtitles() relevant_subtitles = []
if relevant_subtitles: for lang in self.languages:
return relevant_subtitles user_sub = self.get_user_subtitles(lang)
if user_sub:
relevant_subtitles.append(user_sub)
continue
if self.video.config["downloads"]["subtitle_source"] == "auto": if self.video.config["downloads"]["subtitle_source"] == "auto":
relevant_auto = self.get_auto_caption() auto_cap = self.get_auto_caption(lang)
return relevant_auto if auto_cap:
relevant_subtitles.append(auto_cap)
return False return relevant_subtitles
def get_auto_caption(self): def get_auto_caption(self, lang):
"""get auto_caption subtitles""" """get auto_caption subtitles"""
print(f"{self.video.youtube_id}: get auto generated subtitles") print(f"{self.video.youtube_id}-{lang}: get auto generated subtitles")
all_subtitles = self.video.youtube_meta.get("automatic_captions") all_subtitles = self.video.youtube_meta.get("automatic_captions")
if not all_subtitles: if not all_subtitles:
return False return False
relevant_subtitles = [] video_media_url = self.video.json_data["media_url"]
media_url = video_media_url.replace(".mp4", f"-{lang}.vtt")
all_formats = all_subtitles.get(lang)
subtitle = [i for i in all_formats if i["ext"] == "vtt"][0]
subtitle.update(
{"lang": lang, "source": "auto", "media_url": media_url}
)
for lang in self.languages: return subtitle
video_media_url = self.video.json_data["media_url"]
media_url = video_media_url.replace(".mp4", f"-{lang}.vtt")
all_formats = all_subtitles.get(lang)
subtitle = [i for i in all_formats if i["ext"] == "vtt"][0]
subtitle.update(
{"lang": lang, "source": "auto", "media_url": media_url}
)
relevant_subtitles.append(subtitle)
break
return relevant_subtitles
def _normalize_lang(self): def _normalize_lang(self):
"""normalize country specific language keys""" """normalize country specific language keys"""
@ -85,27 +84,26 @@ class YoutubeSubtitle:
return all_subtitles return all_subtitles
def get_user_subtitles(self): def get_user_subtitles(self, lang):
"""get subtitles uploaded from channel owner""" """get subtitles uploaded from channel owner"""
print(f"{self.video.youtube_id}: get user uploaded subtitles") print(f"{self.video.youtube_id}-{lang}: get user uploaded subtitles")
all_subtitles = self._normalize_lang() all_subtitles = self._normalize_lang()
if not all_subtitles: if not all_subtitles:
return False return False
relevant_subtitles = [] video_media_url = self.video.json_data["media_url"]
media_url = video_media_url.replace(".mp4", f"-{lang}.vtt")
all_formats = all_subtitles.get(lang)
if not all_formats:
# no user subtitles found
return False
for lang in self.languages: subtitle = [i for i in all_formats if i["ext"] == "vtt"][0]
video_media_url = self.video.json_data["media_url"] subtitle.update(
media_url = video_media_url.replace(".mp4", f"-{lang}.vtt") {"lang": lang, "source": "user", "media_url": media_url}
all_formats = all_subtitles.get(lang) )
subtitle = [i for i in all_formats if i["ext"] == "vtt"][0]
subtitle.update(
{"lang": lang, "source": "user", "media_url": media_url}
)
relevant_subtitles.append(subtitle)
break
return relevant_subtitles return subtitle
def download_subtitles(self, relevant_subtitles): def download_subtitles(self, relevant_subtitles):
"""download subtitle files to archive""" """download subtitle files to archive"""
@ -173,7 +171,7 @@ class SubtitleParser:
cue_dict = {"lines": []} cue_dict = {"lines": []}
for line in all_lines: for line in all_lines:
if re.match(r"^([0-9]{2}:?){3}", line): if re.match(self.time_reg, line):
clean = re.search(self.time_reg, line).group() clean = re.search(self.time_reg, line).group()
start, end = clean.split(" --> ") start, end = clean.split(" --> ")
cue_dict.update({"start": start, "end": end}) cue_dict.update({"start": start, "end": end})