mirror of
https://github.com/tubearchivist/tubearchivist-frontend.git
synced 2024-11-22 20:00:15 +00:00
fix multi language subtitle extractor, and better regex for timestamp matching
This commit is contained in:
parent
b071612038
commit
077692987b
@ -36,38 +36,37 @@ class YoutubeSubtitle:
|
|||||||
# no subtitles
|
# no subtitles
|
||||||
return False
|
return False
|
||||||
|
|
||||||
relevant_subtitles = self.get_user_subtitles()
|
relevant_subtitles = []
|
||||||
if relevant_subtitles:
|
for lang in self.languages:
|
||||||
return relevant_subtitles
|
user_sub = self.get_user_subtitles(lang)
|
||||||
|
if user_sub:
|
||||||
|
relevant_subtitles.append(user_sub)
|
||||||
|
continue
|
||||||
|
|
||||||
if self.video.config["downloads"]["subtitle_source"] == "auto":
|
if self.video.config["downloads"]["subtitle_source"] == "auto":
|
||||||
relevant_auto = self.get_auto_caption()
|
auto_cap = self.get_auto_caption(lang)
|
||||||
return relevant_auto
|
if auto_cap:
|
||||||
|
relevant_subtitles.append(auto_cap)
|
||||||
|
|
||||||
return False
|
return relevant_subtitles
|
||||||
|
|
||||||
def get_auto_caption(self):
|
def get_auto_caption(self, lang):
|
||||||
"""get auto_caption subtitles"""
|
"""get auto_caption subtitles"""
|
||||||
print(f"{self.video.youtube_id}: get auto generated subtitles")
|
print(f"{self.video.youtube_id}-{lang}: get auto generated subtitles")
|
||||||
all_subtitles = self.video.youtube_meta.get("automatic_captions")
|
all_subtitles = self.video.youtube_meta.get("automatic_captions")
|
||||||
|
|
||||||
if not all_subtitles:
|
if not all_subtitles:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
relevant_subtitles = []
|
video_media_url = self.video.json_data["media_url"]
|
||||||
|
media_url = video_media_url.replace(".mp4", f"-{lang}.vtt")
|
||||||
|
all_formats = all_subtitles.get(lang)
|
||||||
|
subtitle = [i for i in all_formats if i["ext"] == "vtt"][0]
|
||||||
|
subtitle.update(
|
||||||
|
{"lang": lang, "source": "auto", "media_url": media_url}
|
||||||
|
)
|
||||||
|
|
||||||
for lang in self.languages:
|
return subtitle
|
||||||
video_media_url = self.video.json_data["media_url"]
|
|
||||||
media_url = video_media_url.replace(".mp4", f"-{lang}.vtt")
|
|
||||||
all_formats = all_subtitles.get(lang)
|
|
||||||
subtitle = [i for i in all_formats if i["ext"] == "vtt"][0]
|
|
||||||
subtitle.update(
|
|
||||||
{"lang": lang, "source": "auto", "media_url": media_url}
|
|
||||||
)
|
|
||||||
relevant_subtitles.append(subtitle)
|
|
||||||
break
|
|
||||||
|
|
||||||
return relevant_subtitles
|
|
||||||
|
|
||||||
def _normalize_lang(self):
|
def _normalize_lang(self):
|
||||||
"""normalize country specific language keys"""
|
"""normalize country specific language keys"""
|
||||||
@ -85,27 +84,26 @@ class YoutubeSubtitle:
|
|||||||
|
|
||||||
return all_subtitles
|
return all_subtitles
|
||||||
|
|
||||||
def get_user_subtitles(self):
|
def get_user_subtitles(self, lang):
|
||||||
"""get subtitles uploaded from channel owner"""
|
"""get subtitles uploaded from channel owner"""
|
||||||
print(f"{self.video.youtube_id}: get user uploaded subtitles")
|
print(f"{self.video.youtube_id}-{lang}: get user uploaded subtitles")
|
||||||
all_subtitles = self._normalize_lang()
|
all_subtitles = self._normalize_lang()
|
||||||
if not all_subtitles:
|
if not all_subtitles:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
relevant_subtitles = []
|
video_media_url = self.video.json_data["media_url"]
|
||||||
|
media_url = video_media_url.replace(".mp4", f"-{lang}.vtt")
|
||||||
|
all_formats = all_subtitles.get(lang)
|
||||||
|
if not all_formats:
|
||||||
|
# no user subtitles found
|
||||||
|
return False
|
||||||
|
|
||||||
for lang in self.languages:
|
subtitle = [i for i in all_formats if i["ext"] == "vtt"][0]
|
||||||
video_media_url = self.video.json_data["media_url"]
|
subtitle.update(
|
||||||
media_url = video_media_url.replace(".mp4", f"-{lang}.vtt")
|
{"lang": lang, "source": "user", "media_url": media_url}
|
||||||
all_formats = all_subtitles.get(lang)
|
)
|
||||||
subtitle = [i for i in all_formats if i["ext"] == "vtt"][0]
|
|
||||||
subtitle.update(
|
|
||||||
{"lang": lang, "source": "user", "media_url": media_url}
|
|
||||||
)
|
|
||||||
relevant_subtitles.append(subtitle)
|
|
||||||
break
|
|
||||||
|
|
||||||
return relevant_subtitles
|
return subtitle
|
||||||
|
|
||||||
def download_subtitles(self, relevant_subtitles):
|
def download_subtitles(self, relevant_subtitles):
|
||||||
"""download subtitle files to archive"""
|
"""download subtitle files to archive"""
|
||||||
@ -173,7 +171,7 @@ class SubtitleParser:
|
|||||||
cue_dict = {"lines": []}
|
cue_dict = {"lines": []}
|
||||||
|
|
||||||
for line in all_lines:
|
for line in all_lines:
|
||||||
if re.match(r"^([0-9]{2}:?){3}", line):
|
if re.match(self.time_reg, line):
|
||||||
clean = re.search(self.time_reg, line).group()
|
clean = re.search(self.time_reg, line).group()
|
||||||
start, end = clean.split(" --> ")
|
start, end = clean.split(" --> ")
|
||||||
cue_dict.update({"start": start, "end": end})
|
cue_dict.update({"start": start, "end": end})
|
||||||
|
Loading…
Reference in New Issue
Block a user