add subtitle functionality, #build

Changes:
- merges new subtitle download and index functionality
- merges player improvements and api integrations from @n8detar
- merges fix for non ascii channel names
- merges fix for pagination error with 10k+ videos
This commit is contained in:
simon 2022-02-10 19:48:39 +07:00
commit 3efa388b5a
No known key found for this signature in database
GPG Key ID: 2C15AA5E89985DD4
10 changed files with 342 additions and 118 deletions

View File

@ -33,7 +33,7 @@ services:
depends_on: depends_on:
- archivist-es - archivist-es
archivist-es: archivist-es:
image: docker.elastic.co/elasticsearch/elasticsearch:7.16.2 image: docker.elastic.co/elasticsearch/elasticsearch:7.17.0
container_name: archivist-es container_name: archivist-es
restart: always restart: always
environment: environment:
@ -54,4 +54,4 @@ volumes:
media: media:
cache: cache:
redis: redis:
es: es:

View File

@ -23,10 +23,6 @@ response = requests.get(url, headers=headers)
## Video Item View ## Video Item View
/api/video/\<video_id>/ /api/video/\<video_id>/
## Video Player View
returns all relevant information to create video player
/api/video/\<video_id>/player
## Channel List View ## Channel List View
/api/channel/ /api/channel/

View File

@ -6,7 +6,6 @@ from api.views import (
DownloadApiListView, DownloadApiListView,
DownloadApiView, DownloadApiView,
PlaylistApiView, PlaylistApiView,
VideoApiPlayerView,
VideoApiView, VideoApiView,
) )
from django.urls import path from django.urls import path
@ -17,11 +16,6 @@ urlpatterns = [
VideoApiView.as_view(), VideoApiView.as_view(),
name="api-video", name="api-video",
), ),
path(
"video/<slug:video_id>/player/",
VideoApiPlayerView.as_view(),
name="api-video-player",
),
path( path(
"channel/", "channel/",
ChannelApiListView.as_view(), ChannelApiListView.as_view(),

View File

@ -60,6 +60,12 @@ class ApiBaseView(APIView):
cache_dir = self.default_conf["application"]["cache_dir"] cache_dir = self.default_conf["application"]["cache_dir"]
new_thumb = f"{cache_dir}/{vid_thumb_url}" new_thumb = f"{cache_dir}/{vid_thumb_url}"
self.response["data"]["vid_thumb_url"] = new_thumb self.response["data"]["vid_thumb_url"] = new_thumb
if "subtitles" in all_keys:
all_subtitles = self.response["data"]["subtitles"]
for idx, _ in enumerate(all_subtitles):
url = self.response["data"]["subtitles"][idx]["media_url"]
new_url = f"/media/{url}"
self.response["data"]["subtitles"][idx]["media_url"] = new_url
def get_paginate(self): def get_paginate(self):
"""add pagination detail to response""" """add pagination detail to response"""
@ -92,38 +98,6 @@ class VideoApiView(ApiBaseView):
return Response(self.response, status=self.status_code) return Response(self.response, status=self.status_code)
class VideoApiPlayerView(ApiBaseView):
"""resolves to /api/video/<video_id>/player
GET: returns dict of video to build player
"""
search_base = "/ta_video/_doc/"
def get(self, request, video_id):
# pylint: disable=unused-argument
"""get request"""
self.config_builder()
self.get_document(video_id)
player = self.process_response()
return Response(player, status=self.status_code)
def process_response(self):
"""build all needed vars for player"""
vid_data = self.response["data"]
youtube_id = vid_data["youtube_id"]
vid_thumb_url = ThumbManager().vid_thumb_path(youtube_id)
player = {
"youtube_id": youtube_id,
"media_url": "/media/" + vid_data["media_url"],
"vid_thumb_url": "/cache/" + vid_thumb_url,
"title": vid_data["title"],
"channel_name": vid_data["channel"]["channel_name"],
"channel_id": vid_data["channel"]["channel_id"],
"is_watched": vid_data["player"]["watched"],
}
return player
class ChannelApiView(ApiBaseView): class ChannelApiView(ApiBaseView):
"""resolves to /api/channel/<channel_id>/ """resolves to /api/channel/<channel_id>/
GET: returns metadata dict of channel GET: returns metadata dict of channel

View File

@ -156,6 +156,32 @@
"normalizer": "to_lower" "normalizer": "to_lower"
} }
} }
},
"subtitles": {
"properties": {
"ext": {
"type": "keyword",
"index": false
},
"lang": {
"type": "keyword",
"index": false
},
"media_url": {
"type": "keyword",
"index": false
},
"name": {
"type": "keyword"
},
"source": {
"type": "keyword"
},
"url": {
"type": "keyword",
"index": false
}
}
} }
}, },
"expected_set": { "expected_set": {
@ -277,6 +303,73 @@
}, },
"number_of_replicas": "0" "number_of_replicas": "0"
} }
},
{
"index_name": "subtitle",
"expected_map": {
"youtube_id": {
"type": "keyword"
},
"title": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256,
"normalizer": "to_lower"
}
}
},
"subtitle_fragment_id": {
"type": "keyword"
},
"subtitle_channel": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256,
"normalizer": "to_lower"
}
}
},
"subtitle_channel_id": {
"type": "keyword"
},
"subtitle_start": {
"type": "text"
},
"subtitle_end": {
"type": "text"
},
"subtitle_last_refresh": {
"type": "date"
},
"subtitle_index": {
"type" : "long"
},
"subtitle_lang": {
"type": "keyword"
},
"subtitle_source": {
"type": "keyword"
},
"subtitle_line": {
"type" : "text",
"analyzer": "english"
}
},
"expected_set": {
"analysis": {
"normalizer": {
"to_lower": {
"type": "custom",
"filter": ["lowercase"]
}
}
},
"number_of_replicas": "0"
}
} }
] ]
} }

View File

@ -46,8 +46,9 @@ class FilesystemScanner:
all_downloaded = [] all_downloaded = []
for channel_name in all_channels: for channel_name in all_channels:
channel_path = os.path.join(self.VIDEOS, channel_name) channel_path = os.path.join(self.VIDEOS, channel_name)
videos = os.listdir(channel_path) channel_files = os.listdir(channel_path)
all_videos = ignore_filelist(videos) channel_files_clean = ignore_filelist(channel_files)
all_videos = [i for i in channel_files_clean if i.endswith(".mp4")]
for video in all_videos: for video in all_videos:
youtube_id = video[9:20] youtube_id = video[9:20]
all_downloaded.append((channel_name, video, youtube_id)) all_downloaded.append((channel_name, video, youtube_id))

View File

@ -4,10 +4,13 @@ functionality:
- index and update in es - index and update in es
""" """
import json
import os import os
import re
from datetime import datetime from datetime import datetime
import requests import requests
from home.src.es.connect import ElasticWrap
from home.src.index import channel as ta_channel from home.src.index import channel as ta_channel
from home.src.index.generic import YouTubeItem from home.src.index.generic import YouTubeItem
from home.src.ta.helper import DurationConverter, clean_string from home.src.ta.helper import DurationConverter, clean_string
@ -17,16 +20,13 @@ from ryd_client import ryd_client
class YoutubeSubtitle: class YoutubeSubtitle:
"""handle video subtitle functionality""" """handle video subtitle functionality"""
def __init__(self, config, youtube_meta, media_url, youtube_id): def __init__(self, video):
self.config = config self.video = video
self.youtube_meta = youtube_meta
self.media_url = media_url
self.youtube_id = youtube_id
self.languages = False self.languages = False
def sub_conf_parse(self): def sub_conf_parse(self):
"""add additional conf values to self""" """add additional conf values to self"""
languages_raw = self.config["downloads"]["subtitle"] languages_raw = self.video.config["downloads"]["subtitle"]
self.languages = [i.strip() for i in languages_raw.split(",")] self.languages = [i.strip() for i in languages_raw.split(",")]
def get_subtitles(self): def get_subtitles(self):
@ -36,82 +36,226 @@ class YoutubeSubtitle:
# no subtitles # no subtitles
return False return False
relevant_subtitles = self.get_user_subtitles() relevant_subtitles = []
if relevant_subtitles: for lang in self.languages:
return relevant_subtitles user_sub = self.get_user_subtitles(lang)
if user_sub:
relevant_subtitles.append(user_sub)
continue
if self.config["downloads"]["subtitle_source"] == "auto": if self.video.config["downloads"]["subtitle_source"] == "auto":
relevant_auto = self.get_auto_caption() auto_cap = self.get_auto_caption(lang)
return relevant_auto if auto_cap:
relevant_subtitles.append(auto_cap)
return False return relevant_subtitles
def get_auto_caption(self): def get_auto_caption(self, lang):
"""get auto_caption subtitles""" """get auto_caption subtitles"""
print(f"{self.youtube_id}: get auto generated subtitles") print(f"{self.video.youtube_id}-{lang}: get auto generated subtitles")
all_subtitles = self.youtube_meta.get("automatic_captions") all_subtitles = self.video.youtube_meta.get("automatic_captions")
if not all_subtitles: if not all_subtitles:
return False return False
relevant_subtitles = [] video_media_url = self.video.json_data["media_url"]
media_url = video_media_url.replace(".mp4", f"-{lang}.vtt")
all_formats = all_subtitles.get(lang)
subtitle = [i for i in all_formats if i["ext"] == "vtt"][0]
subtitle.update(
{"lang": lang, "source": "auto", "media_url": media_url}
)
for lang in self.languages: return subtitle
media_url = self.media_url.replace(".mp4", f"-{lang}.vtt")
all_formats = all_subtitles.get(lang)
subtitle = [i for i in all_formats if i["ext"] == "vtt"][0]
subtitle.update(
{"lang": lang, "source": "auto", "media_url": media_url}
)
relevant_subtitles.append(subtitle)
break
return relevant_subtitles
def _normalize_lang(self): def _normalize_lang(self):
"""normalize country specific language keys""" """normalize country specific language keys"""
all_subtitles = self.youtube_meta.get("subtitles") all_subtitles = self.video.youtube_meta.get("subtitles")
if not all_subtitles:
return False
all_keys = list(all_subtitles.keys()) all_keys = list(all_subtitles.keys())
for key in all_keys: for key in all_keys:
lang = key.split("-")[0] lang = key.split("-")[0]
old = all_subtitles.pop(key) old = all_subtitles.pop(key)
if lang == "live_chat":
continue
all_subtitles[lang] = old all_subtitles[lang] = old
return all_subtitles return all_subtitles
def get_user_subtitles(self): def get_user_subtitles(self, lang):
"""get subtitles uploaded from channel owner""" """get subtitles uploaded from channel owner"""
print(f"{self.youtube_id}: get user uploaded subtitles") print(f"{self.video.youtube_id}-{lang}: get user uploaded subtitles")
all_subtitles = self._normalize_lang() all_subtitles = self._normalize_lang()
if not all_subtitles: if not all_subtitles:
return False return False
relevant_subtitles = [] video_media_url = self.video.json_data["media_url"]
media_url = video_media_url.replace(".mp4", f"-{lang}.vtt")
all_formats = all_subtitles.get(lang)
if not all_formats:
# no user subtitles found
return False
for lang in self.languages: subtitle = [i for i in all_formats if i["ext"] == "vtt"][0]
media_url = self.media_url.replace(".mp4", f"-{lang}.vtt") subtitle.update(
all_formats = all_subtitles.get(lang) {"lang": lang, "source": "user", "media_url": media_url}
subtitle = [i for i in all_formats if i["ext"] == "vtt"][0] )
subtitle.update(
{"lang": lang, "source": "user", "media_url": media_url}
)
relevant_subtitles.append(subtitle)
break
return relevant_subtitles return subtitle
def download_subtitles(self, relevant_subtitles): def download_subtitles(self, relevant_subtitles):
"""download subtitle files to archive""" """download subtitle files to archive"""
videos_base = self.video.config["application"]["videos"]
for subtitle in relevant_subtitles: for subtitle in relevant_subtitles:
dest_path = os.path.join( dest_path = os.path.join(videos_base, subtitle["media_url"])
self.config["application"]["videos"], subtitle["media_url"] source = subtitle["source"]
)
response = requests.get(subtitle["url"]) response = requests.get(subtitle["url"])
if response.ok: if not response.ok:
with open(dest_path, "w", encoding="utf-8") as subfile: print(f"{self.video.youtube_id}: failed to download subtitle")
subfile.write(response.text) continue
parser = SubtitleParser(response.text, subtitle.get("lang"))
parser.process()
subtitle_str = parser.get_subtitle_str()
self._write_subtitle_file(dest_path, subtitle_str)
query_str = parser.create_bulk_import(self.video, source)
self._index_subtitle(query_str)
@staticmethod
def _write_subtitle_file(dest_path, subtitle_str):
"""write subtitle file to disk"""
# create folder here for first video of channel
os.makedirs(os.path.split(dest_path)[0], exist_ok=True)
with open(dest_path, "w", encoding="utf-8") as subfile:
subfile.write(subtitle_str)
@staticmethod
def _index_subtitle(query_str):
"""send subtitle to es for indexing"""
_, _ = ElasticWrap("_bulk").post(data=query_str, ndjson=True)
class SubtitleParser:
"""parse subtitle str from youtube"""
time_reg = r"^([0-9]{2}:?){3}\.[0-9]{3} --> ([0-9]{2}:?){3}\.[0-9]{3}"
stamp_reg = r"<([0-9]{2}:?){3}\.[0-9]{3}>"
tag_reg = r"</?c>"
def __init__(self, subtitle_str, lang):
self.subtitle_str = subtitle_str
self.lang = lang
self.header = False
self.parsed_cue_list = False
self.all_text_lines = False
self.matched = False
def process(self):
"""collection to process subtitle string"""
self._parse_cues()
self._match_text_lines()
self._add_id()
def _parse_cues(self):
"""split into cues"""
all_cues = self.subtitle_str.replace("\n \n", "\n").split("\n\n")
self.header = all_cues[0]
self.all_text_lines = []
self.parsed_cue_list = [self._cue_cleaner(i) for i in all_cues[1:]]
def _cue_cleaner(self, cue):
"""parse single cue"""
all_lines = cue.split("\n")
cue_dict = {"lines": []}
for line in all_lines:
if re.match(self.time_reg, line):
clean = re.search(self.time_reg, line).group()
start, end = clean.split(" --> ")
cue_dict.update({"start": start, "end": end})
else: else:
print(f"{self.youtube_id}: failed to download subtitle") clean = re.sub(self.stamp_reg, "", line)
clean = re.sub(self.tag_reg, "", clean)
cue_dict["lines"].append(clean)
if clean and clean not in self.all_text_lines:
self.all_text_lines.append(clean)
return cue_dict
def _match_text_lines(self):
"""match unique text lines with timestamps"""
self.matched = []
while self.all_text_lines:
check = self.all_text_lines[0]
matches = [i for i in self.parsed_cue_list if check in i["lines"]]
new_cue = matches[-1]
new_cue["start"] = matches[0]["start"]
for line in new_cue["lines"]:
try:
self.all_text_lines.remove(line)
except ValueError:
print("failed to process:")
print(line)
self.matched.append(new_cue)
def _add_id(self):
"""add id to matched cues"""
for idx, _ in enumerate(self.matched):
self.matched[idx]["id"] = idx + 1
def get_subtitle_str(self):
"""stitch cues and return processed new string"""
new_subtitle_str = self.header + "\n\n"
for cue in self.matched:
timestamp = f"{cue.get('start')} --> {cue.get('end')}"
lines = "\n".join(cue.get("lines"))
cue_text = f"{cue.get('id')}\n{timestamp}\n{lines}\n\n"
new_subtitle_str = new_subtitle_str + cue_text
return new_subtitle_str
def create_bulk_import(self, video, source):
"""process matched for es import"""
bulk_list = []
channel = video.json_data.get("channel")
document = {
"youtube_id": video.youtube_id,
"title": video.json_data.get("title"),
"subtitle_channel": channel.get("channel_name"),
"subtitle_channel_id": channel.get("channel_id"),
"subtitle_last_refresh": int(datetime.now().strftime("%s")),
"subtitle_lang": self.lang,
"subtitle_source": source,
}
for match in self.matched:
match_id = match.get("id")
document_id = f"{video.youtube_id}-{self.lang}-{match_id}"
action = {"index": {"_index": "ta_subtitle", "_id": document_id}}
document.update(
{
"subtitle_fragment_id": document_id,
"subtitle_start": match.get("start"),
"subtitle_end": match.get("end"),
"subtitle_index": match_id,
"subtitle_line": " ".join(match.get("lines")),
}
)
bulk_list.append(json.dumps(action))
bulk_list.append(json.dumps(document))
bulk_list.append("\n")
query_str = "\n".join(bulk_list)
return query_str
class YoutubeVideo(YouTubeItem, YoutubeSubtitle): class YoutubeVideo(YouTubeItem, YoutubeSubtitle):
@ -204,10 +348,17 @@ class YoutubeVideo(YouTubeItem, YoutubeSubtitle):
try: try:
# when indexing from download task # when indexing from download task
vid_path = self.build_dl_cache_path() vid_path = self.build_dl_cache_path()
except FileNotFoundError: except FileNotFoundError as err:
# when reindexing # when reindexing needs to handle title rename
base = self.app_conf["videos"] channel = os.path.split(self.json_data["media_url"])[0]
vid_path = os.path.join(base, self.json_data["media_url"]) channel_dir = os.path.join(self.app_conf["videos"], channel)
all_files = os.listdir(channel_dir)
for file in all_files:
if self.youtube_id in file:
vid_path = os.path.join(channel_dir, file)
break
else:
raise FileNotFoundError("could not find video file") from err
duration_handler = DurationConverter() duration_handler = DurationConverter()
duration = duration_handler.get_sec(vid_path) duration = duration_handler.get_sec(vid_path)
@ -242,11 +393,18 @@ class YoutubeVideo(YouTubeItem, YoutubeSubtitle):
"""delete video file, meta data""" """delete video file, meta data"""
self.get_from_es() self.get_from_es()
video_base = self.app_conf["videos"] video_base = self.app_conf["videos"]
media_url = self.json_data["media_url"] to_del = [self.json_data.get("media_url")]
print(f"{self.youtube_id}: delete {media_url} from file system")
to_delete = os.path.join(video_base, media_url) all_subtitles = self.json_data.get("subtitles")
os.remove(to_delete) if all_subtitles:
to_del = to_del + [i.get("media_url") for i in all_subtitles]
for media_url in to_del:
file_path = os.path.join(video_base, media_url)
os.remove(file_path)
self.del_in_es() self.del_in_es()
self._delete_subtitles()
def _get_ryd_stats(self): def _get_ryd_stats(self):
"""get optional stats from returnyoutubedislikeapi.com""" """get optional stats from returnyoutubedislikeapi.com"""
@ -270,17 +428,17 @@ class YoutubeVideo(YouTubeItem, YoutubeSubtitle):
def _check_subtitles(self): def _check_subtitles(self):
"""optionally add subtitles""" """optionally add subtitles"""
handler = YoutubeSubtitle( handler = YoutubeSubtitle(self)
self.config,
self.youtube_meta,
media_url=self.json_data["media_url"],
youtube_id=self.youtube_id,
)
subtitles = handler.get_subtitles() subtitles = handler.get_subtitles()
if subtitles: if subtitles:
self.json_data["subtitles"] = subtitles self.json_data["subtitles"] = subtitles
handler.download_subtitles(relevant_subtitles=subtitles) handler.download_subtitles(relevant_subtitles=subtitles)
def _delete_subtitles(self):
"""delete indexed subtitles"""
data = {"query": {"term": {"youtube_id": {"value": self.youtube_id}}}}
_, _ = ElasticWrap("ta_subtitle/_delete_by_query").post(data=data)
def index_new_video(youtube_id): def index_new_video(youtube_id):
"""combined classes to create new video in index""" """combined classes to create new video in index"""

View File

@ -169,7 +169,11 @@ class DurationConverter:
capture_output=True, capture_output=True,
check=True, check=True,
) )
duration_sec = int(float(duration.stdout.decode().strip())) duration_raw = duration.stdout.decode().strip()
if duration_raw == "N/A":
return 0
duration_sec = int(float(duration_raw))
return duration_sec return duration_sec
@staticmethod @staticmethod

View File

@ -3,10 +3,14 @@
{% load static %} {% load static %}
{% load humanize %} {% load humanize %}
<div class="video-main"> <div class="video-main">
<video <video poster="/cache/{{ video.vid_thumb_url }}" controls preload="false" width="100%" playsinline
src="/media/{{ video.media_url }}" id="video-item" ontimeupdate="onVideoProgress('{{ video.youtube_id }}')" onloadedmetadata="setVideoProgress(0)">
poster="/cache/{{ video.vid_thumb_url }}" controls preload="false" <source src="/media/{{ video.media_url }}" type="video/mp4">
type='video/mp4' width="100%" playsinline id="video-item" ontimeupdate="onVideoProgress('{{ video.youtube_id }}')" onloadedmetadata="setVideoProgress(0)"> {% if video.subtitles %}
{% for subtitle in video.subtitles %}
<track label="{{subtitle.name}}" kind="subtitles" srclang="{{subtitle.lang}}" src="/media/{{subtitle.media_url}}">
{% endfor %}
{% endif %}
</video> </video>
</div> </div>
<div class="boxed-content"> <div class="boxed-content">
@ -57,10 +61,10 @@
</div> </div>
<div class="info-box-item"> <div class="info-box-item">
<div> <div>
<p>Views: {{ video.stats.view_count|intcomma }}</p> <p class="thumb-icon"><img src="{% static 'img/icon-eye.svg' %}" alt="views">: {{ video.stats.view_count|intcomma }}</p>
<p class="thumb-icon like"><img src="{% static 'img/icon-thumb.svg' %}" alt="thumbs-up">: {{ video.stats.like_count|intcomma }}</p> <p class="thumb-icon like"><img src="{% static 'img/icon-thumb.svg' %}" alt="thumbs-up">: {{ video.stats.like_count|intcomma }}</p>
{% if video.stats.dislike_count %} {% if video.stats.dislike_count %}
<p class="thumb-icon dislike"><img src="{% static 'img/icon-thumb.svg' %}" alt="thumbs-down">: {{ video.stats.dislike_count|intcomma }}</p> <p class="thumb-icon"><img class="dislike" src="{% static 'img/icon-thumb.svg' %}" alt="thumbs-down">: {{ video.stats.dislike_count|intcomma }}</p>
{% endif %} {% endif %}
{% if video.stats.average_rating %} {% if video.stats.average_rating %}
<p class="rating-stars">Rating: <p class="rating-stars">Rating:

View File

@ -4,9 +4,9 @@ Django==4.0.2
django-cors-headers==3.11.0 django-cors-headers==3.11.0
djangorestframework==3.13.1 djangorestframework==3.13.1
Pillow==9.0.1 Pillow==9.0.1
redis==4.1.2 redis==4.1.3
requests==2.27.1 requests==2.27.1
ryd-client==0.0.3 ryd-client==0.0.3
uWSGI==2.0.20 uWSGI==2.0.20
whitenoise==5.3.0 whitenoise==6.0.0
yt_dlp==2022.2.4 yt_dlp==2022.2.4