add subtitle functionality, #build

Changes:
- merges new subtitle download and index functionality
- merges player improvements and api integrations from @n8detar
- merges fix for non ascii channel names
- merges fix for pagination error with 10k+ videos
This commit is contained in:
simon 2022-02-10 19:48:39 +07:00
commit 3efa388b5a
No known key found for this signature in database
GPG Key ID: 2C15AA5E89985DD4
10 changed files with 342 additions and 118 deletions

View File

@ -33,7 +33,7 @@ services:
depends_on:
- archivist-es
archivist-es:
image: docker.elastic.co/elasticsearch/elasticsearch:7.16.2
image: docker.elastic.co/elasticsearch/elasticsearch:7.17.0
container_name: archivist-es
restart: always
environment:
@ -54,4 +54,4 @@ volumes:
media:
cache:
redis:
es:
es:

View File

@ -23,10 +23,6 @@ response = requests.get(url, headers=headers)
## Video Item View
/api/video/\<video_id>/
## Video Player View
returns all relevant information to create video player
/api/video/\<video_id>/player
## Channel List View
/api/channel/

View File

@ -6,7 +6,6 @@ from api.views import (
DownloadApiListView,
DownloadApiView,
PlaylistApiView,
VideoApiPlayerView,
VideoApiView,
)
from django.urls import path
@ -17,11 +16,6 @@ urlpatterns = [
VideoApiView.as_view(),
name="api-video",
),
path(
"video/<slug:video_id>/player/",
VideoApiPlayerView.as_view(),
name="api-video-player",
),
path(
"channel/",
ChannelApiListView.as_view(),

View File

@ -60,6 +60,12 @@ class ApiBaseView(APIView):
cache_dir = self.default_conf["application"]["cache_dir"]
new_thumb = f"{cache_dir}/{vid_thumb_url}"
self.response["data"]["vid_thumb_url"] = new_thumb
if "subtitles" in all_keys:
all_subtitles = self.response["data"]["subtitles"]
for idx, _ in enumerate(all_subtitles):
url = self.response["data"]["subtitles"][idx]["media_url"]
new_url = f"/media/{url}"
self.response["data"]["subtitles"][idx]["media_url"] = new_url
def get_paginate(self):
"""add pagination detail to response"""
@ -92,38 +98,6 @@ class VideoApiView(ApiBaseView):
return Response(self.response, status=self.status_code)
class VideoApiPlayerView(ApiBaseView):
"""resolves to /api/video/<video_id>/player
GET: returns dict of video to build player
"""
search_base = "/ta_video/_doc/"
def get(self, request, video_id):
# pylint: disable=unused-argument
"""get request"""
self.config_builder()
self.get_document(video_id)
player = self.process_response()
return Response(player, status=self.status_code)
def process_response(self):
"""build all needed vars for player"""
vid_data = self.response["data"]
youtube_id = vid_data["youtube_id"]
vid_thumb_url = ThumbManager().vid_thumb_path(youtube_id)
player = {
"youtube_id": youtube_id,
"media_url": "/media/" + vid_data["media_url"],
"vid_thumb_url": "/cache/" + vid_thumb_url,
"title": vid_data["title"],
"channel_name": vid_data["channel"]["channel_name"],
"channel_id": vid_data["channel"]["channel_id"],
"is_watched": vid_data["player"]["watched"],
}
return player
class ChannelApiView(ApiBaseView):
"""resolves to /api/channel/<channel_id>/
GET: returns metadata dict of channel

View File

@ -156,6 +156,32 @@
"normalizer": "to_lower"
}
}
},
"subtitles": {
"properties": {
"ext": {
"type": "keyword",
"index": false
},
"lang": {
"type": "keyword",
"index": false
},
"media_url": {
"type": "keyword",
"index": false
},
"name": {
"type": "keyword"
},
"source": {
"type": "keyword"
},
"url": {
"type": "keyword",
"index": false
}
}
}
},
"expected_set": {
@ -277,6 +303,73 @@
},
"number_of_replicas": "0"
}
},
{
"index_name": "subtitle",
"expected_map": {
"youtube_id": {
"type": "keyword"
},
"title": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256,
"normalizer": "to_lower"
}
}
},
"subtitle_fragment_id": {
"type": "keyword"
},
"subtitle_channel": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256,
"normalizer": "to_lower"
}
}
},
"subtitle_channel_id": {
"type": "keyword"
},
"subtitle_start": {
"type": "text"
},
"subtitle_end": {
"type": "text"
},
"subtitle_last_refresh": {
"type": "date"
},
"subtitle_index": {
"type" : "long"
},
"subtitle_lang": {
"type": "keyword"
},
"subtitle_source": {
"type": "keyword"
},
"subtitle_line": {
"type" : "text",
"analyzer": "english"
}
},
"expected_set": {
"analysis": {
"normalizer": {
"to_lower": {
"type": "custom",
"filter": ["lowercase"]
}
}
},
"number_of_replicas": "0"
}
}
]
}

View File

@ -46,8 +46,9 @@ class FilesystemScanner:
all_downloaded = []
for channel_name in all_channels:
channel_path = os.path.join(self.VIDEOS, channel_name)
videos = os.listdir(channel_path)
all_videos = ignore_filelist(videos)
channel_files = os.listdir(channel_path)
channel_files_clean = ignore_filelist(channel_files)
all_videos = [i for i in channel_files_clean if i.endswith(".mp4")]
for video in all_videos:
youtube_id = video[9:20]
all_downloaded.append((channel_name, video, youtube_id))

View File

@ -4,10 +4,13 @@ functionality:
- index and update in es
"""
import json
import os
import re
from datetime import datetime
import requests
from home.src.es.connect import ElasticWrap
from home.src.index import channel as ta_channel
from home.src.index.generic import YouTubeItem
from home.src.ta.helper import DurationConverter, clean_string
@ -17,16 +20,13 @@ from ryd_client import ryd_client
class YoutubeSubtitle:
"""handle video subtitle functionality"""
def __init__(self, config, youtube_meta, media_url, youtube_id):
self.config = config
self.youtube_meta = youtube_meta
self.media_url = media_url
self.youtube_id = youtube_id
def __init__(self, video):
self.video = video
self.languages = False
def sub_conf_parse(self):
"""add additional conf values to self"""
languages_raw = self.config["downloads"]["subtitle"]
languages_raw = self.video.config["downloads"]["subtitle"]
self.languages = [i.strip() for i in languages_raw.split(",")]
def get_subtitles(self):
@ -36,82 +36,226 @@ class YoutubeSubtitle:
# no subtitles
return False
relevant_subtitles = self.get_user_subtitles()
if relevant_subtitles:
return relevant_subtitles
relevant_subtitles = []
for lang in self.languages:
user_sub = self.get_user_subtitles(lang)
if user_sub:
relevant_subtitles.append(user_sub)
continue
if self.config["downloads"]["subtitle_source"] == "auto":
relevant_auto = self.get_auto_caption()
return relevant_auto
if self.video.config["downloads"]["subtitle_source"] == "auto":
auto_cap = self.get_auto_caption(lang)
if auto_cap:
relevant_subtitles.append(auto_cap)
return False
return relevant_subtitles
def get_auto_caption(self):
def get_auto_caption(self, lang):
"""get auto_caption subtitles"""
print(f"{self.youtube_id}: get auto generated subtitles")
all_subtitles = self.youtube_meta.get("automatic_captions")
print(f"{self.video.youtube_id}-{lang}: get auto generated subtitles")
all_subtitles = self.video.youtube_meta.get("automatic_captions")
if not all_subtitles:
return False
relevant_subtitles = []
video_media_url = self.video.json_data["media_url"]
media_url = video_media_url.replace(".mp4", f"-{lang}.vtt")
all_formats = all_subtitles.get(lang)
subtitle = [i for i in all_formats if i["ext"] == "vtt"][0]
subtitle.update(
{"lang": lang, "source": "auto", "media_url": media_url}
)
for lang in self.languages:
media_url = self.media_url.replace(".mp4", f"-{lang}.vtt")
all_formats = all_subtitles.get(lang)
subtitle = [i for i in all_formats if i["ext"] == "vtt"][0]
subtitle.update(
{"lang": lang, "source": "auto", "media_url": media_url}
)
relevant_subtitles.append(subtitle)
break
return relevant_subtitles
return subtitle
def _normalize_lang(self):
"""normalize country specific language keys"""
all_subtitles = self.youtube_meta.get("subtitles")
all_subtitles = self.video.youtube_meta.get("subtitles")
if not all_subtitles:
return False
all_keys = list(all_subtitles.keys())
for key in all_keys:
lang = key.split("-")[0]
old = all_subtitles.pop(key)
if lang == "live_chat":
continue
all_subtitles[lang] = old
return all_subtitles
def get_user_subtitles(self):
def get_user_subtitles(self, lang):
"""get subtitles uploaded from channel owner"""
print(f"{self.youtube_id}: get user uploaded subtitles")
print(f"{self.video.youtube_id}-{lang}: get user uploaded subtitles")
all_subtitles = self._normalize_lang()
if not all_subtitles:
return False
relevant_subtitles = []
video_media_url = self.video.json_data["media_url"]
media_url = video_media_url.replace(".mp4", f"-{lang}.vtt")
all_formats = all_subtitles.get(lang)
if not all_formats:
# no user subtitles found
return False
for lang in self.languages:
media_url = self.media_url.replace(".mp4", f"-{lang}.vtt")
all_formats = all_subtitles.get(lang)
subtitle = [i for i in all_formats if i["ext"] == "vtt"][0]
subtitle.update(
{"lang": lang, "source": "user", "media_url": media_url}
)
relevant_subtitles.append(subtitle)
break
subtitle = [i for i in all_formats if i["ext"] == "vtt"][0]
subtitle.update(
{"lang": lang, "source": "user", "media_url": media_url}
)
return relevant_subtitles
return subtitle
def download_subtitles(self, relevant_subtitles):
"""download subtitle files to archive"""
videos_base = self.video.config["application"]["videos"]
for subtitle in relevant_subtitles:
dest_path = os.path.join(
self.config["application"]["videos"], subtitle["media_url"]
)
dest_path = os.path.join(videos_base, subtitle["media_url"])
source = subtitle["source"]
response = requests.get(subtitle["url"])
if response.ok:
with open(dest_path, "w", encoding="utf-8") as subfile:
subfile.write(response.text)
if not response.ok:
print(f"{self.video.youtube_id}: failed to download subtitle")
continue
parser = SubtitleParser(response.text, subtitle.get("lang"))
parser.process()
subtitle_str = parser.get_subtitle_str()
self._write_subtitle_file(dest_path, subtitle_str)
query_str = parser.create_bulk_import(self.video, source)
self._index_subtitle(query_str)
@staticmethod
def _write_subtitle_file(dest_path, subtitle_str):
"""write subtitle file to disk"""
# create folder here for first video of channel
os.makedirs(os.path.split(dest_path)[0], exist_ok=True)
with open(dest_path, "w", encoding="utf-8") as subfile:
subfile.write(subtitle_str)
@staticmethod
def _index_subtitle(query_str):
"""send subtitle to es for indexing"""
_, _ = ElasticWrap("_bulk").post(data=query_str, ndjson=True)
class SubtitleParser:
"""parse subtitle str from youtube"""
time_reg = r"^([0-9]{2}:?){3}\.[0-9]{3} --> ([0-9]{2}:?){3}\.[0-9]{3}"
stamp_reg = r"<([0-9]{2}:?){3}\.[0-9]{3}>"
tag_reg = r"</?c>"
def __init__(self, subtitle_str, lang):
self.subtitle_str = subtitle_str
self.lang = lang
self.header = False
self.parsed_cue_list = False
self.all_text_lines = False
self.matched = False
def process(self):
"""collection to process subtitle string"""
self._parse_cues()
self._match_text_lines()
self._add_id()
def _parse_cues(self):
"""split into cues"""
all_cues = self.subtitle_str.replace("\n \n", "\n").split("\n\n")
self.header = all_cues[0]
self.all_text_lines = []
self.parsed_cue_list = [self._cue_cleaner(i) for i in all_cues[1:]]
def _cue_cleaner(self, cue):
"""parse single cue"""
all_lines = cue.split("\n")
cue_dict = {"lines": []}
for line in all_lines:
if re.match(self.time_reg, line):
clean = re.search(self.time_reg, line).group()
start, end = clean.split(" --> ")
cue_dict.update({"start": start, "end": end})
else:
print(f"{self.youtube_id}: failed to download subtitle")
clean = re.sub(self.stamp_reg, "", line)
clean = re.sub(self.tag_reg, "", clean)
cue_dict["lines"].append(clean)
if clean and clean not in self.all_text_lines:
self.all_text_lines.append(clean)
return cue_dict
def _match_text_lines(self):
"""match unique text lines with timestamps"""
self.matched = []
while self.all_text_lines:
check = self.all_text_lines[0]
matches = [i for i in self.parsed_cue_list if check in i["lines"]]
new_cue = matches[-1]
new_cue["start"] = matches[0]["start"]
for line in new_cue["lines"]:
try:
self.all_text_lines.remove(line)
except ValueError:
print("failed to process:")
print(line)
self.matched.append(new_cue)
def _add_id(self):
"""add id to matched cues"""
for idx, _ in enumerate(self.matched):
self.matched[idx]["id"] = idx + 1
def get_subtitle_str(self):
"""stitch cues and return processed new string"""
new_subtitle_str = self.header + "\n\n"
for cue in self.matched:
timestamp = f"{cue.get('start')} --> {cue.get('end')}"
lines = "\n".join(cue.get("lines"))
cue_text = f"{cue.get('id')}\n{timestamp}\n{lines}\n\n"
new_subtitle_str = new_subtitle_str + cue_text
return new_subtitle_str
def create_bulk_import(self, video, source):
"""process matched for es import"""
bulk_list = []
channel = video.json_data.get("channel")
document = {
"youtube_id": video.youtube_id,
"title": video.json_data.get("title"),
"subtitle_channel": channel.get("channel_name"),
"subtitle_channel_id": channel.get("channel_id"),
"subtitle_last_refresh": int(datetime.now().strftime("%s")),
"subtitle_lang": self.lang,
"subtitle_source": source,
}
for match in self.matched:
match_id = match.get("id")
document_id = f"{video.youtube_id}-{self.lang}-{match_id}"
action = {"index": {"_index": "ta_subtitle", "_id": document_id}}
document.update(
{
"subtitle_fragment_id": document_id,
"subtitle_start": match.get("start"),
"subtitle_end": match.get("end"),
"subtitle_index": match_id,
"subtitle_line": " ".join(match.get("lines")),
}
)
bulk_list.append(json.dumps(action))
bulk_list.append(json.dumps(document))
bulk_list.append("\n")
query_str = "\n".join(bulk_list)
return query_str
class YoutubeVideo(YouTubeItem, YoutubeSubtitle):
@ -204,10 +348,17 @@ class YoutubeVideo(YouTubeItem, YoutubeSubtitle):
try:
# when indexing from download task
vid_path = self.build_dl_cache_path()
except FileNotFoundError:
# when reindexing
base = self.app_conf["videos"]
vid_path = os.path.join(base, self.json_data["media_url"])
except FileNotFoundError as err:
# when reindexing needs to handle title rename
channel = os.path.split(self.json_data["media_url"])[0]
channel_dir = os.path.join(self.app_conf["videos"], channel)
all_files = os.listdir(channel_dir)
for file in all_files:
if self.youtube_id in file:
vid_path = os.path.join(channel_dir, file)
break
else:
raise FileNotFoundError("could not find video file") from err
duration_handler = DurationConverter()
duration = duration_handler.get_sec(vid_path)
@ -242,11 +393,18 @@ class YoutubeVideo(YouTubeItem, YoutubeSubtitle):
"""delete video file, meta data"""
self.get_from_es()
video_base = self.app_conf["videos"]
media_url = self.json_data["media_url"]
print(f"{self.youtube_id}: delete {media_url} from file system")
to_delete = os.path.join(video_base, media_url)
os.remove(to_delete)
to_del = [self.json_data.get("media_url")]
all_subtitles = self.json_data.get("subtitles")
if all_subtitles:
to_del = to_del + [i.get("media_url") for i in all_subtitles]
for media_url in to_del:
file_path = os.path.join(video_base, media_url)
os.remove(file_path)
self.del_in_es()
self._delete_subtitles()
def _get_ryd_stats(self):
"""get optional stats from returnyoutubedislikeapi.com"""
@ -270,17 +428,17 @@ class YoutubeVideo(YouTubeItem, YoutubeSubtitle):
def _check_subtitles(self):
"""optionally add subtitles"""
handler = YoutubeSubtitle(
self.config,
self.youtube_meta,
media_url=self.json_data["media_url"],
youtube_id=self.youtube_id,
)
handler = YoutubeSubtitle(self)
subtitles = handler.get_subtitles()
if subtitles:
self.json_data["subtitles"] = subtitles
handler.download_subtitles(relevant_subtitles=subtitles)
def _delete_subtitles(self):
"""delete indexed subtitles"""
data = {"query": {"term": {"youtube_id": {"value": self.youtube_id}}}}
_, _ = ElasticWrap("ta_subtitle/_delete_by_query").post(data=data)
def index_new_video(youtube_id):
"""combined classes to create new video in index"""

View File

@ -169,7 +169,11 @@ class DurationConverter:
capture_output=True,
check=True,
)
duration_sec = int(float(duration.stdout.decode().strip()))
duration_raw = duration.stdout.decode().strip()
if duration_raw == "N/A":
return 0
duration_sec = int(float(duration_raw))
return duration_sec
@staticmethod

View File

@ -3,10 +3,14 @@
{% load static %}
{% load humanize %}
<div class="video-main">
<video
src="/media/{{ video.media_url }}"
poster="/cache/{{ video.vid_thumb_url }}" controls preload="false"
type='video/mp4' width="100%" playsinline id="video-item" ontimeupdate="onVideoProgress('{{ video.youtube_id }}')" onloadedmetadata="setVideoProgress(0)">
<video poster="/cache/{{ video.vid_thumb_url }}" controls preload="false" width="100%" playsinline
id="video-item" ontimeupdate="onVideoProgress('{{ video.youtube_id }}')" onloadedmetadata="setVideoProgress(0)">
<source src="/media/{{ video.media_url }}" type="video/mp4">
{% if video.subtitles %}
{% for subtitle in video.subtitles %}
<track label="{{subtitle.name}}" kind="subtitles" srclang="{{subtitle.lang}}" src="/media/{{subtitle.media_url}}">
{% endfor %}
{% endif %}
</video>
</div>
<div class="boxed-content">
@ -57,10 +61,10 @@
</div>
<div class="info-box-item">
<div>
<p>Views: {{ video.stats.view_count|intcomma }}</p>
<p class="thumb-icon"><img src="{% static 'img/icon-eye.svg' %}" alt="views">: {{ video.stats.view_count|intcomma }}</p>
<p class="thumb-icon like"><img src="{% static 'img/icon-thumb.svg' %}" alt="thumbs-up">: {{ video.stats.like_count|intcomma }}</p>
{% if video.stats.dislike_count %}
<p class="thumb-icon dislike"><img src="{% static 'img/icon-thumb.svg' %}" alt="thumbs-down">: {{ video.stats.dislike_count|intcomma }}</p>
<p class="thumb-icon"><img class="dislike" src="{% static 'img/icon-thumb.svg' %}" alt="thumbs-down">: {{ video.stats.dislike_count|intcomma }}</p>
{% endif %}
{% if video.stats.average_rating %}
<p class="rating-stars">Rating:

View File

@ -4,9 +4,9 @@ Django==4.0.2
django-cors-headers==3.11.0
djangorestframework==3.13.1
Pillow==9.0.1
redis==4.1.2
redis==4.1.3
requests==2.27.1
ryd-client==0.0.3
uWSGI==2.0.20
whitenoise==5.3.0
whitenoise==6.0.0
yt_dlp==2022.2.4