add subtitle functionality, #build

Changes: - merges new subtitle download and index functionality - merges player improvements and api integrations from @n8detar - merges fix for non ascii channel names - merges fix for pagination error with 10k+ videos
2025-07-18 23:18:14 +00:00 · 2022-02-10 19:48:39 +07:00 · 2022-02-10 19:48:39 +07:00 · 3efa388b5a
commit 3efa388b5a
parent 5b37bd059c 16f33feda0
10 changed files with 342 additions and 118 deletions
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -33,7 +33,7 @@ services:
    depends_on:
      - archivist-es
  archivist-es:
-    image: docker.elastic.co/elasticsearch/elasticsearch:7.16.2
+    image: docker.elastic.co/elasticsearch/elasticsearch:7.17.0
    container_name: archivist-es
    restart: always
    environment:
--- a/tubearchivist/api/README.md
+++ b/tubearchivist/api/README.md
@ -23,10 +23,6 @@ response = requests.get(url, headers=headers)
 ## Video Item View
 /api/video/\<video_id>/

-## Video Player View
-returns all relevant information to create video player
-/api/video/\<video_id>/player
-
 ## Channel List View
 /api/channel/

--- a/tubearchivist/api/urls.py
+++ b/tubearchivist/api/urls.py
@ -6,7 +6,6 @@ from api.views import (
    DownloadApiListView,
    DownloadApiView,
    PlaylistApiView,
-    VideoApiPlayerView,
    VideoApiView,
 )
 from django.urls import path
@ -17,11 +16,6 @@ urlpatterns = [
        VideoApiView.as_view(),
        name="api-video",
    ),
-    path(
-        "video/<slug:video_id>/player/",
-        VideoApiPlayerView.as_view(),
-        name="api-video-player",
-    ),
    path(
        "channel/",
        ChannelApiListView.as_view(),
--- a/tubearchivist/api/views.py
+++ b/tubearchivist/api/views.py
@ -60,6 +60,12 @@ class ApiBaseView(APIView):
            cache_dir = self.default_conf["application"]["cache_dir"]
            new_thumb = f"{cache_dir}/{vid_thumb_url}"
            self.response["data"]["vid_thumb_url"] = new_thumb
+        if "subtitles" in all_keys:
+            all_subtitles = self.response["data"]["subtitles"]
+            for idx, _ in enumerate(all_subtitles):
+                url = self.response["data"]["subtitles"][idx]["media_url"]
+                new_url = f"/media/{url}"
+                self.response["data"]["subtitles"][idx]["media_url"] = new_url

    def get_paginate(self):
        """add pagination detail to response"""
@ -92,38 +98,6 @@ class VideoApiView(ApiBaseView):
        return Response(self.response, status=self.status_code)


-class VideoApiPlayerView(ApiBaseView):
-    """resolves to /api/video/<video_id>/player
-    GET: returns dict of video to build player
-    """
-
-    search_base = "/ta_video/_doc/"
-
-    def get(self, request, video_id):
-        # pylint: disable=unused-argument
-        """get request"""
-        self.config_builder()
-        self.get_document(video_id)
-        player = self.process_response()
-        return Response(player, status=self.status_code)
-
-    def process_response(self):
-        """build all needed vars for player"""
-        vid_data = self.response["data"]
-        youtube_id = vid_data["youtube_id"]
-        vid_thumb_url = ThumbManager().vid_thumb_path(youtube_id)
-        player = {
-            "youtube_id": youtube_id,
-            "media_url": "/media/" + vid_data["media_url"],
-            "vid_thumb_url": "/cache/" + vid_thumb_url,
-            "title": vid_data["title"],
-            "channel_name": vid_data["channel"]["channel_name"],
-            "channel_id": vid_data["channel"]["channel_id"],
-            "is_watched": vid_data["player"]["watched"],
-        }
-        return player
-
-
 class ChannelApiView(ApiBaseView):
    """resolves to /api/channel/<channel_id>/
    GET: returns metadata dict of channel
--- a/tubearchivist/home/src/es/index_mapping.json
+++ b/tubearchivist/home/src/es/index_mapping.json
@ -156,6 +156,32 @@
                            "normalizer": "to_lower"
                        }
                    }
+                },
+                "subtitles": {
+                    "properties": {
+                        "ext": {
+                            "type": "keyword",
+                            "index": false
+                        },
+                        "lang": {
+                            "type": "keyword",
+                            "index": false
+                        },
+                        "media_url": {
+                            "type": "keyword",
+                            "index": false
+                        },
+                        "name": {
+                            "type": "keyword"
+                        },
+                        "source": {
+                            "type": "keyword"
+                        },
+                        "url": {
+                            "type": "keyword",
+                            "index": false
+                        }
+                    }
                }
            },
            "expected_set": {
@ -277,6 +303,73 @@
                },
                "number_of_replicas": "0"
            }
+        },
+        {
+            "index_name": "subtitle",
+            "expected_map": {
+                "youtube_id": {
+                    "type": "keyword"
+                },
+                "title": {
+                    "type": "text",
+                    "fields": {
+                        "keyword": {
+                            "type": "keyword",
+                            "ignore_above": 256,
+                            "normalizer": "to_lower"
+                        }
+                    }
+                },
+                "subtitle_fragment_id": {
+                    "type": "keyword"
+                },
+                "subtitle_channel": {
+                    "type": "text",
+                    "fields": {
+                        "keyword": {
+                            "type": "keyword",
+                            "ignore_above": 256,
+                            "normalizer": "to_lower"
+                        }
+                    }
+                },
+                "subtitle_channel_id": {
+                    "type": "keyword"
+                },
+                "subtitle_start": {
+                    "type": "text"
+                },
+                "subtitle_end": {
+                    "type": "text"
+                },
+                "subtitle_last_refresh": {
+                    "type": "date"
+                },
+                "subtitle_index": {
+                    "type" : "long"
+                },
+                "subtitle_lang": {
+                    "type": "keyword"
+                },
+                "subtitle_source": {
+                    "type": "keyword"
+                },
+                "subtitle_line": {
+                    "type" : "text",
+                    "analyzer": "english"
+                }
+            },
+            "expected_set": {
+                "analysis": {
+                    "normalizer": {
+                        "to_lower": {
+                            "type": "custom",
+                            "filter": ["lowercase"]
+                        }
+                    }
+                },
+                "number_of_replicas": "0"
+            }
        }
    ]
 }
--- a/tubearchivist/home/src/index/filesystem.py
+++ b/tubearchivist/home/src/index/filesystem.py
@ -46,8 +46,9 @@ class FilesystemScanner:
        all_downloaded = []
        for channel_name in all_channels:
            channel_path = os.path.join(self.VIDEOS, channel_name)
-            videos = os.listdir(channel_path)
-            all_videos = ignore_filelist(videos)
+            channel_files = os.listdir(channel_path)
+            channel_files_clean = ignore_filelist(channel_files)
+            all_videos = [i for i in channel_files_clean if i.endswith(".mp4")]
            for video in all_videos:
                youtube_id = video[9:20]
                all_downloaded.append((channel_name, video, youtube_id))
--- a/tubearchivist/home/src/index/video.py
+++ b/tubearchivist/home/src/index/video.py
@ -4,10 +4,13 @@ functionality:
 - index and update in es
 """

+import json
 import os
+import re
 from datetime import datetime

 import requests
+from home.src.es.connect import ElasticWrap
 from home.src.index import channel as ta_channel
 from home.src.index.generic import YouTubeItem
 from home.src.ta.helper import DurationConverter, clean_string
@ -17,16 +20,13 @@ from ryd_client import ryd_client
 class YoutubeSubtitle:
    """handle video subtitle functionality"""

-    def __init__(self, config, youtube_meta, media_url, youtube_id):
-        self.config = config
-        self.youtube_meta = youtube_meta
-        self.media_url = media_url
-        self.youtube_id = youtube_id
+    def __init__(self, video):
+        self.video = video
        self.languages = False

    def sub_conf_parse(self):
        """add additional conf values to self"""
-        languages_raw = self.config["downloads"]["subtitle"]
+        languages_raw = self.video.config["downloads"]["subtitle"]
        self.languages = [i.strip() for i in languages_raw.split(",")]

    def get_subtitles(self):
@ -36,82 +36,226 @@ class YoutubeSubtitle:
            # no subtitles
            return False

-        relevant_subtitles = self.get_user_subtitles()
-        if relevant_subtitles:
+        relevant_subtitles = []
+        for lang in self.languages:
+            user_sub = self.get_user_subtitles(lang)
+            if user_sub:
+                relevant_subtitles.append(user_sub)
+                continue
+
+            if self.video.config["downloads"]["subtitle_source"] == "auto":
+                auto_cap = self.get_auto_caption(lang)
+                if auto_cap:
+                    relevant_subtitles.append(auto_cap)
+
        return relevant_subtitles

-        if self.config["downloads"]["subtitle_source"] == "auto":
-            relevant_auto = self.get_auto_caption()
-            return relevant_auto
-
-        return False
-
-    def get_auto_caption(self):
+    def get_auto_caption(self, lang):
        """get auto_caption subtitles"""
-        print(f"{self.youtube_id}: get auto generated subtitles")
-        all_subtitles = self.youtube_meta.get("automatic_captions")
+        print(f"{self.video.youtube_id}-{lang}: get auto generated subtitles")
+        all_subtitles = self.video.youtube_meta.get("automatic_captions")

        if not all_subtitles:
            return False

-        relevant_subtitles = []
-
-        for lang in self.languages:
-            media_url = self.media_url.replace(".mp4", f"-{lang}.vtt")
+        video_media_url = self.video.json_data["media_url"]
+        media_url = video_media_url.replace(".mp4", f"-{lang}.vtt")
        all_formats = all_subtitles.get(lang)
        subtitle = [i for i in all_formats if i["ext"] == "vtt"][0]
        subtitle.update(
            {"lang": lang, "source": "auto", "media_url": media_url}
        )
-            relevant_subtitles.append(subtitle)
-            break

-        return relevant_subtitles
+        return subtitle

    def _normalize_lang(self):
        """normalize country specific language keys"""
-        all_subtitles = self.youtube_meta.get("subtitles")
+        all_subtitles = self.video.youtube_meta.get("subtitles")
+        if not all_subtitles:
+            return False
+
        all_keys = list(all_subtitles.keys())
        for key in all_keys:
            lang = key.split("-")[0]
            old = all_subtitles.pop(key)
+            if lang == "live_chat":
+                continue
            all_subtitles[lang] = old

        return all_subtitles

-    def get_user_subtitles(self):
+    def get_user_subtitles(self, lang):
        """get subtitles uploaded from channel owner"""
-        print(f"{self.youtube_id}: get user uploaded subtitles")
+        print(f"{self.video.youtube_id}-{lang}: get user uploaded subtitles")
        all_subtitles = self._normalize_lang()
        if not all_subtitles:
            return False

-        relevant_subtitles = []
-
-        for lang in self.languages:
-            media_url = self.media_url.replace(".mp4", f"-{lang}.vtt")
+        video_media_url = self.video.json_data["media_url"]
+        media_url = video_media_url.replace(".mp4", f"-{lang}.vtt")
        all_formats = all_subtitles.get(lang)
+        if not all_formats:
+            # no user subtitles found
+            return False
+
        subtitle = [i for i in all_formats if i["ext"] == "vtt"][0]
        subtitle.update(
            {"lang": lang, "source": "user", "media_url": media_url}
        )
-            relevant_subtitles.append(subtitle)
-            break

-        return relevant_subtitles
+        return subtitle

    def download_subtitles(self, relevant_subtitles):
        """download subtitle files to archive"""
+        videos_base = self.video.config["application"]["videos"]
        for subtitle in relevant_subtitles:
-            dest_path = os.path.join(
-                self.config["application"]["videos"], subtitle["media_url"]
-            )
+            dest_path = os.path.join(videos_base, subtitle["media_url"])
+            source = subtitle["source"]
            response = requests.get(subtitle["url"])
-            if response.ok:
+            if not response.ok:
+                print(f"{self.video.youtube_id}: failed to download subtitle")
+                continue
+
+            parser = SubtitleParser(response.text, subtitle.get("lang"))
+            parser.process()
+            subtitle_str = parser.get_subtitle_str()
+            self._write_subtitle_file(dest_path, subtitle_str)
+            query_str = parser.create_bulk_import(self.video, source)
+            self._index_subtitle(query_str)
+
+    @staticmethod
+    def _write_subtitle_file(dest_path, subtitle_str):
+        """write subtitle file to disk"""
+        # create folder here for first video of channel
+        os.makedirs(os.path.split(dest_path)[0], exist_ok=True)
        with open(dest_path, "w", encoding="utf-8") as subfile:
-                    subfile.write(response.text)
+            subfile.write(subtitle_str)
+
+    @staticmethod
+    def _index_subtitle(query_str):
+        """send subtitle to es for indexing"""
+        _, _ = ElasticWrap("_bulk").post(data=query_str, ndjson=True)
+
+
+class SubtitleParser:
+    """parse subtitle str from youtube"""
+
+    time_reg = r"^([0-9]{2}:?){3}\.[0-9]{3} --> ([0-9]{2}:?){3}\.[0-9]{3}"
+    stamp_reg = r"<([0-9]{2}:?){3}\.[0-9]{3}>"
+    tag_reg = r"</?c>"
+
+    def __init__(self, subtitle_str, lang):
+        self.subtitle_str = subtitle_str
+        self.lang = lang
+        self.header = False
+        self.parsed_cue_list = False
+        self.all_text_lines = False
+        self.matched = False
+
+    def process(self):
+        """collection to process subtitle string"""
+        self._parse_cues()
+        self._match_text_lines()
+        self._add_id()
+
+    def _parse_cues(self):
+        """split into cues"""
+        all_cues = self.subtitle_str.replace("\n \n", "\n").split("\n\n")
+        self.header = all_cues[0]
+        self.all_text_lines = []
+        self.parsed_cue_list = [self._cue_cleaner(i) for i in all_cues[1:]]
+
+    def _cue_cleaner(self, cue):
+        """parse single cue"""
+        all_lines = cue.split("\n")
+        cue_dict = {"lines": []}
+
+        for line in all_lines:
+            if re.match(self.time_reg, line):
+                clean = re.search(self.time_reg, line).group()
+                start, end = clean.split(" --> ")
+                cue_dict.update({"start": start, "end": end})
            else:
-                print(f"{self.youtube_id}: failed to download subtitle")
+                clean = re.sub(self.stamp_reg, "", line)
+                clean = re.sub(self.tag_reg, "", clean)
+                cue_dict["lines"].append(clean)
+                if clean and clean not in self.all_text_lines:
+                    self.all_text_lines.append(clean)
+
+        return cue_dict
+
+    def _match_text_lines(self):
+        """match unique text lines with timestamps"""
+
+        self.matched = []
+
+        while self.all_text_lines:
+            check = self.all_text_lines[0]
+            matches = [i for i in self.parsed_cue_list if check in i["lines"]]
+            new_cue = matches[-1]
+            new_cue["start"] = matches[0]["start"]
+
+            for line in new_cue["lines"]:
+                try:
+                    self.all_text_lines.remove(line)
+                except ValueError:
+                    print("failed to process:")
+                    print(line)
+
+            self.matched.append(new_cue)
+
+    def _add_id(self):
+        """add id to matched cues"""
+        for idx, _ in enumerate(self.matched):
+            self.matched[idx]["id"] = idx + 1
+
+    def get_subtitle_str(self):
+        """stitch cues and return processed new string"""
+        new_subtitle_str = self.header + "\n\n"
+
+        for cue in self.matched:
+            timestamp = f"{cue.get('start')} --> {cue.get('end')}"
+            lines = "\n".join(cue.get("lines"))
+            cue_text = f"{cue.get('id')}\n{timestamp}\n{lines}\n\n"
+            new_subtitle_str = new_subtitle_str + cue_text
+
+        return new_subtitle_str
+
+    def create_bulk_import(self, video, source):
+        """process matched for es import"""
+        bulk_list = []
+        channel = video.json_data.get("channel")
+
+        document = {
+            "youtube_id": video.youtube_id,
+            "title": video.json_data.get("title"),
+            "subtitle_channel": channel.get("channel_name"),
+            "subtitle_channel_id": channel.get("channel_id"),
+            "subtitle_last_refresh": int(datetime.now().strftime("%s")),
+            "subtitle_lang": self.lang,
+            "subtitle_source": source,
+        }
+
+        for match in self.matched:
+            match_id = match.get("id")
+            document_id = f"{video.youtube_id}-{self.lang}-{match_id}"
+            action = {"index": {"_index": "ta_subtitle", "_id": document_id}}
+            document.update(
+                {
+                    "subtitle_fragment_id": document_id,
+                    "subtitle_start": match.get("start"),
+                    "subtitle_end": match.get("end"),
+                    "subtitle_index": match_id,
+                    "subtitle_line": " ".join(match.get("lines")),
+                }
+            )
+            bulk_list.append(json.dumps(action))
+            bulk_list.append(json.dumps(document))
+
+        bulk_list.append("\n")
+        query_str = "\n".join(bulk_list)
+
+        return query_str


 class YoutubeVideo(YouTubeItem, YoutubeSubtitle):
@ -204,10 +348,17 @@ class YoutubeVideo(YouTubeItem, YoutubeSubtitle):
        try:
            # when indexing from download task
            vid_path = self.build_dl_cache_path()
-        except FileNotFoundError:
-            # when reindexing
-            base = self.app_conf["videos"]
-            vid_path = os.path.join(base, self.json_data["media_url"])
+        except FileNotFoundError as err:
+            # when reindexing needs to handle title rename
+            channel = os.path.split(self.json_data["media_url"])[0]
+            channel_dir = os.path.join(self.app_conf["videos"], channel)
+            all_files = os.listdir(channel_dir)
+            for file in all_files:
+                if self.youtube_id in file:
+                    vid_path = os.path.join(channel_dir, file)
+                    break
+            else:
+                raise FileNotFoundError("could not find video file") from err

        duration_handler = DurationConverter()
        duration = duration_handler.get_sec(vid_path)
@ -242,11 +393,18 @@ class YoutubeVideo(YouTubeItem, YoutubeSubtitle):
        """delete video file, meta data"""
        self.get_from_es()
        video_base = self.app_conf["videos"]
-        media_url = self.json_data["media_url"]
-        print(f"{self.youtube_id}: delete {media_url} from file system")
-        to_delete = os.path.join(video_base, media_url)
-        os.remove(to_delete)
+        to_del = [self.json_data.get("media_url")]
+
+        all_subtitles = self.json_data.get("subtitles")
+        if all_subtitles:
+            to_del = to_del + [i.get("media_url") for i in all_subtitles]
+
+        for media_url in to_del:
+            file_path = os.path.join(video_base, media_url)
+            os.remove(file_path)
+
        self.del_in_es()
+        self._delete_subtitles()

    def _get_ryd_stats(self):
        """get optional stats from returnyoutubedislikeapi.com"""
@ -270,17 +428,17 @@ class YoutubeVideo(YouTubeItem, YoutubeSubtitle):

    def _check_subtitles(self):
        """optionally add subtitles"""
-        handler = YoutubeSubtitle(
-            self.config,
-            self.youtube_meta,
-            media_url=self.json_data["media_url"],
-            youtube_id=self.youtube_id,
-        )
+        handler = YoutubeSubtitle(self)
        subtitles = handler.get_subtitles()
        if subtitles:
            self.json_data["subtitles"] = subtitles
            handler.download_subtitles(relevant_subtitles=subtitles)

+    def _delete_subtitles(self):
+        """delete indexed subtitles"""
+        data = {"query": {"term": {"youtube_id": {"value": self.youtube_id}}}}
+        _, _ = ElasticWrap("ta_subtitle/_delete_by_query").post(data=data)
+

 def index_new_video(youtube_id):
    """combined classes to create new video in index"""
--- a/tubearchivist/home/src/ta/helper.py
+++ b/tubearchivist/home/src/ta/helper.py
@ -169,7 +169,11 @@ class DurationConverter:
            capture_output=True,
            check=True,
        )
-        duration_sec = int(float(duration.stdout.decode().strip()))
+        duration_raw = duration.stdout.decode().strip()
+        if duration_raw == "N/A":
+            return 0
+
+        duration_sec = int(float(duration_raw))
        return duration_sec

    @staticmethod
--- a/tubearchivist/home/templates/home/video.html
+++ b/tubearchivist/home/templates/home/video.html
@ -3,10 +3,14 @@
 {% load static %}
 {% load humanize %}
 <div class="video-main">
-    <video 
-        src="/media/{{ video.media_url }}" 
-        poster="/cache/{{ video.vid_thumb_url }}" controls preload="false" 
-        type='video/mp4' width="100%" playsinline id="video-item" ontimeupdate="onVideoProgress('{{ video.youtube_id }}')" onloadedmetadata="setVideoProgress(0)">
+    <video poster="/cache/{{ video.vid_thumb_url }}" controls preload="false" width="100%" playsinline 
+    id="video-item" ontimeupdate="onVideoProgress('{{ video.youtube_id }}')" onloadedmetadata="setVideoProgress(0)">
+        <source src="/media/{{ video.media_url }}" type="video/mp4">
+        {% if video.subtitles %}
+            {% for subtitle in video.subtitles %}
+                <track label="{{subtitle.name}}" kind="subtitles" srclang="{{subtitle.lang}}" src="/media/{{subtitle.media_url}}">
+            {% endfor %}
+        {% endif %}
    </video>
 </div>
 <div class="boxed-content">
@ -57,10 +61,10 @@
        </div>
        <div class="info-box-item">
            <div>
-                <p>Views: {{ video.stats.view_count|intcomma }}</p>
+                <p class="thumb-icon"><img src="{% static 'img/icon-eye.svg' %}" alt="views">: {{ video.stats.view_count|intcomma }}</p>
                <p class="thumb-icon like"><img src="{% static 'img/icon-thumb.svg' %}" alt="thumbs-up">: {{ video.stats.like_count|intcomma }}</p>
                {% if video.stats.dislike_count %}
-                    <p class="thumb-icon dislike"><img src="{% static 'img/icon-thumb.svg' %}" alt="thumbs-down">: {{ video.stats.dislike_count|intcomma }}</p>
+                    <p class="thumb-icon"><img class="dislike" src="{% static 'img/icon-thumb.svg' %}" alt="thumbs-down">: {{ video.stats.dislike_count|intcomma }}</p>
                {% endif %}
                {% if video.stats.average_rating %}
                    <p class="rating-stars">Rating: 
--- a/tubearchivist/requirements.txt
+++ b/tubearchivist/requirements.txt
@ -4,9 +4,9 @@ Django==4.0.2
 django-cors-headers==3.11.0
 djangorestframework==3.13.1
 Pillow==9.0.1
-redis==4.1.2
+redis==4.1.3
 requests==2.27.1
 ryd-client==0.0.3
 uWSGI==2.0.20
-whitenoise==5.3.0
+whitenoise==6.0.0
 yt_dlp==2022.2.4