add subtitle functionality, #build

Changes: - merges new subtitle download and index functionality - merges player improvements and api integrations from @n8detar - merges fix for non ascii channel names - merges fix for pagination error with 10k+ videos
2025-08-25 00:48:17 +00:00 · 2022-02-10 19:48:39 +07:00 · 2022-02-10 19:48:39 +07:00 · 3efa388b5a
commit 3efa388b5a
parent 5b37bd059c 16f33feda0
10 changed files with 342 additions and 118 deletions
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -33,7 +33,7 @@ services:
    depends_on:
      - archivist-es
  archivist-es:
-    image: docker.elastic.co/elasticsearch/elasticsearch:7.16.2
+    image: docker.elastic.co/elasticsearch/elasticsearch:7.17.0
    container_name: archivist-es
    restart: always
    environment:
--- a/tubearchivist/api/README.md
+++ b/tubearchivist/api/README.md
@ -23,10 +23,6 @@ response = requests.get(url, headers=headers)
 ## Video Item View
 /api/video/\<video_id>/
 ## Video Player View
 returns all relevant information to create video player
 /api/video/\<video_id>/player
 ## Channel List View
 /api/channel/
--- a/tubearchivist/api/urls.py
+++ b/tubearchivist/api/urls.py
@ -6,7 +6,6 @@ from api.views import (
    DownloadApiListView,
    DownloadApiView,
    PlaylistApiView,
    VideoApiPlayerView,
    VideoApiView,
 )
 from django.urls import path
@ -17,11 +16,6 @@ urlpatterns = [
        VideoApiView.as_view(),
        name="api-video",
    ),
    path(
        "video/<slug:video_id>/player/",
        VideoApiPlayerView.as_view(),
        name="api-video-player",
    ),
    path(
        "channel/",
        ChannelApiListView.as_view(),
--- a/tubearchivist/api/views.py
+++ b/tubearchivist/api/views.py
@ -60,6 +60,12 @@ class ApiBaseView(APIView):
            cache_dir = self.default_conf["application"]["cache_dir"]
            new_thumb = f"{cache_dir}/{vid_thumb_url}"
            self.response["data"]["vid_thumb_url"] = new_thumb
        if "subtitles" in all_keys:
            all_subtitles = self.response["data"]["subtitles"]
            for idx, _ in enumerate(all_subtitles):
                url = self.response["data"]["subtitles"][idx]["media_url"]
                new_url = f"/media/{url}"
                self.response["data"]["subtitles"][idx]["media_url"] = new_url
    def get_paginate(self):
        """add pagination detail to response"""
@ -92,38 +98,6 @@ class VideoApiView(ApiBaseView):
        return Response(self.response, status=self.status_code)
 class VideoApiPlayerView(ApiBaseView):
    """resolves to /api/video/<video_id>/player
    GET: returns dict of video to build player
    """
    search_base = "/ta_video/_doc/"
    def get(self, request, video_id):
        # pylint: disable=unused-argument
        """get request"""
        self.config_builder()
        self.get_document(video_id)
        player = self.process_response()
        return Response(player, status=self.status_code)
    def process_response(self):
        """build all needed vars for player"""
        vid_data = self.response["data"]
        youtube_id = vid_data["youtube_id"]
        vid_thumb_url = ThumbManager().vid_thumb_path(youtube_id)
        player = {
            "youtube_id": youtube_id,
            "media_url": "/media/" + vid_data["media_url"],
            "vid_thumb_url": "/cache/" + vid_thumb_url,
            "title": vid_data["title"],
            "channel_name": vid_data["channel"]["channel_name"],
            "channel_id": vid_data["channel"]["channel_id"],
            "is_watched": vid_data["player"]["watched"],
        }
        return player
 class ChannelApiView(ApiBaseView):
    """resolves to /api/channel/<channel_id>/
    GET: returns metadata dict of channel
--- a/tubearchivist/home/src/es/index_mapping.json
+++ b/tubearchivist/home/src/es/index_mapping.json
@ -156,6 +156,32 @@
                            "normalizer": "to_lower"
                        }
                    }
                },
                "subtitles": {
                    "properties": {
                        "ext": {
                            "type": "keyword",
                            "index": false
                        },
                        "lang": {
                            "type": "keyword",
                            "index": false
                        },
                        "media_url": {
                            "type": "keyword",
                            "index": false
                        },
                        "name": {
                            "type": "keyword"
                        },
                        "source": {
                            "type": "keyword"
                        },
                        "url": {
                            "type": "keyword",
                            "index": false
                        }
                    }
                }
            },
            "expected_set": {
@ -277,6 +303,73 @@
                },
                "number_of_replicas": "0"
            }
        },
        {
            "index_name": "subtitle",
            "expected_map": {
                "youtube_id": {
                    "type": "keyword"
                },
                "title": {
                    "type": "text",
                    "fields": {
                        "keyword": {
                            "type": "keyword",
                            "ignore_above": 256,
                            "normalizer": "to_lower"
                        }
                    }
                },
                "subtitle_fragment_id": {
                    "type": "keyword"
                },
                "subtitle_channel": {
                    "type": "text",
                    "fields": {
                        "keyword": {
                            "type": "keyword",
                            "ignore_above": 256,
                            "normalizer": "to_lower"
                        }
                    }
                },
                "subtitle_channel_id": {
                    "type": "keyword"
                },
                "subtitle_start": {
                    "type": "text"
                },
                "subtitle_end": {
                    "type": "text"
                },
                "subtitle_last_refresh": {
                    "type": "date"
                },
                "subtitle_index": {
                    "type" : "long"
                },
                "subtitle_lang": {
                    "type": "keyword"
                },
                "subtitle_source": {
                    "type": "keyword"
                },
                "subtitle_line": {
                    "type" : "text",
                    "analyzer": "english"
                }
            },
            "expected_set": {
                "analysis": {
                    "normalizer": {
                        "to_lower": {
                            "type": "custom",
                            "filter": ["lowercase"]
                        }
                    }
                },
                "number_of_replicas": "0"
            }
        }
    ]
 }
--- a/tubearchivist/home/src/index/filesystem.py
+++ b/tubearchivist/home/src/index/filesystem.py
@ -46,8 +46,9 @@ class FilesystemScanner:
        all_downloaded = []
        for channel_name in all_channels:
            channel_path = os.path.join(self.VIDEOS, channel_name)
-            videos = os.listdir(channel_path)
+            channel_files = os.listdir(channel_path)
-            all_videos = ignore_filelist(videos)
+            channel_files_clean = ignore_filelist(channel_files)
            all_videos = [i for i in channel_files_clean if i.endswith(".mp4")]
            for video in all_videos:
                youtube_id = video[9:20]
                all_downloaded.append((channel_name, video, youtube_id))
--- a/tubearchivist/home/src/index/video.py
+++ b/tubearchivist/home/src/index/video.py
@ -4,10 +4,13 @@ functionality:
 - index and update in es
 """
 import json
 import os
 import re
 from datetime import datetime
 import requests
 from home.src.es.connect import ElasticWrap
 from home.src.index import channel as ta_channel
 from home.src.index.generic import YouTubeItem
 from home.src.ta.helper import DurationConverter, clean_string
@ -17,16 +20,13 @@ from ryd_client import ryd_client
 class YoutubeSubtitle:
    """handle video subtitle functionality"""
-    def __init__(self, config, youtube_meta, media_url, youtube_id):
+    def __init__(self, video):
-        self.config = config
+        self.video = video
        self.youtube_meta = youtube_meta
        self.media_url = media_url
        self.youtube_id = youtube_id
        self.languages = False
    def sub_conf_parse(self):
        """add additional conf values to self"""
-        languages_raw = self.config["downloads"]["subtitle"]
+        languages_raw = self.video.config["downloads"]["subtitle"]
        self.languages = [i.strip() for i in languages_raw.split(",")]
    def get_subtitles(self):
@ -36,82 +36,226 @@ class YoutubeSubtitle:
            # no subtitles
            return False
-        relevant_subtitles = self.get_user_subtitles()
+        relevant_subtitles = []
-        if relevant_subtitles:
+        for lang in self.languages:
            user_sub = self.get_user_subtitles(lang)
            if user_sub:
                relevant_subtitles.append(user_sub)
                continue
            if self.video.config["downloads"]["subtitle_source"] == "auto":
                auto_cap = self.get_auto_caption(lang)
                if auto_cap:
                    relevant_subtitles.append(auto_cap)
        return relevant_subtitles
-        if self.config["downloads"]["subtitle_source"] == "auto":
+    def get_auto_caption(self, lang):
            relevant_auto = self.get_auto_caption()
            return relevant_auto
        return False
    def get_auto_caption(self):
        """get auto_caption subtitles"""
-        print(f"{self.youtube_id}: get auto generated subtitles")
+        print(f"{self.video.youtube_id}-{lang}: get auto generated subtitles")
-        all_subtitles = self.youtube_meta.get("automatic_captions")
+        all_subtitles = self.video.youtube_meta.get("automatic_captions")
        if not all_subtitles:
            return False
-        relevant_subtitles = []
+        video_media_url = self.video.json_data["media_url"]
-
+        media_url = video_media_url.replace(".mp4", f"-{lang}.vtt")
        for lang in self.languages:
            media_url = self.media_url.replace(".mp4", f"-{lang}.vtt")
        all_formats = all_subtitles.get(lang)
        subtitle = [i for i in all_formats if i["ext"] == "vtt"][0]
        subtitle.update(
            {"lang": lang, "source": "auto", "media_url": media_url}
        )
            relevant_subtitles.append(subtitle)
            break
-        return relevant_subtitles
+        return subtitle
    def _normalize_lang(self):
        """normalize country specific language keys"""
-        all_subtitles = self.youtube_meta.get("subtitles")
+        all_subtitles = self.video.youtube_meta.get("subtitles")
        if not all_subtitles:
            return False
        all_keys = list(all_subtitles.keys())
        for key in all_keys:
            lang = key.split("-")[0]
            old = all_subtitles.pop(key)
            if lang == "live_chat":
                continue
            all_subtitles[lang] = old
        return all_subtitles
-    def get_user_subtitles(self):
+    def get_user_subtitles(self, lang):
        """get subtitles uploaded from channel owner"""
-        print(f"{self.youtube_id}: get user uploaded subtitles")
+        print(f"{self.video.youtube_id}-{lang}: get user uploaded subtitles")
        all_subtitles = self._normalize_lang()
        if not all_subtitles:
            return False
-        relevant_subtitles = []
+        video_media_url = self.video.json_data["media_url"]
-
+        media_url = video_media_url.replace(".mp4", f"-{lang}.vtt")
        for lang in self.languages:
            media_url = self.media_url.replace(".mp4", f"-{lang}.vtt")
        all_formats = all_subtitles.get(lang)
        if not all_formats:
            # no user subtitles found
            return False
        subtitle = [i for i in all_formats if i["ext"] == "vtt"][0]
        subtitle.update(
            {"lang": lang, "source": "user", "media_url": media_url}
        )
            relevant_subtitles.append(subtitle)
            break
-        return relevant_subtitles
+        return subtitle
    def download_subtitles(self, relevant_subtitles):
        """download subtitle files to archive"""
        videos_base = self.video.config["application"]["videos"]
        for subtitle in relevant_subtitles:
-            dest_path = os.path.join(
+            dest_path = os.path.join(videos_base, subtitle["media_url"])
-                self.config["application"]["videos"], subtitle["media_url"]
+            source = subtitle["source"]
            )
            response = requests.get(subtitle["url"])
-            if response.ok:
+            if not response.ok:
                print(f"{self.video.youtube_id}: failed to download subtitle")
                continue
            parser = SubtitleParser(response.text, subtitle.get("lang"))
            parser.process()
            subtitle_str = parser.get_subtitle_str()
            self._write_subtitle_file(dest_path, subtitle_str)
            query_str = parser.create_bulk_import(self.video, source)
            self._index_subtitle(query_str)
    @staticmethod
    def _write_subtitle_file(dest_path, subtitle_str):
        """write subtitle file to disk"""
        # create folder here for first video of channel
        os.makedirs(os.path.split(dest_path)[0], exist_ok=True)
        with open(dest_path, "w", encoding="utf-8") as subfile:
-                    subfile.write(response.text)
+            subfile.write(subtitle_str)
    @staticmethod
    def _index_subtitle(query_str):
        """send subtitle to es for indexing"""
        _, _ = ElasticWrap("_bulk").post(data=query_str, ndjson=True)
 class SubtitleParser:
    """parse subtitle str from youtube"""
    time_reg = r"^([0-9]{2}:?){3}\.[0-9]{3} --> ([0-9]{2}:?){3}\.[0-9]{3}"
    stamp_reg = r"<([0-9]{2}:?){3}\.[0-9]{3}>"
    tag_reg = r"</?c>"
    def __init__(self, subtitle_str, lang):
        self.subtitle_str = subtitle_str
        self.lang = lang
        self.header = False
        self.parsed_cue_list = False
        self.all_text_lines = False
        self.matched = False
    def process(self):
        """collection to process subtitle string"""
        self._parse_cues()
        self._match_text_lines()
        self._add_id()
    def _parse_cues(self):
        """split into cues"""
        all_cues = self.subtitle_str.replace("\n \n", "\n").split("\n\n")
        self.header = all_cues[0]
        self.all_text_lines = []
        self.parsed_cue_list = [self._cue_cleaner(i) for i in all_cues[1:]]
    def _cue_cleaner(self, cue):
        """parse single cue"""
        all_lines = cue.split("\n")
        cue_dict = {"lines": []}
        for line in all_lines:
            if re.match(self.time_reg, line):
                clean = re.search(self.time_reg, line).group()
                start, end = clean.split(" --> ")
                cue_dict.update({"start": start, "end": end})
            else:
-                print(f"{self.youtube_id}: failed to download subtitle")
+                clean = re.sub(self.stamp_reg, "", line)
                clean = re.sub(self.tag_reg, "", clean)
                cue_dict["lines"].append(clean)
                if clean and clean not in self.all_text_lines:
                    self.all_text_lines.append(clean)
        return cue_dict
    def _match_text_lines(self):
        """match unique text lines with timestamps"""
        self.matched = []
        while self.all_text_lines:
            check = self.all_text_lines[0]
            matches = [i for i in self.parsed_cue_list if check in i["lines"]]
            new_cue = matches[-1]
            new_cue["start"] = matches[0]["start"]
            for line in new_cue["lines"]:
                try:
                    self.all_text_lines.remove(line)
                except ValueError:
                    print("failed to process:")
                    print(line)
            self.matched.append(new_cue)
    def _add_id(self):
        """add id to matched cues"""
        for idx, _ in enumerate(self.matched):
            self.matched[idx]["id"] = idx + 1
    def get_subtitle_str(self):
        """stitch cues and return processed new string"""
        new_subtitle_str = self.header + "\n\n"
        for cue in self.matched:
            timestamp = f"{cue.get('start')} --> {cue.get('end')}"
            lines = "\n".join(cue.get("lines"))
            cue_text = f"{cue.get('id')}\n{timestamp}\n{lines}\n\n"
            new_subtitle_str = new_subtitle_str + cue_text
        return new_subtitle_str
    def create_bulk_import(self, video, source):
        """process matched for es import"""
        bulk_list = []
        channel = video.json_data.get("channel")
        document = {
            "youtube_id": video.youtube_id,
            "title": video.json_data.get("title"),
            "subtitle_channel": channel.get("channel_name"),
            "subtitle_channel_id": channel.get("channel_id"),
            "subtitle_last_refresh": int(datetime.now().strftime("%s")),
            "subtitle_lang": self.lang,
            "subtitle_source": source,
        }
        for match in self.matched:
            match_id = match.get("id")
            document_id = f"{video.youtube_id}-{self.lang}-{match_id}"
            action = {"index": {"_index": "ta_subtitle", "_id": document_id}}
            document.update(
                {
                    "subtitle_fragment_id": document_id,
                    "subtitle_start": match.get("start"),
                    "subtitle_end": match.get("end"),
                    "subtitle_index": match_id,
                    "subtitle_line": " ".join(match.get("lines")),
                }
            )
            bulk_list.append(json.dumps(action))
            bulk_list.append(json.dumps(document))
        bulk_list.append("\n")
        query_str = "\n".join(bulk_list)
        return query_str
 class YoutubeVideo(YouTubeItem, YoutubeSubtitle):
@ -204,10 +348,17 @@ class YoutubeVideo(YouTubeItem, YoutubeSubtitle):
        try:
            # when indexing from download task
            vid_path = self.build_dl_cache_path()
-        except FileNotFoundError:
+        except FileNotFoundError as err:
-            # when reindexing
+            # when reindexing needs to handle title rename
-            base = self.app_conf["videos"]
+            channel = os.path.split(self.json_data["media_url"])[0]
-            vid_path = os.path.join(base, self.json_data["media_url"])
+            channel_dir = os.path.join(self.app_conf["videos"], channel)
            all_files = os.listdir(channel_dir)
            for file in all_files:
                if self.youtube_id in file:
                    vid_path = os.path.join(channel_dir, file)
                    break
            else:
                raise FileNotFoundError("could not find video file") from err
        duration_handler = DurationConverter()
        duration = duration_handler.get_sec(vid_path)
@ -242,11 +393,18 @@ class YoutubeVideo(YouTubeItem, YoutubeSubtitle):
        """delete video file, meta data"""
        self.get_from_es()
        video_base = self.app_conf["videos"]
-        media_url = self.json_data["media_url"]
+        to_del = [self.json_data.get("media_url")]
-        print(f"{self.youtube_id}: delete {media_url} from file system")
+
-        to_delete = os.path.join(video_base, media_url)
+        all_subtitles = self.json_data.get("subtitles")
-        os.remove(to_delete)
+        if all_subtitles:
            to_del = to_del + [i.get("media_url") for i in all_subtitles]
        for media_url in to_del:
            file_path = os.path.join(video_base, media_url)
            os.remove(file_path)
        self.del_in_es()
        self._delete_subtitles()
    def _get_ryd_stats(self):
        """get optional stats from returnyoutubedislikeapi.com"""
@ -270,17 +428,17 @@ class YoutubeVideo(YouTubeItem, YoutubeSubtitle):
    def _check_subtitles(self):
        """optionally add subtitles"""
-        handler = YoutubeSubtitle(
+        handler = YoutubeSubtitle(self)
            self.config,
            self.youtube_meta,
            media_url=self.json_data["media_url"],
            youtube_id=self.youtube_id,
        )
        subtitles = handler.get_subtitles()
        if subtitles:
            self.json_data["subtitles"] = subtitles
            handler.download_subtitles(relevant_subtitles=subtitles)
    def _delete_subtitles(self):
        """delete indexed subtitles"""
        data = {"query": {"term": {"youtube_id": {"value": self.youtube_id}}}}
        _, _ = ElasticWrap("ta_subtitle/_delete_by_query").post(data=data)
 def index_new_video(youtube_id):
    """combined classes to create new video in index"""
--- a/tubearchivist/home/src/ta/helper.py
+++ b/tubearchivist/home/src/ta/helper.py
@ -169,7 +169,11 @@ class DurationConverter:
            capture_output=True,
            check=True,
        )
-        duration_sec = int(float(duration.stdout.decode().strip()))
+        duration_raw = duration.stdout.decode().strip()
        if duration_raw == "N/A":
            return 0
        duration_sec = int(float(duration_raw))
        return duration_sec
    @staticmethod
--- a/tubearchivist/home/templates/home/video.html
+++ b/tubearchivist/home/templates/home/video.html
@ -3,10 +3,14 @@
 {% load static %}
 {% load humanize %}
 <div class="video-main">
-    <video 
+    <video poster="/cache/{{ video.vid_thumb_url }}" controls preload="false" width="100%" playsinline 
-        src="/media/{{ video.media_url }}" 
+    id="video-item" ontimeupdate="onVideoProgress('{{ video.youtube_id }}')" onloadedmetadata="setVideoProgress(0)">
-        poster="/cache/{{ video.vid_thumb_url }}" controls preload="false" 
+        <source src="/media/{{ video.media_url }}" type="video/mp4">
-        type='video/mp4' width="100%" playsinline id="video-item" ontimeupdate="onVideoProgress('{{ video.youtube_id }}')" onloadedmetadata="setVideoProgress(0)">
+        {% if video.subtitles %}
            {% for subtitle in video.subtitles %}
                <track label="{{subtitle.name}}" kind="subtitles" srclang="{{subtitle.lang}}" src="/media/{{subtitle.media_url}}">
            {% endfor %}
        {% endif %}
    </video>
 </div>
 <div class="boxed-content">
@ -57,10 +61,10 @@
        </div>
        <div class="info-box-item">
            <div>
-                <p>Views: {{ video.stats.view_count|intcomma }}</p>
+                <p class="thumb-icon"><img src="{% static 'img/icon-eye.svg' %}" alt="views">: {{ video.stats.view_count|intcomma }}</p>
                <p class="thumb-icon like"><img src="{% static 'img/icon-thumb.svg' %}" alt="thumbs-up">: {{ video.stats.like_count|intcomma }}</p>
                {% if video.stats.dislike_count %}
-                    <p class="thumb-icon dislike"><img src="{% static 'img/icon-thumb.svg' %}" alt="thumbs-down">: {{ video.stats.dislike_count|intcomma }}</p>
+                    <p class="thumb-icon"><img class="dislike" src="{% static 'img/icon-thumb.svg' %}" alt="thumbs-down">: {{ video.stats.dislike_count|intcomma }}</p>
                {% endif %}
                {% if video.stats.average_rating %}
                    <p class="rating-stars">Rating: 
--- a/tubearchivist/requirements.txt
+++ b/tubearchivist/requirements.txt
@ -4,9 +4,9 @@ Django==4.0.2
 django-cors-headers==3.11.0
 djangorestframework==3.13.1
 Pillow==9.0.1
-redis==4.1.2
+redis==4.1.3
 requests==2.27.1
 ryd-client==0.0.3
 uWSGI==2.0.20
-whitenoise==5.3.0
+whitenoise==6.0.0
 yt_dlp==2022.2.4