From 91452b511476c82055961643bdda9325e0089426 Mon Sep 17 00:00:00 2001
From: simon <simobilleter@gmail.com>
Date: Sat, 5 Feb 2022 18:35:02 +0700
Subject: [PATCH 01/21] remove redundant video player api endpoint

---
 tubearchivist/api/README.md |  4 ----
 tubearchivist/api/urls.py   |  6 ------
 tubearchivist/api/views.py  | 32 --------------------------------
 3 files changed, 42 deletions(-)
diff --git a/tubearchivist/api/README.md b/tubearchivist/api/README.md
index 0e74a11..73dd4fc 100644
--- a/tubearchivist/api/README.md
+++ b/tubearchivist/api/README.md
@@ -23,10 +23,6 @@ response = requests.get(url, headers=headers)
 ## Video Item View
 /api/video/\<video_id>/
 
-## Video Player View
-returns all relevant information to create video player
-/api/video/\<video_id>/player
-
 ## Channel List View
 /api/channel/
 
diff --git a/tubearchivist/api/urls.py b/tubearchivist/api/urls.py
index a6c6801..d39dc30 100644
--- a/tubearchivist/api/urls.py
+++ b/tubearchivist/api/urls.py
@@ -6,7 +6,6 @@ from api.views import (
     DownloadApiListView,
     DownloadApiView,
     PlaylistApiView,
-    VideoApiPlayerView,
     VideoApiView,
 )
 from django.urls import path
@@ -17,11 +16,6 @@ urlpatterns = [
         VideoApiView.as_view(),
         name="api-video",
     ),
-    path(
-        "video/<slug:video_id>/player/",
-        VideoApiPlayerView.as_view(),
-        name="api-video-player",
-    ),
     path(
         "channel/",
         ChannelApiListView.as_view(),
diff --git a/tubearchivist/api/views.py b/tubearchivist/api/views.py
index f0923aa..ec75370 100644
--- a/tubearchivist/api/views.py
+++ b/tubearchivist/api/views.py
@@ -92,38 +92,6 @@ class VideoApiView(ApiBaseView):
         return Response(self.response, status=self.status_code)
 
 
-class VideoApiPlayerView(ApiBaseView):
-    """resolves to /api/video/<video_id>/player
-    GET: returns dict of video to build player
-    """
-
-    search_base = "/ta_video/_doc/"
-
-    def get(self, request, video_id):
-        # pylint: disable=unused-argument
-        """get request"""
-        self.config_builder()
-        self.get_document(video_id)
-        player = self.process_response()
-        return Response(player, status=self.status_code)
-
-    def process_response(self):
-        """build all needed vars for player"""
-        vid_data = self.response["data"]
-        youtube_id = vid_data["youtube_id"]
-        vid_thumb_url = ThumbManager().vid_thumb_path(youtube_id)
-        player = {
-            "youtube_id": youtube_id,
-            "media_url": "/media/" + vid_data["media_url"],
-            "vid_thumb_url": "/cache/" + vid_thumb_url,
-            "title": vid_data["title"],
-            "channel_name": vid_data["channel"]["channel_name"],
-            "channel_id": vid_data["channel"]["channel_id"],
-            "is_watched": vid_data["player"]["watched"],
-        }
-        return player
-
-
 class ChannelApiView(ApiBaseView):
     """resolves to /api/channel/<channel_id>/
     GET: returns metadata dict of channel

From 851fbae90045ac35c9ec8760278ec74fe0233d9d Mon Sep 17 00:00:00 2001
From: simon <simobilleter@gmail.com>
Date: Sat, 5 Feb 2022 18:42:09 +0700
Subject: [PATCH 02/21] fix video template dislike icon and add watched icon

---
 tubearchivist/home/templates/home/video.html | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tubearchivist/home/templates/home/video.html b/tubearchivist/home/templates/home/video.html
index 1b2e03d..e44aa18 100644
--- a/tubearchivist/home/templates/home/video.html
+++ b/tubearchivist/home/templates/home/video.html
@@ -57,10 +57,10 @@
         </div>
         <div class="info-box-item">
             <div>
-                <p>Views: {{ video.stats.view_count|intcomma }}</p>
+                <p class="thumb-icon"><img src="{% static 'img/icon-eye.svg' %}" alt="views">: {{ video.stats.view_count|intcomma }}</p>
                 <p class="thumb-icon like"><img src="{% static 'img/icon-thumb.svg' %}" alt="thumbs-up">: {{ video.stats.like_count|intcomma }}</p>
                 {% if video.stats.dislike_count %}
-                    <p class="thumb-icon dislike"><img src="{% static 'img/icon-thumb.svg' %}" alt="thumbs-down">: {{ video.stats.dislike_count|intcomma }}</p>
+                    <p class="thumb-icon"><img class="dislike" src="{% static 'img/icon-thumb.svg' %}" alt="thumbs-down">: {{ video.stats.dislike_count|intcomma }}</p>
                 {% endif %}
                 {% if video.stats.average_rating %}
                     <p class="rating-stars">Rating: 

From f5f46349b2404dc629c8e231473e33587c327486 Mon Sep 17 00:00:00 2001
From: simon <simobilleter@gmail.com>
Date: Sat, 5 Feb 2022 22:38:59 +0700
Subject: [PATCH 03/21] handle rescan name change

---
 tubearchivist/home/src/index/video.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/tubearchivist/home/src/index/video.py b/tubearchivist/home/src/index/video.py
index 5b92198..49d0b6b 100644
--- a/tubearchivist/home/src/index/video.py
+++ b/tubearchivist/home/src/index/video.py
@@ -205,9 +205,16 @@ class YoutubeVideo(YouTubeItem, YoutubeSubtitle):
             # when indexing from download task
             vid_path = self.build_dl_cache_path()
         except FileNotFoundError:
-            # when reindexing
-            base = self.app_conf["videos"]
-            vid_path = os.path.join(base, self.json_data["media_url"])
+            # when reindexing needs to handle title rename
+            channel = os.path.split(self.json_data["media_url"])[0]
+            channel_dir = os.path.join(self.app_conf["videos"], channel)
+            all_files = os.listdir(channel_dir)
+            for file in all_files:
+                if self.youtube_id in file:
+                    vid_path = os.path.join(channel_dir, file)
+                    break
+            else:
+                raise FileNotFoundError
 
         duration_handler = DurationConverter()
         duration = duration_handler.get_sec(vid_path)

From 2bf9e9683b13dc49558df0302e7b8fbc85f72817 Mon Sep 17 00:00:00 2001
From: simon <simobilleter@gmail.com>
Date: Sat, 5 Feb 2022 22:51:38 +0700
Subject: [PATCH 04/21] error handeling in _normalize_lang to skip livechat and
 ignore missing

---
 tubearchivist/home/src/index/video.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tubearchivist/home/src/index/video.py b/tubearchivist/home/src/index/video.py
index 49d0b6b..897371b 100644
--- a/tubearchivist/home/src/index/video.py
+++ b/tubearchivist/home/src/index/video.py
@@ -71,10 +71,15 @@ class YoutubeSubtitle:
     def _normalize_lang(self):
         """normalize country specific language keys"""
         all_subtitles = self.youtube_meta.get("subtitles")
+        if not all_subtitles:
+            return False
+
         all_keys = list(all_subtitles.keys())
         for key in all_keys:
             lang = key.split("-")[0]
             old = all_subtitles.pop(key)
+            if lang == "live_chat":
+                continue
             all_subtitles[lang] = old
 
         return all_subtitles

From 44af78b7e30d623f486b007cbcf4b88140130d3d Mon Sep 17 00:00:00 2001
From: simon <simobilleter@gmail.com>
Date: Sat, 5 Feb 2022 23:09:05 +0700
Subject: [PATCH 05/21] handle NA in ffprobe duration extractor

---
 tubearchivist/home/src/ta/helper.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tubearchivist/home/src/ta/helper.py b/tubearchivist/home/src/ta/helper.py
index 4788636..d577dcd 100644
--- a/tubearchivist/home/src/ta/helper.py
+++ b/tubearchivist/home/src/ta/helper.py
@@ -169,7 +169,11 @@ class DurationConverter:
             capture_output=True,
             check=True,
         )
-        duration_sec = int(float(duration.stdout.decode().strip()))
+        duration_raw = duration.stdout.decode().strip()
+        if duration_raw == "N/A":
+            return 0
+
+        duration_sec = int(float(duration_raw))
         return duration_sec
 
     @staticmethod

From 52013aff3f06382816e90cd19a1c34e4401b014e Mon Sep 17 00:00:00 2001
From: simon <simobilleter@gmail.com>
Date: Sat, 5 Feb 2022 23:42:42 +0700
Subject: [PATCH 06/21] fix subtitle download of first video of channel without
 folder

---
 tubearchivist/home/src/index/video.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tubearchivist/home/src/index/video.py b/tubearchivist/home/src/index/video.py
index 897371b..e22e52d 100644
--- a/tubearchivist/home/src/index/video.py
+++ b/tubearchivist/home/src/index/video.py
@@ -113,6 +113,8 @@ class YoutubeSubtitle:
             )
             response = requests.get(subtitle["url"])
             if response.ok:
+                # create folder here for first video of channel
+                os.makedirs(os.path.split(dest_path)[0], exist_ok=True)
                 with open(dest_path, "w", encoding="utf-8") as subfile:
                     subfile.write(response.text)
             else:

From e98ffc00502226c2223339d6e5cbbaf9827bbf53 Mon Sep 17 00:00:00 2001
From: simon <simobilleter@gmail.com>
Date: Sat, 5 Feb 2022 23:50:47 +0700
Subject: [PATCH 07/21] add subtitles mapping to video index

---
 tubearchivist/home/src/es/index_mapping.json | 26 ++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/tubearchivist/home/src/es/index_mapping.json b/tubearchivist/home/src/es/index_mapping.json
index 3272ed3..0ad6494 100644
--- a/tubearchivist/home/src/es/index_mapping.json
+++ b/tubearchivist/home/src/es/index_mapping.json
@@ -156,6 +156,32 @@
                             "normalizer": "to_lower"
                         }
                     }
+                },
+                "subtitles": {
+                    "properties": {
+                        "ext": {
+                            "type": "keyword",
+                            "index": false
+                        },
+                        "lang": {
+                            "type": "keyword",
+                            "index": false
+                        },
+                        "media_url": {
+                            "type": "keyword",
+                            "index": false
+                        },
+                        "name": {
+                            "type": "keyword"
+                        },
+                        "source": {
+                            "type": "keyword"
+                        },
+                        "url": {
+                            "type": "keyword",
+                            "index": false
+                        }
+                    }
                 }
             },
             "expected_set": {

From 1664b0d4fc33243ca92bf6a2787b5bbe1460a1f5 Mon Sep 17 00:00:00 2001
From: simon <simobilleter@gmail.com>
Date: Sun, 6 Feb 2022 00:08:24 +0700
Subject: [PATCH 08/21] restructure video tag to add subtitle tracks

---
 tubearchivist/home/templates/home/video.html | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/tubearchivist/home/templates/home/video.html b/tubearchivist/home/templates/home/video.html
index e44aa18..bec823f 100644
--- a/tubearchivist/home/templates/home/video.html
+++ b/tubearchivist/home/templates/home/video.html
@@ -3,10 +3,14 @@
 {% load static %}
 {% load humanize %}
 <div class="video-main">
-    <video 
-        src="/media/{{ video.media_url }}" 
-        poster="/cache/{{ video.vid_thumb_url }}" controls preload="false" 
-        type='video/mp4' width="100%" playsinline id="video-item" ontimeupdate="onVideoProgress('{{ video.youtube_id }}')" onloadedmetadata="setVideoProgress(0)">
+    <video poster="/cache/{{ video.vid_thumb_url }}" controls preload="false" width="100%" playsinline 
+    id="video-item" ontimeupdate="onVideoProgress('{{ video.youtube_id }}')" onloadedmetadata="setVideoProgress(0)">
+        <source src="/media/{{ video.media_url }}" type="video/mp4">
+        {% if video.subtitles %}
+            {% for subtitle in video.subtitles %}
+                <track label="{{subtitle.name}}" kind="subtitles" srclang="{{subtitle.lang}}" src="/media/{{subtitle.media_url}}">
+            {% endfor %}
+        {% endif %}
     </video>
 </div>
 <div class="boxed-content">

From 5f6158243e018edf5da0c4edc4343609b3d9ee75 Mon Sep 17 00:00:00 2001
From: simon <simobilleter@gmail.com>
Date: Mon, 7 Feb 2022 21:18:52 +0700
Subject: [PATCH 09/21] auto generated subtitle parser and cleaner

---
 tubearchivist/home/src/index/video.py | 85 +++++++++++++++++++++++++++
 1 file changed, 85 insertions(+)

diff --git a/tubearchivist/home/src/index/video.py b/tubearchivist/home/src/index/video.py
index e22e52d..ab5c22b 100644
--- a/tubearchivist/home/src/index/video.py
+++ b/tubearchivist/home/src/index/video.py
@@ -5,6 +5,7 @@ functionality:
 """
 
 import os
+import re
 from datetime import datetime
 
 import requests
@@ -121,6 +122,90 @@ class YoutubeSubtitle:
                 print(f"{self.youtube_id}: failed to download subtitle")
 
 
+class SubtitleParser:
+    """parse subtitle str from youtube"""
+
+    time_reg = r"^([0-9]{2}:?){3}\.[0-9]{3} --> ([0-9]{2}:?){3}\.[0-9]{3}"
+    stamp_reg = r"<([0-9]{2}:?){3}\.[0-9]{3}>"
+    tag_reg = r"</?c>"
+
+    def __init__(self, subtitle_str):
+        self.subtitle_str = subtitle_str
+        self.header = False
+        self.parsed_cue_list = False
+        self.all_text_lines = False
+        self.matched = False
+
+    def process(self):
+        """collection to process subtitle string"""
+        self._parse_cues()
+        self._match_text_lines()
+        self._add_id()
+
+    def _parse_cues(self):
+        """split into cues"""
+        all_cues = self.subtitle_str.replace("\n \n", "\n").split("\n\n")
+        self.header = all_cues[0]
+        self.all_text_lines = []
+        self.parsed_cue_list = [self._cue_cleaner(i) for i in all_cues[1:]]
+
+    def _cue_cleaner(self, cue):
+        """parse single cue"""
+        all_lines = cue.split("\n")
+        cue_dict = {"lines": []}
+
+        for line in all_lines:
+            if re.match(r"^([0-9]{2}:?){3}", line):
+                clean = re.search(self.time_reg, line).group()
+                start, end = clean.split(" --> ")
+                cue_dict.update({"start": start, "end": end})
+            else:
+                clean = re.sub(self.stamp_reg, "", line)
+                clean = re.sub(self.tag_reg, "", clean)
+                cue_dict["lines"].append(clean)
+                if clean and clean not in self.all_text_lines:
+                    self.all_text_lines.append(clean)
+
+        return cue_dict
+
+    def _match_text_lines(self):
+        """match unique text lines with timestamps"""
+
+        self.matched = []
+
+        while self.all_text_lines:
+            check = self.all_text_lines[0]
+            matches = [i for i in self.parsed_cue_list if check in i["lines"]]
+            new_cue = matches[-1]
+            new_cue["start"] = matches[0]["start"]
+
+            for line in new_cue["lines"]:
+                try:
+                    self.all_text_lines.remove(line)
+                except ValueError:
+                    print("failed to process:")
+                    print(line)
+
+            self.matched.append(new_cue)
+
+    def _add_id(self):
+        """add id to matched cues"""
+        for idx, _ in enumerate(self.matched):
+            self.matched[idx]["id"] = idx + 1
+
+    def get_subtitle_str(self):
+        """stitch cues and return processed new string"""
+        new_subtitle_str = self.header + "\n\n"
+
+        for cue in self.matched:
+            timestamp = f"{cue.get('start')} --> {cue.get('end')}"
+            lines = "\n".join(cue.get("lines"))
+            cue_text = f"{cue.get('id')}\n{timestamp}\n{lines}\n\n"
+            new_subtitle_str = new_subtitle_str + cue_text
+
+        return new_subtitle_str
+
+
 class YoutubeVideo(YouTubeItem, YoutubeSubtitle):
     """represents a single youtube video"""
 

From 6cb892a811fabea5bec08fb231759aa9c2ddac11 Mon Sep 17 00:00:00 2001
From: simon <simobilleter@gmail.com>
Date: Wed, 9 Feb 2022 21:33:41 +0700
Subject: [PATCH 10/21] integrate auto generated subtitle cleaner

---
 tubearchivist/home/src/index/video.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tubearchivist/home/src/index/video.py b/tubearchivist/home/src/index/video.py
index ab5c22b..4d0bfcb 100644
--- a/tubearchivist/home/src/index/video.py
+++ b/tubearchivist/home/src/index/video.py
@@ -113,11 +113,17 @@ class YoutubeSubtitle:
                 self.config["application"]["videos"], subtitle["media_url"]
             )
             response = requests.get(subtitle["url"])
+            if subtitle["source"] == "auto":
+                parser = SubtitleParser(response.text)
+                parser.process()
+                subtitle_str_clean = parser.get_subtitle_str()
+            else:
+                subtitle_str_clean = response.text
             if response.ok:
                 # create folder here for first video of channel
                 os.makedirs(os.path.split(dest_path)[0], exist_ok=True)
                 with open(dest_path, "w", encoding="utf-8") as subfile:
-                    subfile.write(response.text)
+                    subfile.write(subtitle_str_clean)
             else:
                 print(f"{self.youtube_id}: failed to download subtitle")
 

From 4e2d0fa46407bc7bf9230d7cb6317e894c62b999 Mon Sep 17 00:00:00 2001
From: simon <simobilleter@gmail.com>
Date: Wed, 9 Feb 2022 23:38:18 +0700
Subject: [PATCH 11/21] bump es version

---
 docker-compose.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docker-compose.yml b/docker-compose.yml
index ce79327..b300949 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -33,7 +33,7 @@ services:
     depends_on:
       - archivist-es
   archivist-es:
-    image: docker.elastic.co/elasticsearch/elasticsearch:7.16.2
+    image: docker.elastic.co/elasticsearch/elasticsearch:7.17.0
     container_name: archivist-es
     restart: always
     environment:
@@ -54,4 +54,4 @@ volumes:
   media:
   cache:
   redis:
-  es:
\ No newline at end of file
+  es:

From 4e4cfe333450c4c9f937508c49464e3d2cb7df33 Mon Sep 17 00:00:00 2001
From: simon <simobilleter@gmail.com>
Date: Wed, 9 Feb 2022 23:40:15 +0700
Subject: [PATCH 12/21] pass whole video object into YoutubeSubtitle class

---
 tubearchivist/home/src/index/video.py | 39 +++++++++++----------------
 1 file changed, 16 insertions(+), 23 deletions(-)

diff --git a/tubearchivist/home/src/index/video.py b/tubearchivist/home/src/index/video.py
index 4d0bfcb..3f39674 100644
--- a/tubearchivist/home/src/index/video.py
+++ b/tubearchivist/home/src/index/video.py
@@ -18,16 +18,13 @@ from ryd_client import ryd_client
 class YoutubeSubtitle:
     """handle video subtitle functionality"""
 
-    def __init__(self, config, youtube_meta, media_url, youtube_id):
-        self.config = config
-        self.youtube_meta = youtube_meta
-        self.media_url = media_url
-        self.youtube_id = youtube_id
+    def __init__(self, video):
+        self.video = video
         self.languages = False
 
     def sub_conf_parse(self):
         """add additional conf values to self"""
-        languages_raw = self.config["downloads"]["subtitle"]
+        languages_raw = self.video.config["downloads"]["subtitle"]
         self.languages = [i.strip() for i in languages_raw.split(",")]
 
     def get_subtitles(self):
@@ -41,7 +38,7 @@ class YoutubeSubtitle:
         if relevant_subtitles:
             return relevant_subtitles
 
-        if self.config["downloads"]["subtitle_source"] == "auto":
+        if self.video.config["downloads"]["subtitle_source"] == "auto":
             relevant_auto = self.get_auto_caption()
             return relevant_auto
 
@@ -49,8 +46,8 @@ class YoutubeSubtitle:
 
     def get_auto_caption(self):
         """get auto_caption subtitles"""
-        print(f"{self.youtube_id}: get auto generated subtitles")
-        all_subtitles = self.youtube_meta.get("automatic_captions")
+        print(f"{self.video.youtube_id}: get auto generated subtitles")
+        all_subtitles = self.video.youtube_meta.get("automatic_captions")
 
         if not all_subtitles:
             return False
@@ -58,7 +55,8 @@ class YoutubeSubtitle:
         relevant_subtitles = []
 
         for lang in self.languages:
-            media_url = self.media_url.replace(".mp4", f"-{lang}.vtt")
+            video_media_url = self.video.json_data["media_url"]
+            media_url = video_media_url.replace(".mp4", f"-{lang}.vtt")
             all_formats = all_subtitles.get(lang)
             subtitle = [i for i in all_formats if i["ext"] == "vtt"][0]
             subtitle.update(
@@ -71,7 +69,7 @@ class YoutubeSubtitle:
 
     def _normalize_lang(self):
         """normalize country specific language keys"""
-        all_subtitles = self.youtube_meta.get("subtitles")
+        all_subtitles = self.video.youtube_meta.get("subtitles")
         if not all_subtitles:
             return False
 
@@ -87,7 +85,7 @@ class YoutubeSubtitle:
 
     def get_user_subtitles(self):
         """get subtitles uploaded from channel owner"""
-        print(f"{self.youtube_id}: get user uploaded subtitles")
+        print(f"{self.video.youtube_id}: get user uploaded subtitles")
         all_subtitles = self._normalize_lang()
         if not all_subtitles:
             return False
@@ -95,7 +93,8 @@ class YoutubeSubtitle:
         relevant_subtitles = []
 
         for lang in self.languages:
-            media_url = self.media_url.replace(".mp4", f"-{lang}.vtt")
+            video_media_url = self.video.json_data["media_url"]
+            media_url = video_media_url.replace(".mp4", f"-{lang}.vtt")
             all_formats = all_subtitles.get(lang)
             subtitle = [i for i in all_formats if i["ext"] == "vtt"][0]
             subtitle.update(
@@ -108,10 +107,9 @@ class YoutubeSubtitle:
 
     def download_subtitles(self, relevant_subtitles):
         """download subtitle files to archive"""
+        videos_base = self.video.config["application"]["videos"]
         for subtitle in relevant_subtitles:
-            dest_path = os.path.join(
-                self.config["application"]["videos"], subtitle["media_url"]
-            )
+            dest_path = os.path.join(videos_base, subtitle["media_url"])
             response = requests.get(subtitle["url"])
             if subtitle["source"] == "auto":
                 parser = SubtitleParser(response.text)
@@ -125,7 +123,7 @@ class YoutubeSubtitle:
                 with open(dest_path, "w", encoding="utf-8") as subfile:
                     subfile.write(subtitle_str_clean)
             else:
-                print(f"{self.youtube_id}: failed to download subtitle")
+                print(f"{self.video.youtube_id}: failed to download subtitle")
 
 
 class SubtitleParser:
@@ -375,12 +373,7 @@ class YoutubeVideo(YouTubeItem, YoutubeSubtitle):
 
     def _check_subtitles(self):
         """optionally add subtitles"""
-        handler = YoutubeSubtitle(
-            self.config,
-            self.youtube_meta,
-            media_url=self.json_data["media_url"],
-            youtube_id=self.youtube_id,
-        )
+        handler = YoutubeSubtitle(self)
         subtitles = handler.get_subtitles()
         if subtitles:
             self.json_data["subtitles"] = subtitles

From 9f652802ae22bba180407914f19a83374eda92d5 Mon Sep 17 00:00:00 2001
From: simon <simobilleter@gmail.com>
Date: Thu, 10 Feb 2022 11:47:14 +0700
Subject: [PATCH 13/21] add new mapping for subtitle index

---
 tubearchivist/home/src/es/index_mapping.json | 67 ++++++++++++++++++++
 1 file changed, 67 insertions(+)

diff --git a/tubearchivist/home/src/es/index_mapping.json b/tubearchivist/home/src/es/index_mapping.json
index 0ad6494..f30a82d 100644
--- a/tubearchivist/home/src/es/index_mapping.json
+++ b/tubearchivist/home/src/es/index_mapping.json
@@ -303,6 +303,73 @@
                 },
                 "number_of_replicas": "0"
             }
+        },
+        {
+            "index_name": "subtitle",
+            "expected_map": {
+                "youtube_id": {
+                    "type": "keyword"
+                },
+                "title": {
+                    "type": "text",
+                    "fields": {
+                        "keyword": {
+                            "type": "keyword",
+                            "ignore_above": 256,
+                            "normalizer": "to_lower"
+                        }
+                    }
+                },
+                "subtitle_fragment_id": {
+                    "type": "keyword"
+                },
+                "subtitle_channel": {
+                    "type": "text",
+                    "fields": {
+                        "keyword": {
+                            "type": "keyword",
+                            "ignore_above": 256,
+                            "normalizer": "to_lower"
+                        }
+                    }
+                },
+                "subtitle_channel_id": {
+                    "type": "keyword"
+                },
+                "subtitle_start": {
+                    "type": "text"
+                },
+                "subtitle_end": {
+                    "type": "text"
+                },
+                "subtitle_last_refresh": {
+                    "type": "date"
+                },
+                "subtitle_index": {
+                    "type" : "long"
+                },
+                "subtitle_lang": {
+                    "type": "keyword"
+                },
+                "subtitle_source": {
+                    "type": "keyword"
+                },
+                "subtitle_line": {
+                    "type" : "text",
+                    "analyzer": "english"
+                }
+            },
+            "expected_set": {
+                "analysis": {
+                    "normalizer": {
+                        "to_lower": {
+                            "type": "custom",
+                            "filter": ["lowercase"]
+                        }
+                    }
+                },
+                "number_of_replicas": "0"
+            }
         }
     ]
 }
\ No newline at end of file

From a2cae51f48a3ca962e7a3ff6c778d229618ad266 Mon Sep 17 00:00:00 2001
From: simon <simobilleter@gmail.com>
Date: Thu, 10 Feb 2022 17:02:19 +0700
Subject: [PATCH 14/21] bulk import subtitle lines into es

---
 tubearchivist/home/src/index/video.py | 76 ++++++++++++++++++++++-----
 1 file changed, 63 insertions(+), 13 deletions(-)

diff --git a/tubearchivist/home/src/index/video.py b/tubearchivist/home/src/index/video.py
index 3f39674..1385e16 100644
--- a/tubearchivist/home/src/index/video.py
+++ b/tubearchivist/home/src/index/video.py
@@ -4,11 +4,13 @@ functionality:
 - index and update in es
 """
 
+import json
 import os
 import re
 from datetime import datetime
 
 import requests
+from home.src.es.connect import ElasticWrap
 from home.src.index import channel as ta_channel
 from home.src.index.generic import YouTubeItem
 from home.src.ta.helper import DurationConverter, clean_string
@@ -110,20 +112,31 @@ class YoutubeSubtitle:
         videos_base = self.video.config["application"]["videos"]
         for subtitle in relevant_subtitles:
             dest_path = os.path.join(videos_base, subtitle["media_url"])
+            source = subtitle["media_url"]
             response = requests.get(subtitle["url"])
-            if subtitle["source"] == "auto":
-                parser = SubtitleParser(response.text)
-                parser.process()
-                subtitle_str_clean = parser.get_subtitle_str()
-            else:
-                subtitle_str_clean = response.text
-            if response.ok:
-                # create folder here for first video of channel
-                os.makedirs(os.path.split(dest_path)[0], exist_ok=True)
-                with open(dest_path, "w", encoding="utf-8") as subfile:
-                    subfile.write(subtitle_str_clean)
-            else:
+            if not response.ok:
                 print(f"{self.video.youtube_id}: failed to download subtitle")
+                continue
+
+            parser = SubtitleParser(response.text, subtitle.get("lang"))
+            parser.process()
+            subtitle_str = parser.get_subtitle_str()
+            self._write_subtitle_file(dest_path, subtitle_str)
+            query_str = parser.create_bulk_import(self.video, source)
+            self._index_subtitle(query_str)
+
+    @staticmethod
+    def _write_subtitle_file(dest_path, subtitle_str):
+        """write subtitle file to disk"""
+        # create folder here for first video of channel
+        os.makedirs(os.path.split(dest_path)[0], exist_ok=True)
+        with open(dest_path, "w", encoding="utf-8") as subfile:
+            subfile.write(subtitle_str)
+
+    @staticmethod
+    def _index_subtitle(query_str):
+        """send subtitle to es for indexing"""
+        _, _ = ElasticWrap("_bulk").post(data=query_str, ndjson=True)
 
 
 class SubtitleParser:
@@ -133,8 +146,9 @@ class SubtitleParser:
     stamp_reg = r"<([0-9]{2}:?){3}\.[0-9]{3}>"
     tag_reg = r"</?c>"
 
-    def __init__(self, subtitle_str):
+    def __init__(self, subtitle_str, lang):
         self.subtitle_str = subtitle_str
+        self.lang = lang
         self.header = False
         self.parsed_cue_list = False
         self.all_text_lines = False
@@ -209,6 +223,42 @@ class SubtitleParser:
 
         return new_subtitle_str
 
+    def create_bulk_import(self, video, source):
+        """process matched for es import"""
+        bulk_list = []
+        channel = video.json_data.get("channel")
+
+        document = {
+            "youtube_id": video.youtube_id,
+            "title": video.json_data.get("title"),
+            "subtitle_channel": channel.get("channel_name"),
+            "subtitle_channel_id": channel.get("channel_id"),
+            "subtitle_last_refresh": int(datetime.now().strftime("%s")),
+            "subtitle_lang": self.lang,
+            "subtitle_source": source,
+        }
+
+        for match in self.matched:
+            match_id = match.get("id")
+            document_id = f"{video.youtube_id}-{self.lang}-{match_id}"
+            action = {"index": {"_index": "ta_subtitle", "_id": document_id}}
+            document.update(
+                {
+                    "subtitle_fragment_id": document_id,
+                    "subtitle_start": match.get("start"),
+                    "subtitle_end": match.get("end"),
+                    "subtitle_index": match_id,
+                    "subtitle_line": " ".join(match.get("lines")),
+                }
+            )
+            bulk_list.append(json.dumps(action))
+            bulk_list.append(json.dumps(document))
+
+        bulk_list.append("\n")
+        query_str = "\n".join(bulk_list)
+
+        return query_str
+
 
 class YoutubeVideo(YouTubeItem, YoutubeSubtitle):
     """represents a single youtube video"""

From 0414df0de087d027baa72c354ad53412e49fef45 Mon Sep 17 00:00:00 2001
From: simon <simobilleter@gmail.com>
Date: Thu, 10 Feb 2022 17:10:30 +0700
Subject: [PATCH 15/21] fix key error for subtitle source

---
 tubearchivist/home/src/index/video.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tubearchivist/home/src/index/video.py b/tubearchivist/home/src/index/video.py
index 1385e16..4ac4461 100644
--- a/tubearchivist/home/src/index/video.py
+++ b/tubearchivist/home/src/index/video.py
@@ -112,7 +112,7 @@ class YoutubeSubtitle:
         videos_base = self.video.config["application"]["videos"]
         for subtitle in relevant_subtitles:
             dest_path = os.path.join(videos_base, subtitle["media_url"])
-            source = subtitle["media_url"]
+            source = subtitle["source"]
             response = requests.get(subtitle["url"])
             if not response.ok:
                 print(f"{self.video.youtube_id}: failed to download subtitle")

From b071612038df25cf963d3c4e542e6e578cb18d61 Mon Sep 17 00:00:00 2001
From: simon <simobilleter@gmail.com>
Date: Thu, 10 Feb 2022 17:34:21 +0700
Subject: [PATCH 16/21] better error raising for add player info

---
 tubearchivist/home/src/index/video.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tubearchivist/home/src/index/video.py b/tubearchivist/home/src/index/video.py
index 4ac4461..c9c57ce 100644
--- a/tubearchivist/home/src/index/video.py
+++ b/tubearchivist/home/src/index/video.py
@@ -350,7 +350,7 @@ class YoutubeVideo(YouTubeItem, YoutubeSubtitle):
         try:
             # when indexing from download task
             vid_path = self.build_dl_cache_path()
-        except FileNotFoundError:
+        except FileNotFoundError as err:
             # when reindexing needs to handle title rename
             channel = os.path.split(self.json_data["media_url"])[0]
             channel_dir = os.path.join(self.app_conf["videos"], channel)
@@ -360,7 +360,7 @@ class YoutubeVideo(YouTubeItem, YoutubeSubtitle):
                     vid_path = os.path.join(channel_dir, file)
                     break
             else:
-                raise FileNotFoundError
+                raise FileNotFoundError("could not find video file") from err
 
         duration_handler = DurationConverter()
         duration = duration_handler.get_sec(vid_path)

From 077692987bdbcd48bc71b6cb0d586c8de174becb Mon Sep 17 00:00:00 2001
From: simon <simobilleter@gmail.com>
Date: Thu, 10 Feb 2022 18:32:23 +0700
Subject: [PATCH 17/21] fix multi language subtitle extractor, and better regex
 for timestamp matching

---
 tubearchivist/home/src/index/video.py | 72 +++++++++++++--------------
 1 file changed, 35 insertions(+), 37 deletions(-)

diff --git a/tubearchivist/home/src/index/video.py b/tubearchivist/home/src/index/video.py
index c9c57ce..ca2c2c7 100644
--- a/tubearchivist/home/src/index/video.py
+++ b/tubearchivist/home/src/index/video.py
@@ -36,38 +36,37 @@ class YoutubeSubtitle:
             # no subtitles
             return False
 
-        relevant_subtitles = self.get_user_subtitles()
-        if relevant_subtitles:
-            return relevant_subtitles
+        relevant_subtitles = []
+        for lang in self.languages:
+            user_sub = self.get_user_subtitles(lang)
+            if user_sub:
+                relevant_subtitles.append(user_sub)
+                continue
 
-        if self.video.config["downloads"]["subtitle_source"] == "auto":
-            relevant_auto = self.get_auto_caption()
-            return relevant_auto
+            if self.video.config["downloads"]["subtitle_source"] == "auto":
+                auto_cap = self.get_auto_caption(lang)
+                if auto_cap:
+                    relevant_subtitles.append(auto_cap)
 
-        return False
+        return relevant_subtitles
 
-    def get_auto_caption(self):
+    def get_auto_caption(self, lang):
         """get auto_caption subtitles"""
-        print(f"{self.video.youtube_id}: get auto generated subtitles")
+        print(f"{self.video.youtube_id}-{lang}: get auto generated subtitles")
         all_subtitles = self.video.youtube_meta.get("automatic_captions")
 
         if not all_subtitles:
             return False
 
-        relevant_subtitles = []
+        video_media_url = self.video.json_data["media_url"]
+        media_url = video_media_url.replace(".mp4", f"-{lang}.vtt")
+        all_formats = all_subtitles.get(lang)
+        subtitle = [i for i in all_formats if i["ext"] == "vtt"][0]
+        subtitle.update(
+            {"lang": lang, "source": "auto", "media_url": media_url}
+        )
 
-        for lang in self.languages:
-            video_media_url = self.video.json_data["media_url"]
-            media_url = video_media_url.replace(".mp4", f"-{lang}.vtt")
-            all_formats = all_subtitles.get(lang)
-            subtitle = [i for i in all_formats if i["ext"] == "vtt"][0]
-            subtitle.update(
-                {"lang": lang, "source": "auto", "media_url": media_url}
-            )
-            relevant_subtitles.append(subtitle)
-            break
-
-        return relevant_subtitles
+        return subtitle
 
     def _normalize_lang(self):
         """normalize country specific language keys"""
@@ -85,27 +84,26 @@ class YoutubeSubtitle:
 
         return all_subtitles
 
-    def get_user_subtitles(self):
+    def get_user_subtitles(self, lang):
         """get subtitles uploaded from channel owner"""
-        print(f"{self.video.youtube_id}: get user uploaded subtitles")
+        print(f"{self.video.youtube_id}-{lang}: get user uploaded subtitles")
         all_subtitles = self._normalize_lang()
         if not all_subtitles:
             return False
 
-        relevant_subtitles = []
+        video_media_url = self.video.json_data["media_url"]
+        media_url = video_media_url.replace(".mp4", f"-{lang}.vtt")
+        all_formats = all_subtitles.get(lang)
+        if not all_formats:
+            # no user subtitles found
+            return False
 
-        for lang in self.languages:
-            video_media_url = self.video.json_data["media_url"]
-            media_url = video_media_url.replace(".mp4", f"-{lang}.vtt")
-            all_formats = all_subtitles.get(lang)
-            subtitle = [i for i in all_formats if i["ext"] == "vtt"][0]
-            subtitle.update(
-                {"lang": lang, "source": "user", "media_url": media_url}
-            )
-            relevant_subtitles.append(subtitle)
-            break
+        subtitle = [i for i in all_formats if i["ext"] == "vtt"][0]
+        subtitle.update(
+            {"lang": lang, "source": "user", "media_url": media_url}
+        )
 
-        return relevant_subtitles
+        return subtitle
 
     def download_subtitles(self, relevant_subtitles):
         """download subtitle files to archive"""
@@ -173,7 +171,7 @@ class SubtitleParser:
         cue_dict = {"lines": []}
 
         for line in all_lines:
-            if re.match(r"^([0-9]{2}:?){3}", line):
+            if re.match(self.time_reg, line):
                 clean = re.search(self.time_reg, line).group()
                 start, end = clean.split(" --> ")
                 cue_dict.update({"start": start, "end": end})

From 0e56efc428e3783433071e3b33217f89b09b6966 Mon Sep 17 00:00:00 2001
From: simon <simobilleter@gmail.com>
Date: Thu, 10 Feb 2022 18:48:35 +0700
Subject: [PATCH 18/21] limit filesystem scan to mp4 files only

---
 tubearchivist/home/src/index/filesystem.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tubearchivist/home/src/index/filesystem.py b/tubearchivist/home/src/index/filesystem.py
index 5a33501..0354e83 100644
--- a/tubearchivist/home/src/index/filesystem.py
+++ b/tubearchivist/home/src/index/filesystem.py
@@ -46,8 +46,9 @@ class FilesystemScanner:
         all_downloaded = []
         for channel_name in all_channels:
             channel_path = os.path.join(self.VIDEOS, channel_name)
-            videos = os.listdir(channel_path)
-            all_videos = ignore_filelist(videos)
+            channel_files = os.listdir(channel_path)
+            channel_files_clean = ignore_filelist(channel_files)
+            all_videos = [i for i in channel_files_clean if i.endswith(".mp4")]
             for video in all_videos:
                 youtube_id = video[9:20]
                 all_downloaded.append((channel_name, video, youtube_id))

From 4d30bed3ccc92a5dfc123480cacedead55ab7ad2 Mon Sep 17 00:00:00 2001
From: simon <simobilleter@gmail.com>
Date: Thu, 10 Feb 2022 19:09:07 +0700
Subject: [PATCH 19/21] extend delete video to also delete subtitles

---
 tubearchivist/home/src/index/video.py | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/tubearchivist/home/src/index/video.py b/tubearchivist/home/src/index/video.py
index ca2c2c7..e2695d4 100644
--- a/tubearchivist/home/src/index/video.py
+++ b/tubearchivist/home/src/index/video.py
@@ -393,11 +393,18 @@ class YoutubeVideo(YouTubeItem, YoutubeSubtitle):
         """delete video file, meta data"""
         self.get_from_es()
         video_base = self.app_conf["videos"]
-        media_url = self.json_data["media_url"]
-        print(f"{self.youtube_id}: delete {media_url} from file system")
-        to_delete = os.path.join(video_base, media_url)
-        os.remove(to_delete)
+        to_del = [self.json_data.get("media_url")]
+
+        all_subtitles = self.json_data.get("subtitles")
+        if all_subtitles:
+            to_del = to_del + [i.get("media_url") for i in all_subtitles]
+
+        for media_url in to_del:
+            file_path = os.path.join(video_base, media_url)
+            os.remove(file_path)
+
         self.del_in_es()
+        self._delete_subtitles()
 
     def _get_ryd_stats(self):
         """get optional stats from returnyoutubedislikeapi.com"""
@@ -427,6 +434,11 @@ class YoutubeVideo(YouTubeItem, YoutubeSubtitle):
             self.json_data["subtitles"] = subtitles
             handler.download_subtitles(relevant_subtitles=subtitles)
 
+    def _delete_subtitles(self):
+        """delete indexed subtitles"""
+        data = {"query": {"term": {"youtube_id": {"value": self.youtube_id}}}}
+        _, _ = ElasticWrap("ta_subtitle/_delete_by_query").post(data=data)
+
 
 def index_new_video(youtube_id):
     """combined classes to create new video in index"""

From 3ea5e9c53708b50aef48ff1008d55efea0aee5b0 Mon Sep 17 00:00:00 2001
From: simon <simobilleter@gmail.com>
Date: Thu, 10 Feb 2022 19:27:05 +0700
Subject: [PATCH 20/21] bump dependencies

---
 tubearchivist/requirements.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tubearchivist/requirements.txt b/tubearchivist/requirements.txt
index 815d2c0..fc21392 100644
--- a/tubearchivist/requirements.txt
+++ b/tubearchivist/requirements.txt
@@ -4,9 +4,9 @@ Django==4.0.2
 django-cors-headers==3.11.0
 djangorestframework==3.13.1
 Pillow==9.0.1
-redis==4.1.2
+redis==4.1.3
 requests==2.27.1
 ryd-client==0.0.3
 uWSGI==2.0.20
-whitenoise==5.3.0
+whitenoise==6.0.0
 yt_dlp==2022.2.4

From 16f33feda0dd1c65f73186d0fee2eb56795f0761 Mon Sep 17 00:00:00 2001
From: simon <simobilleter@gmail.com>
Date: Thu, 10 Feb 2022 19:45:22 +0700
Subject: [PATCH 21/21] process subtitle media url paths

---
 tubearchivist/api/views.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tubearchivist/api/views.py b/tubearchivist/api/views.py
index ec75370..620c063 100644
--- a/tubearchivist/api/views.py
+++ b/tubearchivist/api/views.py
@@ -60,6 +60,12 @@ class ApiBaseView(APIView):
             cache_dir = self.default_conf["application"]["cache_dir"]
             new_thumb = f"{cache_dir}/{vid_thumb_url}"
             self.response["data"]["vid_thumb_url"] = new_thumb
+        if "subtitles" in all_keys:
+            all_subtitles = self.response["data"]["subtitles"]
+            for idx, _ in enumerate(all_subtitles):
+                url = self.response["data"]["subtitles"][idx]["media_url"]
+                new_url = f"/media/{url}"
+                self.response["data"]["subtitles"][idx]["media_url"] = new_url
 
     def get_paginate(self):
         """add pagination detail to response"""