Filesystem organization, #build

Changed: - Channged filesystem to static ids - Improved error handling for download process - Lots of fixes and improvements
2025-07-02 23:31:10 +00:00 · 2023-07-25 00:08:59 +07:00 · 2023-07-25 00:08:59 +07:00 · cd25eadd1c
commit cd25eadd1c
parent f848e73251 d500fa5eeb
28 changed files with 547 additions and 1306 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,5 +1,6 @@
 # python testing cache
 __pycache__
+.venv

 # django testing db
 db.sqlite3
--- a/4
+++ b/4
@ -3,7 +3,7 @@


 # First stage to build python wheel
-FROM python:3.10.9-slim-bullseye AS builder
+FROM python:3.11.3-slim-bullseye AS builder
 ARG TARGETPLATFORM

 RUN apt-get update && apt-get install -y --no-install-recommends \
@ -14,7 +14,7 @@ COPY ./tubearchivist/requirements.txt /requirements.txt
 RUN pip install --user -r requirements.txt

 # build final image
-FROM python:3.10.9-slim-bullseye as tubearchivist
+FROM python:3.11.3-slim-bullseye as tubearchivist

 ARG TARGETPLATFORM
 ARG INSTALL_DEBUG
--- a/deploy.sh
+++ b/deploy.sh
@ -49,6 +49,7 @@ function sync_test {
        --exclude ".gitignore" \
        --exclude "**/cache" \
        --exclude "**/__pycache__/" \
+        --exclude ".venv" \
        --exclude "db.sqlite3" \
        --exclude ".mypy_cache" \
        . -e ssh "$host":tubearchivist
@ -87,14 +88,14 @@ function validate {
    # note: this logic is duplicated in the `./github/workflows/lint_python.yml` config
    # if you update this file, you should update that as well
    echo "running black"
-    black --exclude "migrations/*" --diff --color --check -l 79 "$check_path"
+    black --force-exclude "migrations/*" --diff --color --check -l 79 "$check_path"
    echo "running codespell"
-    codespell --skip="./.git,./package.json,./package-lock.json,./node_modules,./.mypy_cache" "$check_path"
+    codespell --skip="./.git,./.venv,./package.json,./package-lock.json,./node_modules,./.mypy_cache" "$check_path"
    echo "running flake8"
-    flake8 "$check_path" --exclude "migrations" --count --max-complexity=10 \
+    flake8 "$check_path" --exclude "migrations,.venv" --count --max-complexity=10 \
        --max-line-length=79 --show-source --statistics
    echo "running isort"
-    isort --skip "migrations" --check-only --diff --profile black -l 79 "$check_path"
+    isort --skip "migrations" --skip ".venv" --check-only --diff --profile black -l 79 "$check_path"
    printf "    \n> all validations passed\n"

 }
--- a/docker_assets/run.sh
+++ b/docker_assets/run.sh
@ -14,6 +14,7 @@ fi
 python manage.py ta_envcheck
 python manage.py ta_connection
 python manage.py ta_startup
+python manage.py ta_migpath

 # start all tasks
 nginx &
--- a/package-lock.json
+++ b/package-lock.json
--- a/tubearchivist/api/views.py
+++ b/tubearchivist/api/views.py
@ -11,7 +11,7 @@ from home.src.index.channel import YoutubeChannel
 from home.src.index.generic import Pagination
 from home.src.index.reindex import ReindexProgress
 from home.src.index.video import SponsorBlock, YoutubeVideo
-from home.src.ta.config import AppConfig
+from home.src.ta.config import AppConfig, ReleaseVersion
 from home.src.ta.ta_redis import RedisArchivist
 from home.src.ta.task_manager import TaskCommand, TaskManager
 from home.src.ta.urlparser import Parser
@ -189,7 +189,7 @@ class VideoCommentView(ApiBaseView):

 class VideoSimilarView(ApiBaseView):
    """resolves to /api/video/<video-id>/similar/
-    GET: return max 3 videos similar to this
+    GET: return max 6 videos similar to this
    """

    search_base = "ta_video/_search/"
@ -535,7 +535,11 @@ class PingView(ApiBaseView):
    @staticmethod
    def get(request):
        """get pong"""
-        data = {"response": "pong", "user": request.user.id}
+        data = {
+            "response": "pong",
+            "user": request.user.id,
+            "version": ReleaseVersion().get_local_version(),
+        }
        return Response(data)


--- a/tubearchivist/config/management/commands/ta_connection.py
+++ b/tubearchivist/config/management/commands/ta_connection.py
@ -86,6 +86,8 @@ class Command(BaseCommand):
                continue

            if status_code and status_code == 200:
+                path = "_cluster/health?wait_for_status=yellow&timeout=30s"
+                _, _ = ElasticWrap(path).get()
                self.stdout.write(
                    self.style.SUCCESS("    ✓ ES connection established")
                )
@ -116,7 +118,7 @@ class Command(BaseCommand):
            return

        message = (
-            "    🗙 ES connection failed. "
+            "    🗙 ES version check failed. "
            + f"Expected {self.MIN_MAJOR}.{self.MIN_MINOR} but got {version}"
        )
        self.stdout.write(self.style.ERROR(f"{message}"))
--- a/tubearchivist/config/management/commands/ta_migpath.py
+++ b/tubearchivist/config/management/commands/ta_migpath.py
@ -0,0 +1,171 @@
+"""filepath migration from v0.3.6 to v0.3.7"""
+
+import json
+import os
+
+from django.core.management.base import BaseCommand
+from home.src.es.connect import ElasticWrap, IndexPaginate
+from home.src.ta.config import AppConfig
+from home.src.ta.helper import ignore_filelist
+
+TOPIC = """
+
+########################
+# Filesystem Migration #
+########################
+
+"""
+
+
+class Command(BaseCommand):
+    """command framework"""
+
+    # pylint: disable=no-member
+
+    def handle(self, *args, **options):
+        """run commands"""
+        self.stdout.write(TOPIC)
+
+        handler = FolderMigration()
+        to_migrate = handler.get_to_migrate()
+        if not to_migrate:
+            self.stdout.write(
+                self.style.SUCCESS("    no channel migration needed\n")
+            )
+            return
+
+        self.stdout.write(self.style.SUCCESS("    migrating channels"))
+        total_channels = handler.create_folders(to_migrate)
+        self.stdout.write(
+            self.style.SUCCESS(f"    created {total_channels} channels")
+        )
+
+        self.stdout.write(
+            self.style.SUCCESS(f"    migrating {len(to_migrate)} videos")
+        )
+        handler.migrate_videos(to_migrate)
+        self.stdout.write(self.style.SUCCESS("    update videos in index"))
+        handler.send_bulk()
+
+        self.stdout.write(self.style.SUCCESS("    cleanup old folders"))
+        handler.delete_old()
+
+        self.stdout.write(self.style.SUCCESS("    ✓ migration completed\n"))
+
+
+class FolderMigration:
+    """migrate video archive folder"""
+
+    def __init__(self):
+        self.config = AppConfig().config
+        self.videos = self.config["application"]["videos"]
+        self.bulk_list = []
+
+    def get_to_migrate(self):
+        """get videos to migrate"""
+        script = (
+            "doc['media_url'].value == "
+            + "doc['channel.channel_id'].value + '/'"
+            + " + doc['youtube_id'].value + '.mp4'"
+        )
+        data = {
+            "query": {"bool": {"must_not": [{"script": {"script": script}}]}},
+            "_source": [
+                "youtube_id",
+                "media_url",
+                "channel.channel_id",
+                "subtitles",
+            ],
+        }
+        response = IndexPaginate("ta_video", data).get_results()
+
+        return response
+
+    def create_folders(self, to_migrate):
+        """create required channel folders"""
+        host_uid = self.config["application"]["HOST_UID"]
+        host_gid = self.config["application"]["HOST_GID"]
+        all_channel_ids = {i["channel"]["channel_id"] for i in to_migrate}
+
+        for channel_id in all_channel_ids:
+            new_folder = os.path.join(self.videos, channel_id)
+            os.makedirs(new_folder, exist_ok=True)
+            if host_uid and host_gid:
+                os.chown(new_folder, host_uid, host_gid)
+
+        return len(all_channel_ids)
+
+    def migrate_videos(self, to_migrate):
+        """migrate all videos of channel"""
+        for video in to_migrate:
+            new_media_url = self._move_video_file(video)
+            if not new_media_url:
+                continue
+
+            all_subtitles = self._move_subtitles(video)
+            action = {
+                "update": {"_id": video["youtube_id"], "_index": "ta_video"}
+            }
+            source = {"doc": {"media_url": new_media_url}}
+            if all_subtitles:
+                source["doc"].update({"subtitles": all_subtitles})
+
+            self.bulk_list.append(json.dumps(action))
+            self.bulk_list.append(json.dumps(source))
+
+    def _move_video_file(self, video):
+        """move video file to new location"""
+        old_path = os.path.join(self.videos, video["media_url"])
+        if not os.path.exists(old_path):
+            print(f"did not find expected video at {old_path}")
+            return False
+
+        new_media_url = os.path.join(
+            video["channel"]["channel_id"], video["youtube_id"] + ".mp4"
+        )
+        new_path = os.path.join(self.videos, new_media_url)
+        os.rename(old_path, new_path)
+
+        return new_media_url
+
+    def _move_subtitles(self, video):
+        """move subtitle files to new location"""
+        all_subtitles = video.get("subtitles")
+        if not all_subtitles:
+            return False
+
+        for subtitle in all_subtitles:
+            old_path = os.path.join(self.videos, subtitle["media_url"])
+            if not os.path.exists(old_path):
+                print(f"did not find expected subtitle at {old_path}")
+                continue
+
+            new_media_url = os.path.join(
+                video["channel"]["channel_id"],
+                f"{video.get('youtube_id')}.{subtitle.get('lang')}.vtt",
+            )
+            new_path = os.path.join(self.videos, new_media_url)
+            os.rename(old_path, new_path)
+            subtitle["media_url"] = new_media_url
+
+        return all_subtitles
+
+    def send_bulk(self):
+        """send bulk request to update index with new urls"""
+        if not self.bulk_list:
+            print("nothing to update")
+            return
+
+        self.bulk_list.append("\n")
+        data = "\n".join(self.bulk_list)
+        response, status = ElasticWrap("_bulk").post(data=data, ndjson=True)
+        if not status == 200:
+            print(response)
+
+    def delete_old(self):
+        """delete old empty folders"""
+        all_folders = ignore_filelist(os.listdir(self.videos))
+        for folder in all_folders:
+            folder_path = os.path.join(self.videos, folder)
+            if not ignore_filelist(os.listdir(folder_path)):
+                os.rmdir(folder_path)
--- a/tubearchivist/config/settings.py
+++ b/tubearchivist/config/settings.py
@ -256,4 +256,4 @@ CORS_ALLOW_HEADERS = list(default_headers) + [

 # TA application settings
 TA_UPSTREAM = "https://github.com/tubearchivist/tubearchivist"
-TA_VERSION = "v0.3.6"
+TA_VERSION = "v0.3.7-unstable"
--- a/tubearchivist/home/src/download/queue.py
+++ b/tubearchivist/home/src/download/queue.py
@ -114,7 +114,13 @@ class PendingInteract:
    def update_status(self):
        """update status of pending item"""
        if self.status == "priority":
-            data = {"doc": {"status": "pending", "auto_start": True}}
+            data = {
+                "doc": {
+                    "status": "pending",
+                    "auto_start": True,
+                    "message": None,
+                }
+            }
        else:
            data = {"doc": {"status": self.status}}

--- a/tubearchivist/home/src/download/thumbnails.py
+++ b/tubearchivist/home/src/download/thumbnails.py
@ -270,7 +270,7 @@ class ValidatorCallback:
            urls = (
                channel["_source"]["channel_thumb_url"],
                channel["_source"]["channel_banner_url"],
-                channel["_source"]["channel_tvart_url"],
+                channel["_source"].get("channel_tvart_url", False),
            )
            handler = ThumbManager(channel["_source"]["channel_id"])
            handler.download_channel_art(urls, skip_existing=True)
--- a/tubearchivist/home/src/download/yt_dlp_base.py
+++ b/tubearchivist/home/src/download/yt_dlp_base.py
@ -48,11 +48,14 @@ class YtWrap:
        with yt_dlp.YoutubeDL(self.obs) as ydl:
            try:
                ydl.download([url])
-            except yt_dlp.utils.DownloadError:
-                print(f"{url}: failed to download.")
-                return False
+            except yt_dlp.utils.DownloadError as err:
+                print(f"{url}: failed to download with message {err}")
+                if "Temporary failure in name resolution" in str(err):
+                    raise ConnectionError("lost the internet, abort!") from err

-        return True
+                return False, str(err)
+
+        return True, True

    def extract(self, url):
        """make extract request"""
@ -61,8 +64,17 @@ class YtWrap:
        except cookiejar.LoadError:
            print("cookie file is invalid")
            return False
-        except (yt_dlp.utils.ExtractorError, yt_dlp.utils.DownloadError):
-            print(f"{url}: failed to get info from youtube")
+        except yt_dlp.utils.ExtractorError as err:
+            print(f"{url}: failed to extract with message: {err}, continue...")
+            return False
+        except yt_dlp.utils.DownloadError as err:
+            if "This channel does not have a" in str(err):
+                return False
+
+            print(f"{url}: failed to get info from youtube with message {err}")
+            if "Temporary failure in name resolution" in str(err):
+                raise ConnectionError("lost the internet, abort!") from err
+
            return False

        return response
--- a/tubearchivist/home/src/download/yt_dlp_handler.py
+++ b/tubearchivist/home/src/download/yt_dlp_handler.py
@ -20,7 +20,7 @@ from home.src.index.playlist import YoutubePlaylist
 from home.src.index.video import YoutubeVideo, index_new_video
 from home.src.index.video_constants import VideoTypeEnum
 from home.src.ta.config import AppConfig
-from home.src.ta.helper import clean_string, ignore_filelist
+from home.src.ta.helper import ignore_filelist


 class DownloadPostProcess:
@ -203,12 +203,13 @@ class VideoDownloader:
    def _get_next(self, auto_only):
        """get next item in queue"""
        must_list = [{"term": {"status": {"value": "pending"}}}]
+        must_not_list = [{"exists": {"field": "message"}}]
        if auto_only:
            must_list.append({"term": {"auto_start": {"value": True}}})

        data = {
            "size": 1,
-            "query": {"bool": {"must": must_list}},
+            "query": {"bool": {"must": must_list, "must_not": must_not_list}},
            "sort": [
                {"auto_start": {"order": "desc"}},
                {"timestamp": {"order": "asc"}},
@ -344,7 +345,9 @@ class VideoDownloader:
            if youtube_id in file_name:
                obs["outtmpl"] = os.path.join(dl_cache, file_name)

-        success = YtWrap(obs, self.config).download(youtube_id)
+        success, message = YtWrap(obs, self.config).download(youtube_id)
+        if not success:
+            self._handle_error(youtube_id, message)

        if self.obs["writethumbnail"]:
            # webp files don't get cleaned up automatically
@ -356,28 +359,27 @@ class VideoDownloader:

        return success

+    @staticmethod
+    def _handle_error(youtube_id, message):
+        """store error message"""
+        data = {"doc": {"message": message}}
+        _, _ = ElasticWrap(f"ta_download/_update/{youtube_id}").post(data=data)
+
    def move_to_archive(self, vid_dict):
        """move downloaded video from cache to archive"""
        videos = self.config["application"]["videos"]
        host_uid = self.config["application"]["HOST_UID"]
        host_gid = self.config["application"]["HOST_GID"]
-        channel_name = clean_string(vid_dict["channel"]["channel_name"])
-        if len(channel_name) <= 3:
-            # fall back to channel id
-            channel_name = vid_dict["channel"]["channel_id"]
-        # make archive folder with correct permissions
-        new_folder = os.path.join(videos, channel_name)
-        if not os.path.exists(new_folder):
-            os.makedirs(new_folder)
-        if host_uid and host_gid:
-            os.chown(new_folder, host_uid, host_gid)
-        # find real filename
+        # make folder
+        folder = os.path.join(videos, vid_dict["channel"]["channel_id"])
+        if not os.path.exists(folder):
+            os.makedirs(folder)
+            if host_uid and host_gid:
+                os.chown(folder, host_uid, host_gid)
+        # move media file
+        media_file = vid_dict["youtube_id"] + ".mp4"
        cache_dir = self.config["application"]["cache_dir"]
-        all_cached = ignore_filelist(os.listdir(cache_dir + "/download/"))
-        for file_str in all_cached:
-            if vid_dict["youtube_id"] in file_str:
-                old_file = file_str
-        old_path = os.path.join(cache_dir, "download", old_file)
+        old_path = os.path.join(cache_dir, "download", media_file)
        new_path = os.path.join(videos, vid_dict["media_url"])
        # move media file and fix permission
        shutil.move(old_path, new_path, copy_function=shutil.copyfile)
--- a/tubearchivist/home/src/es/connect.py
+++ b/tubearchivist/home/src/es/connect.py
@ -127,6 +127,12 @@ class IndexPaginate:

    def validate_data(self):
        """add pit and size to data"""
+        if not self.data:
+            self.data = {}
+
+        if "query" not in self.data.keys():
+            self.data.update({"query": {"match_all": {}}})
+
        if "sort" not in self.data.keys():
            self.data.update({"sort": [{"_doc": {"order": "desc"}}]})

--- a/tubearchivist/home/src/es/index_mapping.json
+++ b/tubearchivist/home/src/es/index_mapping.json
@ -380,6 +380,9 @@
                },
                "auto_start": {
                    "type": "boolean"
+                },
+                "message": {
+                    "type": "text"
                }
            },
            "expected_set": {
--- a/tubearchivist/home/src/index/channel.py
+++ b/tubearchivist/home/src/index/channel.py
@ -14,7 +14,6 @@ from home.src.download.yt_dlp_base import YtWrap
 from home.src.es.connect import ElasticWrap, IndexPaginate
 from home.src.index.generic import YouTubeItem
 from home.src.index.playlist import YoutubePlaylist
-from home.src.ta.helper import clean_string


 class YoutubeChannel(YouTubeItem):
@ -177,12 +176,10 @@ class YoutubeChannel(YouTubeItem):

    def get_folder_path(self):
        """get folder where media files get stored"""
-        channel_name = self.json_data["channel_name"]
-        folder_name = clean_string(channel_name)
-        if len(folder_name) <= 3:
-            # fall back to channel id
-            folder_name = self.json_data["channel_id"]
-        folder_path = os.path.join(self.app_conf["videos"], folder_name)
+        folder_path = os.path.join(
+            self.app_conf["videos"],
+            self.json_data["channel_id"],
+        )
        return folder_path

    def delete_es_videos(self):
--- a/tubearchivist/home/src/index/comments.py
+++ b/tubearchivist/home/src/index/comments.py
@ -120,7 +120,9 @@ class Comments:
            "comment_timestamp": comment["timestamp"],
            "comment_time_text": time_text,
            "comment_likecount": comment["like_count"],
-            "comment_is_favorited": comment["is_favorited"],
+            "comment_is_favorited": comment.get(
+                "is_favorited"
+            ),  # temporary fix for yt-dlp upstream issue 7389
            "comment_author": comment["author"],
            "comment_author_id": comment["author_id"],
            "comment_author_thumbnail": comment["author_thumbnail"],
--- a/tubearchivist/home/src/index/filesystem.py
+++ b/tubearchivist/home/src/index/filesystem.py
@ -1,198 +1,85 @@
 """
 Functionality:
- reindexing old documents
- syncing updated values between indexes
 - scan the filesystem to delete or index
 """

-import json
 import os

-from home.src.download.queue import PendingList
-from home.src.es.connect import ElasticWrap
+from home.src.es.connect import ElasticWrap, IndexPaginate
 from home.src.index.comments import CommentList
-from home.src.index.video import index_new_video
+from home.src.index.video import YoutubeVideo, index_new_video
 from home.src.ta.config import AppConfig
-from home.src.ta.helper import clean_string, ignore_filelist
-from PIL import ImageFile
-
-ImageFile.LOAD_TRUNCATED_IMAGES = True
+from home.src.ta.helper import ignore_filelist


-class ScannerBase:
-    """scan the filesystem base class"""
+class Scanner:
+    """scan index and filesystem"""

-    CONFIG = AppConfig().config
-    VIDEOS = CONFIG["application"]["videos"]
-
-    def __init__(self):
-        self.to_index = False
-        self.to_delete = False
-        self.mismatch = False
-        self.to_rename = False
-
-    def scan(self):
-        """entry point, scan and compare"""
-        all_downloaded = self._get_all_downloaded()
-        all_indexed = self._get_all_indexed()
-        self.list_comarison(all_downloaded, all_indexed)
-
-    def _get_all_downloaded(self):
-        """get a list of all video files downloaded"""
-        channels = os.listdir(self.VIDEOS)
-        all_channels = ignore_filelist(channels)
-        all_channels.sort()
-        all_downloaded = []
-        for channel_name in all_channels:
-            channel_path = os.path.join(self.VIDEOS, channel_name)
-            channel_files = os.listdir(channel_path)
-            channel_files_clean = ignore_filelist(channel_files)
-            all_videos = [i for i in channel_files_clean if i.endswith(".mp4")]
-            for video in all_videos:
-                youtube_id = video[9:20]
-                all_downloaded.append((channel_name, video, youtube_id))
-
-        return all_downloaded
-
-    @staticmethod
-    def _get_all_indexed():
-        """get a list of all indexed videos"""
-        index_handler = PendingList()
-        index_handler.get_download()
-        index_handler.get_indexed()
-
-        all_indexed = []
-        for video in index_handler.all_videos:
-            youtube_id = video["youtube_id"]
-            media_url = video["media_url"]
-            published = video["published"]
-            title = video["title"]
-            all_indexed.append((youtube_id, media_url, published, title))
-        return all_indexed
-
-    def list_comarison(self, all_downloaded, all_indexed):
-        """compare the lists to figure out what to do"""
-        self._find_unindexed(all_downloaded, all_indexed)
-        self._find_missing(all_downloaded, all_indexed)
-        self._find_bad_media_url(all_downloaded, all_indexed)
-
-    def _find_unindexed(self, all_downloaded, all_indexed):
-        """find video files without a matching document indexed"""
-        all_indexed_ids = [i[0] for i in all_indexed]
-        self.to_index = []
-        for downloaded in all_downloaded:
-            if downloaded[2] not in all_indexed_ids:
-                self.to_index.append(downloaded)
-
-    def _find_missing(self, all_downloaded, all_indexed):
-        """find indexed videos without matching media file"""
-        all_downloaded_ids = [i[2] for i in all_downloaded]
-        self.to_delete = []
-        for video in all_indexed:
-            youtube_id = video[0]
-            if youtube_id not in all_downloaded_ids:
-                self.to_delete.append(video)
-
-    def _find_bad_media_url(self, all_downloaded, all_indexed):
-        """rename media files not matching the indexed title"""
-        self.mismatch = []
-        self.to_rename = []
-
-        for downloaded in all_downloaded:
-            channel, filename, downloaded_id = downloaded
-            # find in indexed
-            for indexed in all_indexed:
-                indexed_id, media_url, published, title = indexed
-                if indexed_id == downloaded_id:
-                    # found it
-                    pub = published.replace("-", "")
-                    expected = f"{pub}_{indexed_id}_{clean_string(title)}.mp4"
-                    new_url = os.path.join(channel, expected)
-                    if expected != filename:
-                        # file to rename
-                        self.to_rename.append((channel, filename, expected))
-                    if media_url != new_url:
-                        # media_url to update in es
-                        self.mismatch.append((indexed_id, new_url))
-
-                    break
-
-
-class Filesystem(ScannerBase):
-    """handle scanning and fixing from filesystem"""
+    VIDEOS = AppConfig().config["application"]["videos"]

    def __init__(self, task=False):
-        super().__init__()
        self.task = task
+        self.to_delete = False
+        self.to_index = False

-    def process(self):
-        """entry point"""
+    def scan(self):
+        """scan the filesystem"""
+        downloaded = self._get_downloaded()
+        indexed = self._get_indexed()
+        self.to_index = downloaded - indexed
+        self.to_delete = indexed - downloaded
+
+    def _get_downloaded(self):
+        """get downloaded ids"""
        if self.task:
-            self.task.send_progress(["Scanning your archive and index."])
-        self.scan()
-        self.rename_files()
-        self.send_mismatch_bulk()
-        self.delete_from_index()
-        self.add_missing()
+            self.task.send_progress(["Scan your filesystem for videos."])

-    def rename_files(self):
-        """rename media files as identified by find_bad_media_url"""
-        if not self.to_rename:
-            return
+        downloaded = set()
+        channels = ignore_filelist(os.listdir(self.VIDEOS))
+        for channel in channels:
+            folder = os.path.join(self.VIDEOS, channel)
+            files = ignore_filelist(os.listdir(folder))
+            downloaded.update({i.split(".")[0] for i in files})

-        total = len(self.to_rename)
+        return downloaded
+
+    def _get_indexed(self):
+        """get all indexed ids"""
        if self.task:
-            self.task.send_progress([f"Rename {total} media files."])
-        for bad_filename in self.to_rename:
-            channel, filename, expected_filename = bad_filename
-            print(f"renaming [{filename}] to [{expected_filename}]")
-            old_path = os.path.join(self.VIDEOS, channel, filename)
-            new_path = os.path.join(self.VIDEOS, channel, expected_filename)
-            os.rename(old_path, new_path)
+            self.task.send_progress(["Get all videos indexed."])

-    def send_mismatch_bulk(self):
-        """build bulk update"""
-        if not self.mismatch:
-            return
+        data = {"query": {"match_all": {}}, "_source": ["youtube_id"]}
+        response = IndexPaginate("ta_video", data).get_results()
+        return {i["youtube_id"] for i in response}

-        total = len(self.mismatch)
-        if self.task:
-            self.task.send_progress([f"Fix media urls for {total} files"])
-        bulk_list = []
-        for video_mismatch in self.mismatch:
-            youtube_id, media_url = video_mismatch
-            print(f"{youtube_id}: fixing media url {media_url}")
-            action = {"update": {"_id": youtube_id, "_index": "ta_video"}}
-            source = {"doc": {"media_url": media_url}}
-            bulk_list.append(json.dumps(action))
-            bulk_list.append(json.dumps(source))
-        # add last newline
-        bulk_list.append("\n")
-        data = "\n".join(bulk_list)
-        _, _ = ElasticWrap("_bulk").post(data=data, ndjson=True)
+    def apply(self):
+        """apply all changes"""
+        self.delete()
+        self.index()
+        self.url_fix()

-    def delete_from_index(self):
-        """find indexed but deleted mediafile"""
+    def delete(self):
+        """delete videos from index"""
        if not self.to_delete:
+            print("nothing to delete")
            return

-        total = len(self.to_delete)
        if self.task:
-            self.task.send_progress([f"Clean up {total} items from index."])
-        for indexed in self.to_delete:
-            youtube_id = indexed[0]
-            print(f"deleting {youtube_id} from index")
-            path = f"ta_video/_doc/{youtube_id}"
-            _, _ = ElasticWrap(path).delete()
+            self.task.send_progress(
+                [f"Remove {len(self.to_delete)} videos from index."]
+            )

-    def add_missing(self):
-        """add missing videos to index"""
-        video_ids = [i[2] for i in self.to_index]
-        if not video_ids:
+        for youtube_id in self.to_delete:
+            YoutubeVideo(youtube_id).delete_media_file()
+
+    def index(self):
+        """index new"""
+        if not self.to_index:
+            print("nothing to index")
            return

-        total = len(video_ids)
-        for idx, youtube_id in enumerate(video_ids):
+        total = len(self.to_index)
+        for idx, youtube_id in enumerate(self.to_index):
            if self.task:
                self.task.send_progress(
                    message_lines=[
@ -202,4 +89,36 @@ class Filesystem(ScannerBase):
                )
            index_new_video(youtube_id)

-        CommentList(video_ids, task=self.task).index()
+        CommentList(self.to_index, task=self.task).index()
+
+    def url_fix(self):
+        """
+        update path v0.3.6 to v0.3.7
+        fix url not matching channel-videoid pattern
+        """
+        bool_must = (
+            "doc['media_url'].value == "
+            + "(doc['channel.channel_id'].value + '/' + "
+            + "doc['youtube_id'].value) + '.mp4'"
+        )
+        to_update = (
+            "ctx._source['media_url'] = "
+            + "ctx._source.channel['channel_id'] + '/' + "
+            + "ctx._source['youtube_id'] + '.mp4'"
+        )
+        data = {
+            "query": {
+                "bool": {
+                    "must_not": [{"script": {"script": {"source": bool_must}}}]
+                }
+            },
+            "script": {"source": to_update},
+        }
+        response, _ = ElasticWrap("ta_video/_update_by_query").post(data=data)
+        updated = response.get("updates")
+        if updated:
+            print(f"updated {updated} bad media_url")
+            if self.task:
+                self.task.send_progress(
+                    [f"Updated {updated} wrong media urls."]
+                )
--- a/tubearchivist/home/src/index/reindex.py
+++ b/tubearchivist/home/src/index/reindex.py
@ -6,7 +6,6 @@ functionality:

 import json
 import os
-import shutil
 from datetime import datetime
 from time import sleep

@ -14,7 +13,6 @@ from home.src.download.queue import PendingList
 from home.src.download.subscriptions import ChannelSubscription
 from home.src.download.thumbnails import ThumbManager
 from home.src.download.yt_dlp_base import CookieHandler
-from home.src.download.yt_dlp_handler import VideoDownloader
 from home.src.es.connect import ElasticWrap, IndexPaginate
 from home.src.index.channel import YoutubeChannel
 from home.src.index.comments import Comments
@ -54,6 +52,7 @@ class ReindexBase:
    def __init__(self):
        self.config = AppConfig().config
        self.now = int(datetime.now().timestamp())
+        self.total = None

    def populate(self, all_ids, reindex_config):
        """add all to reindex ids to redis queue"""
@ -61,6 +60,7 @@ class ReindexBase:
            return

        RedisQueue(queue_name=reindex_config["queue_name"]).add_list(all_ids)
+        self.total = None


 class ReindexPopulate(ReindexBase):
@ -238,18 +238,19 @@ class Reindex(ReindexBase):
            if not RedisQueue(index_config["queue_name"]).has_item():
                continue

-            total = RedisQueue(index_config["queue_name"]).length()
+            self.total = RedisQueue(index_config["queue_name"]).length()
            while True:
-                has_next = self.reindex_index(name, index_config, total)
+                has_next = self.reindex_index(name, index_config)
                if not has_next:
                    break

-    def reindex_index(self, name, index_config, total):
+    def reindex_index(self, name, index_config):
        """reindex all of a single index"""
        reindex = self.get_reindex_map(index_config["index_name"])
        youtube_id = RedisQueue(index_config["queue_name"]).get_next()
        if youtube_id:
-            self._notify(name, index_config, total)
+            if self.task:
+                self._notify(name, index_config)
            reindex(youtube_id)
            sleep_interval = self.config["downloads"].get("sleep_interval", 0)
            sleep(sleep_interval)
@ -266,23 +267,18 @@ class Reindex(ReindexBase):

        return def_map.get(index_name)

-    def _notify(self, name, index_config, total):
+    def _notify(self, name, index_config):
        """send notification back to task"""
+        if self.total is None:
+            self.total = RedisQueue(index_config["queue_name"]).length()
+
        remaining = RedisQueue(index_config["queue_name"]).length()
-        idx = total - remaining
-        message = [f"Reindexing {name.title()}s {idx}/{total}"]
-        progress = idx / total
+        idx = self.total - remaining
+        message = [f"Reindexing {name.title()}s {idx}/{self.total}"]
+        progress = idx / self.total
        self.task.send_progress(message, progress=progress)

    def _reindex_single_video(self, youtube_id):
-        """wrapper to handle channel name changes"""
-        try:
-            self._reindex_single_video_call(youtube_id)
-        except FileNotFoundError:
-            ChannelUrlFixer(youtube_id, self.config).run()
-            self._reindex_single_video_call(youtube_id)
-
-    def _reindex_single_video_call(self, youtube_id):
        """refresh data for single video"""
        video = YoutubeVideo(youtube_id)

@ -291,7 +287,10 @@ class Reindex(ReindexBase):
        es_meta = video.json_data.copy()

        # get new
-        video.build_json()
+        media_url = os.path.join(
+            self.config["application"]["videos"], es_meta["media_url"]
+        )
+        video.build_json(media_path=media_url)
        if not video.youtube_meta:
            video.deactivate()
            return
@ -466,65 +465,6 @@ class ReindexProgress(ReindexBase):
        return state_dict


-class ChannelUrlFixer:
-    """fix not matching channel names in reindex"""
-
-    def __init__(self, youtube_id, config):
-        self.youtube_id = youtube_id
-        self.config = config
-        self.video = False
-
-    def run(self):
-        """check and run if needed"""
-        print(f"{self.youtube_id}: failed to build channel path, try to fix.")
-        video_path_is, video_folder_is = self.get_as_is()
-        if not os.path.exists(video_path_is):
-            print(f"giving up reindex, video in video: {self.video.json_data}")
-            raise ValueError
-
-        _, video_folder_should = self.get_as_should()
-
-        if video_folder_is != video_folder_should:
-            self.process(video_path_is)
-        else:
-            print(f"{self.youtube_id}: skip channel url fixer")
-
-    def get_as_is(self):
-        """get video object as is"""
-        self.video = YoutubeVideo(self.youtube_id)
-        self.video.get_from_es()
-        video_path_is = os.path.join(
-            self.config["application"]["videos"],
-            self.video.json_data["media_url"],
-        )
-        video_folder_is = os.path.split(video_path_is)[0]
-
-        return video_path_is, video_folder_is
-
-    def get_as_should(self):
-        """add fresh metadata from remote"""
-        self.video.get_from_youtube()
-        self.video.add_file_path()
-
-        video_path_should = os.path.join(
-            self.config["application"]["videos"],
-            self.video.json_data["media_url"],
-        )
-        video_folder_should = os.path.split(video_path_should)[0]
-        return video_path_should, video_folder_should
-
-    def process(self, video_path_is):
-        """fix filepath"""
-        print(f"{self.youtube_id}: fixing channel rename.")
-        cache_dir = self.config["application"]["cache_dir"]
-        new_path = os.path.join(
-            cache_dir, "download", self.youtube_id + ".mp4"
-        )
-        shutil.move(video_path_is, new_path, copy_function=shutil.copyfile)
-        VideoDownloader().move_to_archive(self.video.json_data)
-        self.video.update_media_url()
-
-
 class ChannelFullScan:
    """
    update from v0.3.0 to v0.3.1
--- a/tubearchivist/home/src/index/subtitle.py
+++ b/tubearchivist/home/src/index/subtitle.py
@ -62,7 +62,12 @@ class YoutubeSubtitle:
        if not all_formats:
            return False

-        subtitle = [i for i in all_formats if i["ext"] == "json3"][0]
+        subtitle_json3 = [i for i in all_formats if i["ext"] == "json3"]
+        if not subtitle_json3:
+            print(f"{self.video.youtube_id}-{lang}: json3 not processed")
+            return False
+
+        subtitle = subtitle_json3[0]
        subtitle.update(
            {"lang": lang, "source": "auto", "media_url": media_url}
        )
--- a/tubearchivist/home/src/index/video.py
+++ b/tubearchivist/home/src/index/video.py
@ -20,7 +20,7 @@ from home.src.index.video_streams import (
    DurationConverter,
    MediaStreamExtractor,
 )
-from home.src.ta.helper import clean_string, randomizor
+from home.src.ta.helper import randomizor
 from home.src.ta.ta_redis import RedisArchivist
 from ryd_client import ryd_client

@ -231,18 +231,24 @@ class YoutubeVideo(YouTubeItem, YoutubeSubtitle):
    def build_dl_cache_path(self):
        """find video path in dl cache"""
        cache_dir = self.app_conf["cache_dir"]
-        cache_path = f"{cache_dir}/download/"
-        all_cached = os.listdir(cache_path)
-        for file_cached in all_cached:
-            if self.youtube_id in file_cached:
-                vid_path = os.path.join(cache_path, file_cached)
-                return vid_path
+        video_id = self.json_data["youtube_id"]
+        cache_path = f"{cache_dir}/download/{video_id}.mp4"
+        if os.path.exists(cache_path):
+            return cache_path
+
+        channel_path = os.path.join(
+            self.app_conf["videos"],
+            self.json_data["channel"]["channel_id"],
+            f"{video_id}.mp4",
+        )
+        if os.path.exists(channel_path):
+            return channel_path

        raise FileNotFoundError

    def add_player(self, media_path=False):
        """add player information for new videos"""
-        vid_path = self._get_vid_path(media_path)
+        vid_path = media_path or self.build_dl_cache_path()

        duration_handler = DurationConverter()
        duration = duration_handler.get_sec(vid_path)
@ -259,7 +265,7 @@ class YoutubeVideo(YouTubeItem, YoutubeSubtitle):

    def add_streams(self, media_path=False):
        """add stream metadata"""
-        vid_path = self._get_vid_path(media_path)
+        vid_path = media_path or self.build_dl_cache_path()
        media = MediaStreamExtractor(vid_path)
        self.json_data.update(
            {
@ -268,43 +274,12 @@ class YoutubeVideo(YouTubeItem, YoutubeSubtitle):
            }
        )

-    def _get_vid_path(self, media_path=False):
-        """get path of media file"""
-        if media_path:
-            return media_path
-
-        try:
-            # when indexing from download task
-            vid_path = self.build_dl_cache_path()
-        except FileNotFoundError as err:
-            # when reindexing needs to handle title rename
-            channel = os.path.split(self.json_data["media_url"])[0]
-            channel_dir = os.path.join(self.app_conf["videos"], channel)
-            all_files = os.listdir(channel_dir)
-            for file in all_files:
-                if self.youtube_id in file and file.endswith(".mp4"):
-                    vid_path = os.path.join(channel_dir, file)
-                    break
-            else:
-                raise FileNotFoundError("could not find video file") from err
-
-        return vid_path
-
    def add_file_path(self):
        """build media_url for where file will be located"""
-        channel_name = self.json_data["channel"]["channel_name"]
-        clean_channel_name = clean_string(channel_name)
-        if len(clean_channel_name) <= 3:
-            # fall back to channel id
-            clean_channel_name = self.json_data["channel"]["channel_id"]
-
-        timestamp = self.json_data["published"].replace("-", "")
-        youtube_id = self.json_data["youtube_id"]
-        title = self.json_data["title"]
-        clean_title = clean_string(title)
-        filename = f"{timestamp}_{youtube_id}_{clean_title}.mp4"
-        media_url = os.path.join(clean_channel_name, filename)
-        self.json_data["media_url"] = media_url
+        self.json_data["media_url"] = os.path.join(
+            self.json_data["channel"]["channel_id"],
+            self.json_data["youtube_id"] + ".mp4",
+        )

    def delete_media_file(self):
        """delete video file, meta data"""
--- a/tubearchivist/home/src/index/video_streams.py
+++ b/tubearchivist/home/src/index/video_streams.py
@ -35,24 +35,27 @@ class DurationConverter:
        return duration_sec

    @staticmethod
-    def get_str(duration_sec):
+    def get_str(seconds):
        """takes duration in sec and returns clean string"""
-        if not duration_sec:
+        if not seconds:
            # failed to extract
            return "NA"

-        hours = int(duration_sec // 3600)
-        minutes = int((duration_sec - (hours * 3600)) // 60)
-        secs = int(duration_sec - (hours * 3600) - (minutes * 60))
+        days = int(seconds // (24 * 3600))
+        hours = int((seconds % (24 * 3600)) // 3600)
+        minutes = int((seconds % 3600) // 60)
+        seconds = int(seconds % 60)

        duration_str = str()
+        if days:
+            duration_str = f"{days}d "
        if hours:
-            duration_str = str(hours).zfill(2) + ":"
+            duration_str = duration_str + str(hours).zfill(2) + ":"
        if minutes:
            duration_str = duration_str + str(minutes).zfill(2) + ":"
        else:
            duration_str = duration_str + "00:"
-        duration_str = duration_str + str(secs).zfill(2)
+        duration_str = duration_str + str(seconds).zfill(2)
        return duration_str


--- a/tubearchivist/home/src/ta/config.py
+++ b/tubearchivist/home/src/ta/config.py
@ -8,6 +8,7 @@ import json
 import os
 import re
 from random import randint
+from time import sleep

 import requests
 from celery.schedules import crontab
@ -67,11 +68,19 @@ class AppConfig:
    @staticmethod
    def get_config_redis():
        """read config json set from redis to overwrite defaults"""
-        config = RedisArchivist().get_message("config")
-        if not list(config.values())[0]:
-            return False
+        for i in range(10):
+            try:
+                config = RedisArchivist().get_message("config")
+                if not list(config.values())[0]:
+                    return False

-        return config
+                return config
+
+            except Exception:  # pylint: disable=broad-except
+                print(f"... Redis connection failed, retry [{i}/10]")
+                sleep(3)
+
+        raise ConnectionError("failed to connect to redis")

    def update_config(self, form_post):
        """update config values from settings form"""
@ -317,6 +326,10 @@ class ReleaseVersion:
            RedisArchivist().set_message(self.NEW_KEY, message)
            print(f"[{self.local_version}]: found new version {new_version}")

+    def get_local_version(self):
+        """read version from local"""
+        return self.local_version
+
    def get_remote_version(self):
        """read version from remote"""
        self.response = requests.get(self.REMOTE_URL, timeout=20).json()
--- a/tubearchivist/home/src/ta/helper.py
+++ b/tubearchivist/home/src/ta/helper.py
@ -6,25 +6,13 @@ Loose collection of helper functions
 import json
 import os
 import random
-import re
 import string
-import unicodedata
 from datetime import datetime
 from urllib.parse import urlparse

 import requests


-def clean_string(file_name: str) -> str:
-    """clean string to only asci characters"""
-    whitelist = "-_.() " + string.ascii_letters + string.digits
-    normalized = unicodedata.normalize("NFKD", file_name)
-    ascii_only = normalized.encode("ASCII", "ignore").decode().strip()
-    white_listed: str = "".join(c for c in ascii_only if c in whitelist)
-    cleaned: str = re.sub(r"[ ]{2,}", " ", white_listed)
-    return cleaned
-
-
 def ignore_filelist(filelist: list[str]) -> list[str]:
    """ignore temp files for os.listdir sanitizer"""
    to_ignore = ["Icon\r\r", "Temporary Items", "Network Trash Folder"]
--- a/tubearchivist/home/tasks.py
+++ b/tubearchivist/home/tasks.py
@ -19,7 +19,7 @@ from home.src.download.yt_dlp_handler import VideoDownloader
 from home.src.es.backup import ElasticBackup
 from home.src.es.index_setup import ElasitIndexWrap
 from home.src.index.channel import YoutubeChannel
-from home.src.index.filesystem import Filesystem
+from home.src.index.filesystem import Scanner
 from home.src.index.manual import ImportFolderScanner
 from home.src.index.reindex import Reindex, ReindexManual, ReindexPopulate
 from home.src.ta.config import AppConfig, ReleaseVersion, ScheduleBuilder
@ -113,7 +113,7 @@ class BaseTask(Task):
        """callback for task failure"""
        print(f"{task_id} Failed callback")
        message, key = self._build_message(level="error")
-        message.update({"messages": ["Task failed"]})
+        message.update({"messages": [f"Task failed: {exc}"]})
        RedisArchivist().set_message(key, message, expire=20)

    def on_success(self, retval, task_id, args, kwargs):
@ -290,7 +290,9 @@ def rescan_filesystem(self):
        return

    manager.init(self)
-    Filesystem(task=self).process()
+    handler = Scanner(task=self)
+    handler.scan()
+    handler.apply()
    ThumbValidator(task=self).validate()


--- a/tubearchivist/home/templates/home/downloads.html
+++ b/tubearchivist/home/templates/home/downloads.html
@ -97,6 +97,9 @@
                            <a href="https://www.youtube.com/watch?v={{ video.source.youtube_id }}" target="_blank"><h3>{{ video.source.title }}</h3></a>
                        </div>
                        <p>Published: {{ video.source.published }} | Duration: {{ video.source.duration }} | {{ video.source.youtube_id }}</p>
+                        {% if video.source.message %}
+                            <p class="danger-zone">{{ video.source.message }}</p>
+                        {% endif %}
                        <div>
                            {% if show_ignored_only %}
                                <button data-id="{{ video.source.youtube_id }}" onclick="forgetIgnore(this)">Forget</button>
@ -105,6 +108,9 @@
                                <button data-id="{{ video.source.youtube_id }}" onclick="toIgnore(this)">Ignore</button>
                                <button id="{{ video.source.youtube_id }}" data-id="{{ video.source.youtube_id }}" onclick="downloadNow(this)">Download now</button>
                            {% endif %}
+                            {% if video.source.message %}
+                                <button class="danger-button" data-id="{{ video.source.youtube_id }}" onclick="forgetIgnore(this)">Delete</button>
+                            {% endif %}
                        </div>
                    </div>
                </div>
--- a/tubearchivist/requirements.txt
+++ b/tubearchivist/requirements.txt
@ -1,12 +1,12 @@
-celery==5.2.7
-Django==4.2.1
-django-auth-ldap==4.3.0
-django-cors-headers==3.14.0
+celery==5.3.1
+Django==4.2.3
+django-auth-ldap==4.4.0
+django-cors-headers==4.2.0
 djangorestframework==3.14.0
-Pillow==9.5.0
-redis==4.5.4
-requests==2.30.0
+Pillow==10.0.0
+redis==4.6.0
+requests==2.31.0
 ryd-client==0.0.6
 uWSGI==2.0.21
-whitenoise==6.4.0
-yt_dlp==2023.3.4
+whitenoise==6.5.0
+yt_dlp==2023.7.6
--- a/tubearchivist/static/script.js
+++ b/tubearchivist/static/script.js
@ -160,12 +160,12 @@ function dlPending() {
  }, 500);
 }

-function addToQueue(autostart=false) {
+function addToQueue(autostart = false) {
  let textArea = document.getElementById('id_vid_url');
  if (textArea.value === '') {
-    return
+    return;
  }
-  let toPost = {data: [{youtube_id: textArea.value, status: 'pending'}]};
+  let toPost = { data: [{ youtube_id: textArea.value, status: 'pending' }] };
  let apiEndpoint = '/api/download/';
  if (autostart) {
    apiEndpoint = `${apiEndpoint}?autostart=true`;