tubearchivist/tubearchivist/home/src/index/filesystem.py

"""
Functionality:
- reindexing old documents
- syncing updated values between indexes
- scan the filesystem to delete or index
"""

import json
import os

from home.src.download.queue import PendingList
from home.src.es.connect import ElasticWrap
from home.src.index.comments import CommentList
from home.src.index.video import index_new_video
from home.src.ta.config import AppConfig
from home.src.ta.helper import clean_string, ignore_filelist
from PIL import ImageFile

ImageFile.LOAD_TRUNCATED_IMAGES = True


class ScannerBase:
    """scan the filesystem base class"""

    CONFIG = AppConfig().config
    VIDEOS = CONFIG["application"]["videos"]

    def __init__(self):
        self.to_index = False
        self.to_delete = False
        self.mismatch = False
        self.to_rename = False

    def scan(self):
        """entry point, scan and compare"""
        all_downloaded = self._get_all_downloaded()
        all_indexed = self._get_all_indexed()
        self.list_comarison(all_downloaded, all_indexed)

    def _get_all_downloaded(self):
        """get a list of all video files downloaded"""
        channels = os.listdir(self.VIDEOS)
        all_channels = ignore_filelist(channels)
        all_channels.sort()
        all_downloaded = []
        for channel_name in all_channels:
            channel_path = os.path.join(self.VIDEOS, channel_name)
            channel_files = os.listdir(channel_path)
            channel_files_clean = ignore_filelist(channel_files)
            all_videos = [i for i in channel_files_clean if i.endswith(".mp4")]
            for video in all_videos:
                youtube_id = video[9:20]
                all_downloaded.append((channel_name, video, youtube_id))

        return all_downloaded

    @staticmethod
    def _get_all_indexed():
        """get a list of all indexed videos"""
        index_handler = PendingList()
        index_handler.get_download()
        index_handler.get_indexed()

        all_indexed = []
        for video in index_handler.all_videos:
            youtube_id = video["youtube_id"]
            media_url = video["media_url"]
            published = video["published"]
            title = video["title"]
            all_indexed.append((youtube_id, media_url, published, title))
        return all_indexed

    def list_comarison(self, all_downloaded, all_indexed):
        """compare the lists to figure out what to do"""
        self._find_unindexed(all_downloaded, all_indexed)
        self._find_missing(all_downloaded, all_indexed)
        self._find_bad_media_url(all_downloaded, all_indexed)

    def _find_unindexed(self, all_downloaded, all_indexed):
        """find video files without a matching document indexed"""
        all_indexed_ids = [i[0] for i in all_indexed]
        self.to_index = []
        for downloaded in all_downloaded:
            if downloaded[2] not in all_indexed_ids:
                self.to_index.append(downloaded)

    def _find_missing(self, all_downloaded, all_indexed):
        """find indexed videos without matching media file"""
        all_downloaded_ids = [i[2] for i in all_downloaded]
        self.to_delete = []
        for video in all_indexed:
            youtube_id = video[0]
            if youtube_id not in all_downloaded_ids:
                self.to_delete.append(video)

    def _find_bad_media_url(self, all_downloaded, all_indexed):
        """rename media files not matching the indexed title"""
        self.mismatch = []
        self.to_rename = []

        for downloaded in all_downloaded:
            channel, filename, downloaded_id = downloaded
            # find in indexed
            for indexed in all_indexed:
                indexed_id, media_url, published, title = indexed
                if indexed_id == downloaded_id:
                    # found it
                    pub = published.replace("-", "")
                    expected = f"{pub}_{indexed_id}_{clean_string(title)}.mp4"
                    new_url = os.path.join(channel, expected)
                    if expected != filename:
                        # file to rename
                        self.to_rename.append((channel, filename, expected))
                    if media_url != new_url:
                        # media_url to update in es
                        self.mismatch.append((indexed_id, new_url))

                    break


class Filesystem(ScannerBase):
    """handle scanning and fixing from filesystem"""

    def __init__(self, task=False):
        super().__init__()
        self.task = task

    def process(self):
        """entry point"""
        if self.task:
            self.task.send_progress(["Scanning your archive and index."])
        self.scan()
        self.rename_files()
        self.send_mismatch_bulk()
        self.delete_from_index()
        self.add_missing()

    def rename_files(self):
        """rename media files as identified by find_bad_media_url"""
        if not self.to_rename:
            return

        total = len(self.to_rename)
        if self.task:
            self.task.send_progress([f"Rename {total} media files."])
        for bad_filename in self.to_rename:
            channel, filename, expected_filename = bad_filename
            print(f"renaming [{filename}] to [{expected_filename}]")
            old_path = os.path.join(self.VIDEOS, channel, filename)
            new_path = os.path.join(self.VIDEOS, channel, expected_filename)
            os.rename(old_path, new_path)

    def send_mismatch_bulk(self):
        """build bulk update"""
        if not self.mismatch:
            return

        total = len(self.mismatch)
        if self.task:
            self.task.send_progress([f"Fix media urls for {total} files"])
        bulk_list = []
        for video_mismatch in self.mismatch:
            youtube_id, media_url = video_mismatch
            print(f"{youtube_id}: fixing media url {media_url}")
            action = {"update": {"_id": youtube_id, "_index": "ta_video"}}
            source = {"doc": {"media_url": media_url}}
            bulk_list.append(json.dumps(action))
            bulk_list.append(json.dumps(source))
        # add last newline
        bulk_list.append("\n")
        data = "\n".join(bulk_list)
        _, _ = ElasticWrap("_bulk").post(data=data, ndjson=True)

    def delete_from_index(self):
        """find indexed but deleted mediafile"""
        if not self.to_delete:
            return

        total = len(self.to_delete)
        if self.task:
            self.task.send_progress([f"Clean up {total} items from index."])
        for indexed in self.to_delete:
            youtube_id = indexed[0]
            print(f"deleting {youtube_id} from index")
            path = f"ta_video/_doc/{youtube_id}"
            _, _ = ElasticWrap(path).delete()

    def add_missing(self):
        """add missing videos to index"""
        video_ids = [i[2] for i in self.to_index]
        if not video_ids:
            return

        total = len(video_ids)
        for idx, youtube_id in enumerate(video_ids):
            if self.task:
                self.task.send_progress(
                    message_lines=[
                        f"Index missing video {youtube_id}, {idx}/{total}"
                    ],
                    progress=(idx + 1) / total,
                )
            index_new_video(youtube_id)

        CommentList(video_ids, task=self.task).index()
minimal viable product 2021-09-05 17:10:14 +00:00			`"""`
			`Functionality:`
			`- reindexing old documents`
			`- syncing updated values between indexes`
			`- scan the filesystem to delete or index`
			`"""`

			`import json`
			`import os`

major refactor, split up modules 2022-01-22 15:13:37 +00:00			`from home.src.download.queue import PendingList`
use ElasticWrap in FilesystemScanner 2022-03-23 08:56:53 +00:00			`from home.src.es.connect import ElasticWrap`
refactor comment interface into reusable CommentList class 2022-12-23 15:34:25 +00:00			`from home.src.index.comments import CommentList`
split filesystem and manual import classes 2023-03-16 03:59:47 +00:00			`from home.src.index.video import index_new_video`
major refactor, split up modules 2022-01-22 15:13:37 +00:00			`from home.src.ta.config import AppConfig`
			`from home.src.ta.helper import clean_string, ignore_filelist`
split filesystem and manual import classes 2023-03-16 03:59:47 +00:00			`from PIL import ImageFile`
convert thumbnail to jpg for manual import 2022-08-08 07:52:33 +00:00
			`ImageFile.LOAD_TRUNCATED_IMAGES = True`
minimal viable product 2021-09-05 17:10:14 +00:00

refactor filesystem class task 2023-03-16 06:18:26 +00:00			`class ScannerBase:`
			`"""scan the filesystem base class"""`
minimal viable product 2021-09-05 17:10:14 +00:00
			`CONFIG = AppConfig().config`
linting everything in black 2021-09-21 09:25:22 +00:00			`VIDEOS = CONFIG["application"]["videos"]`
minimal viable product 2021-09-05 17:10:14 +00:00
			`def __init__(self):`
refactor filesystem class task 2023-03-16 06:18:26 +00:00			`self.to_index = False`
			`self.to_delete = False`
			`self.mismatch = False`
			`self.to_rename = False`

			`def scan(self):`
			`"""entry point, scan and compare"""`
			`all_downloaded = self._get_all_downloaded()`
			`all_indexed = self._get_all_indexed()`
			`self.list_comarison(all_downloaded, all_indexed)`

			`def _get_all_downloaded(self):`
linting everything in black 2021-09-21 09:25:22 +00:00			`"""get a list of all video files downloaded"""`
implement os.listdir sanitizer for hidden files, #30 2021-09-25 11:59:54 +00:00			`channels = os.listdir(self.VIDEOS)`
			`all_channels = ignore_filelist(channels)`
minimal viable product 2021-09-05 17:10:14 +00:00			`all_channels.sort()`
			`all_downloaded = []`
			`for channel_name in all_channels:`
			`channel_path = os.path.join(self.VIDEOS, channel_name)`
limit filesystem scan to mp4 files only 2022-02-10 11:48:35 +00:00			`channel_files = os.listdir(channel_path)`
			`channel_files_clean = ignore_filelist(channel_files)`
			`all_videos = [i for i in channel_files_clean if i.endswith(".mp4")]`
implement os.listdir sanitizer for hidden files, #30 2021-09-25 11:59:54 +00:00			`for video in all_videos:`
minimal viable product 2021-09-05 17:10:14 +00:00			`youtube_id = video[9:20]`
			`all_downloaded.append((channel_name, video, youtube_id))`

			`return all_downloaded`

			`@staticmethod`
refactor filesystem class task 2023-03-16 06:18:26 +00:00			`def _get_all_indexed():`
linting everything in black 2021-09-21 09:25:22 +00:00			`"""get a list of all indexed videos"""`
minimal viable product 2021-09-05 17:10:14 +00:00			`index_handler = PendingList()`
fix filesysem rescan function 2022-03-19 07:36:46 +00:00			`index_handler.get_download()`
use the refactored PendingList class 2022-03-18 11:27:25 +00:00			`index_handler.get_indexed()`

minimal viable product 2021-09-05 17:10:14 +00:00			`all_indexed = []`
use the refactored PendingList class 2022-03-18 11:27:25 +00:00			`for video in index_handler.all_videos:`
use new IndexPaginate class for get_all_indexed videos 2021-11-18 05:16:21 +00:00			`youtube_id = video["youtube_id"]`
			`media_url = video["media_url"]`
			`published = video["published"]`
			`title = video["title"]`
minimal viable product 2021-09-05 17:10:14 +00:00			`all_indexed.append((youtube_id, media_url, published, title))`
			`return all_indexed`

refactor filesystem class task 2023-03-16 06:18:26 +00:00			`def list_comarison(self, all_downloaded, all_indexed):`
linting everything in black 2021-09-21 09:25:22 +00:00			`"""compare the lists to figure out what to do"""`
refactor filesystem class task 2023-03-16 06:18:26 +00:00			`self._find_unindexed(all_downloaded, all_indexed)`
			`self._find_missing(all_downloaded, all_indexed)`
			`self._find_bad_media_url(all_downloaded, all_indexed)`
minimal viable product 2021-09-05 17:10:14 +00:00
refactor filesystem class task 2023-03-16 06:18:26 +00:00			`def _find_unindexed(self, all_downloaded, all_indexed):`
linting everything in black 2021-09-21 09:25:22 +00:00			`"""find video files without a matching document indexed"""`
refactor filesystem class task 2023-03-16 06:18:26 +00:00			`all_indexed_ids = [i[0] for i in all_indexed]`
			`self.to_index = []`
			`for downloaded in all_downloaded:`
minimal viable product 2021-09-05 17:10:14 +00:00			`if downloaded[2] not in all_indexed_ids:`
refactor filesystem class task 2023-03-16 06:18:26 +00:00			`self.to_index.append(downloaded)`
minimal viable product 2021-09-05 17:10:14 +00:00
refactor filesystem class task 2023-03-16 06:18:26 +00:00			`def _find_missing(self, all_downloaded, all_indexed):`
linting everything in black 2021-09-21 09:25:22 +00:00			`"""find indexed videos without matching media file"""`
refactor filesystem class task 2023-03-16 06:18:26 +00:00			`all_downloaded_ids = [i[2] for i in all_downloaded]`
			`self.to_delete = []`
			`for video in all_indexed:`
minimal viable product 2021-09-05 17:10:14 +00:00			`youtube_id = video[0]`
			`if youtube_id not in all_downloaded_ids:`
refactor filesystem class task 2023-03-16 06:18:26 +00:00			`self.to_delete.append(video)`
minimal viable product 2021-09-05 17:10:14 +00:00
refactor filesystem class task 2023-03-16 06:18:26 +00:00			`def _find_bad_media_url(self, all_downloaded, all_indexed):`
linting everything in black 2021-09-21 09:25:22 +00:00			`"""rename media files not matching the indexed title"""`
refactor filesystem class task 2023-03-16 06:18:26 +00:00			`self.mismatch = []`
			`self.to_rename = []`

			`for downloaded in all_downloaded:`
minimal viable product 2021-09-05 17:10:14 +00:00			`channel, filename, downloaded_id = downloaded`
			`# find in indexed`
refactor filesystem class task 2023-03-16 06:18:26 +00:00			`for indexed in all_indexed:`
minimal viable product 2021-09-05 17:10:14 +00:00			`indexed_id, media_url, published, title = indexed`
			`if indexed_id == downloaded_id:`
			`# found it`
linting everything in black 2021-09-21 09:25:22 +00:00			`pub = published.replace("-", "")`
refactor filesystem class task 2023-03-16 06:18:26 +00:00			`expected = f"{pub}_{indexed_id}_{clean_string(title)}.mp4"`
			`new_url = os.path.join(channel, expected)`
			`if expected != filename:`
minimal viable product 2021-09-05 17:10:14 +00:00			`# file to rename`
refactor filesystem class task 2023-03-16 06:18:26 +00:00			`self.to_rename.append((channel, filename, expected))`
minimal viable product 2021-09-05 17:10:14 +00:00			`if media_url != new_url:`
			`# media_url to update in es`
refactor filesystem class task 2023-03-16 06:18:26 +00:00			`self.mismatch.append((indexed_id, new_url))`
minimal viable product 2021-09-05 17:10:14 +00:00
			`break`

refactor filesystem class task 2023-03-16 06:18:26 +00:00
			`class Filesystem(ScannerBase):`
			`"""handle scanning and fixing from filesystem"""`

			`def __init__(self, task=False):`
			`super().__init__()`
			`self.task = task`

			`def process(self):`
			`"""entry point"""`
fix filesystem rescan without task 2023-05-07 08:36:26 +00:00			`if self.task:`
			`self.task.send_progress(["Scanning your archive and index."])`
refactor filesystem class task 2023-03-16 06:18:26 +00:00			`self.scan()`
			`self.rename_files()`
			`self.send_mismatch_bulk()`
			`self.delete_from_index()`
			`self.add_missing()`
minimal viable product 2021-09-05 17:10:14 +00:00
			`def rename_files(self):`
linting everything in black 2021-09-21 09:25:22 +00:00			`"""rename media files as identified by find_bad_media_url"""`
refactor filesystem class task 2023-03-16 06:18:26 +00:00			`if not self.to_rename:`
			`return`

			`total = len(self.to_rename)`
fix filesystem rescan without task 2023-05-07 08:36:26 +00:00			`if self.task:`
			`self.task.send_progress([f"Rename {total} media files."])`
minimal viable product 2021-09-05 17:10:14 +00:00			`for bad_filename in self.to_rename:`
			`channel, filename, expected_filename = bad_filename`
implementing filesystem rescan to clean index 2021-10-08 07:56:07 +00:00			`print(f"renaming [{filename}] to [{expected_filename}]")`
minimal viable product 2021-09-05 17:10:14 +00:00			`old_path = os.path.join(self.VIDEOS, channel, filename)`
			`new_path = os.path.join(self.VIDEOS, channel, expected_filename)`
			`os.rename(old_path, new_path)`

running codespell #15 2021-09-18 10:28:16 +00:00			`def send_mismatch_bulk(self):`
linting everything in black 2021-09-21 09:25:22 +00:00			`"""build bulk update"""`
refactor filesystem class task 2023-03-16 06:18:26 +00:00			`if not self.mismatch:`
			`return`

			`total = len(self.mismatch)`
fix filesystem rescan without task 2023-05-07 08:36:26 +00:00			`if self.task:`
			`self.task.send_progress([f"Fix media urls for {total} files"])`
minimal viable product 2021-09-05 17:10:14 +00:00			`bulk_list = []`
running codespell #15 2021-09-18 10:28:16 +00:00			`for video_mismatch in self.mismatch:`
			`youtube_id, media_url = video_mismatch`
implementing filesystem rescan to clean index 2021-10-08 07:56:07 +00:00			`print(f"{youtube_id}: fixing media url {media_url}")`
linting everything in black 2021-09-21 09:25:22 +00:00			`action = {"update": {"_id": youtube_id, "_index": "ta_video"}}`
minimal viable product 2021-09-05 17:10:14 +00:00			`source = {"doc": {"media_url": media_url}}`
			`bulk_list.append(json.dumps(action))`
			`bulk_list.append(json.dumps(source))`
			`# add last newline`
linting everything in black 2021-09-21 09:25:22 +00:00			`bulk_list.append("\n")`
use ElasticWrap in FilesystemScanner 2022-03-23 08:56:53 +00:00			`data = "\n".join(bulk_list)`
			`_, _ = ElasticWrap("_bulk").post(data=data, ndjson=True)`
minimal viable product 2021-09-05 17:10:14 +00:00
			`def delete_from_index(self):`
linting everything in black 2021-09-21 09:25:22 +00:00			`"""find indexed but deleted mediafile"""`
refactor filesystem class task 2023-03-16 06:18:26 +00:00			`if not self.to_delete:`
			`return`

			`total = len(self.to_delete)`
fix filesystem rescan without task 2023-05-07 08:36:26 +00:00			`if self.task:`
			`self.task.send_progress([f"Clean up {total} items from index."])`
minimal viable product 2021-09-05 17:10:14 +00:00			`for indexed in self.to_delete:`
implementing filesystem rescan to clean index 2021-10-08 07:56:07 +00:00			`youtube_id = indexed[0]`
			`print(f"deleting {youtube_id} from index")`
use ElasticWrap in FilesystemScanner 2022-03-23 08:56:53 +00:00			`path = f"ta_video/_doc/{youtube_id}"`
			`_, _ = ElasticWrap(path).delete()`
minimal viable product 2021-09-05 17:10:14 +00:00
refactor filesystem class task 2023-03-16 06:18:26 +00:00			`def add_missing(self):`
			`"""add missing videos to index"""`
			`video_ids = [i[2] for i in self.to_index]`
			`if not video_ids:`
			`return`

			`total = len(video_ids)`
			`for idx, youtube_id in enumerate(video_ids):`
			`if self.task:`
			`self.task.send_progress(`
			`message_lines=[`
			`f"Index missing video {youtube_id}, {idx}/{total}"`
			`],`
			`progress=(idx + 1) / total,`
			`)`
squash index.py refactor commits 2022-01-22 10:48:54 +00:00			`index_new_video(youtube_id)`
refactor comment interface into reusable CommentList class 2022-12-23 15:34:25 +00:00
refactor filesystem class task 2023-03-16 06:18:26 +00:00			`CommentList(video_ids, task=self.task).index()`