tubearchivist/tubearchivist/home/src/index/filesystem.py

"""
Functionality:
- reindexing old documents
- syncing updated values between indexes
- scan the filesystem to delete or index
"""

import json
import os
import re
import shutil
import subprocess

from home.src.download.queue import PendingList
from home.src.download.yt_dlp_handler import VideoDownloader
from home.src.es.connect import ElasticWrap
from home.src.index.reindex import Reindex
from home.src.index.video import index_new_video
from home.src.ta.config import AppConfig
from home.src.ta.helper import clean_string, ignore_filelist
from home.src.ta.ta_redis import RedisArchivist


class FilesystemScanner:
    """handle scanning and fixing from filesystem"""

    CONFIG = AppConfig().config
    VIDEOS = CONFIG["application"]["videos"]

    def __init__(self):
        self.all_downloaded = self.get_all_downloaded()
        self.all_indexed = self.get_all_indexed()
        self.mismatch = None
        self.to_rename = None
        self.to_index = None
        self.to_delete = None

    def get_all_downloaded(self):
        """get a list of all video files downloaded"""
        channels = os.listdir(self.VIDEOS)
        all_channels = ignore_filelist(channels)
        all_channels.sort()
        all_downloaded = []
        for channel_name in all_channels:
            channel_path = os.path.join(self.VIDEOS, channel_name)
            channel_files = os.listdir(channel_path)
            channel_files_clean = ignore_filelist(channel_files)
            all_videos = [i for i in channel_files_clean if i.endswith(".mp4")]
            for video in all_videos:
                youtube_id = video[9:20]
                all_downloaded.append((channel_name, video, youtube_id))

        return all_downloaded

    @staticmethod
    def get_all_indexed():
        """get a list of all indexed videos"""
        index_handler = PendingList()
        index_handler.get_download()
        index_handler.get_indexed()

        all_indexed = []
        for video in index_handler.all_videos:
            youtube_id = video["youtube_id"]
            media_url = video["media_url"]
            published = video["published"]
            title = video["title"]
            all_indexed.append((youtube_id, media_url, published, title))
        return all_indexed

    def list_comarison(self):
        """compare the lists to figure out what to do"""
        self.find_unindexed()
        self.find_missing()
        self.find_bad_media_url()

    def find_unindexed(self):
        """find video files without a matching document indexed"""
        all_indexed_ids = [i[0] for i in self.all_indexed]
        to_index = []
        for downloaded in self.all_downloaded:
            if downloaded[2] not in all_indexed_ids:
                to_index.append(downloaded)

        self.to_index = to_index

    def find_missing(self):
        """find indexed videos without matching media file"""
        all_downloaded_ids = [i[2] for i in self.all_downloaded]
        to_delete = []
        for video in self.all_indexed:
            youtube_id = video[0]
            if youtube_id not in all_downloaded_ids:
                to_delete.append(video)

        self.to_delete = to_delete

    def find_bad_media_url(self):
        """rename media files not matching the indexed title"""
        to_fix = []
        to_rename = []
        for downloaded in self.all_downloaded:
            channel, filename, downloaded_id = downloaded
            # find in indexed
            for indexed in self.all_indexed:
                indexed_id, media_url, published, title = indexed
                if indexed_id == downloaded_id:
                    # found it
                    title_c = clean_string(title)
                    pub = published.replace("-", "")
                    expected_filename = f"{pub}_{indexed_id}_{title_c}.mp4"
                    new_url = os.path.join(channel, expected_filename)
                    if expected_filename != filename:
                        # file to rename
                        to_rename.append(
                            (channel, filename, expected_filename)
                        )
                    if media_url != new_url:
                        # media_url to update in es
                        to_fix.append((indexed_id, new_url))

                    break

        self.mismatch = to_fix
        self.to_rename = to_rename

    def rename_files(self):
        """rename media files as identified by find_bad_media_url"""
        for bad_filename in self.to_rename:
            channel, filename, expected_filename = bad_filename
            print(f"renaming [{filename}] to [{expected_filename}]")
            old_path = os.path.join(self.VIDEOS, channel, filename)
            new_path = os.path.join(self.VIDEOS, channel, expected_filename)
            os.rename(old_path, new_path)

    def send_mismatch_bulk(self):
        """build bulk update"""
        bulk_list = []
        for video_mismatch in self.mismatch:
            youtube_id, media_url = video_mismatch
            print(f"{youtube_id}: fixing media url {media_url}")
            action = {"update": {"_id": youtube_id, "_index": "ta_video"}}
            source = {"doc": {"media_url": media_url}}
            bulk_list.append(json.dumps(action))
            bulk_list.append(json.dumps(source))
        # add last newline
        bulk_list.append("\n")
        data = "\n".join(bulk_list)
        _, _ = ElasticWrap("_bulk").post(data=data, ndjson=True)

    def delete_from_index(self):
        """find indexed but deleted mediafile"""
        for indexed in self.to_delete:
            youtube_id = indexed[0]
            print(f"deleting {youtube_id} from index")
            path = f"ta_video/_doc/{youtube_id}"
            _, _ = ElasticWrap(path).delete()


class ManualImport:
    """import and indexing existing video files"""

    CONFIG = AppConfig().config
    CACHE_DIR = CONFIG["application"]["cache_dir"]
    IMPORT_DIR = os.path.join(CACHE_DIR, "import")

    def __init__(self):
        self.identified = self.import_folder_parser()

    def import_folder_parser(self):
        """detect files in import folder"""
        import_files = os.listdir(self.IMPORT_DIR)
        to_import = ignore_filelist(import_files)
        to_import.sort()
        video_files = [i for i in to_import if not i.endswith(".json")]

        identified = []

        for file_path in video_files:

            file_dict = {"video_file": file_path}
            file_name, _ = os.path.splitext(file_path)

            matching_json = [
                i
                for i in to_import
                if i.startswith(file_name) and i.endswith(".json")
            ]
            if matching_json:
                json_file = matching_json[0]
                youtube_id = self.extract_id_from_json(json_file)
                file_dict.update({"json_file": json_file})
            else:
                youtube_id = self.extract_id_from_filename(file_name)
                file_dict.update({"json_file": False})

            file_dict.update({"youtube_id": youtube_id})
            identified.append(file_dict)

        return identified

    @staticmethod
    def extract_id_from_filename(file_name):
        """
        look at the file name for the youtube id
        expects filename ending in [<youtube_id>].<ext>
        """
        id_search = re.search(r"\[([a-zA-Z0-9_-]{11})\]$", file_name)
        if id_search:
            youtube_id = id_search.group(1)
            return youtube_id

        print("failed to extract youtube id for: " + file_name)
        raise Exception

    def extract_id_from_json(self, json_file):
        """open json file and extract id"""
        json_path = os.path.join(self.CACHE_DIR, "import", json_file)
        with open(json_path, "r", encoding="utf-8") as f:
            json_content = f.read()

        youtube_id = json.loads(json_content)["id"]

        return youtube_id

    def process_import(self):
        """go through identified media files"""

        all_videos_added = []

        for media_file in self.identified:
            json_file = media_file["json_file"]
            video_file = media_file["video_file"]
            youtube_id = media_file["youtube_id"]

            video_path = os.path.join(self.CACHE_DIR, "import", video_file)

            self.move_to_cache(video_path, youtube_id)

            # identify and archive
            vid_dict = index_new_video(youtube_id)
            VideoDownloader([youtube_id]).move_to_archive(vid_dict)
            youtube_id = vid_dict["youtube_id"]
            thumb_url = vid_dict["vid_thumb_url"]
            all_videos_added.append((youtube_id, thumb_url))

            # cleanup
            if os.path.exists(video_path):
                os.remove(video_path)
            if json_file:
                json_path = os.path.join(self.CACHE_DIR, "import", json_file)
                os.remove(json_path)

        return all_videos_added

    def move_to_cache(self, video_path, youtube_id):
        """move identified video file to cache, convert to mp4"""
        file_name = os.path.split(video_path)[-1]
        video_file, ext = os.path.splitext(file_name)

        # make sure youtube_id is in filename
        if youtube_id not in video_file:
            video_file = f"{video_file}_{youtube_id}"

        # move, convert if needed
        if ext == ".mp4":
            new_file = video_file + ext
            dest_path = os.path.join(self.CACHE_DIR, "download", new_file)
            shutil.move(video_path, dest_path)
        else:
            print(f"processing with ffmpeg: {video_file}")
            new_file = video_file + ".mp4"
            dest_path = os.path.join(self.CACHE_DIR, "download", new_file)
            subprocess.run(
                [
                    "ffmpeg",
                    "-i",
                    video_path,
                    dest_path,
                    "-loglevel",
                    "warning",
                    "-stats",
                ],
                check=True,
            )


def scan_filesystem():
    """grouped function to delete and update index"""
    filesystem_handler = FilesystemScanner()
    filesystem_handler.list_comarison()
    if filesystem_handler.to_rename:
        print("renaming files")
        filesystem_handler.rename_files()
    if filesystem_handler.mismatch:
        print("fixing media urls in index")
        filesystem_handler.send_mismatch_bulk()
    if filesystem_handler.to_delete:
        print("delete metadata from index")
        filesystem_handler.delete_from_index()
    if filesystem_handler.to_index:
        print("index new videos")
        for missing_vid in filesystem_handler.to_index:
            youtube_id = missing_vid[2]
            index_new_video(youtube_id)


def reindex_old_documents():
    """daily refresh of old documents"""
    handler = Reindex()
    handler.check_outdated()
    handler.reindex()
    RedisArchivist().set_message("last_reindex", handler.now)
minimal viable product 2021-09-05 17:10:14 +00:00			`"""`
			`Functionality:`
			`- reindexing old documents`
			`- syncing updated values between indexes`
			`- scan the filesystem to delete or index`
			`"""`

			`import json`
			`import os`
manual video file import celery task 2021-09-13 15:17:36 +00:00			`import re`
			`import shutil`
flake8 and isort linting 2021-09-18 13:02:54 +00:00			`import subprocess`
minimal viable product 2021-09-05 17:10:14 +00:00
major refactor, split up modules 2022-01-22 15:13:37 +00:00			`from home.src.download.queue import PendingList`
			`from home.src.download.yt_dlp_handler import VideoDownloader`
use ElasticWrap in FilesystemScanner 2022-03-23 08:56:53 +00:00			`from home.src.es.connect import ElasticWrap`
major refactor, split up modules 2022-01-22 15:13:37 +00:00			`from home.src.index.reindex import Reindex`
			`from home.src.index.video import index_new_video`
			`from home.src.ta.config import AppConfig`
			`from home.src.ta.helper import clean_string, ignore_filelist`
			`from home.src.ta.ta_redis import RedisArchivist`
minimal viable product 2021-09-05 17:10:14 +00:00

			`class FilesystemScanner:`
linting everything in black 2021-09-21 09:25:22 +00:00			`"""handle scanning and fixing from filesystem"""`
minimal viable product 2021-09-05 17:10:14 +00:00
			`CONFIG = AppConfig().config`
linting everything in black 2021-09-21 09:25:22 +00:00			`VIDEOS = CONFIG["application"]["videos"]`
minimal viable product 2021-09-05 17:10:14 +00:00
			`def __init__(self):`
			`self.all_downloaded = self.get_all_downloaded()`
			`self.all_indexed = self.get_all_indexed()`
running codespell #15 2021-09-18 10:28:16 +00:00			`self.mismatch = None`
minimal viable product 2021-09-05 17:10:14 +00:00			`self.to_rename = None`
			`self.to_index = None`
			`self.to_delete = None`

			`def get_all_downloaded(self):`
linting everything in black 2021-09-21 09:25:22 +00:00			`"""get a list of all video files downloaded"""`
implement os.listdir sanitizer for hidden files, #30 2021-09-25 11:59:54 +00:00			`channels = os.listdir(self.VIDEOS)`
			`all_channels = ignore_filelist(channels)`
minimal viable product 2021-09-05 17:10:14 +00:00			`all_channels.sort()`
			`all_downloaded = []`
			`for channel_name in all_channels:`
			`channel_path = os.path.join(self.VIDEOS, channel_name)`
limit filesystem scan to mp4 files only 2022-02-10 11:48:35 +00:00			`channel_files = os.listdir(channel_path)`
			`channel_files_clean = ignore_filelist(channel_files)`
			`all_videos = [i for i in channel_files_clean if i.endswith(".mp4")]`
implement os.listdir sanitizer for hidden files, #30 2021-09-25 11:59:54 +00:00			`for video in all_videos:`
minimal viable product 2021-09-05 17:10:14 +00:00			`youtube_id = video[9:20]`
			`all_downloaded.append((channel_name, video, youtube_id))`

			`return all_downloaded`

			`@staticmethod`
			`def get_all_indexed():`
linting everything in black 2021-09-21 09:25:22 +00:00			`"""get a list of all indexed videos"""`
minimal viable product 2021-09-05 17:10:14 +00:00			`index_handler = PendingList()`
fix filesysem rescan function 2022-03-19 07:36:46 +00:00			`index_handler.get_download()`
use the refactored PendingList class 2022-03-18 11:27:25 +00:00			`index_handler.get_indexed()`

minimal viable product 2021-09-05 17:10:14 +00:00			`all_indexed = []`
use the refactored PendingList class 2022-03-18 11:27:25 +00:00			`for video in index_handler.all_videos:`
use new IndexPaginate class for get_all_indexed videos 2021-11-18 05:16:21 +00:00			`youtube_id = video["youtube_id"]`
			`media_url = video["media_url"]`
			`published = video["published"]`
			`title = video["title"]`
minimal viable product 2021-09-05 17:10:14 +00:00			`all_indexed.append((youtube_id, media_url, published, title))`
			`return all_indexed`

			`def list_comarison(self):`
linting everything in black 2021-09-21 09:25:22 +00:00			`"""compare the lists to figure out what to do"""`
minimal viable product 2021-09-05 17:10:14 +00:00			`self.find_unindexed()`
			`self.find_missing()`
			`self.find_bad_media_url()`

			`def find_unindexed(self):`
linting everything in black 2021-09-21 09:25:22 +00:00			`"""find video files without a matching document indexed"""`
minimal viable product 2021-09-05 17:10:14 +00:00			`all_indexed_ids = [i[0] for i in self.all_indexed]`
			`to_index = []`
			`for downloaded in self.all_downloaded:`
			`if downloaded[2] not in all_indexed_ids:`
			`to_index.append(downloaded)`

			`self.to_index = to_index`

			`def find_missing(self):`
linting everything in black 2021-09-21 09:25:22 +00:00			`"""find indexed videos without matching media file"""`
minimal viable product 2021-09-05 17:10:14 +00:00			`all_downloaded_ids = [i[2] for i in self.all_downloaded]`
			`to_delete = []`
			`for video in self.all_indexed:`
			`youtube_id = video[0]`
			`if youtube_id not in all_downloaded_ids:`
			`to_delete.append(video)`

			`self.to_delete = to_delete`

			`def find_bad_media_url(self):`
linting everything in black 2021-09-21 09:25:22 +00:00			`"""rename media files not matching the indexed title"""`
minimal viable product 2021-09-05 17:10:14 +00:00			`to_fix = []`
			`to_rename = []`
			`for downloaded in self.all_downloaded:`
			`channel, filename, downloaded_id = downloaded`
			`# find in indexed`
			`for indexed in self.all_indexed:`
			`indexed_id, media_url, published, title = indexed`
			`if indexed_id == downloaded_id:`
			`# found it`
			`title_c = clean_string(title)`
linting everything in black 2021-09-21 09:25:22 +00:00			`pub = published.replace("-", "")`
			`expected_filename = f"{pub}_{indexed_id}_{title_c}.mp4"`
minimal viable product 2021-09-05 17:10:14 +00:00			`new_url = os.path.join(channel, expected_filename)`
			`if expected_filename != filename:`
			`# file to rename`
			`to_rename.append(`
			`(channel, filename, expected_filename)`
			`)`
			`if media_url != new_url:`
			`# media_url to update in es`
			`to_fix.append((indexed_id, new_url))`

			`break`

running codespell #15 2021-09-18 10:28:16 +00:00			`self.mismatch = to_fix`
minimal viable product 2021-09-05 17:10:14 +00:00			`self.to_rename = to_rename`

			`def rename_files(self):`
linting everything in black 2021-09-21 09:25:22 +00:00			`"""rename media files as identified by find_bad_media_url"""`
minimal viable product 2021-09-05 17:10:14 +00:00			`for bad_filename in self.to_rename:`
			`channel, filename, expected_filename = bad_filename`
implementing filesystem rescan to clean index 2021-10-08 07:56:07 +00:00			`print(f"renaming [{filename}] to [{expected_filename}]")`
minimal viable product 2021-09-05 17:10:14 +00:00			`old_path = os.path.join(self.VIDEOS, channel, filename)`
			`new_path = os.path.join(self.VIDEOS, channel, expected_filename)`
			`os.rename(old_path, new_path)`

running codespell #15 2021-09-18 10:28:16 +00:00			`def send_mismatch_bulk(self):`
linting everything in black 2021-09-21 09:25:22 +00:00			`"""build bulk update"""`
minimal viable product 2021-09-05 17:10:14 +00:00			`bulk_list = []`
running codespell #15 2021-09-18 10:28:16 +00:00			`for video_mismatch in self.mismatch:`
			`youtube_id, media_url = video_mismatch`
implementing filesystem rescan to clean index 2021-10-08 07:56:07 +00:00			`print(f"{youtube_id}: fixing media url {media_url}")`
linting everything in black 2021-09-21 09:25:22 +00:00			`action = {"update": {"_id": youtube_id, "_index": "ta_video"}}`
minimal viable product 2021-09-05 17:10:14 +00:00			`source = {"doc": {"media_url": media_url}}`
			`bulk_list.append(json.dumps(action))`
			`bulk_list.append(json.dumps(source))`
			`# add last newline`
linting everything in black 2021-09-21 09:25:22 +00:00			`bulk_list.append("\n")`
use ElasticWrap in FilesystemScanner 2022-03-23 08:56:53 +00:00			`data = "\n".join(bulk_list)`
			`_, _ = ElasticWrap("_bulk").post(data=data, ndjson=True)`
minimal viable product 2021-09-05 17:10:14 +00:00
			`def delete_from_index(self):`
linting everything in black 2021-09-21 09:25:22 +00:00			`"""find indexed but deleted mediafile"""`
minimal viable product 2021-09-05 17:10:14 +00:00			`for indexed in self.to_delete:`
implementing filesystem rescan to clean index 2021-10-08 07:56:07 +00:00			`youtube_id = indexed[0]`
			`print(f"deleting {youtube_id} from index")`
use ElasticWrap in FilesystemScanner 2022-03-23 08:56:53 +00:00			`path = f"ta_video/_doc/{youtube_id}"`
			`_, _ = ElasticWrap(path).delete()`
minimal viable product 2021-09-05 17:10:14 +00:00

manual video file import celery task 2021-09-13 15:17:36 +00:00			`class ManualImport:`
linting everything in black 2021-09-21 09:25:22 +00:00			`"""import and indexing existing video files"""`
manual video file import celery task 2021-09-13 15:17:36 +00:00
			`CONFIG = AppConfig().config`
linting everything in black 2021-09-21 09:25:22 +00:00			`CACHE_DIR = CONFIG["application"]["cache_dir"]`
			`IMPORT_DIR = os.path.join(CACHE_DIR, "import")`
manual video file import celery task 2021-09-13 15:17:36 +00:00
			`def __init__(self):`
			`self.identified = self.import_folder_parser()`

			`def import_folder_parser(self):`
linting everything in black 2021-09-21 09:25:22 +00:00			`"""detect files in import folder"""`
implement os.listdir sanitizer for hidden files, #30 2021-09-25 11:59:54 +00:00			`import_files = os.listdir(self.IMPORT_DIR)`
			`to_import = ignore_filelist(import_files)`
manual video file import celery task 2021-09-13 15:17:36 +00:00			`to_import.sort()`
linting everything in black 2021-09-21 09:25:22 +00:00			`video_files = [i for i in to_import if not i.endswith(".json")]`
manual video file import celery task 2021-09-13 15:17:36 +00:00
			`identified = []`

			`for file_path in video_files:`

linting everything in black 2021-09-21 09:25:22 +00:00			`file_dict = {"video_file": file_path}`
manual video file import celery task 2021-09-13 15:17:36 +00:00			`file_name, _ = os.path.splitext(file_path)`

			`matching_json = [`
linting everything in black 2021-09-21 09:25:22 +00:00			`i`
			`for i in to_import`
			`if i.startswith(file_name) and i.endswith(".json")`
manual video file import celery task 2021-09-13 15:17:36 +00:00			`]`
			`if matching_json:`
			`json_file = matching_json[0]`
			`youtube_id = self.extract_id_from_json(json_file)`
linting everything in black 2021-09-21 09:25:22 +00:00			`file_dict.update({"json_file": json_file})`
manual video file import celery task 2021-09-13 15:17:36 +00:00			`else:`
			`youtube_id = self.extract_id_from_filename(file_name)`
linting everything in black 2021-09-21 09:25:22 +00:00			`file_dict.update({"json_file": False})`
manual video file import celery task 2021-09-13 15:17:36 +00:00
linting everything in black 2021-09-21 09:25:22 +00:00			`file_dict.update({"youtube_id": youtube_id})`
manual video file import celery task 2021-09-13 15:17:36 +00:00			`identified.append(file_dict)`

			`return identified`

			`@staticmethod`
			`def extract_id_from_filename(file_name):`
			`"""`
			`look at the file name for the youtube id`
			`expects filename ending in [<youtube_id>].<ext>`
			`"""`
linting everything in black 2021-09-21 09:25:22 +00:00			`id_search = re.search(r"\[([a-zA-Z0-9_-]{11})\]$", file_name)`
manual video file import celery task 2021-09-13 15:17:36 +00:00			`if id_search:`
			`youtube_id = id_search.group(1)`
			`return youtube_id`

linting everything in black 2021-09-21 09:25:22 +00:00			`print("failed to extract youtube id for: " + file_name)`
manual video file import celery task 2021-09-13 15:17:36 +00:00			`raise Exception`

			`def extract_id_from_json(self, json_file):`
linting everything in black 2021-09-21 09:25:22 +00:00			`"""open json file and extract id"""`
			`json_path = os.path.join(self.CACHE_DIR, "import", json_file)`
			`with open(json_path, "r", encoding="utf-8") as f:`
manual video file import celery task 2021-09-13 15:17:36 +00:00			`json_content = f.read()`

linting everything in black 2021-09-21 09:25:22 +00:00			`youtube_id = json.loads(json_content)["id"]`
manual video file import celery task 2021-09-13 15:17:36 +00:00
			`return youtube_id`

			`def process_import(self):`
linting everything in black 2021-09-21 09:25:22 +00:00			`"""go through identified media files"""`
manual video file import celery task 2021-09-13 15:17:36 +00:00
add thumbnail download for manual import 2021-10-16 10:03:39 +00:00			`all_videos_added = []`

manual video file import celery task 2021-09-13 15:17:36 +00:00			`for media_file in self.identified:`
linting everything in black 2021-09-21 09:25:22 +00:00			`json_file = media_file["json_file"]`
			`video_file = media_file["video_file"]`
			`youtube_id = media_file["youtube_id"]`
manual video file import celery task 2021-09-13 15:17:36 +00:00
linting everything in black 2021-09-21 09:25:22 +00:00			`video_path = os.path.join(self.CACHE_DIR, "import", video_file)`
manual video file import celery task 2021-09-13 15:17:36 +00:00
			`self.move_to_cache(video_path, youtube_id)`

			`# identify and archive`
			`vid_dict = index_new_video(youtube_id)`
			`VideoDownloader([youtube_id]).move_to_archive(vid_dict)`
add thumbnail download for manual import 2021-10-16 10:03:39 +00:00			`youtube_id = vid_dict["youtube_id"]`
			`thumb_url = vid_dict["vid_thumb_url"]`
			`all_videos_added.append((youtube_id, thumb_url))`
manual video file import celery task 2021-09-13 15:17:36 +00:00
			`# cleanup`
			`if os.path.exists(video_path):`
			`os.remove(video_path)`
			`if json_file:`
linting everything in black 2021-09-21 09:25:22 +00:00			`json_path = os.path.join(self.CACHE_DIR, "import", json_file)`
manual video file import celery task 2021-09-13 15:17:36 +00:00			`os.remove(json_path)`

add thumbnail download for manual import 2021-10-16 10:03:39 +00:00			`return all_videos_added`

manual video file import celery task 2021-09-13 15:17:36 +00:00			`def move_to_cache(self, video_path, youtube_id):`
linting everything in black 2021-09-21 09:25:22 +00:00			`"""move identified video file to cache, convert to mp4"""`
manual video file import celery task 2021-09-13 15:17:36 +00:00			`file_name = os.path.split(video_path)[-1]`
			`video_file, ext = os.path.splitext(file_name)`
make sure youtube_id gets written to cache file 2021-09-15 10:14:17 +00:00
			`# make sure youtube_id is in filename`
flake8 and isort linting 2021-09-18 13:02:54 +00:00			`if youtube_id not in video_file:`
linting everything in black 2021-09-21 09:25:22 +00:00			`video_file = f"{video_file}_{youtube_id}"`
make sure youtube_id gets written to cache file 2021-09-15 10:14:17 +00:00
			`# move, convert if needed`
linting everything in black 2021-09-21 09:25:22 +00:00			`if ext == ".mp4":`
make sure youtube_id gets written to cache file 2021-09-15 10:14:17 +00:00			`new_file = video_file + ext`
linting everything in black 2021-09-21 09:25:22 +00:00			`dest_path = os.path.join(self.CACHE_DIR, "download", new_file)`
make sure youtube_id gets written to cache file 2021-09-15 10:14:17 +00:00			`shutil.move(video_path, dest_path)`
manual video file import celery task 2021-09-13 15:17:36 +00:00			`else:`
linting everything in black 2021-09-21 09:25:22 +00:00			`print(f"processing with ffmpeg: {video_file}")`
			`new_file = video_file + ".mp4"`
			`dest_path = os.path.join(self.CACHE_DIR, "download", new_file)`
manual video file import celery task 2021-09-13 15:17:36 +00:00			`subprocess.run(`
linting everything in black 2021-09-21 09:25:22 +00:00			`[`
			`"ffmpeg",`
			`"-i",`
			`video_path,`
			`dest_path,`
			`"-loglevel",`
			`"warning",`
			`"-stats",`
			`],`
			`check=True,`
manual video file import celery task 2021-09-13 15:17:36 +00:00			`)`


minimal viable product 2021-09-05 17:10:14 +00:00			`def scan_filesystem():`
linting everything in black 2021-09-21 09:25:22 +00:00			`"""grouped function to delete and update index"""`
minimal viable product 2021-09-05 17:10:14 +00:00			`filesystem_handler = FilesystemScanner()`
			`filesystem_handler.list_comarison()`
			`if filesystem_handler.to_rename:`
implementing filesystem rescan to clean index 2021-10-08 07:56:07 +00:00			`print("renaming files")`
minimal viable product 2021-09-05 17:10:14 +00:00			`filesystem_handler.rename_files()`
running codespell #15 2021-09-18 10:28:16 +00:00			`if filesystem_handler.mismatch:`
implementing filesystem rescan to clean index 2021-10-08 07:56:07 +00:00			`print("fixing media urls in index")`
running codespell #15 2021-09-18 10:28:16 +00:00			`filesystem_handler.send_mismatch_bulk()`
minimal viable product 2021-09-05 17:10:14 +00:00			`if filesystem_handler.to_delete:`
implementing filesystem rescan to clean index 2021-10-08 07:56:07 +00:00			`print("delete metadata from index")`
minimal viable product 2021-09-05 17:10:14 +00:00			`filesystem_handler.delete_from_index()`
			`if filesystem_handler.to_index:`
implementing filesystem rescan to clean index 2021-10-08 07:56:07 +00:00			`print("index new videos")`
minimal viable product 2021-09-05 17:10:14 +00:00			`for missing_vid in filesystem_handler.to_index:`
			`youtube_id = missing_vid[2]`
squash index.py refactor commits 2022-01-22 10:48:54 +00:00			`index_new_video(youtube_id)`
minimal viable product 2021-09-05 17:10:14 +00:00

			`def reindex_old_documents():`
linting everything in black 2021-09-21 09:25:22 +00:00			`"""daily refresh of old documents"""`
refactor and consolidate Reindex class 2022-03-23 08:48:38 +00:00			`handler = Reindex()`
refactor use cookie io_stream 2022-05-24 08:51:58 +00:00			`handler.check_outdated()`
			`handler.reindex()`
refactor: default set_message in RedisArchivist to True 2022-06-16 03:37:46 +00:00			`RedisArchivist().set_message("last_reindex", handler.now)`