WIP: rewrite ManualImport, identify and archive

2025-07-05 16:51:10 +00:00 · 2022-08-08 19:18:27 +07:00 · 2022-08-08 19:18:27 +07:00 · b76fa69396
commit b76fa69396
parent 8f711d359b
2 changed files with 51 additions and 120 deletions
--- a/tubearchivist/home/src/index/filesystem.py
+++ b/tubearchivist/home/src/index/filesystem.py
@ -12,10 +12,9 @@ import shutil
 import subprocess

 from home.src.download.queue import PendingList
-from home.src.download.yt_dlp_handler import VideoDownloader
 from home.src.es.connect import ElasticWrap
 from home.src.index.reindex import Reindex
-from home.src.index.video import index_new_video
+from home.src.index.video import YoutubeVideo, index_new_video
 from home.src.ta.config import AppConfig
 from home.src.ta.helper import clean_string, ignore_filelist
 from home.src.ta.ta_redis import RedisArchivist
@ -255,6 +254,8 @@ class ImportFolderScanner:
            self._convert_thumb(current_video)
            self._convert_video(current_video)

+            ManualImport(current_video, self.CONFIG).run()
+
    def _detect_youtube_id(self, current_video):
        """find video id from filename or json"""
        print(current_video)
@ -450,132 +451,65 @@ class ImportFolderScanner:
        os.remove(current_path)


-class ManualImportOld:
-    """import and indexing existing video files"""
+class ManualImport:
+    """import single identified video"""

-    CONFIG = AppConfig().config
-    CACHE_DIR = CONFIG["application"]["cache_dir"]
-    IMPORT_DIR = os.path.join(CACHE_DIR, "import")
+    def __init__(self, current_video, config):
+        self.current_video = current_video
+        self.config = config

-    def __init__(self):
-        self.identified = self.import_folder_parser()
+    def run(self):
+        """run all"""
+        json_data = self.index_metadata()
+        self._move_to_archive(json_data)
+        self._cleanup()

-    def import_folder_parser(self):
-        """detect files in import folder"""
-        import_files = os.listdir(self.IMPORT_DIR)
-        to_import = ignore_filelist(import_files)
-        to_import.sort()
-        video_files = [i for i in to_import if not i.endswith(".json")]
+    def index_metadata(self):
+        """get metadata from yt or json"""
+        video = YoutubeVideo(self.current_video["video_id"])
+        video.build_json(
+            youtube_meta_overwrite=self._get_info_json(),
+            media_path=self.current_video["media"],
+        )
+        video.check_subtitles()
+        video.upload_to_es()

-        identified = []
+        return video.json_data

-        for file_path in video_files:
+    def _get_info_json(self):
+        """read info_json from file"""
+        if not self.current_video["metadata"]:
+            return False

-            file_dict = {"video_file": file_path}
-            file_name, _ = os.path.splitext(file_path)
+        with open(self.current_video["metadata"], "r", encoding="utf-8") as f:
+            info_json = json.loads(f.read())

-            matching_json = [
-                i
-                for i in to_import
-                if i.startswith(file_name) and i.endswith(".json")
-            ]
-            if matching_json:
-                json_file = matching_json[0]
-                youtube_id = self.extract_id_from_json(json_file)
-                file_dict.update({"json_file": json_file})
-            else:
-                youtube_id = self.extract_id_from_filename(file_name)
-                file_dict.update({"json_file": False})
+        return info_json

-            file_dict.update({"youtube_id": youtube_id})
-            identified.append(file_dict)
+    def _move_to_archive(self, json_data):
+        """move identified media file to archive"""
+        videos = self.config["application"]["videos"]

-        return identified
+        channel, file = os.path.split(json_data["media_url"])
+        channel_folder = os.path.join(videos, channel)
+        if not os.path.exists(channel_folder):
+            os.makedirs(channel_folder)

-    @staticmethod
-    def extract_id_from_filename(file_name):
-        """
-        look at the file name for the youtube id
-        expects filename ending in [<youtube_id>].<ext>
-        """
-        id_search = re.search(r"\[([a-zA-Z0-9_-]{11})\]$", file_name)
-        if id_search:
-            youtube_id = id_search.group(1)
-            return youtube_id
+        old_path = self.current_video["media"]
+        new_path = os.path.join(channel_folder, file)
+        shutil.move(old_path, new_path, copy_function=shutil.copyfile)

-        print("failed to extract youtube id for: " + file_name)
-        raise Exception
+    def _cleanup(self):
+        """cleanup leftover files"""
+        if os.path.exists(self.current_video["metadata"]):
+            os.remove(self.current_video["metadata"])

-    def extract_id_from_json(self, json_file):
-        """open json file and extract id"""
-        json_path = os.path.join(self.CACHE_DIR, "import", json_file)
-        with open(json_path, "r", encoding="utf-8") as f:
-            json_content = f.read()
+        if os.path.exists(self.current_video["thumb"]):
+            os.remove(self.current_video["thumb"])

-        youtube_id = json.loads(json_content)["id"]
-
-        return youtube_id
-
-    def process_import(self):
-        """go through identified media files"""
-
-        all_videos_added = []
-
-        for media_file in self.identified:
-            json_file = media_file["json_file"]
-            video_file = media_file["video_file"]
-            youtube_id = media_file["youtube_id"]
-
-            video_path = os.path.join(self.CACHE_DIR, "import", video_file)
-
-            self.move_to_cache(video_path, youtube_id)
-
-            # identify and archive
-            vid_dict = index_new_video(youtube_id)
-            VideoDownloader([youtube_id]).move_to_archive(vid_dict)
-            youtube_id = vid_dict["youtube_id"]
-            thumb_url = vid_dict["vid_thumb_url"]
-            all_videos_added.append((youtube_id, thumb_url))
-
-            # cleanup
-            if os.path.exists(video_path):
-                os.remove(video_path)
-            if json_file:
-                json_path = os.path.join(self.CACHE_DIR, "import", json_file)
-                os.remove(json_path)
-
-        return all_videos_added
-
-    def move_to_cache(self, video_path, youtube_id):
-        """move identified video file to cache, convert to mp4"""
-        file_name = os.path.split(video_path)[-1]
-        video_file, ext = os.path.splitext(file_name)
-
-        # make sure youtube_id is in filename
-        if youtube_id not in video_file:
-            video_file = f"{video_file}_{youtube_id}"
-
-        # move, convert if needed
-        if ext == ".mp4":
-            new_file = video_file + ext
-            dest_path = os.path.join(self.CACHE_DIR, "download", new_file)
-            shutil.move(video_path, dest_path, copy_function=shutil.copyfile)
-        else:
-            print(f"processing with ffmpeg: {video_file}")
-            new_file = video_file + ".mp4"
-            dest_path = os.path.join(self.CACHE_DIR, "download", new_file)
-            subprocess.run(
-                [
-                    "ffmpeg",
-                    "-i",
-                    video_path,
-                    dest_path,
-                    "-loglevel",
-                    "warning",
-                    "-stats",
-                ],
-                check=True,
-            )
+        for subtitle_file in self.current_video["subtitle"]:
+            if os.path.exists(subtitle_file):
+                os.remove(subtitle_file)


 def scan_filesystem():
--- a/tubearchivist/home/tasks.py
+++ b/tubearchivist/home/tasks.py
@ -20,7 +20,7 @@ from home.src.download.yt_dlp_handler import VideoDownloader
 from home.src.es.index_setup import backup_all_indexes, restore_from_backup
 from home.src.index.channel import YoutubeChannel
 from home.src.index.filesystem import (
-    ManualImport,
+    ImportFolderScanner,
    reindex_old_documents,
    scan_filesystem,
 )
@ -150,10 +150,7 @@ def run_manual_import():
    try:
        have_lock = my_lock.acquire(blocking=False)
        if have_lock:
-            import_handler = ManualImport()
-            if import_handler.identified:
-                all_videos_added = import_handler.process_import()
-                ThumbManager().download_vid(all_videos_added)
+            ImportFolderScanner().scan()
        else:
            print("Did not acquire lock form import.")