From b76fa693968a8d28e21f08a46ea7993b49768173 Mon Sep 17 00:00:00 2001 From: simon Date: Mon, 8 Aug 2022 19:18:27 +0700 Subject: [PATCH] WIP: rewrite ManualImport, identify and archive --- tubearchivist/home/src/index/filesystem.py | 164 ++++++--------------- tubearchivist/home/tasks.py | 7 +- 2 files changed, 51 insertions(+), 120 deletions(-) diff --git a/tubearchivist/home/src/index/filesystem.py b/tubearchivist/home/src/index/filesystem.py index f4ec8fed..5e80ebc8 100644 --- a/tubearchivist/home/src/index/filesystem.py +++ b/tubearchivist/home/src/index/filesystem.py @@ -12,10 +12,9 @@ import shutil import subprocess from home.src.download.queue import PendingList -from home.src.download.yt_dlp_handler import VideoDownloader from home.src.es.connect import ElasticWrap from home.src.index.reindex import Reindex -from home.src.index.video import index_new_video +from home.src.index.video import YoutubeVideo, index_new_video from home.src.ta.config import AppConfig from home.src.ta.helper import clean_string, ignore_filelist from home.src.ta.ta_redis import RedisArchivist @@ -255,6 +254,8 @@ class ImportFolderScanner: self._convert_thumb(current_video) self._convert_video(current_video) + ManualImport(current_video, self.CONFIG).run() + def _detect_youtube_id(self, current_video): """find video id from filename or json""" print(current_video) @@ -450,132 +451,65 @@ class ImportFolderScanner: os.remove(current_path) -class ManualImportOld: - """import and indexing existing video files""" +class ManualImport: + """import single identified video""" - CONFIG = AppConfig().config - CACHE_DIR = CONFIG["application"]["cache_dir"] - IMPORT_DIR = os.path.join(CACHE_DIR, "import") + def __init__(self, current_video, config): + self.current_video = current_video + self.config = config - def __init__(self): - self.identified = self.import_folder_parser() + def run(self): + """run all""" + json_data = self.index_metadata() + self._move_to_archive(json_data) + self._cleanup() - def import_folder_parser(self): - """detect files in import folder""" - import_files = os.listdir(self.IMPORT_DIR) - to_import = ignore_filelist(import_files) - to_import.sort() - video_files = [i for i in to_import if not i.endswith(".json")] + def index_metadata(self): + """get metadata from yt or json""" + video = YoutubeVideo(self.current_video["video_id"]) + video.build_json( + youtube_meta_overwrite=self._get_info_json(), + media_path=self.current_video["media"], + ) + video.check_subtitles() + video.upload_to_es() - identified = [] + return video.json_data - for file_path in video_files: + def _get_info_json(self): + """read info_json from file""" + if not self.current_video["metadata"]: + return False - file_dict = {"video_file": file_path} - file_name, _ = os.path.splitext(file_path) + with open(self.current_video["metadata"], "r", encoding="utf-8") as f: + info_json = json.loads(f.read()) - matching_json = [ - i - for i in to_import - if i.startswith(file_name) and i.endswith(".json") - ] - if matching_json: - json_file = matching_json[0] - youtube_id = self.extract_id_from_json(json_file) - file_dict.update({"json_file": json_file}) - else: - youtube_id = self.extract_id_from_filename(file_name) - file_dict.update({"json_file": False}) + return info_json - file_dict.update({"youtube_id": youtube_id}) - identified.append(file_dict) + def _move_to_archive(self, json_data): + """move identified media file to archive""" + videos = self.config["application"]["videos"] - return identified + channel, file = os.path.split(json_data["media_url"]) + channel_folder = os.path.join(videos, channel) + if not os.path.exists(channel_folder): + os.makedirs(channel_folder) - @staticmethod - def extract_id_from_filename(file_name): - """ - look at the file name for the youtube id - expects filename ending in []. - """ - id_search = re.search(r"\[([a-zA-Z0-9_-]{11})\]$", file_name) - if id_search: - youtube_id = id_search.group(1) - return youtube_id + old_path = self.current_video["media"] + new_path = os.path.join(channel_folder, file) + shutil.move(old_path, new_path, copy_function=shutil.copyfile) - print("failed to extract youtube id for: " + file_name) - raise Exception + def _cleanup(self): + """cleanup leftover files""" + if os.path.exists(self.current_video["metadata"]): + os.remove(self.current_video["metadata"]) - def extract_id_from_json(self, json_file): - """open json file and extract id""" - json_path = os.path.join(self.CACHE_DIR, "import", json_file) - with open(json_path, "r", encoding="utf-8") as f: - json_content = f.read() + if os.path.exists(self.current_video["thumb"]): + os.remove(self.current_video["thumb"]) - youtube_id = json.loads(json_content)["id"] - - return youtube_id - - def process_import(self): - """go through identified media files""" - - all_videos_added = [] - - for media_file in self.identified: - json_file = media_file["json_file"] - video_file = media_file["video_file"] - youtube_id = media_file["youtube_id"] - - video_path = os.path.join(self.CACHE_DIR, "import", video_file) - - self.move_to_cache(video_path, youtube_id) - - # identify and archive - vid_dict = index_new_video(youtube_id) - VideoDownloader([youtube_id]).move_to_archive(vid_dict) - youtube_id = vid_dict["youtube_id"] - thumb_url = vid_dict["vid_thumb_url"] - all_videos_added.append((youtube_id, thumb_url)) - - # cleanup - if os.path.exists(video_path): - os.remove(video_path) - if json_file: - json_path = os.path.join(self.CACHE_DIR, "import", json_file) - os.remove(json_path) - - return all_videos_added - - def move_to_cache(self, video_path, youtube_id): - """move identified video file to cache, convert to mp4""" - file_name = os.path.split(video_path)[-1] - video_file, ext = os.path.splitext(file_name) - - # make sure youtube_id is in filename - if youtube_id not in video_file: - video_file = f"{video_file}_{youtube_id}" - - # move, convert if needed - if ext == ".mp4": - new_file = video_file + ext - dest_path = os.path.join(self.CACHE_DIR, "download", new_file) - shutil.move(video_path, dest_path, copy_function=shutil.copyfile) - else: - print(f"processing with ffmpeg: {video_file}") - new_file = video_file + ".mp4" - dest_path = os.path.join(self.CACHE_DIR, "download", new_file) - subprocess.run( - [ - "ffmpeg", - "-i", - video_path, - dest_path, - "-loglevel", - "warning", - "-stats", - ], - check=True, - ) + for subtitle_file in self.current_video["subtitle"]: + if os.path.exists(subtitle_file): + os.remove(subtitle_file) def scan_filesystem(): diff --git a/tubearchivist/home/tasks.py b/tubearchivist/home/tasks.py index 8f78d4dd..ede17f26 100644 --- a/tubearchivist/home/tasks.py +++ b/tubearchivist/home/tasks.py @@ -20,7 +20,7 @@ from home.src.download.yt_dlp_handler import VideoDownloader from home.src.es.index_setup import backup_all_indexes, restore_from_backup from home.src.index.channel import YoutubeChannel from home.src.index.filesystem import ( - ManualImport, + ImportFolderScanner, reindex_old_documents, scan_filesystem, ) @@ -150,10 +150,7 @@ def run_manual_import(): try: have_lock = my_lock.acquire(blocking=False) if have_lock: - import_handler = ManualImport() - if import_handler.identified: - all_videos_added = import_handler.process_import() - ThumbManager().download_vid(all_videos_added) + ImportFolderScanner().scan() else: print("Did not acquire lock form import.")