From 11067094b22d25d1153c1ffc33bc209f50d50cf7 Mon Sep 17 00:00:00 2001 From: simon Date: Sat, 25 Sep 2021 18:59:54 +0700 Subject: [PATCH] implement os.listdir sanitizer for hidden files, #30 --- tubearchivist/home/src/download.py | 14 ++++++++++---- tubearchivist/home/src/helper.py | 16 +++++++++++++++- tubearchivist/home/src/index_management.py | 5 ++++- tubearchivist/home/src/reindex.py | 12 ++++++++---- tubearchivist/home/src/searching.py | 7 +++++-- 5 files changed, 42 insertions(+), 12 deletions(-) diff --git a/tubearchivist/home/src/download.py b/tubearchivist/home/src/download.py index f8cb72a..134d891 100644 --- a/tubearchivist/home/src/download.py +++ b/tubearchivist/home/src/download.py @@ -18,6 +18,7 @@ from home.src.helper import ( DurationConverter, RedisQueue, clean_string, + ignore_filelist, set_message, ) from home.src.index import YoutubeChannel, index_new_video @@ -219,11 +220,13 @@ class PendingList: def get_all_downloaded(self): """get a list of all videos in archive""" - all_channel_folders = os.listdir(self.VIDEOS) + channel_folders = os.listdir(self.VIDEOS) + all_channel_folders = ignore_filelist(channel_folders) all_downloaded = [] for channel_folder in all_channel_folders: channel_path = os.path.join(self.VIDEOS, channel_folder) - all_videos = os.listdir(channel_path) + videos = os.listdir(channel_path) + all_videos = ignore_filelist(videos) youtube_vids = [i[9:20] for i in all_videos] for youtube_id in youtube_vids: all_downloaded.append(youtube_id) @@ -506,7 +509,8 @@ class VideoDownloader: # check if already in cache to continue from there cache_dir = self.config["application"]["cache_dir"] - all_cached = os.listdir(cache_dir + "/download/") + cached = os.listdir(cache_dir + "/download/") + all_cached = ignore_filelist(cached) for file_name in all_cached: if youtube_id in file_name: obs["outtmpl"] = cache_dir + "/download/" + file_name @@ -531,7 +535,9 @@ class VideoDownloader: os.makedirs(new_folder, exist_ok=True) # find real filename cache_dir = self.config["application"]["cache_dir"] - for file_str in os.listdir(cache_dir + "/download"): + cached = os.listdir(cache_dir + "/download/") + all_cached = ignore_filelist(cached) + for file_str in all_cached: if youtube_id in file_str: old_file = file_str old_file_path = os.path.join(cache_dir, "download", old_file) diff --git a/tubearchivist/home/src/helper.py b/tubearchivist/home/src/helper.py index 2f27d55..423cde3 100644 --- a/tubearchivist/home/src/helper.py +++ b/tubearchivist/home/src/helper.py @@ -40,6 +40,19 @@ def clean_string(file_name): return cleaned +def ignore_filelist(filelist): + """ignore temp files for os.listdir sanitizer""" + to_ignore = ["Icon\r\r", "Temporary Items", "Network Trash Folder"] + cleaned = [] + for file_name in filelist: + if file_name.startswith(".") or file_name in to_ignore: + continue + + cleaned.append(file_name) + + return cleaned + + def process_url_list(url_str): """parse url_list to find valid youtube video or channel ids""" to_replace = ["watch?v=", "playlist?list="] @@ -118,7 +131,8 @@ def monitor_cache_dir(cache_dir): look at download cache dir directly as alternative progress info """ dl_cache = os.path.join(cache_dir, "download") - cache_file = os.listdir(dl_cache) + all_cache_file = os.listdir(dl_cache) + cache_file = ignore_filelist(all_cache_file) if cache_file: filename = cache_file[0][12:].replace("_", " ").split(".")[0] mess_dict = { diff --git a/tubearchivist/home/src/index_management.py b/tubearchivist/home/src/index_management.py index af26cc9..9ebd59e 100644 --- a/tubearchivist/home/src/index_management.py +++ b/tubearchivist/home/src/index_management.py @@ -13,6 +13,7 @@ from datetime import datetime import requests from home.src.config import AppConfig +from home.src.helper import ignore_filelist # expected mapping and settings INDEX_CONFIG = [ @@ -433,9 +434,11 @@ class ElasticBackup: """extract backup zip and return filelist""" cache_dir = self.config["application"]["cache_dir"] backup_dir = os.path.join(cache_dir, "backup") + backup_files = os.listdir(backup_dir) + all_backup_files = ignore_filelist(backup_files) all_available_backups = [ i - for i in os.listdir(backup_dir) + for i in all_backup_files if i.startswith("ta_") and i.endswith(".zip") ] all_available_backups.sort() diff --git a/tubearchivist/home/src/reindex.py b/tubearchivist/home/src/reindex.py index 3a8a1cc..edc0a9d 100644 --- a/tubearchivist/home/src/reindex.py +++ b/tubearchivist/home/src/reindex.py @@ -21,6 +21,7 @@ from home.src.helper import ( clean_string, get_message, get_total_hits, + ignore_filelist, set_message, ) from home.src.index import YoutubeChannel, YoutubeVideo, index_new_video @@ -209,12 +210,15 @@ class FilesystemScanner: def get_all_downloaded(self): """get a list of all video files downloaded""" - all_channels = os.listdir(self.VIDEOS) + channels = os.listdir(self.VIDEOS) + all_channels = ignore_filelist(channels) all_channels.sort() all_downloaded = [] for channel_name in all_channels: channel_path = os.path.join(self.VIDEOS, channel_name) - for video in os.listdir(channel_path): + videos = os.listdir(channel_path) + all_videos = ignore_filelist(videos) + for video in all_videos: youtube_id = video[9:20] all_downloaded.append((channel_name, video, youtube_id)) @@ -339,8 +343,8 @@ class ManualImport: def import_folder_parser(self): """detect files in import folder""" - - to_import = os.listdir(self.IMPORT_DIR) + import_files = os.listdir(self.IMPORT_DIR) + to_import = ignore_filelist(import_files) to_import.sort() video_files = [i for i in to_import if not i.endswith(".json")] diff --git a/tubearchivist/home/src/searching.py b/tubearchivist/home/src/searching.py index 2a1f953..b63f3f5 100644 --- a/tubearchivist/home/src/searching.py +++ b/tubearchivist/home/src/searching.py @@ -13,6 +13,7 @@ from datetime import datetime import requests from home.src.config import AppConfig +from home.src.helper import ignore_filelist from PIL import Image @@ -105,7 +106,8 @@ class SearchHandler: def cache_dl_vids(self, all_videos): """video thumbs links for cache""" vid_cache = os.path.join(self.CACHE_DIR, "videos") - all_vid_cached = os.listdir(vid_cache) + vid_cached = os.listdir(vid_cache) + all_vid_cached = ignore_filelist(vid_cached) # videos for video_dict in all_videos: youtube_id = video_dict["youtube_id"] @@ -124,7 +126,8 @@ class SearchHandler: def cache_dl_chan(self, all_channels): """download channel thumbs""" chan_cache = os.path.join(self.CACHE_DIR, "channels") - all_chan_cached = os.listdir(chan_cache) + chan_cached = os.listdir(chan_cache) + all_chan_cached = ignore_filelist(chan_cached) for channel_dict in all_channels: channel_id_cache = channel_dict["channel_id"] channel_banner_url = channel_dict["chan_banner"]