diff --git a/tubearchivist/config/asgi.py b/tubearchivist/config/asgi.py index 8dad602..c31b0ec 100644 --- a/tubearchivist/config/asgi.py +++ b/tubearchivist/config/asgi.py @@ -11,6 +11,6 @@ import os from django.core.asgi import get_asgi_application -os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'config.settings') +os.environ.setdefault("DJANGO_SETTINGS_MODULE", "config.settings") application = get_asgi_application() diff --git a/tubearchivist/config/settings.py b/tubearchivist/config/settings.py index 15b1c8c..655a385 100644 --- a/tubearchivist/config/settings.py +++ b/tubearchivist/config/settings.py @@ -21,67 +21,67 @@ BASE_DIR = Path(__file__).resolve().parent.parent # See https://docs.djangoproject.com/en/3.2/howto/deployment/checklist/ # SECURITY WARNING: keep the secret key used in production secret! -SECRET_KEY = 'Fvid^aUL6LohRZz*kZFvq85B&JW&kB9o*#jdzWsdWE8*XkCLR8' +SECRET_KEY = "Fvid^aUL6LohRZz*kZFvq85B&JW&kB9o*#jdzWsdWE8*XkCLR8" # SECURITY WARNING: don't run with debug turned on in production! -DEBUG = bool(environ.get('DJANGO_DEBUG')) +DEBUG = bool(environ.get("DJANGO_DEBUG")) -ALLOWED_HOSTS = ['*'] +ALLOWED_HOSTS = ["*"] # Application definition INSTALLED_APPS = [ - 'home.apps.HomeConfig', - 'django.contrib.admin', - 'django.contrib.auth', - 'django.contrib.contenttypes', - 'django.contrib.sessions', - 'django.contrib.messages', - 'whitenoise.runserver_nostatic', - 'django.contrib.staticfiles', - 'django.contrib.humanize' + "home.apps.HomeConfig", + "django.contrib.admin", + "django.contrib.auth", + "django.contrib.contenttypes", + "django.contrib.sessions", + "django.contrib.messages", + "whitenoise.runserver_nostatic", + "django.contrib.staticfiles", + "django.contrib.humanize", ] MIDDLEWARE = [ - 'django.middleware.security.SecurityMiddleware', - 'django.contrib.sessions.middleware.SessionMiddleware', - 'whitenoise.middleware.WhiteNoiseMiddleware', - 'django.middleware.common.CommonMiddleware', - 'django.middleware.csrf.CsrfViewMiddleware', - 'django.contrib.auth.middleware.AuthenticationMiddleware', - 'django.contrib.messages.middleware.MessageMiddleware', - 'django.middleware.clickjacking.XFrameOptionsMiddleware', + "django.middleware.security.SecurityMiddleware", + "django.contrib.sessions.middleware.SessionMiddleware", + "whitenoise.middleware.WhiteNoiseMiddleware", + "django.middleware.common.CommonMiddleware", + "django.middleware.csrf.CsrfViewMiddleware", + "django.contrib.auth.middleware.AuthenticationMiddleware", + "django.contrib.messages.middleware.MessageMiddleware", + "django.middleware.clickjacking.XFrameOptionsMiddleware", ] -ROOT_URLCONF = 'config.urls' +ROOT_URLCONF = "config.urls" TEMPLATES = [ { - 'BACKEND': 'django.template.backends.django.DjangoTemplates', - 'DIRS': [], - 'APP_DIRS': True, - 'OPTIONS': { - 'context_processors': [ - 'django.template.context_processors.debug', - 'django.template.context_processors.request', - 'django.contrib.auth.context_processors.auth', - 'django.contrib.messages.context_processors.messages', + "BACKEND": "django.template.backends.django.DjangoTemplates", + "DIRS": [], + "APP_DIRS": True, + "OPTIONS": { + "context_processors": [ + "django.template.context_processors.debug", + "django.template.context_processors.request", + "django.contrib.auth.context_processors.auth", + "django.contrib.messages.context_processors.messages", ], }, }, ] -WSGI_APPLICATION = 'config.wsgi.application' +WSGI_APPLICATION = "config.wsgi.application" # Database # https://docs.djangoproject.com/en/3.2/ref/settings/#databases DATABASES = { - 'default': { - 'ENGINE': 'django.db.backends.sqlite3', - 'NAME': BASE_DIR / 'db.sqlite3', + "default": { + "ENGINE": "django.db.backends.sqlite3", + "NAME": BASE_DIR / "db.sqlite3", } } @@ -91,16 +91,16 @@ DATABASES = { AUTH_PASSWORD_VALIDATORS = [ { - 'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator', # noqa: E501 + "NAME": "django.contrib.auth.password_validation.UserAttributeSimilarityValidator", # noqa: E501 }, { - 'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator', # noqa: E501 + "NAME": "django.contrib.auth.password_validation.MinimumLengthValidator", # noqa: E501 }, { - 'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator', # noqa: E501 + "NAME": "django.contrib.auth.password_validation.CommonPasswordValidator", # noqa: E501 }, { - 'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator', # noqa: E501 + "NAME": "django.contrib.auth.password_validation.NumericPasswordValidator", # noqa: E501 }, ] @@ -108,9 +108,9 @@ AUTH_PASSWORD_VALIDATORS = [ # Internationalization # https://docs.djangoproject.com/en/3.2/topics/i18n/ -LANGUAGE_CODE = 'en-us' +LANGUAGE_CODE = "en-us" -TIME_ZONE = 'UTC' +TIME_ZONE = "UTC" USE_I18N = True @@ -122,7 +122,7 @@ USE_TZ = True # Static files (CSS, JavaScript, Images) # https://docs.djangoproject.com/en/3.2/howto/static-files/ -STATIC_URL = '/static/' +STATIC_URL = "/static/" # STATICFILES_DIRS = [ # str(BASE_DIR.joinpath('static')), @@ -130,15 +130,15 @@ STATIC_URL = '/static/' # ] # STATIC_URL = '/static/' -STATICFILES_DIRS = (str(BASE_DIR.joinpath('static')),) +STATICFILES_DIRS = (str(BASE_DIR.joinpath("static")),) # MEDIA_ROOT = str(BASE_DIR.joinpath('media')) # MEDIA_URL = '/media/' -STATIC_ROOT = str(BASE_DIR.joinpath('staticfiles')) +STATIC_ROOT = str(BASE_DIR.joinpath("staticfiles")) -STATICFILES_STORAGE = 'whitenoise.storage.CompressedManifestStaticFilesStorage' +STATICFILES_STORAGE = "whitenoise.storage.CompressedManifestStaticFilesStorage" # Default primary key field type # https://docs.djangoproject.com/en/3.2/ref/settings/#default-auto-field -DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField' +DEFAULT_AUTO_FIELD = "django.db.models.BigAutoField" diff --git a/tubearchivist/config/urls.py b/tubearchivist/config/urls.py index ed6423e..b857f01 100644 --- a/tubearchivist/config/urls.py +++ b/tubearchivist/config/urls.py @@ -17,6 +17,6 @@ from django.contrib import admin from django.urls import include, path urlpatterns = [ - path('', include('home.urls')), - path('admin/', admin.site.urls), + path("", include("home.urls")), + path("admin/", admin.site.urls), ] diff --git a/tubearchivist/config/wsgi.py b/tubearchivist/config/wsgi.py index 8915671..3d6fa1b 100644 --- a/tubearchivist/config/wsgi.py +++ b/tubearchivist/config/wsgi.py @@ -11,6 +11,6 @@ import os from django.core.wsgi import get_wsgi_application -os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'config.settings') +os.environ.setdefault("DJANGO_SETTINGS_MODULE", "config.settings") application = get_wsgi_application() diff --git a/tubearchivist/home/__init__.py b/tubearchivist/home/__init__.py index fd57cb2..bc4b3d6 100644 --- a/tubearchivist/home/__init__.py +++ b/tubearchivist/home/__init__.py @@ -10,24 +10,24 @@ from .tasks import app as celery_app def sync_redis_state(): - """ make sure redis gets the config.json values """ - print('sync redis') + """make sure redis gets the config.json values""" + print("sync redis") config_handler = AppConfig() config_handler.load_new_defaults() config = config_handler.config - sort_order = config['archive']['sort'] - set_message('sort_order', sort_order, expire=False) - hide_watched = bool(int(config['archive']['hide_watched'])) - set_message('hide_watched', hide_watched, expire=False) - show_subed_only = bool(int(config['archive']['show_subed_only'])) - set_message('show_subed_only', show_subed_only, expire=False) + sort_order = config["archive"]["sort"] + set_message("sort_order", sort_order, expire=False) + hide_watched = bool(int(config["archive"]["hide_watched"])) + set_message("hide_watched", hide_watched, expire=False) + show_subed_only = bool(int(config["archive"]["show_subed_only"])) + set_message("show_subed_only", show_subed_only, expire=False) def make_folders(): - """ make needed cache folders here so docker doesn't mess it up """ - folders = ['download', 'channels', 'videos', 'import', 'backup'] + """make needed cache folders here so docker doesn't mess it up""" + folders = ["download", "channels", "videos", "import", "backup"] config = AppConfig().config - cache_dir = config['application']['cache_dir'] + cache_dir = config["application"]["cache_dir"] for folder in folders: folder_path = os.path.join(cache_dir, folder) try: @@ -36,7 +36,7 @@ def make_folders(): continue -__all__ = ('celery_app',) +__all__ = ("celery_app",) make_folders() sync_redis_state() index_check() diff --git a/tubearchivist/home/apps.py b/tubearchivist/home/apps.py index e5ea0af..e7d1c7e 100644 --- a/tubearchivist/home/apps.py +++ b/tubearchivist/home/apps.py @@ -2,5 +2,5 @@ from django.apps import AppConfig class HomeConfig(AppConfig): - default_auto_field = 'django.db.models.BigAutoField' - name = 'home' + default_auto_field = "django.db.models.BigAutoField" + name = "home" diff --git a/tubearchivist/home/src/config.py b/tubearchivist/home/src/config.py index fbd5f58..4485882 100644 --- a/tubearchivist/home/src/config.py +++ b/tubearchivist/home/src/config.py @@ -12,71 +12,71 @@ from home.src.helper import get_message, set_message class AppConfig: - """ handle user settings and application variables """ + """handle user settings and application variables""" def __init__(self): self.config = self.get_config() def get_config(self): - """ get config from default file or redis if changed """ + """get config from default file or redis if changed""" config = self.get_config_redis() if not config: config = self.get_config_file() - config['application'].update(self.get_config_env()) + config["application"].update(self.get_config_env()) return config def get_config_file(self): - """ read the defaults from config.json """ - with open('home/config.json', 'r', encoding="utf-8") as f: + """read the defaults from config.json""" + with open("home/config.json", "r", encoding="utf-8") as f: config_str = f.read() config_file = json.loads(config_str) - config_file['application'].update(self.get_config_env()) + config_file["application"].update(self.get_config_env()) return config_file @staticmethod def get_config_env(): - """ read environment application variables """ + """read environment application variables""" application = { - 'REDIS_HOST': os.environ.get('REDIS_HOST'), - 'es_url': os.environ.get('ES_URL'), - 'HOST_UID': int(os.environ.get('HOST_UID')), - 'HOST_GID': int(os.environ.get('HOST_GID')) + "REDIS_HOST": os.environ.get("REDIS_HOST"), + "es_url": os.environ.get("ES_URL"), + "HOST_UID": int(os.environ.get("HOST_UID")), + "HOST_GID": int(os.environ.get("HOST_GID")), } return application @staticmethod def get_config_redis(): - """ read config json set from redis to overwrite defaults """ - config = get_message('config') + """read config json set from redis to overwrite defaults""" + config = get_message("config") if not list(config.values())[0]: return False return config def update_config(self, form_post): - """ update config values from settings form """ + """update config values from settings form""" config = self.config for key, value in form_post.items(): to_write = value[0] if len(to_write): - if to_write == '0': + if to_write == "0": to_write = False - elif to_write == '1': + elif to_write == "1": to_write = True elif to_write.isdigit(): to_write = int(to_write) - config_dict, config_value = key.split('.') + config_dict, config_value = key.split(".") config[config_dict][config_value] = to_write - set_message('config', config, expire=False) + set_message("config", config, expire=False) def load_new_defaults(self): - """ check config.json for missing defaults """ + """check config.json for missing defaults""" default_config = self.get_config_file() redis_config = self.get_config_redis() @@ -100,4 +100,4 @@ class AppConfig: needs_update = True if needs_update: - set_message('config', redis_config, expire=False) + set_message("config", redis_config, expire=False) diff --git a/tubearchivist/home/src/download.py b/tubearchivist/home/src/download.py index f035f94..4c61546 100644 --- a/tubearchivist/home/src/download.py +++ b/tubearchivist/home/src/download.py @@ -19,15 +19,15 @@ from home.src.index import YoutubeChannel, index_new_video class PendingList: - """ manage the pending videos list """ + """manage the pending videos list""" CONFIG = AppConfig().config - ES_URL = CONFIG['application']['es_url'] - VIDEOS = CONFIG['application']['videos'] + ES_URL = CONFIG["application"]["es_url"] + VIDEOS = CONFIG["application"]["videos"] @staticmethod def parse_url_list(youtube_ids): - """ extract youtube ids from list """ + """extract youtube ids from list""" missing_videos = [] for entry in youtube_ids: # notify @@ -35,31 +35,31 @@ class PendingList: "status": "pending", "level": "info", "title": "Adding to download queue.", - "message": 'Extracting lists' + "message": "Extracting lists", } - set_message('progress:download', mess_dict) + set_message("progress:download", mess_dict) # extract - url = entry['url'] - url_type = entry['type'] - if url_type == 'video': + url = entry["url"] + url_type = entry["type"] + if url_type == "video": missing_videos.append(url) - elif url_type == 'channel': + elif url_type == "channel": youtube_ids = ChannelSubscription().get_last_youtube_videos( url, limit=False ) missing_videos = missing_videos + youtube_ids - elif url_type == 'playlist': + elif url_type == "playlist": youtube_ids = playlist_extractor(url) missing_videos = missing_videos + youtube_ids return missing_videos def add_to_pending(self, missing_videos): - """ build the bulk json data from pending """ + """build the bulk json data from pending""" # check if channel is indexed channel_handler = ChannelSubscription() all_indexed = channel_handler.get_channels(subscribed_only=False) - all_channel_ids = [i['channel_id'] for i in all_indexed] + all_channel_ids = [i["channel_id"] for i in all_indexed] # check if already there all_downloaded = self.get_all_downloaded() # loop @@ -77,11 +77,11 @@ class PendingList: if not video: continue - if video['channel_id'] in all_channel_ids: - video['channel_indexed'] = True + if video["channel_id"] in all_channel_ids: + video["channel_indexed"] = True else: - video['channel_indexed'] = False - video['status'] = "pending" + video["channel_indexed"] = False + video["status"] = "pending" action = {"create": {"_id": youtube_id, "_index": "ta_download"}} bulk_list.append(json.dumps(action)) bulk_list.append(json.dumps(video)) @@ -90,128 +90,130 @@ class PendingList: "status": "pending", "level": "info", "title": "Adding to download queue.", - "message": 'Processing IDs...' + "message": "Processing IDs...", } - set_message('progress:download', mess_dict) + set_message("progress:download", mess_dict) # add last newline - bulk_list.append('\n') - query_str = '\n'.join(bulk_list) - headers = {'Content-type': 'application/x-ndjson'} - url = self.ES_URL + '/_bulk' + bulk_list.append("\n") + query_str = "\n".join(bulk_list) + headers = {"Content-type": "application/x-ndjson"} + url = self.ES_URL + "/_bulk" request = requests.post(url, data=query_str, headers=headers) if not request.ok: print(request) @staticmethod def get_youtube_details(youtube_id): - """ get details from youtubedl for single pending video """ + """get details from youtubedl for single pending video""" obs = { - 'default_search': 'ytsearch', - 'quiet': True, - 'skip_download': True, + "default_search": "ytsearch", + "quiet": True, + "skip_download": True, } try: vid = youtube_dl.YoutubeDL(obs).extract_info(youtube_id) except youtube_dl.utils.DownloadError: - print('failed to extract info for: ' + youtube_id) + print("failed to extract info for: " + youtube_id) return False # parse response - seconds = vid['duration'] + seconds = vid["duration"] duration_str = DurationConverter.get_str(seconds) - upload_date = vid['upload_date'] + upload_date = vid["upload_date"] upload_dt = datetime.strptime(upload_date, "%Y%m%d") published = upload_dt.strftime("%Y-%m-%d") # build dict youtube_details = { "youtube_id": youtube_id, - "channel_name": vid['channel'], - "vid_thumb_url": vid['thumbnail'], - "title": vid['title'], - "channel_id": vid['channel_id'], + "channel_name": vid["channel"], + "vid_thumb_url": vid["thumbnail"], + "title": vid["title"], + "channel_id": vid["channel_id"], "duration": duration_str, "published": published, - "timestamp": int(datetime.now().strftime("%s")) + "timestamp": int(datetime.now().strftime("%s")), } return youtube_details def get_all_pending(self): - """ get a list of all pending videos in ta_download """ - headers = {'Content-type': 'application/json'} + """get a list of all pending videos in ta_download""" + headers = {"Content-type": "application/json"} # get PIT ID - url = self.ES_URL + '/ta_download/_pit?keep_alive=1m' + url = self.ES_URL + "/ta_download/_pit?keep_alive=1m" response = requests.post(url) json_data = json.loads(response.text) - pit_id = json_data['id'] + pit_id = json_data["id"] # query data = { - "size": 50, "query": {"match_all": {}}, + "size": 50, + "query": {"match_all": {}}, "pit": {"id": pit_id, "keep_alive": "1m"}, - "sort": [{"timestamp": {"order": "desc"}}] + "sort": [{"timestamp": {"order": "desc"}}], } query_str = json.dumps(data) - url = self.ES_URL + '/_search' + url = self.ES_URL + "/_search" all_pending = [] all_ignore = [] while True: response = requests.get(url, data=query_str, headers=headers) json_data = json.loads(response.text) - all_hits = json_data['hits']['hits'] + all_hits = json_data["hits"]["hits"] if all_hits: for hit in all_hits: - youtube_id = hit['_source']['youtube_id'] - status = hit['_source']['status'] - if status == 'pending': - all_pending.append(hit['_source']) - elif status == 'ignore': + youtube_id = hit["_source"]["youtube_id"] + status = hit["_source"]["status"] + if status == "pending": + all_pending.append(hit["_source"]) + elif status == "ignore": all_ignore.append(youtube_id) - search_after = hit['sort'] + search_after = hit["sort"] # update search_after with last hit data - data['search_after'] = search_after + data["search_after"] = search_after query_str = json.dumps(data) else: break # clean up PIT query_str = json.dumps({"id": pit_id}) - requests.delete(self.ES_URL + '/_pit', data=query_str, headers=headers) + requests.delete(self.ES_URL + "/_pit", data=query_str, headers=headers) return all_pending, all_ignore def get_all_indexed(self): - """ get a list of all videos indexed """ - headers = {'Content-type': 'application/json'} + """get a list of all videos indexed""" + headers = {"Content-type": "application/json"} # get PIT ID - url = self.ES_URL + '/ta_video/_pit?keep_alive=1m' + url = self.ES_URL + "/ta_video/_pit?keep_alive=1m" response = requests.post(url) json_data = json.loads(response.text) - pit_id = json_data['id'] + pit_id = json_data["id"] # query data = { - "size": 500, "query": {"match_all": {}}, + "size": 500, + "query": {"match_all": {}}, "pit": {"id": pit_id, "keep_alive": "1m"}, - "sort": [{"published": {"order": "desc"}}] + "sort": [{"published": {"order": "desc"}}], } query_str = json.dumps(data) - url = self.ES_URL + '/_search' + url = self.ES_URL + "/_search" all_indexed = [] while True: response = requests.get(url, data=query_str, headers=headers) json_data = json.loads(response.text) - all_hits = json_data['hits']['hits'] + all_hits = json_data["hits"]["hits"] if all_hits: for hit in all_hits: all_indexed.append(hit) - search_after = hit['sort'] + search_after = hit["sort"] # update search_after with last hit data - data['search_after'] = search_after + data["search_after"] = search_after query_str = json.dumps(data) else: break # clean up PIT query_str = json.dumps({"id": pit_id}) - requests.delete(self.ES_URL + '/_pit', data=query_str, headers=headers) + requests.delete(self.ES_URL + "/_pit", data=query_str, headers=headers) return all_indexed def get_all_downloaded(self): - """ get a list of all videos in archive """ + """get a list of all videos in archive""" all_channel_folders = os.listdir(self.VIDEOS) all_downloaded = [] for channel_folder in all_channel_folders: @@ -223,125 +225,131 @@ class PendingList: return all_downloaded def delete_from_pending(self, youtube_id): - """ delete the youtube_id from ta_download """ - url = f'{self.ES_URL}/ta_download/_doc/{youtube_id}' + """delete the youtube_id from ta_download""" + url = f"{self.ES_URL}/ta_download/_doc/{youtube_id}" response = requests.delete(url) if not response.ok: print(response.text) def ignore_from_pending(self, ignore_list): - """ build the bulk query string """ + """build the bulk query string""" stamp = int(datetime.now().strftime("%s")) bulk_list = [] for youtube_id in ignore_list: action = {"update": {"_id": youtube_id, "_index": "ta_download"}} - source = {"doc": {"status": 'ignore', "timestamp": stamp}} + source = {"doc": {"status": "ignore", "timestamp": stamp}} bulk_list.append(json.dumps(action)) bulk_list.append(json.dumps(source)) # add last newline - bulk_list.append('\n') - query_str = '\n'.join(bulk_list) + bulk_list.append("\n") + query_str = "\n".join(bulk_list) - headers = {'Content-type': 'application/x-ndjson'} - url = self.ES_URL + '/_bulk' + headers = {"Content-type": "application/x-ndjson"} + url = self.ES_URL + "/_bulk" request = requests.post(url, data=query_str, headers=headers) mess_dict = { "status": "ignore", "level": "info", "title": "Added to ignore list", - "message": '' + "message": "", } - set_message('progress:download', mess_dict) + set_message("progress:download", mess_dict) if not request.ok: print(request) class ChannelSubscription: - """ manage the list of channels subscribed """ + """manage the list of channels subscribed""" def __init__(self): config = AppConfig().config - self.es_url = config['application']['es_url'] - self.channel_size = config['subscriptions']['channel_size'] + self.es_url = config["application"]["es_url"] + self.channel_size = config["subscriptions"]["channel_size"] def get_channels(self, subscribed_only=True): - """ get a list of all channels subscribed to """ - headers = {'Content-type': 'application/json'} + """get a list of all channels subscribed to""" + headers = {"Content-type": "application/json"} # get PIT ID - url = self.es_url + '/ta_channel/_pit?keep_alive=1m' + url = self.es_url + "/ta_channel/_pit?keep_alive=1m" response = requests.post(url) json_data = json.loads(response.text) - pit_id = json_data['id'] + pit_id = json_data["id"] # query if subscribed_only: data = { "query": {"term": {"channel_subscribed": {"value": True}}}, - "size": 50, "pit": {"id": pit_id, "keep_alive": "1m"}, - "sort": [{"channel_name.keyword": {"order": "asc"}}] + "size": 50, + "pit": {"id": pit_id, "keep_alive": "1m"}, + "sort": [{"channel_name.keyword": {"order": "asc"}}], } else: data = { "query": {"match_all": {}}, - "size": 50, "pit": {"id": pit_id, "keep_alive": "1m"}, - "sort": [{"channel_name.keyword": {"order": "asc"}}] + "size": 50, + "pit": {"id": pit_id, "keep_alive": "1m"}, + "sort": [{"channel_name.keyword": {"order": "asc"}}], } query_str = json.dumps(data) - url = self.es_url + '/_search' + url = self.es_url + "/_search" all_channels = [] while True: response = requests.get(url, data=query_str, headers=headers) json_data = json.loads(response.text) - all_hits = json_data['hits']['hits'] + all_hits = json_data["hits"]["hits"] if all_hits: for hit in all_hits: - source = hit['_source'] - search_after = hit['sort'] + source = hit["_source"] + search_after = hit["sort"] all_channels.append(source) # update search_after with last hit data - data['search_after'] = search_after + data["search_after"] = search_after query_str = json.dumps(data) else: break # clean up PIT query_str = json.dumps({"id": pit_id}) - requests.delete(self.es_url + '/_pit', data=query_str, headers=headers) + requests.delete(self.es_url + "/_pit", data=query_str, headers=headers) return all_channels def get_last_youtube_videos(self, channel_id, limit=True): - """ get a list of last videos from channel """ - url = f'https://www.youtube.com/channel/{channel_id}/videos' + """get a list of last videos from channel""" + url = f"https://www.youtube.com/channel/{channel_id}/videos" obs = { - 'default_search': 'ytsearch', 'quiet': True, - 'skip_download': True, 'extract_flat': True + "default_search": "ytsearch", + "quiet": True, + "skip_download": True, + "extract_flat": True, } if limit: - obs['playlistend'] = self.channel_size + obs["playlistend"] = self.channel_size chan = youtube_dl.YoutubeDL(obs).extract_info(url, download=False) - last_videos = [(i['id'], i['title']) for i in chan['entries']] + last_videos = [(i["id"], i["title"]) for i in chan["entries"]] return last_videos def find_missing(self): - """ add missing videos from subscribed channels to pending """ + """add missing videos from subscribed channels to pending""" all_channels = self.get_channels() pending_handler = PendingList() all_pending, all_ignore = pending_handler.get_all_pending() - all_pending_ids = [i['youtube_id'] for i in all_pending] + all_pending_ids = [i["youtube_id"] for i in all_pending] all_downloaded = pending_handler.get_all_downloaded() to_ignore = all_pending_ids + all_ignore + all_downloaded missing_videos = [] counter = 1 for channel in all_channels: - channel_id = channel['channel_id'] + channel_id = channel["channel_id"] last_videos = self.get_last_youtube_videos(channel_id) - set_message('progress:download', { - "status": "rescan", - "level": "info", - "title": "Rescanning: Looking for new videos.", - "message": f'Progress: {counter}/{len(all_channels)}' - } + set_message( + "progress:download", + { + "status": "rescan", + "level": "info", + "title": "Rescanning: Looking for new videos.", + "message": f"Progress: {counter}/{len(all_channels)}", + }, ) for video in last_videos: youtube_id = video[0] @@ -352,22 +360,22 @@ class ChannelSubscription: return missing_videos def change_subscribe(self, channel_id, channel_subscribed): - """ subscribe or unsubscribe from channel and update """ + """subscribe or unsubscribe from channel and update""" if not isinstance(channel_subscribed, bool): - print('invalid status, should be bool') + print("invalid status, should be bool") return - headers = {'Content-type': 'application/json'} + headers = {"Content-type": "application/json"} channel_handler = YoutubeChannel(channel_id) channel_dict = channel_handler.channel_dict - channel_dict['channel_subscribed'] = channel_subscribed + channel_dict["channel_subscribed"] = channel_subscribed if channel_subscribed: # handle subscribe - url = self.es_url + '/ta_channel/_doc/' + channel_id + url = self.es_url + "/ta_channel/_doc/" + channel_id payload = json.dumps(channel_dict) print(channel_dict) else: - url = self.es_url + '/ta_channel/_update/' + channel_id - payload = json.dumps({'doc': channel_dict}) + url = self.es_url + "/ta_channel/_update/" + channel_id + payload = json.dumps({"doc": channel_dict}) # update channel request = requests.post(url, data=payload, headers=headers) if not request.ok: @@ -377,27 +385,30 @@ class ChannelSubscription: def playlist_extractor(playlist_id): - """ return youtube_ids from a playlist_id """ - url = 'https://www.youtube.com/playlist?list=' + playlist_id + """return youtube_ids from a playlist_id""" + url = "https://www.youtube.com/playlist?list=" + playlist_id obs = { - 'default_search': 'ytsearch', 'quiet': True, 'ignoreerrors': True, - 'skip_download': True, 'extract_flat': True + "default_search": "ytsearch", + "quiet": True, + "ignoreerrors": True, + "skip_download": True, + "extract_flat": True, } playlist = youtube_dl.YoutubeDL(obs).extract_info(url, download=False) - playlist_vids = [(i['id'], i['title']) for i in playlist['entries']] + playlist_vids = [(i["id"], i["title"]) for i in playlist["entries"]] return playlist_vids class VideoDownloader: - """ handle the video download functionality """ + """handle the video download functionality""" def __init__(self, youtube_id_list): self.youtube_id_list = youtube_id_list self.config = AppConfig().config def download_list(self): - """ download the list of youtube_ids """ - limit_count = self.config['downloads']['limit_count'] + """download the list of youtube_ids""" + limit_count = self.config["downloads"]["limit_count"] if limit_count: self.youtube_id_list = self.youtube_id_list[:limit_count] @@ -405,112 +416,118 @@ class VideoDownloader: try: self.dl_single_vid(youtube_id) except youtube_dl.utils.DownloadError: - print('failed to download ' + youtube_id) + print("failed to download " + youtube_id) continue vid_dict = index_new_video(youtube_id) self.move_to_archive(vid_dict) self.delete_from_pending(youtube_id) - if self.config['downloads']['sleep_interval']: - sleep(self.config['downloads']['sleep_interval']) + if self.config["downloads"]["sleep_interval"]: + sleep(self.config["downloads"]["sleep_interval"]) @staticmethod def progress_hook(response): - """ process the progress_hooks from youtube_dl """ + """process the progress_hooks from youtube_dl""" # title - filename = response['filename'][12:].replace('_', ' ') + filename = response["filename"][12:].replace("_", " ") title = "Downloading: " + os.path.split(filename)[-1] # message try: - percent = response['_percent_str'] - size = response['_total_bytes_str'] - speed = response['_speed_str'] - eta = response['_eta_str'] - message = f'{percent} of {size} at {speed} - time left: {eta}' + percent = response["_percent_str"] + size = response["_total_bytes_str"] + speed = response["_speed_str"] + eta = response["_eta_str"] + message = f"{percent} of {size} at {speed} - time left: {eta}" except KeyError: - message = '' + message = "" mess_dict = { "status": "downloading", "level": "info", "title": title, - "message": message + "message": message, } - set_message('progress:download', mess_dict) + set_message("progress:download", mess_dict) def dl_single_vid(self, youtube_id): - """ download single video """ + """download single video""" obs = { - 'default_search': 'ytsearch', - 'merge_output_format': 'mp4', 'restrictfilenames': True, - 'outtmpl': (self.config['application']['cache_dir'] + - '/download/' + - self.config['application']['file_template']), - 'progress_hooks': [self.progress_hook], - 'quiet': True, 'continuedl': True, 'retries': 3 + "default_search": "ytsearch", + "merge_output_format": "mp4", + "restrictfilenames": True, + "outtmpl": ( + self.config["application"]["cache_dir"] + + "/download/" + + self.config["application"]["file_template"] + ), + "progress_hooks": [self.progress_hook], + "quiet": True, + "continuedl": True, + "retries": 3, } - if self.config['downloads']['format']: - obs['format'] = self.config['downloads']['format'] - if self.config['downloads']['limit_speed']: - obs['ratelimit'] = self.config['downloads']['limit_speed'] * 1024 + if self.config["downloads"]["format"]: + obs["format"] = self.config["downloads"]["format"] + if self.config["downloads"]["limit_speed"]: + obs["ratelimit"] = self.config["downloads"]["limit_speed"] * 1024 external = False if external: - obs['external_downloader'] = 'aria2c' - + obs["external_downloader"] = "aria2c" postprocessors = [] - if self.config['downloads']['add_metadata']: - postprocessors.append({ - 'key': 'FFmpegMetadata', - 'add_chapters': True, - 'add_metadata': True, - }) + if self.config["downloads"]["add_metadata"]: + postprocessors.append( + { + "key": "FFmpegMetadata", + "add_chapters": True, + "add_metadata": True, + } + ) - obs['postprocessors'] = postprocessors + obs["postprocessors"] = postprocessors # check if already in cache to continue from there - cache_dir = self.config['application']['cache_dir'] - all_cached = os.listdir(cache_dir + '/download/') + cache_dir = self.config["application"]["cache_dir"] + all_cached = os.listdir(cache_dir + "/download/") for file_name in all_cached: if youtube_id in file_name: - obs['outtmpl'] = cache_dir + '/download/' + file_name + obs["outtmpl"] = cache_dir + "/download/" + file_name with youtube_dl.YoutubeDL(obs) as ydl: try: ydl.download([youtube_id]) except youtube_dl.utils.DownloadError: - print('retry failed download: ' + youtube_id) + print("retry failed download: " + youtube_id) sleep(10) ydl.download([youtube_id]) def move_to_archive(self, vid_dict): - """ move downloaded video from cache to archive """ - videos = self.config['application']['videos'] - channel_name = vid_dict['channel']['channel_name'] + """move downloaded video from cache to archive""" + videos = self.config["application"]["videos"] + channel_name = vid_dict["channel"]["channel_name"] channel_name_clean = clean_string(channel_name) - media_url = vid_dict['media_url'] - youtube_id = vid_dict['youtube_id'] + media_url = vid_dict["media_url"] + youtube_id = vid_dict["youtube_id"] # make archive folder - videos = self.config['application']['videos'] + videos = self.config["application"]["videos"] new_folder = os.path.join(videos, channel_name_clean) os.makedirs(new_folder, exist_ok=True) # find real filename - cache_dir = self.config['application']['cache_dir'] - for file_str in os.listdir(cache_dir + '/download'): + cache_dir = self.config["application"]["cache_dir"] + for file_str in os.listdir(cache_dir + "/download"): if youtube_id in file_str: old_file = file_str - old_file_path = os.path.join(cache_dir, 'download', old_file) + old_file_path = os.path.join(cache_dir, "download", old_file) new_file_path = os.path.join(videos, media_url) # move and fix permission shutil.move(old_file_path, new_file_path) os.chown( new_file_path, - self.config['application']['HOST_UID'], - self.config['application']['HOST_GID'] + self.config["application"]["HOST_UID"], + self.config["application"]["HOST_GID"], ) def delete_from_pending(self, youtube_id): - """ delete downloaded video from pending index if its there """ - es_url = self.config['application']['es_url'] - url = f'{es_url}/ta_download/_doc/{youtube_id}' + """delete downloaded video from pending index if its there""" + es_url = self.config["application"]["es_url"] + url = f"{es_url}/ta_download/_doc/{youtube_id}" response = requests.delete(url) if not response.ok and not response.status_code == 404: print(response.text) diff --git a/tubearchivist/home/src/helper.py b/tubearchivist/home/src/helper.py index 195507f..80ff11f 100644 --- a/tubearchivist/home/src/helper.py +++ b/tubearchivist/home/src/helper.py @@ -13,53 +13,53 @@ import unicodedata import redis import requests -REDIS_HOST = os.environ.get('REDIS_HOST') +REDIS_HOST = os.environ.get("REDIS_HOST") def get_total_hits(index, es_url, match_field): - """ get total hits from index """ - headers = {'Content-type': 'application/json'} + """get total hits from index""" + headers = {"Content-type": "application/json"} data = {"query": {"match": {match_field: True}}} payload = json.dumps(data) - url = f'{es_url}/{index}/_search?filter_path=hits.total' + url = f"{es_url}/{index}/_search?filter_path=hits.total" request = requests.post(url, data=payload, headers=headers) if not request.ok: print(request.text) total_json = json.loads(request.text) - total_hits = total_json['hits']['total']['value'] + total_hits = total_json["hits"]["total"]["value"] return total_hits def clean_string(file_name): - """ clean string to only asci characters """ + """clean string to only asci characters""" whitelist = "-_.() " + string.ascii_letters + string.digits - normalized = unicodedata.normalize('NFKD', file_name) - ascii_only = normalized.encode('ASCII', 'ignore').decode().strip() - white_listed = ''.join(c for c in ascii_only if c in whitelist) - cleaned = re.sub(r'[ ]{2,}', ' ', white_listed) + normalized = unicodedata.normalize("NFKD", file_name) + ascii_only = normalized.encode("ASCII", "ignore").decode().strip() + white_listed = "".join(c for c in ascii_only if c in whitelist) + cleaned = re.sub(r"[ ]{2,}", " ", white_listed) return cleaned def process_url_list(url_str): - """ parse url_list to find valid youtube video or channel ids """ - to_replace = ['watch?v=', 'playlist?list='] - url_list = re.split('\n+', url_str[0]) + """parse url_list to find valid youtube video or channel ids""" + to_replace = ["watch?v=", "playlist?list="] + url_list = re.split("\n+", url_str[0]) youtube_ids = [] for url in url_list: - url_clean = url.strip().strip('/').split('/')[-1] + url_clean = url.strip().strip("/").split("/")[-1] for i in to_replace: - url_clean = url_clean.replace(i, '') - url_no_param = url_clean.split('&')[0] + url_clean = url_clean.replace(i, "") + url_no_param = url_clean.split("&")[0] str_len = len(url_no_param) if str_len == 11: - link_type = 'video' + link_type = "video" elif str_len == 24: - link_type = 'channel' + link_type = "channel" elif str_len == 34: - link_type = 'playlist' + link_type = "playlist" else: # unable to parse - raise ValueError('not a valid url: ' + url) + raise ValueError("not a valid url: " + url) youtube_ids.append({"url": url_no_param, "type": link_type}) @@ -67,19 +67,17 @@ def process_url_list(url_str): def set_message(key, message, expire=True): - """ write new message to redis """ + """write new message to redis""" redis_connection = redis.Redis(host=REDIS_HOST) - redis_connection.execute_command( - 'JSON.SET', key, '.', json.dumps(message) - ) + redis_connection.execute_command("JSON.SET", key, ".", json.dumps(message)) if expire: - redis_connection.execute_command('EXPIRE', key, 20) + redis_connection.execute_command("EXPIRE", key, 20) def get_message(key): - """ get any message from JSON key """ + """get any message from JSON key""" redis_connection = redis.Redis(host=REDIS_HOST) - reply = redis_connection.execute_command('JSON.GET', key) + reply = redis_connection.execute_command("JSON.GET", key) if reply: json_str = json.loads(reply) else: @@ -88,9 +86,9 @@ def get_message(key): def get_dl_message(cache_dir): - """ get latest message if available """ + """get latest message if available""" redis_connection = redis.Redis(host=REDIS_HOST) - reply = redis_connection.execute_command('JSON.GET', 'progress:download') + reply = redis_connection.execute_command("JSON.GET", "progress:download") if reply: json_str = json.loads(reply) elif json_str := monitor_cache_dir(cache_dir): @@ -101,7 +99,7 @@ def get_dl_message(cache_dir): def get_lock(lock_key): - """ handle lock for task management """ + """handle lock for task management""" redis_lock = redis.Redis(host=REDIS_HOST).lock(lock_key) return redis_lock @@ -110,15 +108,15 @@ def monitor_cache_dir(cache_dir): """ look at download cache dir directly as alternative progress info """ - dl_cache = os.path.join(cache_dir, 'download') + dl_cache = os.path.join(cache_dir, "download") cache_file = os.listdir(dl_cache) if cache_file: - filename = cache_file[0][12:].replace('_', ' ').split('.')[0] + filename = cache_file[0][12:].replace("_", " ").split(".")[0] mess_dict = { "status": "downloading", "level": "info", "title": "Downloading: " + filename, - "message": "" + "message": "", } else: return False @@ -133,27 +131,37 @@ class DurationConverter: @staticmethod def get_sec(file_path): - """ read duration from file """ - duration = subprocess.run([ - "ffprobe", "-v", "error", "-show_entries", "format=duration", - "-of", "default=noprint_wrappers=1:nokey=1", file_path - ], capture_output=True, check=True) + """read duration from file""" + duration = subprocess.run( + [ + "ffprobe", + "-v", + "error", + "-show_entries", + "format=duration", + "-of", + "default=noprint_wrappers=1:nokey=1", + file_path, + ], + capture_output=True, + check=True, + ) duration_sec = int(float(duration.stdout.decode().strip())) return duration_sec @staticmethod def get_str(duration_sec): - """ takes duration in sec and returns clean string """ + """takes duration in sec and returns clean string""" hours = duration_sec // 3600 minutes = (duration_sec - (hours * 3600)) // 60 secs = duration_sec - (hours * 3600) - (minutes * 60) duration_str = str() if hours: - duration_str = str(hours).zfill(2) + ':' + duration_str = str(hours).zfill(2) + ":" if minutes: - duration_str = duration_str + str(minutes).zfill(2) + ':' + duration_str = duration_str + str(minutes).zfill(2) + ":" else: - duration_str = duration_str + '00:' + duration_str = duration_str + "00:" duration_str = duration_str + str(secs).zfill(2) return duration_str diff --git a/tubearchivist/home/src/index.py b/tubearchivist/home/src/index.py index 796e606..9c53e56 100644 --- a/tubearchivist/home/src/index.py +++ b/tubearchivist/home/src/index.py @@ -19,11 +19,11 @@ from home.src.helper import DurationConverter, clean_string class YoutubeChannel: - """ represents a single youtube channel """ + """represents a single youtube channel""" CONFIG = AppConfig().config - ES_URL = CONFIG['application']['es_url'] - CACHE_DIR = CONFIG['application']['cache_dir'] + ES_URL = CONFIG["application"]["es_url"] + CACHE_DIR = CONFIG["application"]["cache_dir"] def __init__(self, channel_id): self.channel_id = channel_id @@ -32,193 +32,187 @@ class YoutubeChannel: self.channel_dict = self.build_channel_dict() def build_channel_dict(self, scrape=False): - """ combine the dicts build from extracted json payload """ + """combine the dicts build from extracted json payload""" if scrape: channel_dict = False else: channel_dict = self.get_es_channel() if not channel_dict: - print('scrape data from youtube') + print("scrape data from youtube") self.scrape_channel() channel_dict = self.parse_channel_main() channel_dict.update(self.parse_channel_meta()) - self.source = 'scraped' + self.source = "scraped" return channel_dict def get_es_channel(self): - """ get from elastic search first if possible """ + """get from elastic search first if possible""" channel_id = self.channel_id - url = f'{self.ES_URL}/ta_channel/_doc/{channel_id}' + url = f"{self.ES_URL}/ta_channel/_doc/{channel_id}" response = requests.get(url) if response.ok: - channel_source = response.json()['_source'] - self.source = 'elastic' + channel_source = response.json()["_source"] + self.source = "elastic" return channel_source return False def scrape_channel(self): - """ scrape channel page for additional infos """ + """scrape channel page for additional infos""" channel_id = self.channel_id - url = f'https://www.youtube.com/channel/{channel_id}/about?hl=en' - cookies = { - 'CONSENT': 'YES+xxxxxxxxxxxxxxxxxxxxxxxxxxx' - } + url = f"https://www.youtube.com/channel/{channel_id}/about?hl=en" + cookies = {"CONSENT": "YES+xxxxxxxxxxxxxxxxxxxxxxxxxxx"} response = requests.get(url, cookies=cookies) if response.ok: channel_page = response.text else: - print(f'failed to extract channel info for: {channel_id}') + print(f"failed to extract channel info for: {channel_id}") raise ConnectionError - soup = BeautifulSoup(channel_page, 'html.parser') + soup = BeautifulSoup(channel_page, "html.parser") # load script into json - all_scripts = soup.find('body').find_all('script') + all_scripts = soup.find("body").find_all("script") for script in all_scripts: - if 'var ytInitialData = ' in str(script): + if "var ytInitialData = " in str(script): script_content = str(script) break # extract payload - script_content = script_content.split('var ytInitialData = ')[1] - json_raw = script_content.rstrip(';') + script_content = script_content.split("var ytInitialData = ")[1] + json_raw = script_content.rstrip(";") json_data = json.loads(json_raw) # add to self self.json_data = json_data def parse_channel_main(self): - """ extract maintab values from scraped channel json data """ - main_tab = self.json_data['header']['c4TabbedHeaderRenderer'] - channel_name = main_tab['title'] + """extract maintab values from scraped channel json data""" + main_tab = self.json_data["header"]["c4TabbedHeaderRenderer"] + channel_name = main_tab["title"] last_refresh = int(datetime.now().strftime("%s")) # channel_subs try: - sub_text_simple = main_tab['subscriberCountText']['simpleText'] - sub_text = sub_text_simple.split(' ')[0] - if sub_text[-1] == 'K': - channel_subs = int(float(sub_text.replace('K', ''))*1000) - elif sub_text[-1] == 'M': - channel_subs = int(float(sub_text.replace('M', ''))*1000000) + sub_text_simple = main_tab["subscriberCountText"]["simpleText"] + sub_text = sub_text_simple.split(" ")[0] + if sub_text[-1] == "K": + channel_subs = int(float(sub_text.replace("K", "")) * 1000) + elif sub_text[-1] == "M": + channel_subs = int(float(sub_text.replace("M", "")) * 1000000) elif int(sub_text) >= 0: channel_subs = int(sub_text) else: - message = f'{sub_text} not dealt with' + message = f"{sub_text} not dealt with" print(message) except KeyError: channel_subs = 0 # banner try: - all_banners = main_tab['banner']['thumbnails'] - banner = sorted(all_banners, key=lambda k: k['width'])[-1]['url'] + all_banners = main_tab["banner"]["thumbnails"] + banner = sorted(all_banners, key=lambda k: k["width"])[-1]["url"] except KeyError: banner = False # build and return dict main_channel_dict = { - 'channel_active': True, - 'channel_last_refresh': last_refresh, - 'channel_subs': channel_subs, - 'channel_banner_url': banner, - 'channel_name': channel_name, - 'channel_id': self.channel_id + "channel_active": True, + "channel_last_refresh": last_refresh, + "channel_subs": channel_subs, + "channel_banner_url": banner, + "channel_name": channel_name, + "channel_id": self.channel_id, } return main_channel_dict def parse_channel_meta(self): - """ extract meta tab values from channel payload """ + """extract meta tab values from channel payload""" # meta tab json_data = self.json_data - meta_tab = json_data['metadata']['channelMetadataRenderer'] - description = meta_tab['description'] - all_thumbs = meta_tab['avatar']['thumbnails'] - thumb_url = sorted(all_thumbs, key=lambda k: k['width'])[-1]['url'] + meta_tab = json_data["metadata"]["channelMetadataRenderer"] + description = meta_tab["description"] + all_thumbs = meta_tab["avatar"]["thumbnails"] + thumb_url = sorted(all_thumbs, key=lambda k: k["width"])[-1]["url"] # stats tab - renderer = 'twoColumnBrowseResultsRenderer' - all_tabs = json_data['contents'][renderer]['tabs'] + renderer = "twoColumnBrowseResultsRenderer" + all_tabs = json_data["contents"][renderer]["tabs"] for tab in all_tabs: - if 'tabRenderer' in tab.keys(): - if tab['tabRenderer']['title'] == 'About': - about_tab = (tab['tabRenderer']['content'] - ['sectionListRenderer']['contents'][0] - ['itemSectionRenderer']['contents'][0] - ['channelAboutFullMetadataRenderer']) + if "tabRenderer" in tab.keys(): + if tab["tabRenderer"]["title"] == "About": + about_tab = tab["tabRenderer"]["content"][ + "sectionListRenderer" + ]["contents"][0]["itemSectionRenderer"]["contents"][0][ + "channelAboutFullMetadataRenderer" + ] break try: - channel_views_text = about_tab['viewCountText']['simpleText'] + channel_views_text = about_tab["viewCountText"]["simpleText"] channel_views = int(re.sub(r"\D", "", channel_views_text)) except KeyError: channel_views = 0 meta_channel_dict = { - 'channel_description': description, - 'channel_thumb_url': thumb_url, - 'channel_views': channel_views + "channel_description": description, + "channel_thumb_url": thumb_url, + "channel_views": channel_views, } return meta_channel_dict def upload_to_es(self): - """ upload channel data to elastic search """ - url = f'{self.ES_URL}/ta_channel/_doc/{self.channel_id}' + """upload channel data to elastic search""" + url = f"{self.ES_URL}/ta_channel/_doc/{self.channel_id}" response = requests.put(url, json=self.channel_dict) - print(f'added {self.channel_id} to es') + print(f"added {self.channel_id} to es") if not response.ok: print(response.text) def clear_cache(self): - """ delete banner and thumb from cache if available """ - channel_cache = os.path.join(self.CACHE_DIR, 'channels') - thumb = os.path.join(channel_cache, self.channel_id + '_thumb.jpg') - banner = os.path.join(channel_cache, self.channel_id + '_banner.jpg') + """delete banner and thumb from cache if available""" + channel_cache = os.path.join(self.CACHE_DIR, "channels") + thumb = os.path.join(channel_cache, self.channel_id + "_thumb.jpg") + banner = os.path.join(channel_cache, self.channel_id + "_banner.jpg") if os.path.exists(thumb): os.remove(thumb) if os.path.exists(banner): os.remove(banner) def sync_to_videos(self): - """ sync new channel_dict to all videos of channel """ - headers = {'Content-type': 'application/json'} + """sync new channel_dict to all videos of channel""" + headers = {"Content-type": "application/json"} channel_id = self.channel_id # add ingest pipeline processors = [] for field, value in self.channel_dict.items(): line = {"set": {"field": "channel." + field, "value": value}} processors.append(line) - data = { - "description": channel_id, - "processors": processors - } + data = {"description": channel_id, "processors": processors} payload = json.dumps(data) - url = self.ES_URL + '/_ingest/pipeline/' + channel_id + url = self.ES_URL + "/_ingest/pipeline/" + channel_id request = requests.put(url, data=payload, headers=headers) if not request.ok: print(request.text) # apply pipeline - data = { - "query": {"match": {"channel.channel_id": channel_id}} - } + data = {"query": {"match": {"channel.channel_id": channel_id}}} payload = json.dumps(data) - url = self.ES_URL + '/ta_video/_update_by_query?pipeline=' + channel_id + url = self.ES_URL + "/ta_video/_update_by_query?pipeline=" + channel_id request = requests.post(url, data=payload, headers=headers) if not request.ok: print(request.text) def get_total_hits(self): - """ get total channels indexed """ - headers = {'Content-type': 'application/json'} + """get total channels indexed""" + headers = {"Content-type": "application/json"} data = {"query": {"match_all": {}}} payload = json.dumps(data) - url = f'{self.ES_URL}/ta_channel/_search?filter_path=hits.total' + url = f"{self.ES_URL}/ta_channel/_search?filter_path=hits.total" request = requests.post(url, data=payload, headers=headers) if not request.ok: print(request.text) - total_hits = json.loads(request.text)['hits']['total']['value'] + total_hits = json.loads(request.text)["hits"]["total"]["value"] return total_hits class YoutubeVideo: - """ represents a single youtube video """ + """represents a single youtube video""" CONFIG = AppConfig().config - ES_URL = CONFIG['application']['es_url'] - CACHE_DIR = CONFIG['application']['cache_dir'] - VIDEOS = CONFIG['application']['videos'] + ES_URL = CONFIG["application"]["es_url"] + CACHE_DIR = CONFIG["application"]["cache_dir"] + VIDEOS = CONFIG["application"]["videos"] def __init__(self, youtube_id): self.youtube_id = youtube_id @@ -226,8 +220,8 @@ class YoutubeVideo: self.vid_dict = self.get_wrapper() def get_wrapper(self): - """ wrapper to loop around youtube_dl to retry on failure """ - print(f'get video data for {self.youtube_id}') + """wrapper to loop around youtube_dl to retry on failure""" + print(f"get video data for {self.youtube_id}") for i in range(3): try: vid_dict = self.get_youtubedl_vid_data() @@ -241,63 +235,63 @@ class YoutubeVideo: return vid_dict def get_youtubedl_vid_data(self): - """ parse youtubedl extract info """ + """parse youtubedl extract info""" youtube_id = self.youtube_id obs = { - 'quiet': True, - 'default_search': 'ytsearch', - 'skip_download': True + "quiet": True, + "default_search": "ytsearch", + "skip_download": True, } try: vid = youtube_dl.YoutubeDL(obs).extract_info(youtube_id) except ( - youtube_dl.utils.ExtractorError, - youtube_dl.utils.DownloadError - ): - print('failed to get info for ' + youtube_id) + youtube_dl.utils.ExtractorError, + youtube_dl.utils.DownloadError, + ): + print("failed to get info for " + youtube_id) return False # extract - self.channel_id = vid['channel_id'] - upload_date = vid['upload_date'] + self.channel_id = vid["channel_id"] + upload_date = vid["upload_date"] upload_date_time = datetime.strptime(upload_date, "%Y%m%d") published = upload_date_time.strftime("%Y-%m-%d") last_refresh = int(datetime.now().strftime("%s")) # likes try: - like_count = vid['like_count'] + like_count = vid["like_count"] except KeyError: like_count = 0 try: - dislike_count = vid['dislike_count'] + dislike_count = vid["dislike_count"] except KeyError: dislike_count = 0 # build dicts stats = { - "view_count": vid['view_count'], + "view_count": vid["view_count"], "like_count": like_count, "dislike_count": dislike_count, - "average_rating": vid['average_rating'] + "average_rating": vid["average_rating"], } vid_basic = { - "title": vid['title'], - "description": vid['description'], - "category": vid['categories'], - "vid_thumb_url": vid['thumbnail'], - "tags": vid['tags'], + "title": vid["title"], + "description": vid["description"], + "category": vid["categories"], + "vid_thumb_url": vid["thumbnail"], + "tags": vid["tags"], "published": published, "stats": stats, "vid_last_refresh": last_refresh, "date_downloaded": last_refresh, "youtube_id": youtube_id, "active": True, - "channel": False + "channel": False, } return vid_basic def add_player(self, missing_vid): - """ add player information for new videos """ - cache_path = self.CACHE_DIR + '/download/' + """add player information for new videos""" + cache_path = self.CACHE_DIR + "/download/" videos = self.VIDEOS if missing_vid: @@ -318,24 +312,24 @@ class YoutubeVideo: player = { "watched": False, "duration": duration, - "duration_str": duration_str + "duration_str": duration_str, } - self.vid_dict['player'] = player + self.vid_dict["player"] = player def build_file_path(self, channel_name): - """ build media_url from where file will be located """ + """build media_url from where file will be located""" clean_channel_name = clean_string(channel_name) - timestamp = self.vid_dict['published'].replace('-', '') - youtube_id = self.vid_dict['youtube_id'] - title = self.vid_dict['title'] + timestamp = self.vid_dict["published"].replace("-", "") + youtube_id = self.vid_dict["youtube_id"] + title = self.vid_dict["title"] clean_title = clean_string(title) - filename = f'{timestamp}_{youtube_id}_{clean_title}.mp4' + filename = f"{timestamp}_{youtube_id}_{clean_title}.mp4" media_url = os.path.join(clean_channel_name, filename) - self.vid_dict['media_url'] = media_url + self.vid_dict["media_url"] = media_url def get_es_data(self): - """ get current data from elastic search """ - url = self.ES_URL + '/ta_video/_doc/' + self.youtube_id + """get current data from elastic search""" + url = self.ES_URL + "/ta_video/_doc/" + self.youtube_id response = requests.get(url) if not response.ok: print(response.text) @@ -343,48 +337,48 @@ class YoutubeVideo: return es_vid_dict def upload_to_es(self): - """ upload channel data to elastic search """ - url = f'{self.ES_URL}/ta_video/_doc/{self.youtube_id}' + """upload channel data to elastic search""" + url = f"{self.ES_URL}/ta_video/_doc/{self.youtube_id}" response = requests.put(url, json=self.vid_dict) if not response.ok: print(response.text) def delete_cache(self): - """ delete thumbnail from cache if exist """ - video_cache = os.path.join(self.CACHE_DIR, 'videos') - thumb = os.path.join(video_cache, self.youtube_id + '.jpg') + """delete thumbnail from cache if exist""" + video_cache = os.path.join(self.CACHE_DIR, "videos") + thumb = os.path.join(video_cache, self.youtube_id + ".jpg") if os.path.exists(thumb): os.remove(thumb) def deactivate(self): - """ deactivate document on extractor error """ + """deactivate document on extractor error""" youtube_id = self.youtube_id - headers = {'Content-type': 'application/json'} - url = f'{self.ES_URL}/ta_video/_update/{youtube_id}' + headers = {"Content-type": "application/json"} + url = f"{self.ES_URL}/ta_video/_update/{youtube_id}" data = {"script": "ctx._source.active = false"} json_str = json.dumps(data) response = requests.post(url, data=json_str, headers=headers) - print(f'deactivated {youtube_id}') + print(f"deactivated {youtube_id}") if not response.ok: print(response.text) def index_new_video(youtube_id, missing_vid=False): - """ combine video and channel classes for new video index """ + """combine video and channel classes for new video index""" vid_handler = YoutubeVideo(youtube_id) if not vid_handler.vid_dict: - raise ValueError('failed to get metadata for ' + youtube_id) + raise ValueError("failed to get metadata for " + youtube_id) channel_handler = YoutubeChannel(vid_handler.channel_id) # add filepath to vid_dict - channel_name = channel_handler.channel_dict['channel_name'] + channel_name = channel_handler.channel_dict["channel_name"] vid_handler.build_file_path(channel_name) # add channel and player to video vid_handler.add_player(missing_vid) - vid_handler.vid_dict['channel'] = channel_handler.channel_dict + vid_handler.vid_dict["channel"] = channel_handler.channel_dict # add new channel to es - if channel_handler.source == 'scraped': - channel_handler.channel_dict['channel_subscribed'] = False + if channel_handler.source == "scraped": + channel_handler.channel_dict["channel_subscribed"] = False channel_handler.upload_to_es() # upload video to es vid_handler.upload_to_es() diff --git a/tubearchivist/home/src/index_management.py b/tubearchivist/home/src/index_management.py index 4189246..af26cc9 100644 --- a/tubearchivist/home/src/index_management.py +++ b/tubearchivist/home/src/index_management.py @@ -17,8 +17,8 @@ from home.src.config import AppConfig # expected mapping and settings INDEX_CONFIG = [ { - 'index_name': 'channel', - 'expected_map': { + "index_name": "channel", + "expected_map": { "channel_id": { "type": "keyword", }, @@ -28,53 +28,34 @@ INDEX_CONFIG = [ "keyword": { "type": "keyword", "ignore_above": 256, - "normalizer": "to_lower" + "normalizer": "to_lower", }, "search_as_you_type": { "type": "search_as_you_type", "doc_values": False, - "max_shingle_size": 3 - } - } + "max_shingle_size": 3, + }, + }, }, - "channel_banner_url": { - "type": "keyword", - "index": False - }, - "channel_thumb_url": { - "type": "keyword", - "index": False - }, - "channel_description": { - "type": "text" - }, - "channel_last_refresh": { - "type": "date", - "format": "epoch_second" - } + "channel_banner_url": {"type": "keyword", "index": False}, + "channel_thumb_url": {"type": "keyword", "index": False}, + "channel_description": {"type": "text"}, + "channel_last_refresh": {"type": "date", "format": "epoch_second"}, }, - 'expected_set': { + "expected_set": { "analysis": { "normalizer": { - "to_lower": { - "type": "custom", - "filter": ["lowercase"] - } + "to_lower": {"type": "custom", "filter": ["lowercase"]} } }, - "number_of_replicas": "0" - } + "number_of_replicas": "0", + }, }, { - 'index_name': 'video', - 'expected_map': { - "vid_thumb_url": { - "type": "text", - "index": False - }, - "date_downloaded": { - "type": "date" - }, + "index_name": "video", + "expected_map": { + "vid_thumb_url": {"type": "text", "index": False}, + "date_downloaded": {"type": "date"}, "channel": { "properties": { "channel_id": { @@ -86,127 +67,92 @@ INDEX_CONFIG = [ "keyword": { "type": "keyword", "ignore_above": 256, - "normalizer": "to_lower" + "normalizer": "to_lower", }, "search_as_you_type": { "type": "search_as_you_type", "doc_values": False, - "max_shingle_size": 3 - } - } - }, - "channel_banner_url": { - "type": "keyword", - "index": False - }, - "channel_thumb_url": { - "type": "keyword", - "index": False - }, - "channel_description": { - "type": "text" + "max_shingle_size": 3, + }, + }, }, + "channel_banner_url": {"type": "keyword", "index": False}, + "channel_thumb_url": {"type": "keyword", "index": False}, + "channel_description": {"type": "text"}, "channel_last_refresh": { "type": "date", - "format": "epoch_second" - } + "format": "epoch_second", + }, } }, - "description": { - "type": "text" - }, - "media_url": { - "type": "keyword", - "index": False - }, + "description": {"type": "text"}, + "media_url": {"type": "keyword", "index": False}, "title": { "type": "text", "fields": { "keyword": { "type": "keyword", "ignore_above": 256, - "normalizer": "to_lower" + "normalizer": "to_lower", }, "search_as_you_type": { "type": "search_as_you_type", "doc_values": False, - "max_shingle_size": 3 - } - } - }, - "vid_last_refresh": { - "type": "date" - }, - "youtube_id": { - "type": "keyword" - }, - "published": { - "type": "date" + "max_shingle_size": 3, + }, + }, }, + "vid_last_refresh": {"type": "date"}, + "youtube_id": {"type": "keyword"}, + "published": {"type": "date"}, }, - 'expected_set': { + "expected_set": { "analysis": { "normalizer": { - "to_lower": { - "type": "custom", - "filter": ["lowercase"] - } + "to_lower": {"type": "custom", "filter": ["lowercase"]} } }, - "number_of_replicas": "0" - } + "number_of_replicas": "0", + }, }, { - 'index_name': 'download', - 'expected_map': { - "timestamp": { - "type": "date" - }, - "channel_id": { - "type": "keyword" - }, + "index_name": "download", + "expected_map": { + "timestamp": {"type": "date"}, + "channel_id": {"type": "keyword"}, "channel_name": { "type": "text", "fields": { "keyword": { "type": "keyword", "ignore_above": 256, - "normalizer": "to_lower" + "normalizer": "to_lower", } - } - }, - "status": { - "type": "keyword" + }, }, + "status": {"type": "keyword"}, "title": { "type": "text", "fields": { "keyword": { "type": "keyword", "ignore_above": 256, - "normalizer": "to_lower" + "normalizer": "to_lower", } - } + }, }, - "vid_thumb_url": { - "type": "keyword" - }, - "youtube_id": { - "type": "keyword" - } + "vid_thumb_url": {"type": "keyword"}, + "youtube_id": {"type": "keyword"}, }, - 'expected_set': { + "expected_set": { "analysis": { "normalizer": { - "to_lower": { - "type": "custom", - "filter": ["lowercase"] - } + "to_lower": {"type": "custom", "filter": ["lowercase"]} } }, - "number_of_replicas": "0" - } - } + "number_of_replicas": "0", + }, + }, ] @@ -216,8 +162,8 @@ class ElasticIndex: """ CONFIG = AppConfig().config - ES_URL = CONFIG['application']['es_url'] - HEADERS = {'Content-type': 'application/json'} + ES_URL = CONFIG["application"]["es_url"] + HEADERS = {"Content-type": "application/json"} def __init__(self, index_name, expected_map, expected_set): self.index_name = index_name @@ -226,14 +172,14 @@ class ElasticIndex: self.exists, self.details = self.index_exists() def index_exists(self): - """ check if index already exists and return mapping if it does """ + """check if index already exists and return mapping if it does""" index_name = self.index_name - url = f'{self.ES_URL}/ta_{index_name}' + url = f"{self.ES_URL}/ta_{index_name}" response = requests.get(url) exists = response.ok if exists: - details = response.json()[f'ta_{index_name}'] + details = response.json()[f"ta_{index_name}"] else: details = False @@ -258,19 +204,19 @@ class ElasticIndex: return False def validate_mappings(self): - """ check if all mappings are as expected """ + """check if all mappings are as expected""" expected_map = self.expected_map - now_map = self.details['mappings']['properties'] + now_map = self.details["mappings"]["properties"] for key, value in expected_map.items(): # nested - if list(value.keys()) == ['properties']: - for key_n, value_n in value['properties'].items(): - if key_n not in now_map[key]['properties'].keys(): + if list(value.keys()) == ["properties"]: + for key_n, value_n in value["properties"].items(): + if key_n not in now_map[key]["properties"].keys(): print(key_n, value_n) return True - if not value_n == now_map[key]['properties'][key_n]: + if not value_n == now_map[key]["properties"][key_n]: print(key_n, value_n) return True @@ -287,9 +233,9 @@ class ElasticIndex: return False def validate_settings(self): - """ check if all settings are as expected """ + """check if all settings are as expected""" - now_set = self.details['settings']['index'] + now_set = self.details["settings"]["index"] for key, value in self.expected_set.items(): if key not in now_set.keys(): @@ -303,53 +249,46 @@ class ElasticIndex: return False def rebuild_index(self): - """ rebuild with new mapping """ + """rebuild with new mapping""" # backup - self.reindex('backup') + self.reindex("backup") # delete original self.delete_index(backup=False) # create new self.create_blank() - self.reindex('restore') + self.reindex("restore") # delete backup self.delete_index() def reindex(self, method): - """ create on elastic search """ + """create on elastic search""" index_name = self.index_name - if method == 'backup': - source = f'ta_{index_name}' - destination = f'ta_{index_name}_backup' - elif method == 'restore': - source = f'ta_{index_name}_backup' - destination = f'ta_{index_name}' + if method == "backup": + source = f"ta_{index_name}" + destination = f"ta_{index_name}_backup" + elif method == "restore": + source = f"ta_{index_name}_backup" + destination = f"ta_{index_name}" - query = { - "source": { - "index": source - }, - "dest": { - "index": destination - } - } + query = {"source": {"index": source}, "dest": {"index": destination}} data = json.dumps(query) - url = self.ES_URL + '/_reindex?refresh=true' + url = self.ES_URL + "/_reindex?refresh=true" response = requests.post(url=url, data=data, headers=self.HEADERS) if not response.ok: print(response.text) def delete_index(self, backup=True): - """ delete index passed as argument """ + """delete index passed as argument""" if backup: - url = f'{self.ES_URL}/ta_{self.index_name}_backup' + url = f"{self.ES_URL}/ta_{self.index_name}_backup" else: - url = f'{self.ES_URL}/ta_{self.index_name}' + url = f"{self.ES_URL}/ta_{self.index_name}" response = requests.delete(url) if not response.ok: print(response.text) def create_blank(self): - """ apply new mapping and settings for blank new index """ + """apply new mapping and settings for blank new index""" expected_map = self.expected_map expected_set = self.expected_set # stich payload @@ -359,7 +298,7 @@ class ElasticIndex: if expected_map: payload.update({"mappings": {"properties": expected_map}}) # create - url = f'{self.ES_URL}/ta_{self.index_name}' + url = f"{self.ES_URL}/ta_{self.index_name}" data = json.dumps(payload) response = requests.put(url=url, data=data, headers=self.HEADERS) if not response.ok: @@ -367,103 +306,104 @@ class ElasticIndex: class ElasticBackup: - """ dump index to nd-json files for later bulk import """ + """dump index to nd-json files for later bulk import""" def __init__(self, index_config): self.config = AppConfig().config self.index_config = index_config - self.timestamp = datetime.now().strftime('%Y%m%d') + self.timestamp = datetime.now().strftime("%Y%m%d") self.backup_files = [] def get_all_documents(self, index_name): - """ export all documents of a single index """ - headers = {'Content-type': 'application/json'} - es_url = self.config['application']['es_url'] + """export all documents of a single index""" + headers = {"Content-type": "application/json"} + es_url = self.config["application"]["es_url"] # get PIT ID - url = f'{es_url}/ta_{index_name}/_pit?keep_alive=1m' + url = f"{es_url}/ta_{index_name}/_pit?keep_alive=1m" response = requests.post(url) json_data = json.loads(response.text) - pit_id = json_data['id'] + pit_id = json_data["id"] # build query data = { "query": {"match_all": {}}, - "size": 100, "pit": {"id": pit_id, "keep_alive": "1m"}, - "sort": [{"_id": {"order": "asc"}}] + "size": 100, + "pit": {"id": pit_id, "keep_alive": "1m"}, + "sort": [{"_id": {"order": "asc"}}], } query_str = json.dumps(data) - url = es_url + '/_search' + url = es_url + "/_search" # loop until nothing left all_results = [] while True: response = requests.get(url, data=query_str, headers=headers) json_data = json.loads(response.text) - all_hits = json_data['hits']['hits'] + all_hits = json_data["hits"]["hits"] if all_hits: for hit in all_hits: - search_after = hit['sort'] + search_after = hit["sort"] all_results.append(hit) # update search_after with last hit data - data['search_after'] = search_after + data["search_after"] = search_after query_str = json.dumps(data) else: break # clean up PIT query_str = json.dumps({"id": pit_id}) - requests.delete(es_url + '/_pit', data=query_str, headers=headers) + requests.delete(es_url + "/_pit", data=query_str, headers=headers) return all_results @staticmethod def build_bulk(all_results): - """ build bulk query data from all_results """ + """build bulk query data from all_results""" bulk_list = [] for document in all_results: - document_id = document['_id'] - es_index = document['_index'] + document_id = document["_id"] + es_index = document["_index"] action = {"index": {"_index": es_index, "_id": document_id}} - source = document['_source'] + source = document["_source"] bulk_list.append(json.dumps(action)) bulk_list.append(json.dumps(source)) # add last newline - bulk_list.append('\n') - file_content = '\n'.join(bulk_list) + bulk_list.append("\n") + file_content = "\n".join(bulk_list) return file_content def write_es_json(self, file_content, index_name): - """ write nd-json file for es _bulk API to disk """ - cache_dir = self.config['application']['cache_dir'] - file_name = f'es_{index_name}-{self.timestamp}.json' - file_path = os.path.join(cache_dir, 'backup', file_name) - with open(file_path, 'w', encoding='utf-8') as f: + """write nd-json file for es _bulk API to disk""" + cache_dir = self.config["application"]["cache_dir"] + file_name = f"es_{index_name}-{self.timestamp}.json" + file_path = os.path.join(cache_dir, "backup", file_name) + with open(file_path, "w", encoding="utf-8") as f: f.write(file_content) self.backup_files.append(file_path) def write_ta_json(self, all_results, index_name): - """ write generic json file to disk """ - cache_dir = self.config['application']['cache_dir'] - file_name = f'ta_{index_name}-{self.timestamp}.json' - file_path = os.path.join(cache_dir, 'backup', file_name) - to_write = [i['_source'] for i in all_results] + """write generic json file to disk""" + cache_dir = self.config["application"]["cache_dir"] + file_name = f"ta_{index_name}-{self.timestamp}.json" + file_path = os.path.join(cache_dir, "backup", file_name) + to_write = [i["_source"] for i in all_results] file_content = json.dumps(to_write) - with open(file_path, 'w', encoding='utf-8') as f: + with open(file_path, "w", encoding="utf-8") as f: f.write(file_content) self.backup_files.append(file_path) def zip_it(self): - """ pack it up into single zip file """ - cache_dir = self.config['application']['cache_dir'] - file_name = f'ta_backup-{self.timestamp}.zip' - backup_folder = os.path.join(cache_dir, 'backup') + """pack it up into single zip file""" + cache_dir = self.config["application"]["cache_dir"] + file_name = f"ta_backup-{self.timestamp}.zip" + backup_folder = os.path.join(cache_dir, "backup") backup_file = os.path.join(backup_folder, file_name) with zipfile.ZipFile( - backup_file, 'w', compression=zipfile.ZIP_DEFLATED - ) as zip_f: + backup_file, "w", compression=zipfile.ZIP_DEFLATED + ) as zip_f: for backup_file in self.backup_files: zip_f.write(backup_file, os.path.basename(backup_file)) @@ -472,66 +412,67 @@ class ElasticBackup: os.remove(backup_file) def post_bulk_restore(self, file_name): - """ send bulk to es """ - cache_dir = self.config['application']['cache_dir'] - es_url = self.config['application']['es_url'] - headers = {'Content-type': 'application/x-ndjson'} + """send bulk to es""" + cache_dir = self.config["application"]["cache_dir"] + es_url = self.config["application"]["es_url"] + headers = {"Content-type": "application/x-ndjson"} file_path = os.path.join(cache_dir, file_name) - with open(file_path, 'r', encoding='utf-8') as f: + with open(file_path, "r", encoding="utf-8") as f: query_str = f.read() if not query_str.strip(): return - url = es_url + '/_bulk' + url = es_url + "/_bulk" request = requests.post(url, data=query_str, headers=headers) if not request.ok: print(request.text) def unpack_zip_backup(self): - """ extract backup zip and return filelist """ - cache_dir = self.config['application']['cache_dir'] - backup_dir = os.path.join(cache_dir, 'backup') + """extract backup zip and return filelist""" + cache_dir = self.config["application"]["cache_dir"] + backup_dir = os.path.join(cache_dir, "backup") all_available_backups = [ - i for i in os.listdir(backup_dir) if - i.startswith('ta_') and i.endswith('.zip') + i + for i in os.listdir(backup_dir) + if i.startswith("ta_") and i.endswith(".zip") ] all_available_backups.sort() newest_backup = all_available_backups[-1] file_path = os.path.join(backup_dir, newest_backup) - with zipfile.ZipFile(file_path, 'r') as z: + with zipfile.ZipFile(file_path, "r") as z: zip_content = z.namelist() z.extractall(backup_dir) return zip_content def restore_json_files(self, zip_content): - """ go through the unpacked files and restore """ + """go through the unpacked files and restore""" - cache_dir = self.config['application']['cache_dir'] - backup_dir = os.path.join(cache_dir, 'backup') + cache_dir = self.config["application"]["cache_dir"] + backup_dir = os.path.join(cache_dir, "backup") for json_f in zip_content: file_name = os.path.join(backup_dir, json_f) - if not json_f.startswith('es_') or not json_f.endswith('.json'): + if not json_f.startswith("es_") or not json_f.endswith(".json"): os.remove(file_name) continue - print('restoring: ' + json_f) + print("restoring: " + json_f) self.post_bulk_restore(file_name) os.remove(file_name) def backup_all_indexes(): - """ backup all es indexes to disk """ + """backup all es indexes to disk""" backup_handler = ElasticBackup(INDEX_CONFIG) for index in backup_handler.index_config: - index_name = index['index_name'] + index_name = index["index_name"] all_results = backup_handler.get_all_documents(index_name) file_content = backup_handler.build_bulk(all_results) backup_handler.write_es_json(file_content, index_name) @@ -541,7 +482,7 @@ def backup_all_indexes(): def restore_from_backup(): - """ restore indexes from backup file """ + """restore indexes from backup file""" # delete index_check(force_restore=True) # recreate @@ -551,14 +492,14 @@ def restore_from_backup(): def index_check(force_restore=False): - """ check if all indexes are created and have correct mapping """ + """check if all indexes are created and have correct mapping""" backed_up = False for index in INDEX_CONFIG: - index_name = index['index_name'] - expected_map = index['expected_map'] - expected_set = index['expected_set'] + index_name = index["index_name"] + expected_map = index["expected_map"] + expected_set = index["expected_set"] handler = ElasticIndex(index_name, expected_map, expected_set) # force restore if force_restore: @@ -568,7 +509,7 @@ def index_check(force_restore=False): # create new if not handler.exists: - print(f'create new blank index with name ta_{index_name}...') + print(f"create new blank index with name ta_{index_name}...") handler.create_blank() continue @@ -577,13 +518,13 @@ def index_check(force_restore=False): if rebuild: # make backup before rebuild if not backed_up: - print('running backup first') + print("running backup first") backup_all_indexes() backed_up = True - print(f'applying new mappings to index ta_{index_name}...') + print(f"applying new mappings to index ta_{index_name}...") handler.rebuild_index() continue # else all good - print(f'ta_{index_name} index is created and up to date...') + print(f"ta_{index_name} index is created and up to date...") diff --git a/tubearchivist/home/src/reindex.py b/tubearchivist/home/src/reindex.py index 11af35c..3a8a1cc 100644 --- a/tubearchivist/home/src/reindex.py +++ b/tubearchivist/home/src/reindex.py @@ -17,19 +17,23 @@ from time import sleep import requests from home.src.config import AppConfig from home.src.download import ChannelSubscription, PendingList, VideoDownloader -from home.src.helper import (clean_string, get_message, get_total_hits, - set_message) +from home.src.helper import ( + clean_string, + get_message, + get_total_hits, + set_message, +) from home.src.index import YoutubeChannel, YoutubeVideo, index_new_video class Reindex: - """ check for outdated documents and refresh data from youtube """ + """check for outdated documents and refresh data from youtube""" def __init__(self): # config config = AppConfig().config - self.sleep_interval = config['downloads']['sleep_interval'] - self.es_url = config['application']['es_url'] + self.sleep_interval = config["downloads"]["sleep_interval"] + self.es_url = config["application"]["es_url"] self.refresh_interval = 90 # scan self.video_daily, self.channel_daily = self.get_daily() @@ -37,20 +41,18 @@ class Reindex: self.all_channel_ids = False def get_daily(self): - """ get daily refresh values """ - total_videos = get_total_hits( - 'ta_video', self.es_url, 'active' - ) + """get daily refresh values""" + total_videos = get_total_hits("ta_video", self.es_url, "active") video_daily = ceil(total_videos / self.refresh_interval * 1.2) total_channels = get_total_hits( - 'ta_channel', self.es_url, 'channel_active' + "ta_channel", self.es_url, "channel_active" ) channel_daily = ceil(total_channels / self.refresh_interval * 1.2) return (video_daily, channel_daily) def get_outdated_vids(self): - """ get daily videos to refresh """ - headers = {'Content-type': 'application/json'} + """get daily videos to refresh""" + headers = {"Content-type": "application/json"} now = int(datetime.now().strftime("%s")) now_3m = now - 3 * 30 * 24 * 60 * 60 size = self.video_daily @@ -60,24 +62,25 @@ class Reindex: "bool": { "must": [ {"match": {"active": True}}, - {"range": {"vid_last_refresh": {"lte": now_3m}}} + {"range": {"vid_last_refresh": {"lte": now_3m}}}, ] } }, - "sort": [{"vid_last_refresh": {"order": "asc"}}], "_source": False + "sort": [{"vid_last_refresh": {"order": "asc"}}], + "_source": False, } query_str = json.dumps(data) - url = self.es_url + '/ta_video/_search' + url = self.es_url + "/ta_video/_search" response = requests.get(url, data=query_str, headers=headers) if not response.ok: print(response.text) response_dict = json.loads(response.text) - all_youtube_ids = [i['_id'] for i in response_dict['hits']['hits']] + all_youtube_ids = [i["_id"] for i in response_dict["hits"]["hits"]] return all_youtube_ids def get_outdated_channels(self): - """ get daily channels to refresh """ - headers = {'Content-type': 'application/json'} + """get daily channels to refresh""" + headers = {"Content-type": "application/json"} now = int(datetime.now().strftime("%s")) now_3m = now - 3 * 30 * 24 * 60 * 60 size = self.channel_daily @@ -87,52 +90,50 @@ class Reindex: "bool": { "must": [ {"match": {"channel_active": True}}, - {"range": {"channel_last_refresh": {"lte": now_3m}}} + {"range": {"channel_last_refresh": {"lte": now_3m}}}, ] } }, "sort": [{"channel_last_refresh": {"order": "asc"}}], - "_source": False + "_source": False, } query_str = json.dumps(data) - url = self.es_url + '/ta_channel/_search' + url = self.es_url + "/ta_channel/_search" response = requests.get(url, data=query_str, headers=headers) if not response.ok: print(response.text) response_dict = json.loads(response.text) - all_channel_ids = [i['_id'] for i in response_dict['hits']['hits']] + all_channel_ids = [i["_id"] for i in response_dict["hits"]["hits"]] return all_channel_ids def check_outdated(self): - """ add missing vids and channels """ + """add missing vids and channels""" self.all_youtube_ids = self.get_outdated_vids() self.all_channel_ids = self.get_outdated_channels() def rescrape_all_channels(self): - """ sync new data from channel to all matching videos """ + """sync new data from channel to all matching videos""" sleep_interval = self.sleep_interval channel_sub_handler = ChannelSubscription() - all_channels = channel_sub_handler.get_channels( - subscribed_only=False - ) - all_channel_ids = [i['channel_id'] for i in all_channels] + all_channels = channel_sub_handler.get_channels(subscribed_only=False) + all_channel_ids = [i["channel_id"] for i in all_channels] counter = 1 for channel_id in all_channel_ids: - message = f'Progress: {counter}/{len(all_channels)}' + message = f"Progress: {counter}/{len(all_channels)}" mess_dict = { "status": "scraping", "level": "info", "title": "Scraping all youtube channels", - "message": message + "message": message, } - set_message('progress:download', mess_dict) + set_message("progress:download", mess_dict) channel_index = YoutubeChannel(channel_id) - subscribed = channel_index.channel_dict['channel_subscribed'] + subscribed = channel_index.channel_dict["channel_subscribed"] channel_index.channel_dict = channel_index.build_channel_dict( scrape=True ) - channel_index.channel_dict['channel_subscribed'] = subscribed + channel_index.channel_dict["channel_subscribed"] = subscribed channel_index.upload_to_es() channel_index.sync_to_videos() counter = counter + 1 @@ -141,7 +142,7 @@ class Reindex: @staticmethod def reindex_single_video(youtube_id): - """ refresh data for single video """ + """refresh data for single video""" vid_handler = YoutubeVideo(youtube_id) if not vid_handler.vid_dict: # stop if deactivated @@ -149,42 +150,42 @@ class Reindex: return es_vid_dict = vid_handler.get_es_data() - player = es_vid_dict['_source']['player'] - date_downloaded = es_vid_dict['_source']['date_downloaded'] - channel_dict = es_vid_dict['_source']['channel'] - channel_name = channel_dict['channel_name'] + player = es_vid_dict["_source"]["player"] + date_downloaded = es_vid_dict["_source"]["date_downloaded"] + channel_dict = es_vid_dict["_source"]["channel"] + channel_name = channel_dict["channel_name"] vid_handler.build_file_path(channel_name) # add to vid_dict - vid_handler.vid_dict['player'] = player - vid_handler.vid_dict['date_downloaded'] = date_downloaded - vid_handler.vid_dict['channel'] = channel_dict + vid_handler.vid_dict["player"] = player + vid_handler.vid_dict["date_downloaded"] = date_downloaded + vid_handler.vid_dict["channel"] = channel_dict # update vid_handler.upload_to_es() vid_handler.delete_cache() @staticmethod def reindex_single_channel(channel_id): - """ refresh channel data and sync to videos """ + """refresh channel data and sync to videos""" channel_handler = YoutubeChannel(channel_id) - subscribed = channel_handler.channel_dict['channel_subscribed'] + subscribed = channel_handler.channel_dict["channel_subscribed"] channel_handler.channel_dict = channel_handler.build_channel_dict( scrape=True ) - channel_handler.channel_dict['channel_subscribed'] = subscribed + channel_handler.channel_dict["channel_subscribed"] = subscribed channel_handler.upload_to_es() channel_handler.sync_to_videos() channel_handler.clear_cache() def reindex(self): - """ reindex what's needed """ + """reindex what's needed""" # videos - print(f'reindexing {len(self.all_youtube_ids)} videos') + print(f"reindexing {len(self.all_youtube_ids)} videos") for youtube_id in self.all_youtube_ids: self.reindex_single_video(youtube_id) if self.sleep_interval: sleep(self.sleep_interval) # channels - print(f'reindexing {len(self.all_channel_ids)} channels') + print(f"reindexing {len(self.all_channel_ids)} channels") for channel_id in self.all_channel_ids: self.reindex_single_channel(channel_id) if self.sleep_interval: @@ -192,11 +193,11 @@ class Reindex: class FilesystemScanner: - """ handle scanning and fixing from filesystem """ + """handle scanning and fixing from filesystem""" CONFIG = AppConfig().config - ES_URL = CONFIG['application']['es_url'] - VIDEOS = CONFIG['application']['videos'] + ES_URL = CONFIG["application"]["es_url"] + VIDEOS = CONFIG["application"]["videos"] def __init__(self): self.all_downloaded = self.get_all_downloaded() @@ -207,7 +208,7 @@ class FilesystemScanner: self.to_delete = None def get_all_downloaded(self): - """ get a list of all video files downloaded """ + """get a list of all video files downloaded""" all_channels = os.listdir(self.VIDEOS) all_channels.sort() all_downloaded = [] @@ -221,26 +222,26 @@ class FilesystemScanner: @staticmethod def get_all_indexed(): - """ get a list of all indexed videos """ + """get a list of all indexed videos""" index_handler = PendingList() all_indexed_raw = index_handler.get_all_indexed() all_indexed = [] for video in all_indexed_raw: - youtube_id = video['_id'] - media_url = video['_source']['media_url'] - published = video['_source']['published'] - title = video['_source']['title'] + youtube_id = video["_id"] + media_url = video["_source"]["media_url"] + published = video["_source"]["published"] + title = video["_source"]["title"] all_indexed.append((youtube_id, media_url, published, title)) return all_indexed def list_comarison(self): - """ compare the lists to figure out what to do """ + """compare the lists to figure out what to do""" self.find_unindexed() self.find_missing() self.find_bad_media_url() def find_unindexed(self): - """ find video files without a matching document indexed """ + """find video files without a matching document indexed""" all_indexed_ids = [i[0] for i in self.all_indexed] to_index = [] for downloaded in self.all_downloaded: @@ -250,7 +251,7 @@ class FilesystemScanner: self.to_index = to_index def find_missing(self): - """ find indexed videos without matching media file """ + """find indexed videos without matching media file""" all_downloaded_ids = [i[2] for i in self.all_downloaded] to_delete = [] for video in self.all_indexed: @@ -261,7 +262,7 @@ class FilesystemScanner: self.to_delete = to_delete def find_bad_media_url(self): - """ rename media files not matching the indexed title """ + """rename media files not matching the indexed title""" to_fix = [] to_rename = [] for downloaded in self.all_downloaded: @@ -272,8 +273,8 @@ class FilesystemScanner: if indexed_id == downloaded_id: # found it title_c = clean_string(title) - pub = published.replace('-', '') - expected_filename = f'{pub}_{indexed_id}_{title_c}.mp4' + pub = published.replace("-", "") + expected_filename = f"{pub}_{indexed_id}_{title_c}.mp4" new_url = os.path.join(channel, expected_filename) if expected_filename != filename: # file to rename @@ -290,7 +291,7 @@ class FilesystemScanner: self.to_rename = to_rename def rename_files(self): - """ rename media files as identified by find_bad_media_url """ + """rename media files as identified by find_bad_media_url""" for bad_filename in self.to_rename: channel, filename, expected_filename = bad_filename old_path = os.path.join(self.VIDEOS, channel, filename) @@ -298,71 +299,72 @@ class FilesystemScanner: os.rename(old_path, new_path) def send_mismatch_bulk(self): - """ build bulk update """ + """build bulk update""" bulk_list = [] for video_mismatch in self.mismatch: youtube_id, media_url = video_mismatch - action = {"update": {"_id": youtube_id, "_index": 'ta_video'}} + action = {"update": {"_id": youtube_id, "_index": "ta_video"}} source = {"doc": {"media_url": media_url}} bulk_list.append(json.dumps(action)) bulk_list.append(json.dumps(source)) # add last newline - bulk_list.append('\n') - query_str = '\n'.join(bulk_list) + bulk_list.append("\n") + query_str = "\n".join(bulk_list) # make the call - headers = {'Content-type': 'application/x-ndjson'} - url = self.ES_URL + '/_bulk' + headers = {"Content-type": "application/x-ndjson"} + url = self.ES_URL + "/_bulk" request = requests.post(url, data=query_str, headers=headers) if not request.ok: print(request.text) def delete_from_index(self): - """ find indexed but deleted mediafile """ + """find indexed but deleted mediafile""" for indexed in self.to_delete: youtube_id, _ = indexed - url = self.ES_URL + '/ta_video/_doc/' + youtube_id + url = self.ES_URL + "/ta_video/_doc/" + youtube_id request = requests.delete(url) if not request.ok: print(request.text) class ManualImport: - """ import and indexing existing video files """ + """import and indexing existing video files""" CONFIG = AppConfig().config - CACHE_DIR = CONFIG['application']['cache_dir'] - IMPORT_DIR = os.path.join(CACHE_DIR, 'import') + CACHE_DIR = CONFIG["application"]["cache_dir"] + IMPORT_DIR = os.path.join(CACHE_DIR, "import") def __init__(self): self.identified = self.import_folder_parser() def import_folder_parser(self): - """ detect files in import folder """ + """detect files in import folder""" to_import = os.listdir(self.IMPORT_DIR) to_import.sort() - video_files = [i for i in to_import if not i.endswith('.json')] + video_files = [i for i in to_import if not i.endswith(".json")] identified = [] for file_path in video_files: - file_dict = {'video_file': file_path} + file_dict = {"video_file": file_path} file_name, _ = os.path.splitext(file_path) matching_json = [ - i for i in to_import if i.startswith(file_name) - and i.endswith('.json') + i + for i in to_import + if i.startswith(file_name) and i.endswith(".json") ] if matching_json: json_file = matching_json[0] youtube_id = self.extract_id_from_json(json_file) - file_dict.update({'json_file': json_file}) + file_dict.update({"json_file": json_file}) else: youtube_id = self.extract_id_from_filename(file_name) - file_dict.update({'json_file': False}) + file_dict.update({"json_file": False}) - file_dict.update({'youtube_id': youtube_id}) + file_dict.update({"youtube_id": youtube_id}) identified.append(file_dict) return identified @@ -373,33 +375,33 @@ class ManualImport: look at the file name for the youtube id expects filename ending in []. """ - id_search = re.search(r'\[([a-zA-Z0-9_-]{11})\]$', file_name) + id_search = re.search(r"\[([a-zA-Z0-9_-]{11})\]$", file_name) if id_search: youtube_id = id_search.group(1) return youtube_id - print('failed to extract youtube id for: ' + file_name) + print("failed to extract youtube id for: " + file_name) raise Exception def extract_id_from_json(self, json_file): - """ open json file and extract id """ - json_path = os.path.join(self.CACHE_DIR, 'import', json_file) - with open(json_path, 'r', encoding='utf-8') as f: + """open json file and extract id""" + json_path = os.path.join(self.CACHE_DIR, "import", json_file) + with open(json_path, "r", encoding="utf-8") as f: json_content = f.read() - youtube_id = json.loads(json_content)['id'] + youtube_id = json.loads(json_content)["id"] return youtube_id def process_import(self): - """ go through identified media files """ + """go through identified media files""" for media_file in self.identified: - json_file = media_file['json_file'] - video_file = media_file['video_file'] - youtube_id = media_file['youtube_id'] + json_file = media_file["json_file"] + video_file = media_file["video_file"] + youtube_id = media_file["youtube_id"] - video_path = os.path.join(self.CACHE_DIR, 'import', video_file) + video_path = os.path.join(self.CACHE_DIR, "import", video_file) self.move_to_cache(video_path, youtube_id) @@ -411,35 +413,43 @@ class ManualImport: if os.path.exists(video_path): os.remove(video_path) if json_file: - json_path = os.path.join(self.CACHE_DIR, 'import', json_file) + json_path = os.path.join(self.CACHE_DIR, "import", json_file) os.remove(json_path) def move_to_cache(self, video_path, youtube_id): - """ move identified video file to cache, convert to mp4 """ + """move identified video file to cache, convert to mp4""" file_name = os.path.split(video_path)[-1] video_file, ext = os.path.splitext(file_name) # make sure youtube_id is in filename if youtube_id not in video_file: - video_file = f'{video_file}_{youtube_id}' + video_file = f"{video_file}_{youtube_id}" # move, convert if needed - if ext == '.mp4': + if ext == ".mp4": new_file = video_file + ext - dest_path = os.path.join(self.CACHE_DIR, 'download', new_file) + dest_path = os.path.join(self.CACHE_DIR, "download", new_file) shutil.move(video_path, dest_path) else: - print(f'processing with ffmpeg: {video_file}') - new_file = video_file + '.mp4' - dest_path = os.path.join(self.CACHE_DIR, 'download', new_file) + print(f"processing with ffmpeg: {video_file}") + new_file = video_file + ".mp4" + dest_path = os.path.join(self.CACHE_DIR, "download", new_file) subprocess.run( - ["ffmpeg", "-i", video_path, dest_path, - "-loglevel", "warning", "-stats"], check=True + [ + "ffmpeg", + "-i", + video_path, + dest_path, + "-loglevel", + "warning", + "-stats", + ], + check=True, ) def scan_filesystem(): - """ grouped function to delete and update index """ + """grouped function to delete and update index""" filesystem_handler = FilesystemScanner() filesystem_handler.list_comarison() if filesystem_handler.to_rename: @@ -455,10 +465,10 @@ def scan_filesystem(): def reindex_old_documents(): - """ daily refresh of old documents """ + """daily refresh of old documents""" # check needed last run now = int(datetime.now().strftime("%s")) - last_reindex = get_message('last_reindex') + last_reindex = get_message("last_reindex") if isinstance(last_reindex, int) and now - last_reindex < 60 * 60 * 24: return # continue if needed @@ -466,4 +476,4 @@ def reindex_old_documents(): reindex_handler.check_outdated() reindex_handler.reindex() # set timestamp - set_message('last_reindex', now, expire=False) + set_message("last_reindex", now, expire=False) diff --git a/tubearchivist/home/src/searching.py b/tubearchivist/home/src/searching.py index 3dcddb6..e02b5fe 100644 --- a/tubearchivist/home/src/searching.py +++ b/tubearchivist/home/src/searching.py @@ -17,10 +17,10 @@ from PIL import Image class SearchHandler: - """ search elastic search """ + """search elastic search""" CONFIG = AppConfig().config - CACHE_DIR = CONFIG['application']['cache_dir'] + CACHE_DIR = CONFIG["application"]["cache_dir"] def __init__(self, url, data, cache=True): self.max_hits = None @@ -29,15 +29,15 @@ class SearchHandler: self.cache = cache def get_data(self): - """ get the data """ + """get the data""" if self.data: response = requests.get(self.url, json=self.data).json() else: response = requests.get(self.url).json() - if 'hits' in response.keys(): - self.max_hits = response['hits']['total']['value'] - return_value = response['hits']['hits'] + if "hits" in response.keys(): + self.max_hits = response["hits"]["total"]["value"] + return_value = response["hits"]["hits"] else: # simulate list for single result to reuse rest of class return_value = [response] @@ -50,13 +50,13 @@ class SearchHandler: all_channels = [] for idx, hit in enumerate(return_value): return_value[idx] = self.hit_cleanup(hit) - if hit['_index'] == 'ta_video': + if hit["_index"] == "ta_video": video_dict, channel_dict = self.vid_cache_link(hit) if video_dict not in all_videos: all_videos.append(video_dict) if channel_dict not in all_channels: all_channels.append(channel_dict) - elif hit['_index'] == 'ta_channel': + elif hit["_index"] == "ta_channel": channel_dict = self.channel_cache_link(hit) if channel_dict not in all_channels: all_channels.append(channel_dict) @@ -69,52 +69,49 @@ class SearchHandler: @staticmethod def vid_cache_link(hit): - """ download thumbnails into cache """ - vid_thumb = hit['source']['vid_thumb_url'] - youtube_id = hit['source']['youtube_id'] - channel_id_hit = hit['source']['channel']['channel_id'] - chan_thumb = hit['source']['channel']['channel_thumb_url'] + """download thumbnails into cache""" + vid_thumb = hit["source"]["vid_thumb_url"] + youtube_id = hit["source"]["youtube_id"] + channel_id_hit = hit["source"]["channel"]["channel_id"] + chan_thumb = hit["source"]["channel"]["channel_thumb_url"] try: - chan_banner = hit['source']['channel']['channel_banner_url'] + chan_banner = hit["source"]["channel"]["channel_banner_url"] except KeyError: chan_banner = False - video_dict = { - 'youtube_id': youtube_id, - 'vid_thumb': vid_thumb - } + video_dict = {"youtube_id": youtube_id, "vid_thumb": vid_thumb} channel_dict = { - 'channel_id': channel_id_hit, - 'chan_thumb': chan_thumb, - 'chan_banner': chan_banner + "channel_id": channel_id_hit, + "chan_thumb": chan_thumb, + "chan_banner": chan_banner, } return video_dict, channel_dict @staticmethod def channel_cache_link(hit): - """ build channel thumb links """ - channel_id_hit = hit['source']['channel_id'] - chan_thumb = hit['source']['channel_thumb_url'] + """build channel thumb links""" + channel_id_hit = hit["source"]["channel_id"] + chan_thumb = hit["source"]["channel_thumb_url"] try: - chan_banner = hit['source']['channel_banner_url'] + chan_banner = hit["source"]["channel_banner_url"] except KeyError: chan_banner = False channel_dict = { - 'channel_id': channel_id_hit, - 'chan_thumb': chan_thumb, - 'chan_banner': chan_banner + "channel_id": channel_id_hit, + "chan_thumb": chan_thumb, + "chan_banner": chan_banner, } return channel_dict def cache_dl_vids(self, all_videos): - """ video thumbs links for cache """ - vid_cache = os.path.join(self.CACHE_DIR, 'videos') + """video thumbs links for cache""" + vid_cache = os.path.join(self.CACHE_DIR, "videos") all_vid_cached = os.listdir(vid_cache) # videos for video_dict in all_videos: - youtube_id = video_dict['youtube_id'] - if not youtube_id + '.jpg' in all_vid_cached: - cache_path = os.path.join(vid_cache, youtube_id + '.jpg') - thumb_url = video_dict['vid_thumb'] + youtube_id = video_dict["youtube_id"] + if not youtube_id + ".jpg" in all_vid_cached: + cache_path = os.path.join(vid_cache, youtube_id + ".jpg") + thumb_url = video_dict["vid_thumb"] img_raw = requests.get(thumb_url, stream=True).raw img = Image.open(img_raw) width, height = img.size @@ -125,62 +122,62 @@ class SearchHandler: img.convert("RGB").save(cache_path) def cache_dl_chan(self, all_channels): - """ download channel thumbs """ - chan_cache = os.path.join(self.CACHE_DIR, 'channels') + """download channel thumbs""" + chan_cache = os.path.join(self.CACHE_DIR, "channels") all_chan_cached = os.listdir(chan_cache) for channel_dict in all_channels: - channel_id_cache = channel_dict['channel_id'] - channel_banner_url = channel_dict['chan_banner'] - channel_banner = channel_id_cache + '_banner.jpg' - channel_thumb_url = channel_dict['chan_thumb'] - channel_thumb = channel_id_cache + '_thumb.jpg' + channel_id_cache = channel_dict["channel_id"] + channel_banner_url = channel_dict["chan_banner"] + channel_banner = channel_id_cache + "_banner.jpg" + channel_thumb_url = channel_dict["chan_thumb"] + channel_thumb = channel_id_cache + "_thumb.jpg" # thumb if channel_thumb_url and channel_thumb not in all_chan_cached: cache_path = os.path.join(chan_cache, channel_thumb) img_raw = requests.get(channel_thumb_url, stream=True).content - with open(cache_path, 'wb') as f: + with open(cache_path, "wb") as f: f.write(img_raw) # banner if channel_banner_url and channel_banner not in all_chan_cached: cache_path = os.path.join(chan_cache, channel_banner) img_raw = requests.get(channel_banner_url, stream=True).content - with open(cache_path, 'wb') as f: + with open(cache_path, "wb") as f: f.write(img_raw) @staticmethod def hit_cleanup(hit): - """ clean up and parse data from a single hit """ - hit['source'] = hit.pop('_source') - hit_keys = hit['source'].keys() - if 'media_url' in hit_keys: - parsed_url = urllib.parse.quote(hit['source']['media_url']) - hit['source']['media_url'] = parsed_url + """clean up and parse data from a single hit""" + hit["source"] = hit.pop("_source") + hit_keys = hit["source"].keys() + if "media_url" in hit_keys: + parsed_url = urllib.parse.quote(hit["source"]["media_url"]) + hit["source"]["media_url"] = parsed_url - if 'published' in hit_keys: - published = hit['source']['published'] + if "published" in hit_keys: + published = hit["source"]["published"] date_pub = datetime.strptime(published, "%Y-%m-%d") date_str = datetime.strftime(date_pub, "%d %b, %Y") - hit['source']['published'] = date_str + hit["source"]["published"] = date_str - if 'vid_last_refresh' in hit_keys: - vid_last_refresh = hit['source']['vid_last_refresh'] + if "vid_last_refresh" in hit_keys: + vid_last_refresh = hit["source"]["vid_last_refresh"] date_refresh = datetime.fromtimestamp(vid_last_refresh) date_str = datetime.strftime(date_refresh, "%d %b, %Y") - hit['source']['vid_last_refresh'] = date_str + hit["source"]["vid_last_refresh"] = date_str - if 'channel_last_refresh' in hit_keys: - refreshed = hit['source']['channel_last_refresh'] + if "channel_last_refresh" in hit_keys: + refreshed = hit["source"]["channel_last_refresh"] date_refresh = datetime.fromtimestamp(refreshed) date_str = datetime.strftime(date_refresh, "%d %b, %Y") - hit['source']['channel_last_refresh'] = date_str + hit["source"]["channel_last_refresh"] = date_str - if 'channel' in hit_keys: - channel_keys = hit['source']['channel'].keys() - if 'channel_last_refresh' in channel_keys: - refreshed = hit['source']['channel']['channel_last_refresh'] + if "channel" in hit_keys: + channel_keys = hit["source"]["channel"].keys() + if "channel_last_refresh" in channel_keys: + refreshed = hit["source"]["channel"]["channel_last_refresh"] date_refresh = datetime.fromtimestamp(refreshed) date_str = datetime.strftime(date_refresh, "%d %b, %Y") - hit['source']['channel']['channel_last_refresh'] = date_str + hit["source"]["channel"]["channel_last_refresh"] = date_str return hit @@ -192,13 +189,13 @@ class Pagination: def __init__(self, page_get, search_get=False): config = AppConfig().config - self.page_size = config['archive']['page_size'] + self.page_size = config["archive"]["page_size"] self.page_get = page_get self.search_get = search_get self.pagination = self.first_guess() def first_guess(self): - """ build first guess before api call """ + """build first guess before api call""" page_get = self.page_get if page_get in [0, 1]: page_from = 0 @@ -213,22 +210,22 @@ class Pagination: "page_size": self.page_size, "page_from": page_from, "prev_pages": prev_pages, - "current_page": page_get + "current_page": page_get, } if self.search_get: pagination.update({"search_get": self.search_get}) return pagination def validate(self, total_hits): - """ validate pagination with total_hits after making api call """ + """validate pagination with total_hits after making api call""" page_get = self.page_get max_pages = math.ceil(total_hits / self.page_size) if page_get < max_pages and max_pages > 1: - self.pagination['last_page'] = max_pages + self.pagination["last_page"] = max_pages else: - self.pagination['last_page'] = False + self.pagination["last_page"] = False next_pages = [ i for i in range(page_get + 1, page_get + 6) if 1 < i < max_pages ] - self.pagination['next_pages'] = next_pages + self.pagination["next_pages"] = next_pages diff --git a/tubearchivist/home/tasks.py b/tubearchivist/home/tasks.py index 5b19b9f..87949a8 100644 --- a/tubearchivist/home/tasks.py +++ b/tubearchivist/home/tasks.py @@ -14,17 +14,17 @@ from home.src.index_management import backup_all_indexes, restore_from_backup from home.src.reindex import ManualImport, reindex_old_documents CONFIG = AppConfig().config -REDIS_HOST = CONFIG['application']['REDIS_HOST'] +REDIS_HOST = CONFIG["application"]["REDIS_HOST"] -os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'home.settings') -app = Celery('tasks', broker='redis://' + REDIS_HOST) -app.config_from_object('django.conf:settings', namespace='CELERY') +os.environ.setdefault("DJANGO_SETTINGS_MODULE", "home.settings") +app = Celery("tasks", broker="redis://" + REDIS_HOST) +app.config_from_object("django.conf:settings", namespace="CELERY") app.autodiscover_tasks() @shared_task def update_subscribed(): - """ look for missing videos and add to pending """ + """look for missing videos and add to pending""" channel_handler = ChannelSubscription() missing_videos = channel_handler.find_missing() if missing_videos: @@ -36,10 +36,10 @@ def update_subscribed(): @shared_task def download_pending(): - """ download latest pending videos """ + """download latest pending videos""" pending_handler = PendingList() pending_vids = pending_handler.get_all_pending()[0] - to_download = [i['youtube_id'] for i in pending_vids] + to_download = [i["youtube_id"] for i in pending_vids] to_download.reverse() if to_download: download_handler = VideoDownloader(to_download) @@ -48,14 +48,14 @@ def download_pending(): @shared_task def download_single(youtube_id): - """ start download single video now """ + """start download single video now""" download_handler = VideoDownloader([youtube_id]) download_handler.download_list() @shared_task def extrac_dl(youtube_ids): - """ parse list passed and add to pending """ + """parse list passed and add to pending""" pending_handler = PendingList() missing_videos = pending_handler.parse_url_list(youtube_ids) pending_handler.add_to_pending(missing_videos) @@ -63,17 +63,17 @@ def extrac_dl(youtube_ids): @shared_task def check_reindex(): - """ run the reindex main command """ + """run the reindex main command""" reindex_old_documents() @shared_task def run_manual_import(): - """ called from settings page, to go through import folder """ + """called from settings page, to go through import folder""" - print('starting media file import') + print("starting media file import") have_lock = False - my_lock = get_lock('manual_import') + my_lock = get_lock("manual_import") try: have_lock = my_lock.acquire(blocking=False) @@ -91,13 +91,13 @@ def run_manual_import(): @shared_task def run_backup(): - """ called from settings page, dump backup to zip file """ + """called from settings page, dump backup to zip file""" backup_all_indexes() - print('backup finished') + print("backup finished") @shared_task def run_restore_backup(): - """ called from settings page, dump backup to zip file """ + """called from settings page, dump backup to zip file""" restore_from_backup() - print('index restore finished') + print("index restore finished") diff --git a/tubearchivist/home/templates/home/base.html b/tubearchivist/home/templates/home/base.html index 48d4253..c52898d 100644 --- a/tubearchivist/home/templates/home/base.html +++ b/tubearchivist/home/templates/home/base.html @@ -96,7 +96,7 @@ diff --git a/tubearchivist/home/urls.py b/tubearchivist/home/urls.py index 70d482f..4a319bb 100644 --- a/tubearchivist/home/urls.py +++ b/tubearchivist/home/urls.py @@ -1,22 +1,30 @@ """ all home app urls """ from django.urls import path -from home.views import (AboutView, ChannelIdView, ChannelView, DownloadView, - HomeView, SettingsView, VideoView) +from home.views import ( + AboutView, + ChannelIdView, + ChannelView, + DownloadView, + HomeView, + SettingsView, + VideoView, +) from . import views urlpatterns = [ - path('', HomeView.as_view(), name='home'), - path('about/', AboutView.as_view(), name='about'), - path('downloads/', DownloadView.as_view(), name='downloads'), - path('settings/', SettingsView.as_view(), name='settings'), - path('process/', views.process, name='process'), - path('downloads/progress', views.progress, name='progress'), - path('channel/', ChannelView.as_view(), name='channel'), + path("", HomeView.as_view(), name="home"), + path("about/", AboutView.as_view(), name="about"), + path("downloads/", DownloadView.as_view(), name="downloads"), + path("settings/", SettingsView.as_view(), name="settings"), + path("process/", views.process, name="process"), + path("downloads/progress", views.progress, name="progress"), + path("channel/", ChannelView.as_view(), name="channel"), path( - 'channel//', - ChannelIdView.as_view(), name='channel_id' + "channel//", + ChannelIdView.as_view(), + name="channel_id", ), - path('video//', VideoView.as_view(), name='video') + path("video//", VideoView.as_view(), name="video"), ] diff --git a/tubearchivist/home/views.py b/tubearchivist/home/views.py index d107a10..e67eee0 100644 --- a/tubearchivist/home/views.py +++ b/tubearchivist/home/views.py @@ -16,36 +16,46 @@ from django.utils.http import urlencode from django.views import View from home.src.config import AppConfig from home.src.download import ChannelSubscription, PendingList -from home.src.helper import (get_dl_message, get_message, process_url_list, - set_message) +from home.src.helper import ( + get_dl_message, + get_message, + process_url_list, + set_message, +) from home.src.searching import Pagination, SearchHandler -from home.tasks import (download_pending, download_single, extrac_dl, - run_backup, run_manual_import, run_restore_backup, - update_subscribed) +from home.tasks import ( + download_pending, + download_single, + extrac_dl, + run_backup, + run_manual_import, + run_restore_backup, + update_subscribed, +) class HomeView(View): - """ resolves to / + """resolves to / handle home page and video search post functionality """ CONFIG = AppConfig().config - ES_URL = CONFIG['application']['es_url'] + ES_URL = CONFIG["application"]["es_url"] def get(self, request): - """ return home search results """ + """return home search results""" colors, sort_order, hide_watched = self.read_config() # handle search - search_get = request.GET.get('search', False) + search_get = request.GET.get("search", False) if search_get: search_encoded = urllib.parse.quote(search_get) else: search_encoded = False # define page size - page_get = int(request.GET.get('page', 0)) + page_get = int(request.GET.get("page", 0)) pagination_handler = Pagination(page_get, search_encoded) - url = self.ES_URL + '/ta_video/_search' + url = self.ES_URL + "/ta_video/_search" data = self.build_data( pagination_handler, sort_order, search_get, hide_watched @@ -56,95 +66,94 @@ class HomeView(View): max_hits = search.max_hits pagination_handler.validate(max_hits) context = { - 'videos': videos_hits, - 'pagination': pagination_handler.pagination, - 'sortorder': sort_order, - 'hide_watched': hide_watched, - 'colors': colors + "videos": videos_hits, + "pagination": pagination_handler.pagination, + "sortorder": sort_order, + "hide_watched": hide_watched, + "colors": colors, } - return render(request, 'home/home.html', context) + return render(request, "home/home.html", context) @staticmethod def build_data(pagination_handler, sort_order, search_get, hide_watched): - """ build the data dict for the search query """ - page_size = pagination_handler.pagination['page_size'] - page_from = pagination_handler.pagination['page_from'] + """build the data dict for the search query""" + page_size = pagination_handler.pagination["page_size"] + page_from = pagination_handler.pagination["page_from"] data = { - "size": page_size, "from": page_from, "query": {"match_all": {}}, + "size": page_size, + "from": page_from, + "query": {"match_all": {}}, "sort": [ {"published": {"order": "desc"}}, - {"date_downloaded": {"order": "desc"}} - ] + {"date_downloaded": {"order": "desc"}}, + ], } # define sort - if sort_order == 'downloaded': - del data['sort'][0] + if sort_order == "downloaded": + del data["sort"][0] if search_get: - del data['sort'] + del data["sort"] if hide_watched: - data['query'] = {"term": {"player.watched": {"value": False}}} + data["query"] = {"term": {"player.watched": {"value": False}}} if search_get: query = { "multi_match": { "query": search_get, "fields": ["title", "channel.channel_name", "tags"], "type": "cross_fields", - "operator": "and" + "operator": "and", } } - data['query'] = query + data["query"] = query return data @staticmethod def read_config(): - """ read needed values from redis """ + """read needed values from redis""" config_handler = AppConfig().config - colors = config_handler['application']['colors'] - sort_order = get_message('sort_order') - hide_watched = get_message('hide_watched') + colors = config_handler["application"]["colors"] + sort_order = get_message("sort_order") + hide_watched = get_message("hide_watched") return colors, sort_order, hide_watched @staticmethod def post(request): - """ handle post from search form """ + """handle post from search form""" post_data = dict(request.POST) - search_query = post_data['videoSearch'][0] - search_url = '/?' + urlencode({'search': search_query}) + search_query = post_data["videoSearch"][0] + search_url = "/?" + urlencode({"search": search_query}) return redirect(search_url, permanent=True) class AboutView(View): - """ resolves to /about/ + """resolves to /about/ show helpful how to information """ @staticmethod def get(request): - """ handle http get """ + """handle http get""" config = AppConfig().config - colors = config['application']['colors'] - context = { - 'title': 'About', - 'colors': colors - } - return render(request, 'home/about.html', context) + colors = config["application"]["colors"] + context = {"title": "About", "colors": colors} + return render(request, "home/about.html", context) class DownloadView(View): - """ resolves to /download/ + """resolves to /download/ takes POST for downloading youtube links """ def get(self, request): - """ handle get requests """ + """handle get requests""" config = AppConfig().config - colors = config['application']['colors'] + colors = config["application"]["colors"] - page_get = int(request.GET.get('page', 0)) + page_get = int(request.GET.get("page", 0)) pagination_handler = Pagination(page_get) - url = config['application']['es_url'] + '/ta_download/_search' + url = config["application"]["es_url"] + "/ta_download/_search" data = self.build_data(pagination_handler) search = SearchHandler(url, data, cache=False) @@ -152,7 +161,7 @@ class DownloadView(View): max_hits = search.max_hits if videos_hits: - all_pending = [i['source'] for i in videos_hits] + all_pending = [i["source"] for i in videos_hits] pagination_handler.validate(max_hits) pagination = pagination_handler.pagination else: @@ -160,33 +169,34 @@ class DownloadView(View): pagination = False context = { - 'pending': all_pending, - 'max_hits': max_hits, - 'pagination': pagination, - 'title': 'Downloads', - 'colors': colors + "pending": all_pending, + "max_hits": max_hits, + "pagination": pagination, + "title": "Downloads", + "colors": colors, } - return render(request, 'home/downloads.html', context) + return render(request, "home/downloads.html", context) @staticmethod def build_data(pagination_handler): - """ build data dict for search """ - page_size = pagination_handler.pagination['page_size'] - page_from = pagination_handler.pagination['page_from'] + """build data dict for search""" + page_size = pagination_handler.pagination["page_size"] + page_from = pagination_handler.pagination["page_from"] data = { - "size": page_size, "from": page_from, + "size": page_size, + "from": page_from, "query": {"term": {"status": {"value": "pending"}}}, - "sort": [{"timestamp": {"order": "desc"}}] + "sort": [{"timestamp": {"order": "desc"}}], } return data @staticmethod def post(request): - """ handle post requests """ + """handle post requests""" download_post = dict(request.POST) - if 'vid-url' in download_post.keys(): - url_str = download_post['vid-url'] - print('adding to queue') + if "vid-url" in download_post.keys(): + url_str = download_post["vid-url"] + print("adding to queue") youtube_ids = process_url_list(url_str) if not youtube_ids: # failed to process @@ -194,52 +204,52 @@ class DownloadView(View): mess_dict = { "status": "downloading", "level": "error", - "title": 'Failed to extract links.', - "message": '' + "title": "Failed to extract links.", + "message": "", } - set_message('progress:download', mess_dict) - return redirect('downloads') + set_message("progress:download", mess_dict) + return redirect("downloads") print(youtube_ids) extrac_dl.delay(youtube_ids) sleep(2) - return redirect('downloads', permanent=True) + return redirect("downloads", permanent=True) class ChannelIdView(View): - """ resolves to /channel// + """resolves to /channel// display single channel page from channel_id """ def get(self, request, channel_id_detail): - """ get method """ + """get method""" es_url, colors = self.read_config() context = self.get_channel_videos(request, channel_id_detail, es_url) - context.update({'colors': colors}) - return render(request, 'home/channel_id.html', context) + context.update({"colors": colors}) + return render(request, "home/channel_id.html", context) @staticmethod def read_config(): - """ read config file """ + """read config file""" config = AppConfig().config - es_url = config['application']['es_url'] - colors = config['application']['colors'] + es_url = config["application"]["es_url"] + colors = config["application"]["colors"] return es_url, colors def get_channel_videos(self, request, channel_id_detail, es_url): - """ get channel from video index """ - page_get = int(request.GET.get('page', 0)) + """get channel from video index""" + page_get = int(request.GET.get("page", 0)) pagination_handler = Pagination(page_get) # get data - url = es_url + '/ta_video/_search' + url = es_url + "/ta_video/_search" data = self.build_data(pagination_handler, channel_id_detail) search = SearchHandler(url, data) videos_hits = search.get_data() max_hits = search.max_hits if max_hits: - channel_info = videos_hits[0]['source']['channel'] - channel_name = channel_info['channel_name'] + channel_info = videos_hits[0]["source"]["channel"] + channel_name = channel_info["channel_name"] pagination_handler.validate(max_hits) pagination = pagination_handler.pagination else: @@ -251,218 +261,223 @@ class ChannelIdView(View): pagination = False context = { - 'channel_info': channel_info, - 'videos': videos_hits, - 'max_hits': max_hits, - 'pagination': pagination, - 'title': 'Channel: ' + channel_name, + "channel_info": channel_info, + "videos": videos_hits, + "max_hits": max_hits, + "pagination": pagination, + "title": "Channel: " + channel_name, } return context @staticmethod def build_data(pagination_handler, channel_id_detail): - """ build data dict for search """ - page_size = pagination_handler.pagination['page_size'] - page_from = pagination_handler.pagination['page_from'] + """build data dict for search""" + page_size = pagination_handler.pagination["page_size"] + page_from = pagination_handler.pagination["page_from"] data = { - "size": page_size, "from": page_from, + "size": page_size, + "from": page_from, "query": { "term": {"channel.channel_id": {"value": channel_id_detail}} }, "sort": [ {"published": {"order": "desc"}}, - {"date_downloaded": {"order": "desc"}} - ] + {"date_downloaded": {"order": "desc"}}, + ], } return data @staticmethod def get_channel_info(channel_id_detail, es_url): - """ get channel info from channel index if no videos """ - url = f'{es_url}/ta_channel/_doc/{channel_id_detail}' + """get channel info from channel index if no videos""" + url = f"{es_url}/ta_channel/_doc/{channel_id_detail}" data = False search = SearchHandler(url, data) channel_data = search.get_data() - channel_info = channel_data[0]['source'] - channel_name = channel_info['channel_name'] + channel_info = channel_data[0]["source"] + channel_name = channel_info["channel_name"] return channel_info, channel_name class ChannelView(View): - """ resolves to /channel/ + """resolves to /channel/ handle functionality for channel overview page, subscribe to channel, search as you type for channel name """ def get(self, request): - """ handle http get requests """ + """handle http get requests""" es_url, colors = self.read_config() - page_get = int(request.GET.get('page', 0)) + page_get = int(request.GET.get("page", 0)) pagination_handler = Pagination(page_get) - page_size = pagination_handler.pagination['page_size'] - page_from = pagination_handler.pagination['page_from'] + page_size = pagination_handler.pagination["page_size"] + page_from = pagination_handler.pagination["page_from"] # get - url = es_url + '/ta_channel/_search' + url = es_url + "/ta_channel/_search" data = { - "size": page_size, "from": page_from, "query": {"match_all": {}}, - "sort": [{"channel_name.keyword": {"order": "asc"}}] + "size": page_size, + "from": page_from, + "query": {"match_all": {}}, + "sort": [{"channel_name.keyword": {"order": "asc"}}], } - show_subed_only = get_message('show_subed_only') + show_subed_only = get_message("show_subed_only") if show_subed_only: - data['query'] = {"term": {"channel_subscribed": {"value": True}}} + data["query"] = {"term": {"channel_subscribed": {"value": True}}} search = SearchHandler(url, data) channel_hits = search.get_data() max_hits = search.max_hits pagination_handler.validate(search.max_hits) context = { - 'channels': channel_hits, - 'max_hits': max_hits, - 'pagination': pagination_handler.pagination, - 'show_subed_only': show_subed_only, - 'title': 'Channels', - 'colors': colors + "channels": channel_hits, + "max_hits": max_hits, + "pagination": pagination_handler.pagination, + "show_subed_only": show_subed_only, + "title": "Channels", + "colors": colors, } - return render(request, 'home/channel.html', context) + return render(request, "home/channel.html", context) @staticmethod def read_config(): - """ read config file """ + """read config file""" config = AppConfig().config - es_url = config['application']['es_url'] - colors = config['application']['colors'] + es_url = config["application"]["es_url"] + colors = config["application"]["colors"] return es_url, colors def post(self, request): - """ handle http post requests """ + """handle http post requests""" subscriptions_post = dict(request.POST) print(subscriptions_post) subscriptions_post = dict(request.POST) - if 'subscribe' in subscriptions_post.keys(): - sub_str = subscriptions_post['subscribe'] + if "subscribe" in subscriptions_post.keys(): + sub_str = subscriptions_post["subscribe"] try: youtube_ids = process_url_list(sub_str) self.subscribe_to(youtube_ids) except ValueError: - print('parsing subscribe ids failed!') + print("parsing subscribe ids failed!") print(sub_str) sleep(1) - return redirect('channel', permanent=True) + return redirect("channel", permanent=True) @staticmethod def subscribe_to(youtube_ids): - """ process the subscribe ids """ + """process the subscribe ids""" for youtube_id in youtube_ids: - if youtube_id['type'] == 'video': - to_sub = youtube_id['url'] + if youtube_id["type"] == "video": + to_sub = youtube_id["url"] vid_details = PendingList().get_youtube_details(to_sub) - channel_id_sub = vid_details['channel_id'] - elif youtube_id['type'] == 'channel': - channel_id_sub = youtube_id['url'] + channel_id_sub = vid_details["channel_id"] + elif youtube_id["type"] == "channel": + channel_id_sub = youtube_id["url"] else: - raise ValueError('failed to subscribe to: ' + youtube_id) + raise ValueError("failed to subscribe to: " + youtube_id) ChannelSubscription().change_subscribe( channel_id_sub, channel_subscribed=True ) - print('subscribed to: ' + channel_id_sub) + print("subscribed to: " + channel_id_sub) class VideoView(View): - """ resolves to /video// + """resolves to /video// display details about a single video """ def get(self, request, video_id): - """ get single video """ + """get single video""" es_url, colors = self.read_config() - url = f'{es_url}/ta_video/_doc/{video_id}' + url = f"{es_url}/ta_video/_doc/{video_id}" data = None look_up = SearchHandler(url, data) video_hit = look_up.get_data() - video_data = video_hit[0]['source'] - video_title = video_data['title'] - context = { - 'video': video_data, - 'title': video_title, - 'colors': colors - } - return render(request, 'home/video.html', context) + video_data = video_hit[0]["source"] + video_title = video_data["title"] + context = {"video": video_data, "title": video_title, "colors": colors} + return render(request, "home/video.html", context) @staticmethod def read_config(): - """ read config file """ + """read config file""" config = AppConfig().config - es_url = config['application']['es_url'] - colors = config['application']['colors'] + es_url = config["application"]["es_url"] + colors = config["application"]["colors"] return es_url, colors class SettingsView(View): - """ resolves to /settings/ + """resolves to /settings/ handle the settings page, display current settings, take post request from the form to update settings """ @staticmethod def get(request): - """ read and display current settings """ + """read and display current settings""" config = AppConfig().config - colors = config['application']['colors'] + colors = config["application"]["colors"] - context = { - 'title': 'Settings', - 'config': config, - 'colors': colors - } + context = {"title": "Settings", "config": config, "colors": colors} - return render(request, 'home/settings.html', context) + return render(request, "home/settings.html", context) @staticmethod def post(request): - """ handle form post to update settings """ + """handle form post to update settings""" form_post = dict(request.POST) - del form_post['csrfmiddlewaretoken'] + del form_post["csrfmiddlewaretoken"] print(form_post) config_handler = AppConfig() config_handler.update_config(form_post) - return redirect('settings', permanent=True) + return redirect("settings", permanent=True) def progress(request): # pylint: disable=unused-argument - """ endpoint for download progress ajax calls """ + """endpoint for download progress ajax calls""" config = AppConfig().config - cache_dir = config['application']['cache_dir'] + cache_dir = config["application"]["cache_dir"] json_data = get_dl_message(cache_dir) return JsonResponse(json_data) def process(request): - """ handle all the buttons calls via POST ajax """ - if request.method == 'POST': + """handle all the buttons calls via POST ajax""" + if request.method == "POST": post_dict = json.loads(request.body.decode()) post_handler = PostData(post_dict) if post_handler.to_do: task_result = post_handler.run_task() return JsonResponse(task_result) - return JsonResponse({'success': False}) + return JsonResponse({"success": False}) class PostData: - """ generic post handler from process route """ + """generic post handler from process route""" CONFIG = AppConfig().config - ES_URL = CONFIG['application']['es_url'] + ES_URL = CONFIG["application"]["es_url"] VALID_KEYS = [ - "watched", "rescan_pending", "ignore", "dl_pending", - "unsubscribe", "sort_order", "hide_watched", "show_subed_only", - "channel-search", "video-search", "dlnow", "manual-import", - "db-backup", "db-restore" + "watched", + "rescan_pending", + "ignore", + "dl_pending", + "unsubscribe", + "sort_order", + "hide_watched", + "show_subed_only", + "channel-search", + "video-search", + "dlnow", + "manual-import", + "db-backup", + "db-restore", ] def __init__(self, post_dict): @@ -470,81 +485,81 @@ class PostData: self.to_do = self.validate() def validate(self): - """ validate the post_dict """ + """validate the post_dict""" to_do = [] for key, value in self.post_dict.items(): if key in self.VALID_KEYS: - task_item = {'task': key, 'status': value} + task_item = {"task": key, "status": value} print(task_item) to_do.append(task_item) else: - print(key + ' not a valid key') + print(key + " not a valid key") return to_do def run_task(self): - """ run through the tasks to do """ + """run through the tasks to do""" for item in self.to_do: - task = item['task'] - if task == 'watched': - youtube_id = item['status'] + task = item["task"] + if task == "watched": + youtube_id = item["status"] self.parse_watched(youtube_id) - elif task == 'rescan_pending': - print('rescan subscribed channels') + elif task == "rescan_pending": + print("rescan subscribed channels") update_subscribed.delay() - elif task == 'ignore': - print('ignore video') + elif task == "ignore": + print("ignore video") handler = PendingList() - ignore_list = item['status'] + ignore_list = item["status"] handler.ignore_from_pending([ignore_list]) - elif task == 'dl_pending': - print('download pending') + elif task == "dl_pending": + print("download pending") download_pending.delay() - elif task == 'unsubscribe': - channel_id_unsub = item['status'] - print('unsubscribe from ' + channel_id_unsub) + elif task == "unsubscribe": + channel_id_unsub = item["status"] + print("unsubscribe from " + channel_id_unsub) ChannelSubscription().change_subscribe( channel_id_unsub, channel_subscribed=False ) - elif task == 'sort_order': - sort_order = item['status'] - set_message('sort_order', sort_order, expire=False) - elif task == 'hide_watched': - hide_watched = bool(int(item['status'])) - print(item['status']) - set_message('hide_watched', hide_watched, expire=False) - elif task == 'show_subed_only': - show_subed_only = bool(int(item['status'])) + elif task == "sort_order": + sort_order = item["status"] + set_message("sort_order", sort_order, expire=False) + elif task == "hide_watched": + hide_watched = bool(int(item["status"])) + print(item["status"]) + set_message("hide_watched", hide_watched, expire=False) + elif task == "show_subed_only": + show_subed_only = bool(int(item["status"])) print(show_subed_only) - set_message('show_subed_only', show_subed_only, expire=False) - elif task == 'channel-search': - search_query = item['status'] - print('searching for: ' + search_query) + set_message("show_subed_only", show_subed_only, expire=False) + elif task == "channel-search": + search_query = item["status"] + print("searching for: " + search_query) search_results = self.search_channels(search_query) return search_results - elif task == 'video-search': - search_query = item['status'] - print('searching for: ' + search_query) + elif task == "video-search": + search_query = item["status"] + print("searching for: " + search_query) search_results = self.search_videos(search_query) return search_results - elif task == 'dlnow': - youtube_id = item['status'] - print('downloading: ' + youtube_id) + elif task == "dlnow": + youtube_id = item["status"] + print("downloading: " + youtube_id) download_single.delay(youtube_id=youtube_id) - elif task == 'manual-import': - print('starting manual import') + elif task == "manual-import": + print("starting manual import") run_manual_import.delay() - elif task == 'db-backup': - print('backing up database') + elif task == "db-backup": + print("backing up database") run_backup.delay() - elif task == 'db-restore': - print('restoring index from backup zip') + elif task == "db-restore": + print("restoring index from backup zip") run_restore_backup.delay() - return {'success': True} + return {"success": True} def search_channels(self, search_query): - """ fancy searching channels as you type """ - url = self.ES_URL + '/ta_channel/_search' + """fancy searching channels as you type""" + url = self.ES_URL + "/ta_channel/_search" data = { "size": 10, "query": { @@ -554,18 +569,18 @@ class PostData: "fields": [ "channel_name.search_as_you_type", "channel_name._2gram", - "channel_name._3gram" - ] + "channel_name._3gram", + ], } - } + }, } look_up = SearchHandler(url, data, cache=False) search_results = look_up.get_data() - return {'results': search_results} + return {"results": search_results} def search_videos(self, search_query): - """ fancy searching videos as you type """ - url = self.ES_URL + '/ta_video/_search' + """fancy searching videos as you type""" + url = self.ES_URL + "/ta_video/_search" data = { "size": 10, "query": { @@ -575,51 +590,51 @@ class PostData: "fields": [ "title.search_as_you_type", "title._2gram", - "title._3gram" - ] + "title._3gram", + ], } - } + }, } look_up = SearchHandler(url, data, cache=False) search_results = look_up.get_data() - return {'results': search_results} + return {"results": search_results} def parse_watched(self, youtube_id): - """ marked as watched based on id type """ + """marked as watched based on id type""" es_url = self.ES_URL - id_type = process_url_list([youtube_id])[0]['type'] + id_type = process_url_list([youtube_id])[0]["type"] stamp = int(datetime.now().strftime("%s")) - if id_type == 'video': + if id_type == "video": stamp = int(datetime.now().strftime("%s")) - url = self.ES_URL + '/ta_video/_update/' + youtube_id + url = self.ES_URL + "/ta_video/_update/" + youtube_id source = { "doc": {"player": {"watched": True, "watched_date": stamp}} } request = requests.post(url, json=source) if not request.ok: print(request.text) - elif id_type == 'channel': - headers = {'Content-type': 'application/json'} + elif id_type == "channel": + headers = {"Content-type": "application/json"} data = { "description": youtube_id, "processors": [ {"set": {"field": "player.watched", "value": True}}, - {"set": {"field": "player.watched_date", "value": stamp}} - ] + {"set": {"field": "player.watched_date", "value": stamp}}, + ], } payload = json.dumps(data) - url = es_url + '/_ingest/pipeline/' + youtube_id + url = es_url + "/_ingest/pipeline/" + youtube_id request = requests.put(url, data=payload, headers=headers) if not request.ok: print(request.text) # apply pipeline must_list = [ {"term": {"channel.channel_id": {"value": youtube_id}}}, - {"term": {"player.watched": {"value": False}}} + {"term": {"player.watched": {"value": False}}}, ] data = {"query": {"bool": {"must": must_list}}} payload = json.dumps(data) - url = f'{es_url}/ta_video/_update_by_query?pipeline={youtube_id}' + url = f"{es_url}/ta_video/_update_by_query?pipeline={youtube_id}" request = requests.post(url, data=payload, headers=headers) if not request.ok: print(request.text) diff --git a/tubearchivist/manage.py b/tubearchivist/manage.py index 78679b3..f17c8b6 100755 --- a/tubearchivist/manage.py +++ b/tubearchivist/manage.py @@ -7,7 +7,7 @@ import sys def main(): # pylint: disable=import-outside-toplevel """Run administrative tasks.""" - os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'config.settings') + os.environ.setdefault("DJANGO_SETTINGS_MODULE", "config.settings") try: from django.core.management import execute_from_command_line except ImportError as exc: @@ -19,5 +19,5 @@ def main(): execute_from_command_line(sys.argv) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/version_check.py b/version_check.py index cac5e4b..0ada678 100755 --- a/version_check.py +++ b/version_check.py @@ -8,10 +8,10 @@ import requests class Requirements: - """ handle requirements.txt """ + """handle requirements.txt""" - FILE_PATH = 'tubearchivist/requirements.txt' - LOCK = '/tmp/tubearchivist-requirements.lock' + FILE_PATH = "tubearchivist/requirements.txt" + LOCK = "/tmp/tubearchivist-requirements.lock" def __init__(self): self.exists = self.checked_today() @@ -19,24 +19,24 @@ class Requirements: self.all_updates = False def checked_today(self): - """ skip requirements check when lock file exists """ + """skip requirements check when lock file exists""" exists = pathlib.Path(self.LOCK).exists() return exists def look_for_updates(self): - """ look through requirements and check for updates """ + """look through requirements and check for updates""" self.all_requirements = self.get_dependencies() self.all_updates = self.check_packages() def get_dependencies(self): - """ read out requirements.txt """ + """read out requirements.txt""" all_requirements = [] - with open(self.FILE_PATH, 'r', encoding='utf-8') as f: + with open(self.FILE_PATH, "r", encoding="utf-8") as f: dependencies = f.readlines() for dependency in dependencies: - package, version = dependency.split('==') + package, version = dependency.split("==") all_requirements.append((package, version.strip())) all_requirements.sort(key=lambda x: x[0].lower()) @@ -44,33 +44,32 @@ class Requirements: return all_requirements def check_packages(self): - """ compare installed with remote version """ + """compare installed with remote version""" total = len(self.all_requirements) - print(f'checking versions for {total} packages...') + print(f"checking versions for {total} packages...") all_updates = {} for dependency in self.all_requirements: package, version_installed = dependency - url = f'https://pypi.org/pypi/{package}/json' + url = f"https://pypi.org/pypi/{package}/json" response = requests.get(url).json() - version_remote = response['info']['version'] - homepage = response['info']['home_page'] + version_remote = response["info"]["version"] + homepage = response["info"]["home_page"] if version_remote != version_installed: to_update = { - package: { - "from": version_installed, - "to": version_remote - } + package: {"from": version_installed, "to": version_remote} } all_updates.update(to_update) - message = (f'update {package} {version_installed}' + - f'==> {version_remote}\n {homepage}') + message = ( + f"update {package} {version_installed}" + + f"==> {version_remote}\n {homepage}" + ) print(message) if not all_updates: - print('no updates found') + print("no updates found") # remember that pathlib.Path(self.LOCK).touch() @@ -78,7 +77,7 @@ class Requirements: return all_updates def apply_updates(self): - """ update requirements.txt file with new versions """ + """update requirements.txt file with new versions""" to_write = [] @@ -86,31 +85,31 @@ class Requirements: package, old_version = requirement if package in self.all_updates.keys(): - package_version = self.all_updates[package]['to'] + package_version = self.all_updates[package]["to"] else: package_version = old_version - to_write.append(f'{package}=={package_version}\n') + to_write.append(f"{package}=={package_version}\n") - with open(self.FILE_PATH, 'w', encoding='utf-8') as f: + with open(self.FILE_PATH, "w", encoding="utf-8") as f: f.writelines(to_write) - print('requirements.txt updates') + print("requirements.txt updates") def main(): - """ main to check for updates """ + """main to check for updates""" handler = Requirements() if handler.exists: return handler.look_for_updates() if handler.all_updates: - input_response = input('\nupdate requirements.txt? [y/n] ') - if input_response == 'y': + input_response = input("\nupdate requirements.txt? [y/n] ") + if input_response == "y": handler.apply_updates() else: - print('cancle update...') + print("cancel update...") sys.exit(1)