linting everything in black

This commit is contained in:
simon 2021-09-21 16:25:22 +07:00
parent 69e6e490f4
commit 2433e0e7d8
19 changed files with 1100 additions and 1111 deletions

View File

@ -11,6 +11,6 @@ import os
from django.core.asgi import get_asgi_application from django.core.asgi import get_asgi_application
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'config.settings') os.environ.setdefault("DJANGO_SETTINGS_MODULE", "config.settings")
application = get_asgi_application() application = get_asgi_application()

View File

@ -21,67 +21,67 @@ BASE_DIR = Path(__file__).resolve().parent.parent
# See https://docs.djangoproject.com/en/3.2/howto/deployment/checklist/ # See https://docs.djangoproject.com/en/3.2/howto/deployment/checklist/
# SECURITY WARNING: keep the secret key used in production secret! # SECURITY WARNING: keep the secret key used in production secret!
SECRET_KEY = 'Fvid^aUL6LohRZz*kZFvq85B&JW&kB9o*#jdzWsdWE8*XkCLR8' SECRET_KEY = "Fvid^aUL6LohRZz*kZFvq85B&JW&kB9o*#jdzWsdWE8*XkCLR8"
# SECURITY WARNING: don't run with debug turned on in production! # SECURITY WARNING: don't run with debug turned on in production!
DEBUG = bool(environ.get('DJANGO_DEBUG')) DEBUG = bool(environ.get("DJANGO_DEBUG"))
ALLOWED_HOSTS = ['*'] ALLOWED_HOSTS = ["*"]
# Application definition # Application definition
INSTALLED_APPS = [ INSTALLED_APPS = [
'home.apps.HomeConfig', "home.apps.HomeConfig",
'django.contrib.admin', "django.contrib.admin",
'django.contrib.auth', "django.contrib.auth",
'django.contrib.contenttypes', "django.contrib.contenttypes",
'django.contrib.sessions', "django.contrib.sessions",
'django.contrib.messages', "django.contrib.messages",
'whitenoise.runserver_nostatic', "whitenoise.runserver_nostatic",
'django.contrib.staticfiles', "django.contrib.staticfiles",
'django.contrib.humanize' "django.contrib.humanize",
] ]
MIDDLEWARE = [ MIDDLEWARE = [
'django.middleware.security.SecurityMiddleware', "django.middleware.security.SecurityMiddleware",
'django.contrib.sessions.middleware.SessionMiddleware', "django.contrib.sessions.middleware.SessionMiddleware",
'whitenoise.middleware.WhiteNoiseMiddleware', "whitenoise.middleware.WhiteNoiseMiddleware",
'django.middleware.common.CommonMiddleware', "django.middleware.common.CommonMiddleware",
'django.middleware.csrf.CsrfViewMiddleware', "django.middleware.csrf.CsrfViewMiddleware",
'django.contrib.auth.middleware.AuthenticationMiddleware', "django.contrib.auth.middleware.AuthenticationMiddleware",
'django.contrib.messages.middleware.MessageMiddleware', "django.contrib.messages.middleware.MessageMiddleware",
'django.middleware.clickjacking.XFrameOptionsMiddleware', "django.middleware.clickjacking.XFrameOptionsMiddleware",
] ]
ROOT_URLCONF = 'config.urls' ROOT_URLCONF = "config.urls"
TEMPLATES = [ TEMPLATES = [
{ {
'BACKEND': 'django.template.backends.django.DjangoTemplates', "BACKEND": "django.template.backends.django.DjangoTemplates",
'DIRS': [], "DIRS": [],
'APP_DIRS': True, "APP_DIRS": True,
'OPTIONS': { "OPTIONS": {
'context_processors': [ "context_processors": [
'django.template.context_processors.debug', "django.template.context_processors.debug",
'django.template.context_processors.request', "django.template.context_processors.request",
'django.contrib.auth.context_processors.auth', "django.contrib.auth.context_processors.auth",
'django.contrib.messages.context_processors.messages', "django.contrib.messages.context_processors.messages",
], ],
}, },
}, },
] ]
WSGI_APPLICATION = 'config.wsgi.application' WSGI_APPLICATION = "config.wsgi.application"
# Database # Database
# https://docs.djangoproject.com/en/3.2/ref/settings/#databases # https://docs.djangoproject.com/en/3.2/ref/settings/#databases
DATABASES = { DATABASES = {
'default': { "default": {
'ENGINE': 'django.db.backends.sqlite3', "ENGINE": "django.db.backends.sqlite3",
'NAME': BASE_DIR / 'db.sqlite3', "NAME": BASE_DIR / "db.sqlite3",
} }
} }
@ -91,16 +91,16 @@ DATABASES = {
AUTH_PASSWORD_VALIDATORS = [ AUTH_PASSWORD_VALIDATORS = [
{ {
'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator', # noqa: E501 "NAME": "django.contrib.auth.password_validation.UserAttributeSimilarityValidator", # noqa: E501
}, },
{ {
'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator', # noqa: E501 "NAME": "django.contrib.auth.password_validation.MinimumLengthValidator", # noqa: E501
}, },
{ {
'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator', # noqa: E501 "NAME": "django.contrib.auth.password_validation.CommonPasswordValidator", # noqa: E501
}, },
{ {
'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator', # noqa: E501 "NAME": "django.contrib.auth.password_validation.NumericPasswordValidator", # noqa: E501
}, },
] ]
@ -108,9 +108,9 @@ AUTH_PASSWORD_VALIDATORS = [
# Internationalization # Internationalization
# https://docs.djangoproject.com/en/3.2/topics/i18n/ # https://docs.djangoproject.com/en/3.2/topics/i18n/
LANGUAGE_CODE = 'en-us' LANGUAGE_CODE = "en-us"
TIME_ZONE = 'UTC' TIME_ZONE = "UTC"
USE_I18N = True USE_I18N = True
@ -122,7 +122,7 @@ USE_TZ = True
# Static files (CSS, JavaScript, Images) # Static files (CSS, JavaScript, Images)
# https://docs.djangoproject.com/en/3.2/howto/static-files/ # https://docs.djangoproject.com/en/3.2/howto/static-files/
STATIC_URL = '/static/' STATIC_URL = "/static/"
# STATICFILES_DIRS = [ # STATICFILES_DIRS = [
# str(BASE_DIR.joinpath('static')), # str(BASE_DIR.joinpath('static')),
@ -130,15 +130,15 @@ STATIC_URL = '/static/'
# ] # ]
# STATIC_URL = '/static/' # STATIC_URL = '/static/'
STATICFILES_DIRS = (str(BASE_DIR.joinpath('static')),) STATICFILES_DIRS = (str(BASE_DIR.joinpath("static")),)
# MEDIA_ROOT = str(BASE_DIR.joinpath('media')) # MEDIA_ROOT = str(BASE_DIR.joinpath('media'))
# MEDIA_URL = '/media/' # MEDIA_URL = '/media/'
STATIC_ROOT = str(BASE_DIR.joinpath('staticfiles')) STATIC_ROOT = str(BASE_DIR.joinpath("staticfiles"))
STATICFILES_STORAGE = 'whitenoise.storage.CompressedManifestStaticFilesStorage' STATICFILES_STORAGE = "whitenoise.storage.CompressedManifestStaticFilesStorage"
# Default primary key field type # Default primary key field type
# https://docs.djangoproject.com/en/3.2/ref/settings/#default-auto-field # https://docs.djangoproject.com/en/3.2/ref/settings/#default-auto-field
DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField' DEFAULT_AUTO_FIELD = "django.db.models.BigAutoField"

View File

@ -17,6 +17,6 @@ from django.contrib import admin
from django.urls import include, path from django.urls import include, path
urlpatterns = [ urlpatterns = [
path('', include('home.urls')), path("", include("home.urls")),
path('admin/', admin.site.urls), path("admin/", admin.site.urls),
] ]

View File

@ -11,6 +11,6 @@ import os
from django.core.wsgi import get_wsgi_application from django.core.wsgi import get_wsgi_application
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'config.settings') os.environ.setdefault("DJANGO_SETTINGS_MODULE", "config.settings")
application = get_wsgi_application() application = get_wsgi_application()

View File

@ -10,24 +10,24 @@ from .tasks import app as celery_app
def sync_redis_state(): def sync_redis_state():
""" make sure redis gets the config.json values """ """make sure redis gets the config.json values"""
print('sync redis') print("sync redis")
config_handler = AppConfig() config_handler = AppConfig()
config_handler.load_new_defaults() config_handler.load_new_defaults()
config = config_handler.config config = config_handler.config
sort_order = config['archive']['sort'] sort_order = config["archive"]["sort"]
set_message('sort_order', sort_order, expire=False) set_message("sort_order", sort_order, expire=False)
hide_watched = bool(int(config['archive']['hide_watched'])) hide_watched = bool(int(config["archive"]["hide_watched"]))
set_message('hide_watched', hide_watched, expire=False) set_message("hide_watched", hide_watched, expire=False)
show_subed_only = bool(int(config['archive']['show_subed_only'])) show_subed_only = bool(int(config["archive"]["show_subed_only"]))
set_message('show_subed_only', show_subed_only, expire=False) set_message("show_subed_only", show_subed_only, expire=False)
def make_folders(): def make_folders():
""" make needed cache folders here so docker doesn't mess it up """ """make needed cache folders here so docker doesn't mess it up"""
folders = ['download', 'channels', 'videos', 'import', 'backup'] folders = ["download", "channels", "videos", "import", "backup"]
config = AppConfig().config config = AppConfig().config
cache_dir = config['application']['cache_dir'] cache_dir = config["application"]["cache_dir"]
for folder in folders: for folder in folders:
folder_path = os.path.join(cache_dir, folder) folder_path = os.path.join(cache_dir, folder)
try: try:
@ -36,7 +36,7 @@ def make_folders():
continue continue
__all__ = ('celery_app',) __all__ = ("celery_app",)
make_folders() make_folders()
sync_redis_state() sync_redis_state()
index_check() index_check()

View File

@ -2,5 +2,5 @@ from django.apps import AppConfig
class HomeConfig(AppConfig): class HomeConfig(AppConfig):
default_auto_field = 'django.db.models.BigAutoField' default_auto_field = "django.db.models.BigAutoField"
name = 'home' name = "home"

View File

@ -12,71 +12,71 @@ from home.src.helper import get_message, set_message
class AppConfig: class AppConfig:
""" handle user settings and application variables """ """handle user settings and application variables"""
def __init__(self): def __init__(self):
self.config = self.get_config() self.config = self.get_config()
def get_config(self): def get_config(self):
""" get config from default file or redis if changed """ """get config from default file or redis if changed"""
config = self.get_config_redis() config = self.get_config_redis()
if not config: if not config:
config = self.get_config_file() config = self.get_config_file()
config['application'].update(self.get_config_env()) config["application"].update(self.get_config_env())
return config return config
def get_config_file(self): def get_config_file(self):
""" read the defaults from config.json """ """read the defaults from config.json"""
with open('home/config.json', 'r', encoding="utf-8") as f: with open("home/config.json", "r", encoding="utf-8") as f:
config_str = f.read() config_str = f.read()
config_file = json.loads(config_str) config_file = json.loads(config_str)
config_file['application'].update(self.get_config_env()) config_file["application"].update(self.get_config_env())
return config_file return config_file
@staticmethod @staticmethod
def get_config_env(): def get_config_env():
""" read environment application variables """ """read environment application variables"""
application = { application = {
'REDIS_HOST': os.environ.get('REDIS_HOST'), "REDIS_HOST": os.environ.get("REDIS_HOST"),
'es_url': os.environ.get('ES_URL'), "es_url": os.environ.get("ES_URL"),
'HOST_UID': int(os.environ.get('HOST_UID')), "HOST_UID": int(os.environ.get("HOST_UID")),
'HOST_GID': int(os.environ.get('HOST_GID')) "HOST_GID": int(os.environ.get("HOST_GID")),
} }
return application return application
@staticmethod @staticmethod
def get_config_redis(): def get_config_redis():
""" read config json set from redis to overwrite defaults """ """read config json set from redis to overwrite defaults"""
config = get_message('config') config = get_message("config")
if not list(config.values())[0]: if not list(config.values())[0]:
return False return False
return config return config
def update_config(self, form_post): def update_config(self, form_post):
""" update config values from settings form """ """update config values from settings form"""
config = self.config config = self.config
for key, value in form_post.items(): for key, value in form_post.items():
to_write = value[0] to_write = value[0]
if len(to_write): if len(to_write):
if to_write == '0': if to_write == "0":
to_write = False to_write = False
elif to_write == '1': elif to_write == "1":
to_write = True to_write = True
elif to_write.isdigit(): elif to_write.isdigit():
to_write = int(to_write) to_write = int(to_write)
config_dict, config_value = key.split('.') config_dict, config_value = key.split(".")
config[config_dict][config_value] = to_write config[config_dict][config_value] = to_write
set_message('config', config, expire=False) set_message("config", config, expire=False)
def load_new_defaults(self): def load_new_defaults(self):
""" check config.json for missing defaults """ """check config.json for missing defaults"""
default_config = self.get_config_file() default_config = self.get_config_file()
redis_config = self.get_config_redis() redis_config = self.get_config_redis()
@ -100,4 +100,4 @@ class AppConfig:
needs_update = True needs_update = True
if needs_update: if needs_update:
set_message('config', redis_config, expire=False) set_message("config", redis_config, expire=False)

View File

@ -19,15 +19,15 @@ from home.src.index import YoutubeChannel, index_new_video
class PendingList: class PendingList:
""" manage the pending videos list """ """manage the pending videos list"""
CONFIG = AppConfig().config CONFIG = AppConfig().config
ES_URL = CONFIG['application']['es_url'] ES_URL = CONFIG["application"]["es_url"]
VIDEOS = CONFIG['application']['videos'] VIDEOS = CONFIG["application"]["videos"]
@staticmethod @staticmethod
def parse_url_list(youtube_ids): def parse_url_list(youtube_ids):
""" extract youtube ids from list """ """extract youtube ids from list"""
missing_videos = [] missing_videos = []
for entry in youtube_ids: for entry in youtube_ids:
# notify # notify
@ -35,31 +35,31 @@ class PendingList:
"status": "pending", "status": "pending",
"level": "info", "level": "info",
"title": "Adding to download queue.", "title": "Adding to download queue.",
"message": 'Extracting lists' "message": "Extracting lists",
} }
set_message('progress:download', mess_dict) set_message("progress:download", mess_dict)
# extract # extract
url = entry['url'] url = entry["url"]
url_type = entry['type'] url_type = entry["type"]
if url_type == 'video': if url_type == "video":
missing_videos.append(url) missing_videos.append(url)
elif url_type == 'channel': elif url_type == "channel":
youtube_ids = ChannelSubscription().get_last_youtube_videos( youtube_ids = ChannelSubscription().get_last_youtube_videos(
url, limit=False url, limit=False
) )
missing_videos = missing_videos + youtube_ids missing_videos = missing_videos + youtube_ids
elif url_type == 'playlist': elif url_type == "playlist":
youtube_ids = playlist_extractor(url) youtube_ids = playlist_extractor(url)
missing_videos = missing_videos + youtube_ids missing_videos = missing_videos + youtube_ids
return missing_videos return missing_videos
def add_to_pending(self, missing_videos): def add_to_pending(self, missing_videos):
""" build the bulk json data from pending """ """build the bulk json data from pending"""
# check if channel is indexed # check if channel is indexed
channel_handler = ChannelSubscription() channel_handler = ChannelSubscription()
all_indexed = channel_handler.get_channels(subscribed_only=False) all_indexed = channel_handler.get_channels(subscribed_only=False)
all_channel_ids = [i['channel_id'] for i in all_indexed] all_channel_ids = [i["channel_id"] for i in all_indexed]
# check if already there # check if already there
all_downloaded = self.get_all_downloaded() all_downloaded = self.get_all_downloaded()
# loop # loop
@ -77,11 +77,11 @@ class PendingList:
if not video: if not video:
continue continue
if video['channel_id'] in all_channel_ids: if video["channel_id"] in all_channel_ids:
video['channel_indexed'] = True video["channel_indexed"] = True
else: else:
video['channel_indexed'] = False video["channel_indexed"] = False
video['status'] = "pending" video["status"] = "pending"
action = {"create": {"_id": youtube_id, "_index": "ta_download"}} action = {"create": {"_id": youtube_id, "_index": "ta_download"}}
bulk_list.append(json.dumps(action)) bulk_list.append(json.dumps(action))
bulk_list.append(json.dumps(video)) bulk_list.append(json.dumps(video))
@ -90,128 +90,130 @@ class PendingList:
"status": "pending", "status": "pending",
"level": "info", "level": "info",
"title": "Adding to download queue.", "title": "Adding to download queue.",
"message": 'Processing IDs...' "message": "Processing IDs...",
} }
set_message('progress:download', mess_dict) set_message("progress:download", mess_dict)
# add last newline # add last newline
bulk_list.append('\n') bulk_list.append("\n")
query_str = '\n'.join(bulk_list) query_str = "\n".join(bulk_list)
headers = {'Content-type': 'application/x-ndjson'} headers = {"Content-type": "application/x-ndjson"}
url = self.ES_URL + '/_bulk' url = self.ES_URL + "/_bulk"
request = requests.post(url, data=query_str, headers=headers) request = requests.post(url, data=query_str, headers=headers)
if not request.ok: if not request.ok:
print(request) print(request)
@staticmethod @staticmethod
def get_youtube_details(youtube_id): def get_youtube_details(youtube_id):
""" get details from youtubedl for single pending video """ """get details from youtubedl for single pending video"""
obs = { obs = {
'default_search': 'ytsearch', "default_search": "ytsearch",
'quiet': True, "quiet": True,
'skip_download': True, "skip_download": True,
} }
try: try:
vid = youtube_dl.YoutubeDL(obs).extract_info(youtube_id) vid = youtube_dl.YoutubeDL(obs).extract_info(youtube_id)
except youtube_dl.utils.DownloadError: except youtube_dl.utils.DownloadError:
print('failed to extract info for: ' + youtube_id) print("failed to extract info for: " + youtube_id)
return False return False
# parse response # parse response
seconds = vid['duration'] seconds = vid["duration"]
duration_str = DurationConverter.get_str(seconds) duration_str = DurationConverter.get_str(seconds)
upload_date = vid['upload_date'] upload_date = vid["upload_date"]
upload_dt = datetime.strptime(upload_date, "%Y%m%d") upload_dt = datetime.strptime(upload_date, "%Y%m%d")
published = upload_dt.strftime("%Y-%m-%d") published = upload_dt.strftime("%Y-%m-%d")
# build dict # build dict
youtube_details = { youtube_details = {
"youtube_id": youtube_id, "youtube_id": youtube_id,
"channel_name": vid['channel'], "channel_name": vid["channel"],
"vid_thumb_url": vid['thumbnail'], "vid_thumb_url": vid["thumbnail"],
"title": vid['title'], "title": vid["title"],
"channel_id": vid['channel_id'], "channel_id": vid["channel_id"],
"duration": duration_str, "duration": duration_str,
"published": published, "published": published,
"timestamp": int(datetime.now().strftime("%s")) "timestamp": int(datetime.now().strftime("%s")),
} }
return youtube_details return youtube_details
def get_all_pending(self): def get_all_pending(self):
""" get a list of all pending videos in ta_download """ """get a list of all pending videos in ta_download"""
headers = {'Content-type': 'application/json'} headers = {"Content-type": "application/json"}
# get PIT ID # get PIT ID
url = self.ES_URL + '/ta_download/_pit?keep_alive=1m' url = self.ES_URL + "/ta_download/_pit?keep_alive=1m"
response = requests.post(url) response = requests.post(url)
json_data = json.loads(response.text) json_data = json.loads(response.text)
pit_id = json_data['id'] pit_id = json_data["id"]
# query # query
data = { data = {
"size": 50, "query": {"match_all": {}}, "size": 50,
"query": {"match_all": {}},
"pit": {"id": pit_id, "keep_alive": "1m"}, "pit": {"id": pit_id, "keep_alive": "1m"},
"sort": [{"timestamp": {"order": "desc"}}] "sort": [{"timestamp": {"order": "desc"}}],
} }
query_str = json.dumps(data) query_str = json.dumps(data)
url = self.ES_URL + '/_search' url = self.ES_URL + "/_search"
all_pending = [] all_pending = []
all_ignore = [] all_ignore = []
while True: while True:
response = requests.get(url, data=query_str, headers=headers) response = requests.get(url, data=query_str, headers=headers)
json_data = json.loads(response.text) json_data = json.loads(response.text)
all_hits = json_data['hits']['hits'] all_hits = json_data["hits"]["hits"]
if all_hits: if all_hits:
for hit in all_hits: for hit in all_hits:
youtube_id = hit['_source']['youtube_id'] youtube_id = hit["_source"]["youtube_id"]
status = hit['_source']['status'] status = hit["_source"]["status"]
if status == 'pending': if status == "pending":
all_pending.append(hit['_source']) all_pending.append(hit["_source"])
elif status == 'ignore': elif status == "ignore":
all_ignore.append(youtube_id) all_ignore.append(youtube_id)
search_after = hit['sort'] search_after = hit["sort"]
# update search_after with last hit data # update search_after with last hit data
data['search_after'] = search_after data["search_after"] = search_after
query_str = json.dumps(data) query_str = json.dumps(data)
else: else:
break break
# clean up PIT # clean up PIT
query_str = json.dumps({"id": pit_id}) query_str = json.dumps({"id": pit_id})
requests.delete(self.ES_URL + '/_pit', data=query_str, headers=headers) requests.delete(self.ES_URL + "/_pit", data=query_str, headers=headers)
return all_pending, all_ignore return all_pending, all_ignore
def get_all_indexed(self): def get_all_indexed(self):
""" get a list of all videos indexed """ """get a list of all videos indexed"""
headers = {'Content-type': 'application/json'} headers = {"Content-type": "application/json"}
# get PIT ID # get PIT ID
url = self.ES_URL + '/ta_video/_pit?keep_alive=1m' url = self.ES_URL + "/ta_video/_pit?keep_alive=1m"
response = requests.post(url) response = requests.post(url)
json_data = json.loads(response.text) json_data = json.loads(response.text)
pit_id = json_data['id'] pit_id = json_data["id"]
# query # query
data = { data = {
"size": 500, "query": {"match_all": {}}, "size": 500,
"query": {"match_all": {}},
"pit": {"id": pit_id, "keep_alive": "1m"}, "pit": {"id": pit_id, "keep_alive": "1m"},
"sort": [{"published": {"order": "desc"}}] "sort": [{"published": {"order": "desc"}}],
} }
query_str = json.dumps(data) query_str = json.dumps(data)
url = self.ES_URL + '/_search' url = self.ES_URL + "/_search"
all_indexed = [] all_indexed = []
while True: while True:
response = requests.get(url, data=query_str, headers=headers) response = requests.get(url, data=query_str, headers=headers)
json_data = json.loads(response.text) json_data = json.loads(response.text)
all_hits = json_data['hits']['hits'] all_hits = json_data["hits"]["hits"]
if all_hits: if all_hits:
for hit in all_hits: for hit in all_hits:
all_indexed.append(hit) all_indexed.append(hit)
search_after = hit['sort'] search_after = hit["sort"]
# update search_after with last hit data # update search_after with last hit data
data['search_after'] = search_after data["search_after"] = search_after
query_str = json.dumps(data) query_str = json.dumps(data)
else: else:
break break
# clean up PIT # clean up PIT
query_str = json.dumps({"id": pit_id}) query_str = json.dumps({"id": pit_id})
requests.delete(self.ES_URL + '/_pit', data=query_str, headers=headers) requests.delete(self.ES_URL + "/_pit", data=query_str, headers=headers)
return all_indexed return all_indexed
def get_all_downloaded(self): def get_all_downloaded(self):
""" get a list of all videos in archive """ """get a list of all videos in archive"""
all_channel_folders = os.listdir(self.VIDEOS) all_channel_folders = os.listdir(self.VIDEOS)
all_downloaded = [] all_downloaded = []
for channel_folder in all_channel_folders: for channel_folder in all_channel_folders:
@ -223,125 +225,131 @@ class PendingList:
return all_downloaded return all_downloaded
def delete_from_pending(self, youtube_id): def delete_from_pending(self, youtube_id):
""" delete the youtube_id from ta_download """ """delete the youtube_id from ta_download"""
url = f'{self.ES_URL}/ta_download/_doc/{youtube_id}' url = f"{self.ES_URL}/ta_download/_doc/{youtube_id}"
response = requests.delete(url) response = requests.delete(url)
if not response.ok: if not response.ok:
print(response.text) print(response.text)
def ignore_from_pending(self, ignore_list): def ignore_from_pending(self, ignore_list):
""" build the bulk query string """ """build the bulk query string"""
stamp = int(datetime.now().strftime("%s")) stamp = int(datetime.now().strftime("%s"))
bulk_list = [] bulk_list = []
for youtube_id in ignore_list: for youtube_id in ignore_list:
action = {"update": {"_id": youtube_id, "_index": "ta_download"}} action = {"update": {"_id": youtube_id, "_index": "ta_download"}}
source = {"doc": {"status": 'ignore', "timestamp": stamp}} source = {"doc": {"status": "ignore", "timestamp": stamp}}
bulk_list.append(json.dumps(action)) bulk_list.append(json.dumps(action))
bulk_list.append(json.dumps(source)) bulk_list.append(json.dumps(source))
# add last newline # add last newline
bulk_list.append('\n') bulk_list.append("\n")
query_str = '\n'.join(bulk_list) query_str = "\n".join(bulk_list)
headers = {'Content-type': 'application/x-ndjson'} headers = {"Content-type": "application/x-ndjson"}
url = self.ES_URL + '/_bulk' url = self.ES_URL + "/_bulk"
request = requests.post(url, data=query_str, headers=headers) request = requests.post(url, data=query_str, headers=headers)
mess_dict = { mess_dict = {
"status": "ignore", "status": "ignore",
"level": "info", "level": "info",
"title": "Added to ignore list", "title": "Added to ignore list",
"message": '' "message": "",
} }
set_message('progress:download', mess_dict) set_message("progress:download", mess_dict)
if not request.ok: if not request.ok:
print(request) print(request)
class ChannelSubscription: class ChannelSubscription:
""" manage the list of channels subscribed """ """manage the list of channels subscribed"""
def __init__(self): def __init__(self):
config = AppConfig().config config = AppConfig().config
self.es_url = config['application']['es_url'] self.es_url = config["application"]["es_url"]
self.channel_size = config['subscriptions']['channel_size'] self.channel_size = config["subscriptions"]["channel_size"]
def get_channels(self, subscribed_only=True): def get_channels(self, subscribed_only=True):
""" get a list of all channels subscribed to """ """get a list of all channels subscribed to"""
headers = {'Content-type': 'application/json'} headers = {"Content-type": "application/json"}
# get PIT ID # get PIT ID
url = self.es_url + '/ta_channel/_pit?keep_alive=1m' url = self.es_url + "/ta_channel/_pit?keep_alive=1m"
response = requests.post(url) response = requests.post(url)
json_data = json.loads(response.text) json_data = json.loads(response.text)
pit_id = json_data['id'] pit_id = json_data["id"]
# query # query
if subscribed_only: if subscribed_only:
data = { data = {
"query": {"term": {"channel_subscribed": {"value": True}}}, "query": {"term": {"channel_subscribed": {"value": True}}},
"size": 50, "pit": {"id": pit_id, "keep_alive": "1m"}, "size": 50,
"sort": [{"channel_name.keyword": {"order": "asc"}}] "pit": {"id": pit_id, "keep_alive": "1m"},
"sort": [{"channel_name.keyword": {"order": "asc"}}],
} }
else: else:
data = { data = {
"query": {"match_all": {}}, "query": {"match_all": {}},
"size": 50, "pit": {"id": pit_id, "keep_alive": "1m"}, "size": 50,
"sort": [{"channel_name.keyword": {"order": "asc"}}] "pit": {"id": pit_id, "keep_alive": "1m"},
"sort": [{"channel_name.keyword": {"order": "asc"}}],
} }
query_str = json.dumps(data) query_str = json.dumps(data)
url = self.es_url + '/_search' url = self.es_url + "/_search"
all_channels = [] all_channels = []
while True: while True:
response = requests.get(url, data=query_str, headers=headers) response = requests.get(url, data=query_str, headers=headers)
json_data = json.loads(response.text) json_data = json.loads(response.text)
all_hits = json_data['hits']['hits'] all_hits = json_data["hits"]["hits"]
if all_hits: if all_hits:
for hit in all_hits: for hit in all_hits:
source = hit['_source'] source = hit["_source"]
search_after = hit['sort'] search_after = hit["sort"]
all_channels.append(source) all_channels.append(source)
# update search_after with last hit data # update search_after with last hit data
data['search_after'] = search_after data["search_after"] = search_after
query_str = json.dumps(data) query_str = json.dumps(data)
else: else:
break break
# clean up PIT # clean up PIT
query_str = json.dumps({"id": pit_id}) query_str = json.dumps({"id": pit_id})
requests.delete(self.es_url + '/_pit', data=query_str, headers=headers) requests.delete(self.es_url + "/_pit", data=query_str, headers=headers)
return all_channels return all_channels
def get_last_youtube_videos(self, channel_id, limit=True): def get_last_youtube_videos(self, channel_id, limit=True):
""" get a list of last videos from channel """ """get a list of last videos from channel"""
url = f'https://www.youtube.com/channel/{channel_id}/videos' url = f"https://www.youtube.com/channel/{channel_id}/videos"
obs = { obs = {
'default_search': 'ytsearch', 'quiet': True, "default_search": "ytsearch",
'skip_download': True, 'extract_flat': True "quiet": True,
"skip_download": True,
"extract_flat": True,
} }
if limit: if limit:
obs['playlistend'] = self.channel_size obs["playlistend"] = self.channel_size
chan = youtube_dl.YoutubeDL(obs).extract_info(url, download=False) chan = youtube_dl.YoutubeDL(obs).extract_info(url, download=False)
last_videos = [(i['id'], i['title']) for i in chan['entries']] last_videos = [(i["id"], i["title"]) for i in chan["entries"]]
return last_videos return last_videos
def find_missing(self): def find_missing(self):
""" add missing videos from subscribed channels to pending """ """add missing videos from subscribed channels to pending"""
all_channels = self.get_channels() all_channels = self.get_channels()
pending_handler = PendingList() pending_handler = PendingList()
all_pending, all_ignore = pending_handler.get_all_pending() all_pending, all_ignore = pending_handler.get_all_pending()
all_pending_ids = [i['youtube_id'] for i in all_pending] all_pending_ids = [i["youtube_id"] for i in all_pending]
all_downloaded = pending_handler.get_all_downloaded() all_downloaded = pending_handler.get_all_downloaded()
to_ignore = all_pending_ids + all_ignore + all_downloaded to_ignore = all_pending_ids + all_ignore + all_downloaded
missing_videos = [] missing_videos = []
counter = 1 counter = 1
for channel in all_channels: for channel in all_channels:
channel_id = channel['channel_id'] channel_id = channel["channel_id"]
last_videos = self.get_last_youtube_videos(channel_id) last_videos = self.get_last_youtube_videos(channel_id)
set_message('progress:download', { set_message(
"progress:download",
{
"status": "rescan", "status": "rescan",
"level": "info", "level": "info",
"title": "Rescanning: Looking for new videos.", "title": "Rescanning: Looking for new videos.",
"message": f'Progress: {counter}/{len(all_channels)}' "message": f"Progress: {counter}/{len(all_channels)}",
} },
) )
for video in last_videos: for video in last_videos:
youtube_id = video[0] youtube_id = video[0]
@ -352,22 +360,22 @@ class ChannelSubscription:
return missing_videos return missing_videos
def change_subscribe(self, channel_id, channel_subscribed): def change_subscribe(self, channel_id, channel_subscribed):
""" subscribe or unsubscribe from channel and update """ """subscribe or unsubscribe from channel and update"""
if not isinstance(channel_subscribed, bool): if not isinstance(channel_subscribed, bool):
print('invalid status, should be bool') print("invalid status, should be bool")
return return
headers = {'Content-type': 'application/json'} headers = {"Content-type": "application/json"}
channel_handler = YoutubeChannel(channel_id) channel_handler = YoutubeChannel(channel_id)
channel_dict = channel_handler.channel_dict channel_dict = channel_handler.channel_dict
channel_dict['channel_subscribed'] = channel_subscribed channel_dict["channel_subscribed"] = channel_subscribed
if channel_subscribed: if channel_subscribed:
# handle subscribe # handle subscribe
url = self.es_url + '/ta_channel/_doc/' + channel_id url = self.es_url + "/ta_channel/_doc/" + channel_id
payload = json.dumps(channel_dict) payload = json.dumps(channel_dict)
print(channel_dict) print(channel_dict)
else: else:
url = self.es_url + '/ta_channel/_update/' + channel_id url = self.es_url + "/ta_channel/_update/" + channel_id
payload = json.dumps({'doc': channel_dict}) payload = json.dumps({"doc": channel_dict})
# update channel # update channel
request = requests.post(url, data=payload, headers=headers) request = requests.post(url, data=payload, headers=headers)
if not request.ok: if not request.ok:
@ -377,27 +385,30 @@ class ChannelSubscription:
def playlist_extractor(playlist_id): def playlist_extractor(playlist_id):
""" return youtube_ids from a playlist_id """ """return youtube_ids from a playlist_id"""
url = 'https://www.youtube.com/playlist?list=' + playlist_id url = "https://www.youtube.com/playlist?list=" + playlist_id
obs = { obs = {
'default_search': 'ytsearch', 'quiet': True, 'ignoreerrors': True, "default_search": "ytsearch",
'skip_download': True, 'extract_flat': True "quiet": True,
"ignoreerrors": True,
"skip_download": True,
"extract_flat": True,
} }
playlist = youtube_dl.YoutubeDL(obs).extract_info(url, download=False) playlist = youtube_dl.YoutubeDL(obs).extract_info(url, download=False)
playlist_vids = [(i['id'], i['title']) for i in playlist['entries']] playlist_vids = [(i["id"], i["title"]) for i in playlist["entries"]]
return playlist_vids return playlist_vids
class VideoDownloader: class VideoDownloader:
""" handle the video download functionality """ """handle the video download functionality"""
def __init__(self, youtube_id_list): def __init__(self, youtube_id_list):
self.youtube_id_list = youtube_id_list self.youtube_id_list = youtube_id_list
self.config = AppConfig().config self.config = AppConfig().config
def download_list(self): def download_list(self):
""" download the list of youtube_ids """ """download the list of youtube_ids"""
limit_count = self.config['downloads']['limit_count'] limit_count = self.config["downloads"]["limit_count"]
if limit_count: if limit_count:
self.youtube_id_list = self.youtube_id_list[:limit_count] self.youtube_id_list = self.youtube_id_list[:limit_count]
@ -405,112 +416,118 @@ class VideoDownloader:
try: try:
self.dl_single_vid(youtube_id) self.dl_single_vid(youtube_id)
except youtube_dl.utils.DownloadError: except youtube_dl.utils.DownloadError:
print('failed to download ' + youtube_id) print("failed to download " + youtube_id)
continue continue
vid_dict = index_new_video(youtube_id) vid_dict = index_new_video(youtube_id)
self.move_to_archive(vid_dict) self.move_to_archive(vid_dict)
self.delete_from_pending(youtube_id) self.delete_from_pending(youtube_id)
if self.config['downloads']['sleep_interval']: if self.config["downloads"]["sleep_interval"]:
sleep(self.config['downloads']['sleep_interval']) sleep(self.config["downloads"]["sleep_interval"])
@staticmethod @staticmethod
def progress_hook(response): def progress_hook(response):
""" process the progress_hooks from youtube_dl """ """process the progress_hooks from youtube_dl"""
# title # title
filename = response['filename'][12:].replace('_', ' ') filename = response["filename"][12:].replace("_", " ")
title = "Downloading: " + os.path.split(filename)[-1] title = "Downloading: " + os.path.split(filename)[-1]
# message # message
try: try:
percent = response['_percent_str'] percent = response["_percent_str"]
size = response['_total_bytes_str'] size = response["_total_bytes_str"]
speed = response['_speed_str'] speed = response["_speed_str"]
eta = response['_eta_str'] eta = response["_eta_str"]
message = f'{percent} of {size} at {speed} - time left: {eta}' message = f"{percent} of {size} at {speed} - time left: {eta}"
except KeyError: except KeyError:
message = '' message = ""
mess_dict = { mess_dict = {
"status": "downloading", "status": "downloading",
"level": "info", "level": "info",
"title": title, "title": title,
"message": message "message": message,
} }
set_message('progress:download', mess_dict) set_message("progress:download", mess_dict)
def dl_single_vid(self, youtube_id): def dl_single_vid(self, youtube_id):
""" download single video """ """download single video"""
obs = { obs = {
'default_search': 'ytsearch', "default_search": "ytsearch",
'merge_output_format': 'mp4', 'restrictfilenames': True, "merge_output_format": "mp4",
'outtmpl': (self.config['application']['cache_dir'] + "restrictfilenames": True,
'/download/' + "outtmpl": (
self.config['application']['file_template']), self.config["application"]["cache_dir"]
'progress_hooks': [self.progress_hook], + "/download/"
'quiet': True, 'continuedl': True, 'retries': 3 + self.config["application"]["file_template"]
),
"progress_hooks": [self.progress_hook],
"quiet": True,
"continuedl": True,
"retries": 3,
} }
if self.config['downloads']['format']: if self.config["downloads"]["format"]:
obs['format'] = self.config['downloads']['format'] obs["format"] = self.config["downloads"]["format"]
if self.config['downloads']['limit_speed']: if self.config["downloads"]["limit_speed"]:
obs['ratelimit'] = self.config['downloads']['limit_speed'] * 1024 obs["ratelimit"] = self.config["downloads"]["limit_speed"] * 1024
external = False external = False
if external: if external:
obs['external_downloader'] = 'aria2c' obs["external_downloader"] = "aria2c"
postprocessors = [] postprocessors = []
if self.config['downloads']['add_metadata']: if self.config["downloads"]["add_metadata"]:
postprocessors.append({ postprocessors.append(
'key': 'FFmpegMetadata', {
'add_chapters': True, "key": "FFmpegMetadata",
'add_metadata': True, "add_chapters": True,
}) "add_metadata": True,
}
)
obs['postprocessors'] = postprocessors obs["postprocessors"] = postprocessors
# check if already in cache to continue from there # check if already in cache to continue from there
cache_dir = self.config['application']['cache_dir'] cache_dir = self.config["application"]["cache_dir"]
all_cached = os.listdir(cache_dir + '/download/') all_cached = os.listdir(cache_dir + "/download/")
for file_name in all_cached: for file_name in all_cached:
if youtube_id in file_name: if youtube_id in file_name:
obs['outtmpl'] = cache_dir + '/download/' + file_name obs["outtmpl"] = cache_dir + "/download/" + file_name
with youtube_dl.YoutubeDL(obs) as ydl: with youtube_dl.YoutubeDL(obs) as ydl:
try: try:
ydl.download([youtube_id]) ydl.download([youtube_id])
except youtube_dl.utils.DownloadError: except youtube_dl.utils.DownloadError:
print('retry failed download: ' + youtube_id) print("retry failed download: " + youtube_id)
sleep(10) sleep(10)
ydl.download([youtube_id]) ydl.download([youtube_id])
def move_to_archive(self, vid_dict): def move_to_archive(self, vid_dict):
""" move downloaded video from cache to archive """ """move downloaded video from cache to archive"""
videos = self.config['application']['videos'] videos = self.config["application"]["videos"]
channel_name = vid_dict['channel']['channel_name'] channel_name = vid_dict["channel"]["channel_name"]
channel_name_clean = clean_string(channel_name) channel_name_clean = clean_string(channel_name)
media_url = vid_dict['media_url'] media_url = vid_dict["media_url"]
youtube_id = vid_dict['youtube_id'] youtube_id = vid_dict["youtube_id"]
# make archive folder # make archive folder
videos = self.config['application']['videos'] videos = self.config["application"]["videos"]
new_folder = os.path.join(videos, channel_name_clean) new_folder = os.path.join(videos, channel_name_clean)
os.makedirs(new_folder, exist_ok=True) os.makedirs(new_folder, exist_ok=True)
# find real filename # find real filename
cache_dir = self.config['application']['cache_dir'] cache_dir = self.config["application"]["cache_dir"]
for file_str in os.listdir(cache_dir + '/download'): for file_str in os.listdir(cache_dir + "/download"):
if youtube_id in file_str: if youtube_id in file_str:
old_file = file_str old_file = file_str
old_file_path = os.path.join(cache_dir, 'download', old_file) old_file_path = os.path.join(cache_dir, "download", old_file)
new_file_path = os.path.join(videos, media_url) new_file_path = os.path.join(videos, media_url)
# move and fix permission # move and fix permission
shutil.move(old_file_path, new_file_path) shutil.move(old_file_path, new_file_path)
os.chown( os.chown(
new_file_path, new_file_path,
self.config['application']['HOST_UID'], self.config["application"]["HOST_UID"],
self.config['application']['HOST_GID'] self.config["application"]["HOST_GID"],
) )
def delete_from_pending(self, youtube_id): def delete_from_pending(self, youtube_id):
""" delete downloaded video from pending index if its there """ """delete downloaded video from pending index if its there"""
es_url = self.config['application']['es_url'] es_url = self.config["application"]["es_url"]
url = f'{es_url}/ta_download/_doc/{youtube_id}' url = f"{es_url}/ta_download/_doc/{youtube_id}"
response = requests.delete(url) response = requests.delete(url)
if not response.ok and not response.status_code == 404: if not response.ok and not response.status_code == 404:
print(response.text) print(response.text)

View File

@ -13,53 +13,53 @@ import unicodedata
import redis import redis
import requests import requests
REDIS_HOST = os.environ.get('REDIS_HOST') REDIS_HOST = os.environ.get("REDIS_HOST")
def get_total_hits(index, es_url, match_field): def get_total_hits(index, es_url, match_field):
""" get total hits from index """ """get total hits from index"""
headers = {'Content-type': 'application/json'} headers = {"Content-type": "application/json"}
data = {"query": {"match": {match_field: True}}} data = {"query": {"match": {match_field: True}}}
payload = json.dumps(data) payload = json.dumps(data)
url = f'{es_url}/{index}/_search?filter_path=hits.total' url = f"{es_url}/{index}/_search?filter_path=hits.total"
request = requests.post(url, data=payload, headers=headers) request = requests.post(url, data=payload, headers=headers)
if not request.ok: if not request.ok:
print(request.text) print(request.text)
total_json = json.loads(request.text) total_json = json.loads(request.text)
total_hits = total_json['hits']['total']['value'] total_hits = total_json["hits"]["total"]["value"]
return total_hits return total_hits
def clean_string(file_name): def clean_string(file_name):
""" clean string to only asci characters """ """clean string to only asci characters"""
whitelist = "-_.() " + string.ascii_letters + string.digits whitelist = "-_.() " + string.ascii_letters + string.digits
normalized = unicodedata.normalize('NFKD', file_name) normalized = unicodedata.normalize("NFKD", file_name)
ascii_only = normalized.encode('ASCII', 'ignore').decode().strip() ascii_only = normalized.encode("ASCII", "ignore").decode().strip()
white_listed = ''.join(c for c in ascii_only if c in whitelist) white_listed = "".join(c for c in ascii_only if c in whitelist)
cleaned = re.sub(r'[ ]{2,}', ' ', white_listed) cleaned = re.sub(r"[ ]{2,}", " ", white_listed)
return cleaned return cleaned
def process_url_list(url_str): def process_url_list(url_str):
""" parse url_list to find valid youtube video or channel ids """ """parse url_list to find valid youtube video or channel ids"""
to_replace = ['watch?v=', 'playlist?list='] to_replace = ["watch?v=", "playlist?list="]
url_list = re.split('\n+', url_str[0]) url_list = re.split("\n+", url_str[0])
youtube_ids = [] youtube_ids = []
for url in url_list: for url in url_list:
url_clean = url.strip().strip('/').split('/')[-1] url_clean = url.strip().strip("/").split("/")[-1]
for i in to_replace: for i in to_replace:
url_clean = url_clean.replace(i, '') url_clean = url_clean.replace(i, "")
url_no_param = url_clean.split('&')[0] url_no_param = url_clean.split("&")[0]
str_len = len(url_no_param) str_len = len(url_no_param)
if str_len == 11: if str_len == 11:
link_type = 'video' link_type = "video"
elif str_len == 24: elif str_len == 24:
link_type = 'channel' link_type = "channel"
elif str_len == 34: elif str_len == 34:
link_type = 'playlist' link_type = "playlist"
else: else:
# unable to parse # unable to parse
raise ValueError('not a valid url: ' + url) raise ValueError("not a valid url: " + url)
youtube_ids.append({"url": url_no_param, "type": link_type}) youtube_ids.append({"url": url_no_param, "type": link_type})
@ -67,19 +67,17 @@ def process_url_list(url_str):
def set_message(key, message, expire=True): def set_message(key, message, expire=True):
""" write new message to redis """ """write new message to redis"""
redis_connection = redis.Redis(host=REDIS_HOST) redis_connection = redis.Redis(host=REDIS_HOST)
redis_connection.execute_command( redis_connection.execute_command("JSON.SET", key, ".", json.dumps(message))
'JSON.SET', key, '.', json.dumps(message)
)
if expire: if expire:
redis_connection.execute_command('EXPIRE', key, 20) redis_connection.execute_command("EXPIRE", key, 20)
def get_message(key): def get_message(key):
""" get any message from JSON key """ """get any message from JSON key"""
redis_connection = redis.Redis(host=REDIS_HOST) redis_connection = redis.Redis(host=REDIS_HOST)
reply = redis_connection.execute_command('JSON.GET', key) reply = redis_connection.execute_command("JSON.GET", key)
if reply: if reply:
json_str = json.loads(reply) json_str = json.loads(reply)
else: else:
@ -88,9 +86,9 @@ def get_message(key):
def get_dl_message(cache_dir): def get_dl_message(cache_dir):
""" get latest message if available """ """get latest message if available"""
redis_connection = redis.Redis(host=REDIS_HOST) redis_connection = redis.Redis(host=REDIS_HOST)
reply = redis_connection.execute_command('JSON.GET', 'progress:download') reply = redis_connection.execute_command("JSON.GET", "progress:download")
if reply: if reply:
json_str = json.loads(reply) json_str = json.loads(reply)
elif json_str := monitor_cache_dir(cache_dir): elif json_str := monitor_cache_dir(cache_dir):
@ -101,7 +99,7 @@ def get_dl_message(cache_dir):
def get_lock(lock_key): def get_lock(lock_key):
""" handle lock for task management """ """handle lock for task management"""
redis_lock = redis.Redis(host=REDIS_HOST).lock(lock_key) redis_lock = redis.Redis(host=REDIS_HOST).lock(lock_key)
return redis_lock return redis_lock
@ -110,15 +108,15 @@ def monitor_cache_dir(cache_dir):
""" """
look at download cache dir directly as alternative progress info look at download cache dir directly as alternative progress info
""" """
dl_cache = os.path.join(cache_dir, 'download') dl_cache = os.path.join(cache_dir, "download")
cache_file = os.listdir(dl_cache) cache_file = os.listdir(dl_cache)
if cache_file: if cache_file:
filename = cache_file[0][12:].replace('_', ' ').split('.')[0] filename = cache_file[0][12:].replace("_", " ").split(".")[0]
mess_dict = { mess_dict = {
"status": "downloading", "status": "downloading",
"level": "info", "level": "info",
"title": "Downloading: " + filename, "title": "Downloading: " + filename,
"message": "" "message": "",
} }
else: else:
return False return False
@ -133,27 +131,37 @@ class DurationConverter:
@staticmethod @staticmethod
def get_sec(file_path): def get_sec(file_path):
""" read duration from file """ """read duration from file"""
duration = subprocess.run([ duration = subprocess.run(
"ffprobe", "-v", "error", "-show_entries", "format=duration", [
"-of", "default=noprint_wrappers=1:nokey=1", file_path "ffprobe",
], capture_output=True, check=True) "-v",
"error",
"-show_entries",
"format=duration",
"-of",
"default=noprint_wrappers=1:nokey=1",
file_path,
],
capture_output=True,
check=True,
)
duration_sec = int(float(duration.stdout.decode().strip())) duration_sec = int(float(duration.stdout.decode().strip()))
return duration_sec return duration_sec
@staticmethod @staticmethod
def get_str(duration_sec): def get_str(duration_sec):
""" takes duration in sec and returns clean string """ """takes duration in sec and returns clean string"""
hours = duration_sec // 3600 hours = duration_sec // 3600
minutes = (duration_sec - (hours * 3600)) // 60 minutes = (duration_sec - (hours * 3600)) // 60
secs = duration_sec - (hours * 3600) - (minutes * 60) secs = duration_sec - (hours * 3600) - (minutes * 60)
duration_str = str() duration_str = str()
if hours: if hours:
duration_str = str(hours).zfill(2) + ':' duration_str = str(hours).zfill(2) + ":"
if minutes: if minutes:
duration_str = duration_str + str(minutes).zfill(2) + ':' duration_str = duration_str + str(minutes).zfill(2) + ":"
else: else:
duration_str = duration_str + '00:' duration_str = duration_str + "00:"
duration_str = duration_str + str(secs).zfill(2) duration_str = duration_str + str(secs).zfill(2)
return duration_str return duration_str

View File

@ -19,11 +19,11 @@ from home.src.helper import DurationConverter, clean_string
class YoutubeChannel: class YoutubeChannel:
""" represents a single youtube channel """ """represents a single youtube channel"""
CONFIG = AppConfig().config CONFIG = AppConfig().config
ES_URL = CONFIG['application']['es_url'] ES_URL = CONFIG["application"]["es_url"]
CACHE_DIR = CONFIG['application']['cache_dir'] CACHE_DIR = CONFIG["application"]["cache_dir"]
def __init__(self, channel_id): def __init__(self, channel_id):
self.channel_id = channel_id self.channel_id = channel_id
@ -32,193 +32,187 @@ class YoutubeChannel:
self.channel_dict = self.build_channel_dict() self.channel_dict = self.build_channel_dict()
def build_channel_dict(self, scrape=False): def build_channel_dict(self, scrape=False):
""" combine the dicts build from extracted json payload """ """combine the dicts build from extracted json payload"""
if scrape: if scrape:
channel_dict = False channel_dict = False
else: else:
channel_dict = self.get_es_channel() channel_dict = self.get_es_channel()
if not channel_dict: if not channel_dict:
print('scrape data from youtube') print("scrape data from youtube")
self.scrape_channel() self.scrape_channel()
channel_dict = self.parse_channel_main() channel_dict = self.parse_channel_main()
channel_dict.update(self.parse_channel_meta()) channel_dict.update(self.parse_channel_meta())
self.source = 'scraped' self.source = "scraped"
return channel_dict return channel_dict
def get_es_channel(self): def get_es_channel(self):
""" get from elastic search first if possible """ """get from elastic search first if possible"""
channel_id = self.channel_id channel_id = self.channel_id
url = f'{self.ES_URL}/ta_channel/_doc/{channel_id}' url = f"{self.ES_URL}/ta_channel/_doc/{channel_id}"
response = requests.get(url) response = requests.get(url)
if response.ok: if response.ok:
channel_source = response.json()['_source'] channel_source = response.json()["_source"]
self.source = 'elastic' self.source = "elastic"
return channel_source return channel_source
return False return False
def scrape_channel(self): def scrape_channel(self):
""" scrape channel page for additional infos """ """scrape channel page for additional infos"""
channel_id = self.channel_id channel_id = self.channel_id
url = f'https://www.youtube.com/channel/{channel_id}/about?hl=en' url = f"https://www.youtube.com/channel/{channel_id}/about?hl=en"
cookies = { cookies = {"CONSENT": "YES+xxxxxxxxxxxxxxxxxxxxxxxxxxx"}
'CONSENT': 'YES+xxxxxxxxxxxxxxxxxxxxxxxxxxx'
}
response = requests.get(url, cookies=cookies) response = requests.get(url, cookies=cookies)
if response.ok: if response.ok:
channel_page = response.text channel_page = response.text
else: else:
print(f'failed to extract channel info for: {channel_id}') print(f"failed to extract channel info for: {channel_id}")
raise ConnectionError raise ConnectionError
soup = BeautifulSoup(channel_page, 'html.parser') soup = BeautifulSoup(channel_page, "html.parser")
# load script into json # load script into json
all_scripts = soup.find('body').find_all('script') all_scripts = soup.find("body").find_all("script")
for script in all_scripts: for script in all_scripts:
if 'var ytInitialData = ' in str(script): if "var ytInitialData = " in str(script):
script_content = str(script) script_content = str(script)
break break
# extract payload # extract payload
script_content = script_content.split('var ytInitialData = ')[1] script_content = script_content.split("var ytInitialData = ")[1]
json_raw = script_content.rstrip(';</script>') json_raw = script_content.rstrip(";</script>")
json_data = json.loads(json_raw) json_data = json.loads(json_raw)
# add to self # add to self
self.json_data = json_data self.json_data = json_data
def parse_channel_main(self): def parse_channel_main(self):
""" extract maintab values from scraped channel json data """ """extract maintab values from scraped channel json data"""
main_tab = self.json_data['header']['c4TabbedHeaderRenderer'] main_tab = self.json_data["header"]["c4TabbedHeaderRenderer"]
channel_name = main_tab['title'] channel_name = main_tab["title"]
last_refresh = int(datetime.now().strftime("%s")) last_refresh = int(datetime.now().strftime("%s"))
# channel_subs # channel_subs
try: try:
sub_text_simple = main_tab['subscriberCountText']['simpleText'] sub_text_simple = main_tab["subscriberCountText"]["simpleText"]
sub_text = sub_text_simple.split(' ')[0] sub_text = sub_text_simple.split(" ")[0]
if sub_text[-1] == 'K': if sub_text[-1] == "K":
channel_subs = int(float(sub_text.replace('K', ''))*1000) channel_subs = int(float(sub_text.replace("K", "")) * 1000)
elif sub_text[-1] == 'M': elif sub_text[-1] == "M":
channel_subs = int(float(sub_text.replace('M', ''))*1000000) channel_subs = int(float(sub_text.replace("M", "")) * 1000000)
elif int(sub_text) >= 0: elif int(sub_text) >= 0:
channel_subs = int(sub_text) channel_subs = int(sub_text)
else: else:
message = f'{sub_text} not dealt with' message = f"{sub_text} not dealt with"
print(message) print(message)
except KeyError: except KeyError:
channel_subs = 0 channel_subs = 0
# banner # banner
try: try:
all_banners = main_tab['banner']['thumbnails'] all_banners = main_tab["banner"]["thumbnails"]
banner = sorted(all_banners, key=lambda k: k['width'])[-1]['url'] banner = sorted(all_banners, key=lambda k: k["width"])[-1]["url"]
except KeyError: except KeyError:
banner = False banner = False
# build and return dict # build and return dict
main_channel_dict = { main_channel_dict = {
'channel_active': True, "channel_active": True,
'channel_last_refresh': last_refresh, "channel_last_refresh": last_refresh,
'channel_subs': channel_subs, "channel_subs": channel_subs,
'channel_banner_url': banner, "channel_banner_url": banner,
'channel_name': channel_name, "channel_name": channel_name,
'channel_id': self.channel_id "channel_id": self.channel_id,
} }
return main_channel_dict return main_channel_dict
def parse_channel_meta(self): def parse_channel_meta(self):
""" extract meta tab values from channel payload """ """extract meta tab values from channel payload"""
# meta tab # meta tab
json_data = self.json_data json_data = self.json_data
meta_tab = json_data['metadata']['channelMetadataRenderer'] meta_tab = json_data["metadata"]["channelMetadataRenderer"]
description = meta_tab['description'] description = meta_tab["description"]
all_thumbs = meta_tab['avatar']['thumbnails'] all_thumbs = meta_tab["avatar"]["thumbnails"]
thumb_url = sorted(all_thumbs, key=lambda k: k['width'])[-1]['url'] thumb_url = sorted(all_thumbs, key=lambda k: k["width"])[-1]["url"]
# stats tab # stats tab
renderer = 'twoColumnBrowseResultsRenderer' renderer = "twoColumnBrowseResultsRenderer"
all_tabs = json_data['contents'][renderer]['tabs'] all_tabs = json_data["contents"][renderer]["tabs"]
for tab in all_tabs: for tab in all_tabs:
if 'tabRenderer' in tab.keys(): if "tabRenderer" in tab.keys():
if tab['tabRenderer']['title'] == 'About': if tab["tabRenderer"]["title"] == "About":
about_tab = (tab['tabRenderer']['content'] about_tab = tab["tabRenderer"]["content"][
['sectionListRenderer']['contents'][0] "sectionListRenderer"
['itemSectionRenderer']['contents'][0] ]["contents"][0]["itemSectionRenderer"]["contents"][0][
['channelAboutFullMetadataRenderer']) "channelAboutFullMetadataRenderer"
]
break break
try: try:
channel_views_text = about_tab['viewCountText']['simpleText'] channel_views_text = about_tab["viewCountText"]["simpleText"]
channel_views = int(re.sub(r"\D", "", channel_views_text)) channel_views = int(re.sub(r"\D", "", channel_views_text))
except KeyError: except KeyError:
channel_views = 0 channel_views = 0
meta_channel_dict = { meta_channel_dict = {
'channel_description': description, "channel_description": description,
'channel_thumb_url': thumb_url, "channel_thumb_url": thumb_url,
'channel_views': channel_views "channel_views": channel_views,
} }
return meta_channel_dict return meta_channel_dict
def upload_to_es(self): def upload_to_es(self):
""" upload channel data to elastic search """ """upload channel data to elastic search"""
url = f'{self.ES_URL}/ta_channel/_doc/{self.channel_id}' url = f"{self.ES_URL}/ta_channel/_doc/{self.channel_id}"
response = requests.put(url, json=self.channel_dict) response = requests.put(url, json=self.channel_dict)
print(f'added {self.channel_id} to es') print(f"added {self.channel_id} to es")
if not response.ok: if not response.ok:
print(response.text) print(response.text)
def clear_cache(self): def clear_cache(self):
""" delete banner and thumb from cache if available """ """delete banner and thumb from cache if available"""
channel_cache = os.path.join(self.CACHE_DIR, 'channels') channel_cache = os.path.join(self.CACHE_DIR, "channels")
thumb = os.path.join(channel_cache, self.channel_id + '_thumb.jpg') thumb = os.path.join(channel_cache, self.channel_id + "_thumb.jpg")
banner = os.path.join(channel_cache, self.channel_id + '_banner.jpg') banner = os.path.join(channel_cache, self.channel_id + "_banner.jpg")
if os.path.exists(thumb): if os.path.exists(thumb):
os.remove(thumb) os.remove(thumb)
if os.path.exists(banner): if os.path.exists(banner):
os.remove(banner) os.remove(banner)
def sync_to_videos(self): def sync_to_videos(self):
""" sync new channel_dict to all videos of channel """ """sync new channel_dict to all videos of channel"""
headers = {'Content-type': 'application/json'} headers = {"Content-type": "application/json"}
channel_id = self.channel_id channel_id = self.channel_id
# add ingest pipeline # add ingest pipeline
processors = [] processors = []
for field, value in self.channel_dict.items(): for field, value in self.channel_dict.items():
line = {"set": {"field": "channel." + field, "value": value}} line = {"set": {"field": "channel." + field, "value": value}}
processors.append(line) processors.append(line)
data = { data = {"description": channel_id, "processors": processors}
"description": channel_id,
"processors": processors
}
payload = json.dumps(data) payload = json.dumps(data)
url = self.ES_URL + '/_ingest/pipeline/' + channel_id url = self.ES_URL + "/_ingest/pipeline/" + channel_id
request = requests.put(url, data=payload, headers=headers) request = requests.put(url, data=payload, headers=headers)
if not request.ok: if not request.ok:
print(request.text) print(request.text)
# apply pipeline # apply pipeline
data = { data = {"query": {"match": {"channel.channel_id": channel_id}}}
"query": {"match": {"channel.channel_id": channel_id}}
}
payload = json.dumps(data) payload = json.dumps(data)
url = self.ES_URL + '/ta_video/_update_by_query?pipeline=' + channel_id url = self.ES_URL + "/ta_video/_update_by_query?pipeline=" + channel_id
request = requests.post(url, data=payload, headers=headers) request = requests.post(url, data=payload, headers=headers)
if not request.ok: if not request.ok:
print(request.text) print(request.text)
def get_total_hits(self): def get_total_hits(self):
""" get total channels indexed """ """get total channels indexed"""
headers = {'Content-type': 'application/json'} headers = {"Content-type": "application/json"}
data = {"query": {"match_all": {}}} data = {"query": {"match_all": {}}}
payload = json.dumps(data) payload = json.dumps(data)
url = f'{self.ES_URL}/ta_channel/_search?filter_path=hits.total' url = f"{self.ES_URL}/ta_channel/_search?filter_path=hits.total"
request = requests.post(url, data=payload, headers=headers) request = requests.post(url, data=payload, headers=headers)
if not request.ok: if not request.ok:
print(request.text) print(request.text)
total_hits = json.loads(request.text)['hits']['total']['value'] total_hits = json.loads(request.text)["hits"]["total"]["value"]
return total_hits return total_hits
class YoutubeVideo: class YoutubeVideo:
""" represents a single youtube video """ """represents a single youtube video"""
CONFIG = AppConfig().config CONFIG = AppConfig().config
ES_URL = CONFIG['application']['es_url'] ES_URL = CONFIG["application"]["es_url"]
CACHE_DIR = CONFIG['application']['cache_dir'] CACHE_DIR = CONFIG["application"]["cache_dir"]
VIDEOS = CONFIG['application']['videos'] VIDEOS = CONFIG["application"]["videos"]
def __init__(self, youtube_id): def __init__(self, youtube_id):
self.youtube_id = youtube_id self.youtube_id = youtube_id
@ -226,8 +220,8 @@ class YoutubeVideo:
self.vid_dict = self.get_wrapper() self.vid_dict = self.get_wrapper()
def get_wrapper(self): def get_wrapper(self):
""" wrapper to loop around youtube_dl to retry on failure """ """wrapper to loop around youtube_dl to retry on failure"""
print(f'get video data for {self.youtube_id}') print(f"get video data for {self.youtube_id}")
for i in range(3): for i in range(3):
try: try:
vid_dict = self.get_youtubedl_vid_data() vid_dict = self.get_youtubedl_vid_data()
@ -241,63 +235,63 @@ class YoutubeVideo:
return vid_dict return vid_dict
def get_youtubedl_vid_data(self): def get_youtubedl_vid_data(self):
""" parse youtubedl extract info """ """parse youtubedl extract info"""
youtube_id = self.youtube_id youtube_id = self.youtube_id
obs = { obs = {
'quiet': True, "quiet": True,
'default_search': 'ytsearch', "default_search": "ytsearch",
'skip_download': True "skip_download": True,
} }
try: try:
vid = youtube_dl.YoutubeDL(obs).extract_info(youtube_id) vid = youtube_dl.YoutubeDL(obs).extract_info(youtube_id)
except ( except (
youtube_dl.utils.ExtractorError, youtube_dl.utils.ExtractorError,
youtube_dl.utils.DownloadError youtube_dl.utils.DownloadError,
): ):
print('failed to get info for ' + youtube_id) print("failed to get info for " + youtube_id)
return False return False
# extract # extract
self.channel_id = vid['channel_id'] self.channel_id = vid["channel_id"]
upload_date = vid['upload_date'] upload_date = vid["upload_date"]
upload_date_time = datetime.strptime(upload_date, "%Y%m%d") upload_date_time = datetime.strptime(upload_date, "%Y%m%d")
published = upload_date_time.strftime("%Y-%m-%d") published = upload_date_time.strftime("%Y-%m-%d")
last_refresh = int(datetime.now().strftime("%s")) last_refresh = int(datetime.now().strftime("%s"))
# likes # likes
try: try:
like_count = vid['like_count'] like_count = vid["like_count"]
except KeyError: except KeyError:
like_count = 0 like_count = 0
try: try:
dislike_count = vid['dislike_count'] dislike_count = vid["dislike_count"]
except KeyError: except KeyError:
dislike_count = 0 dislike_count = 0
# build dicts # build dicts
stats = { stats = {
"view_count": vid['view_count'], "view_count": vid["view_count"],
"like_count": like_count, "like_count": like_count,
"dislike_count": dislike_count, "dislike_count": dislike_count,
"average_rating": vid['average_rating'] "average_rating": vid["average_rating"],
} }
vid_basic = { vid_basic = {
"title": vid['title'], "title": vid["title"],
"description": vid['description'], "description": vid["description"],
"category": vid['categories'], "category": vid["categories"],
"vid_thumb_url": vid['thumbnail'], "vid_thumb_url": vid["thumbnail"],
"tags": vid['tags'], "tags": vid["tags"],
"published": published, "published": published,
"stats": stats, "stats": stats,
"vid_last_refresh": last_refresh, "vid_last_refresh": last_refresh,
"date_downloaded": last_refresh, "date_downloaded": last_refresh,
"youtube_id": youtube_id, "youtube_id": youtube_id,
"active": True, "active": True,
"channel": False "channel": False,
} }
return vid_basic return vid_basic
def add_player(self, missing_vid): def add_player(self, missing_vid):
""" add player information for new videos """ """add player information for new videos"""
cache_path = self.CACHE_DIR + '/download/' cache_path = self.CACHE_DIR + "/download/"
videos = self.VIDEOS videos = self.VIDEOS
if missing_vid: if missing_vid:
@ -318,24 +312,24 @@ class YoutubeVideo:
player = { player = {
"watched": False, "watched": False,
"duration": duration, "duration": duration,
"duration_str": duration_str "duration_str": duration_str,
} }
self.vid_dict['player'] = player self.vid_dict["player"] = player
def build_file_path(self, channel_name): def build_file_path(self, channel_name):
""" build media_url from where file will be located """ """build media_url from where file will be located"""
clean_channel_name = clean_string(channel_name) clean_channel_name = clean_string(channel_name)
timestamp = self.vid_dict['published'].replace('-', '') timestamp = self.vid_dict["published"].replace("-", "")
youtube_id = self.vid_dict['youtube_id'] youtube_id = self.vid_dict["youtube_id"]
title = self.vid_dict['title'] title = self.vid_dict["title"]
clean_title = clean_string(title) clean_title = clean_string(title)
filename = f'{timestamp}_{youtube_id}_{clean_title}.mp4' filename = f"{timestamp}_{youtube_id}_{clean_title}.mp4"
media_url = os.path.join(clean_channel_name, filename) media_url = os.path.join(clean_channel_name, filename)
self.vid_dict['media_url'] = media_url self.vid_dict["media_url"] = media_url
def get_es_data(self): def get_es_data(self):
""" get current data from elastic search """ """get current data from elastic search"""
url = self.ES_URL + '/ta_video/_doc/' + self.youtube_id url = self.ES_URL + "/ta_video/_doc/" + self.youtube_id
response = requests.get(url) response = requests.get(url)
if not response.ok: if not response.ok:
print(response.text) print(response.text)
@ -343,48 +337,48 @@ class YoutubeVideo:
return es_vid_dict return es_vid_dict
def upload_to_es(self): def upload_to_es(self):
""" upload channel data to elastic search """ """upload channel data to elastic search"""
url = f'{self.ES_URL}/ta_video/_doc/{self.youtube_id}' url = f"{self.ES_URL}/ta_video/_doc/{self.youtube_id}"
response = requests.put(url, json=self.vid_dict) response = requests.put(url, json=self.vid_dict)
if not response.ok: if not response.ok:
print(response.text) print(response.text)
def delete_cache(self): def delete_cache(self):
""" delete thumbnail from cache if exist """ """delete thumbnail from cache if exist"""
video_cache = os.path.join(self.CACHE_DIR, 'videos') video_cache = os.path.join(self.CACHE_DIR, "videos")
thumb = os.path.join(video_cache, self.youtube_id + '.jpg') thumb = os.path.join(video_cache, self.youtube_id + ".jpg")
if os.path.exists(thumb): if os.path.exists(thumb):
os.remove(thumb) os.remove(thumb)
def deactivate(self): def deactivate(self):
""" deactivate document on extractor error """ """deactivate document on extractor error"""
youtube_id = self.youtube_id youtube_id = self.youtube_id
headers = {'Content-type': 'application/json'} headers = {"Content-type": "application/json"}
url = f'{self.ES_URL}/ta_video/_update/{youtube_id}' url = f"{self.ES_URL}/ta_video/_update/{youtube_id}"
data = {"script": "ctx._source.active = false"} data = {"script": "ctx._source.active = false"}
json_str = json.dumps(data) json_str = json.dumps(data)
response = requests.post(url, data=json_str, headers=headers) response = requests.post(url, data=json_str, headers=headers)
print(f'deactivated {youtube_id}') print(f"deactivated {youtube_id}")
if not response.ok: if not response.ok:
print(response.text) print(response.text)
def index_new_video(youtube_id, missing_vid=False): def index_new_video(youtube_id, missing_vid=False):
""" combine video and channel classes for new video index """ """combine video and channel classes for new video index"""
vid_handler = YoutubeVideo(youtube_id) vid_handler = YoutubeVideo(youtube_id)
if not vid_handler.vid_dict: if not vid_handler.vid_dict:
raise ValueError('failed to get metadata for ' + youtube_id) raise ValueError("failed to get metadata for " + youtube_id)
channel_handler = YoutubeChannel(vid_handler.channel_id) channel_handler = YoutubeChannel(vid_handler.channel_id)
# add filepath to vid_dict # add filepath to vid_dict
channel_name = channel_handler.channel_dict['channel_name'] channel_name = channel_handler.channel_dict["channel_name"]
vid_handler.build_file_path(channel_name) vid_handler.build_file_path(channel_name)
# add channel and player to video # add channel and player to video
vid_handler.add_player(missing_vid) vid_handler.add_player(missing_vid)
vid_handler.vid_dict['channel'] = channel_handler.channel_dict vid_handler.vid_dict["channel"] = channel_handler.channel_dict
# add new channel to es # add new channel to es
if channel_handler.source == 'scraped': if channel_handler.source == "scraped":
channel_handler.channel_dict['channel_subscribed'] = False channel_handler.channel_dict["channel_subscribed"] = False
channel_handler.upload_to_es() channel_handler.upload_to_es()
# upload video to es # upload video to es
vid_handler.upload_to_es() vid_handler.upload_to_es()

View File

@ -17,8 +17,8 @@ from home.src.config import AppConfig
# expected mapping and settings # expected mapping and settings
INDEX_CONFIG = [ INDEX_CONFIG = [
{ {
'index_name': 'channel', "index_name": "channel",
'expected_map': { "expected_map": {
"channel_id": { "channel_id": {
"type": "keyword", "type": "keyword",
}, },
@ -28,53 +28,34 @@ INDEX_CONFIG = [
"keyword": { "keyword": {
"type": "keyword", "type": "keyword",
"ignore_above": 256, "ignore_above": 256,
"normalizer": "to_lower" "normalizer": "to_lower",
}, },
"search_as_you_type": { "search_as_you_type": {
"type": "search_as_you_type", "type": "search_as_you_type",
"doc_values": False, "doc_values": False,
"max_shingle_size": 3 "max_shingle_size": 3,
}
}
}, },
"channel_banner_url": {
"type": "keyword",
"index": False
}, },
"channel_thumb_url": {
"type": "keyword",
"index": False
}, },
"channel_description": { "channel_banner_url": {"type": "keyword", "index": False},
"type": "text" "channel_thumb_url": {"type": "keyword", "index": False},
"channel_description": {"type": "text"},
"channel_last_refresh": {"type": "date", "format": "epoch_second"},
}, },
"channel_last_refresh": { "expected_set": {
"type": "date",
"format": "epoch_second"
}
},
'expected_set': {
"analysis": { "analysis": {
"normalizer": { "normalizer": {
"to_lower": { "to_lower": {"type": "custom", "filter": ["lowercase"]}
"type": "custom",
"filter": ["lowercase"]
}
} }
}, },
"number_of_replicas": "0" "number_of_replicas": "0",
} },
}, },
{ {
'index_name': 'video', "index_name": "video",
'expected_map': { "expected_map": {
"vid_thumb_url": { "vid_thumb_url": {"type": "text", "index": False},
"type": "text", "date_downloaded": {"type": "date"},
"index": False
},
"date_downloaded": {
"type": "date"
},
"channel": { "channel": {
"properties": { "properties": {
"channel_id": { "channel_id": {
@ -86,127 +67,92 @@ INDEX_CONFIG = [
"keyword": { "keyword": {
"type": "keyword", "type": "keyword",
"ignore_above": 256, "ignore_above": 256,
"normalizer": "to_lower" "normalizer": "to_lower",
}, },
"search_as_you_type": { "search_as_you_type": {
"type": "search_as_you_type", "type": "search_as_you_type",
"doc_values": False, "doc_values": False,
"max_shingle_size": 3 "max_shingle_size": 3,
}
}
}, },
"channel_banner_url": {
"type": "keyword",
"index": False
}, },
"channel_thumb_url": {
"type": "keyword",
"index": False
},
"channel_description": {
"type": "text"
}, },
"channel_banner_url": {"type": "keyword", "index": False},
"channel_thumb_url": {"type": "keyword", "index": False},
"channel_description": {"type": "text"},
"channel_last_refresh": { "channel_last_refresh": {
"type": "date", "type": "date",
"format": "epoch_second" "format": "epoch_second",
} },
} }
}, },
"description": { "description": {"type": "text"},
"type": "text" "media_url": {"type": "keyword", "index": False},
},
"media_url": {
"type": "keyword",
"index": False
},
"title": { "title": {
"type": "text", "type": "text",
"fields": { "fields": {
"keyword": { "keyword": {
"type": "keyword", "type": "keyword",
"ignore_above": 256, "ignore_above": 256,
"normalizer": "to_lower" "normalizer": "to_lower",
}, },
"search_as_you_type": { "search_as_you_type": {
"type": "search_as_you_type", "type": "search_as_you_type",
"doc_values": False, "doc_values": False,
"max_shingle_size": 3 "max_shingle_size": 3,
}
}
},
"vid_last_refresh": {
"type": "date"
},
"youtube_id": {
"type": "keyword"
},
"published": {
"type": "date"
}, },
}, },
'expected_set': { },
"vid_last_refresh": {"type": "date"},
"youtube_id": {"type": "keyword"},
"published": {"type": "date"},
},
"expected_set": {
"analysis": { "analysis": {
"normalizer": { "normalizer": {
"to_lower": { "to_lower": {"type": "custom", "filter": ["lowercase"]}
"type": "custom",
"filter": ["lowercase"]
}
} }
}, },
"number_of_replicas": "0" "number_of_replicas": "0",
} },
}, },
{ {
'index_name': 'download', "index_name": "download",
'expected_map': { "expected_map": {
"timestamp": { "timestamp": {"type": "date"},
"type": "date" "channel_id": {"type": "keyword"},
},
"channel_id": {
"type": "keyword"
},
"channel_name": { "channel_name": {
"type": "text", "type": "text",
"fields": { "fields": {
"keyword": { "keyword": {
"type": "keyword", "type": "keyword",
"ignore_above": 256, "ignore_above": 256,
"normalizer": "to_lower" "normalizer": "to_lower",
}
} }
}, },
"status": {
"type": "keyword"
}, },
"status": {"type": "keyword"},
"title": { "title": {
"type": "text", "type": "text",
"fields": { "fields": {
"keyword": { "keyword": {
"type": "keyword", "type": "keyword",
"ignore_above": 256, "ignore_above": 256,
"normalizer": "to_lower" "normalizer": "to_lower",
}
} }
}, },
"vid_thumb_url": {
"type": "keyword"
}, },
"youtube_id": { "vid_thumb_url": {"type": "keyword"},
"type": "keyword" "youtube_id": {"type": "keyword"},
}
}, },
'expected_set': { "expected_set": {
"analysis": { "analysis": {
"normalizer": { "normalizer": {
"to_lower": { "to_lower": {"type": "custom", "filter": ["lowercase"]}
"type": "custom",
"filter": ["lowercase"]
}
} }
}, },
"number_of_replicas": "0" "number_of_replicas": "0",
} },
} },
] ]
@ -216,8 +162,8 @@ class ElasticIndex:
""" """
CONFIG = AppConfig().config CONFIG = AppConfig().config
ES_URL = CONFIG['application']['es_url'] ES_URL = CONFIG["application"]["es_url"]
HEADERS = {'Content-type': 'application/json'} HEADERS = {"Content-type": "application/json"}
def __init__(self, index_name, expected_map, expected_set): def __init__(self, index_name, expected_map, expected_set):
self.index_name = index_name self.index_name = index_name
@ -226,14 +172,14 @@ class ElasticIndex:
self.exists, self.details = self.index_exists() self.exists, self.details = self.index_exists()
def index_exists(self): def index_exists(self):
""" check if index already exists and return mapping if it does """ """check if index already exists and return mapping if it does"""
index_name = self.index_name index_name = self.index_name
url = f'{self.ES_URL}/ta_{index_name}' url = f"{self.ES_URL}/ta_{index_name}"
response = requests.get(url) response = requests.get(url)
exists = response.ok exists = response.ok
if exists: if exists:
details = response.json()[f'ta_{index_name}'] details = response.json()[f"ta_{index_name}"]
else: else:
details = False details = False
@ -258,19 +204,19 @@ class ElasticIndex:
return False return False
def validate_mappings(self): def validate_mappings(self):
""" check if all mappings are as expected """ """check if all mappings are as expected"""
expected_map = self.expected_map expected_map = self.expected_map
now_map = self.details['mappings']['properties'] now_map = self.details["mappings"]["properties"]
for key, value in expected_map.items(): for key, value in expected_map.items():
# nested # nested
if list(value.keys()) == ['properties']: if list(value.keys()) == ["properties"]:
for key_n, value_n in value['properties'].items(): for key_n, value_n in value["properties"].items():
if key_n not in now_map[key]['properties'].keys(): if key_n not in now_map[key]["properties"].keys():
print(key_n, value_n) print(key_n, value_n)
return True return True
if not value_n == now_map[key]['properties'][key_n]: if not value_n == now_map[key]["properties"][key_n]:
print(key_n, value_n) print(key_n, value_n)
return True return True
@ -287,9 +233,9 @@ class ElasticIndex:
return False return False
def validate_settings(self): def validate_settings(self):
""" check if all settings are as expected """ """check if all settings are as expected"""
now_set = self.details['settings']['index'] now_set = self.details["settings"]["index"]
for key, value in self.expected_set.items(): for key, value in self.expected_set.items():
if key not in now_set.keys(): if key not in now_set.keys():
@ -303,53 +249,46 @@ class ElasticIndex:
return False return False
def rebuild_index(self): def rebuild_index(self):
""" rebuild with new mapping """ """rebuild with new mapping"""
# backup # backup
self.reindex('backup') self.reindex("backup")
# delete original # delete original
self.delete_index(backup=False) self.delete_index(backup=False)
# create new # create new
self.create_blank() self.create_blank()
self.reindex('restore') self.reindex("restore")
# delete backup # delete backup
self.delete_index() self.delete_index()
def reindex(self, method): def reindex(self, method):
""" create on elastic search """ """create on elastic search"""
index_name = self.index_name index_name = self.index_name
if method == 'backup': if method == "backup":
source = f'ta_{index_name}' source = f"ta_{index_name}"
destination = f'ta_{index_name}_backup' destination = f"ta_{index_name}_backup"
elif method == 'restore': elif method == "restore":
source = f'ta_{index_name}_backup' source = f"ta_{index_name}_backup"
destination = f'ta_{index_name}' destination = f"ta_{index_name}"
query = { query = {"source": {"index": source}, "dest": {"index": destination}}
"source": {
"index": source
},
"dest": {
"index": destination
}
}
data = json.dumps(query) data = json.dumps(query)
url = self.ES_URL + '/_reindex?refresh=true' url = self.ES_URL + "/_reindex?refresh=true"
response = requests.post(url=url, data=data, headers=self.HEADERS) response = requests.post(url=url, data=data, headers=self.HEADERS)
if not response.ok: if not response.ok:
print(response.text) print(response.text)
def delete_index(self, backup=True): def delete_index(self, backup=True):
""" delete index passed as argument """ """delete index passed as argument"""
if backup: if backup:
url = f'{self.ES_URL}/ta_{self.index_name}_backup' url = f"{self.ES_URL}/ta_{self.index_name}_backup"
else: else:
url = f'{self.ES_URL}/ta_{self.index_name}' url = f"{self.ES_URL}/ta_{self.index_name}"
response = requests.delete(url) response = requests.delete(url)
if not response.ok: if not response.ok:
print(response.text) print(response.text)
def create_blank(self): def create_blank(self):
""" apply new mapping and settings for blank new index """ """apply new mapping and settings for blank new index"""
expected_map = self.expected_map expected_map = self.expected_map
expected_set = self.expected_set expected_set = self.expected_set
# stich payload # stich payload
@ -359,7 +298,7 @@ class ElasticIndex:
if expected_map: if expected_map:
payload.update({"mappings": {"properties": expected_map}}) payload.update({"mappings": {"properties": expected_map}})
# create # create
url = f'{self.ES_URL}/ta_{self.index_name}' url = f"{self.ES_URL}/ta_{self.index_name}"
data = json.dumps(payload) data = json.dumps(payload)
response = requests.put(url=url, data=data, headers=self.HEADERS) response = requests.put(url=url, data=data, headers=self.HEADERS)
if not response.ok: if not response.ok:
@ -367,102 +306,103 @@ class ElasticIndex:
class ElasticBackup: class ElasticBackup:
""" dump index to nd-json files for later bulk import """ """dump index to nd-json files for later bulk import"""
def __init__(self, index_config): def __init__(self, index_config):
self.config = AppConfig().config self.config = AppConfig().config
self.index_config = index_config self.index_config = index_config
self.timestamp = datetime.now().strftime('%Y%m%d') self.timestamp = datetime.now().strftime("%Y%m%d")
self.backup_files = [] self.backup_files = []
def get_all_documents(self, index_name): def get_all_documents(self, index_name):
""" export all documents of a single index """ """export all documents of a single index"""
headers = {'Content-type': 'application/json'} headers = {"Content-type": "application/json"}
es_url = self.config['application']['es_url'] es_url = self.config["application"]["es_url"]
# get PIT ID # get PIT ID
url = f'{es_url}/ta_{index_name}/_pit?keep_alive=1m' url = f"{es_url}/ta_{index_name}/_pit?keep_alive=1m"
response = requests.post(url) response = requests.post(url)
json_data = json.loads(response.text) json_data = json.loads(response.text)
pit_id = json_data['id'] pit_id = json_data["id"]
# build query # build query
data = { data = {
"query": {"match_all": {}}, "query": {"match_all": {}},
"size": 100, "pit": {"id": pit_id, "keep_alive": "1m"}, "size": 100,
"sort": [{"_id": {"order": "asc"}}] "pit": {"id": pit_id, "keep_alive": "1m"},
"sort": [{"_id": {"order": "asc"}}],
} }
query_str = json.dumps(data) query_str = json.dumps(data)
url = es_url + '/_search' url = es_url + "/_search"
# loop until nothing left # loop until nothing left
all_results = [] all_results = []
while True: while True:
response = requests.get(url, data=query_str, headers=headers) response = requests.get(url, data=query_str, headers=headers)
json_data = json.loads(response.text) json_data = json.loads(response.text)
all_hits = json_data['hits']['hits'] all_hits = json_data["hits"]["hits"]
if all_hits: if all_hits:
for hit in all_hits: for hit in all_hits:
search_after = hit['sort'] search_after = hit["sort"]
all_results.append(hit) all_results.append(hit)
# update search_after with last hit data # update search_after with last hit data
data['search_after'] = search_after data["search_after"] = search_after
query_str = json.dumps(data) query_str = json.dumps(data)
else: else:
break break
# clean up PIT # clean up PIT
query_str = json.dumps({"id": pit_id}) query_str = json.dumps({"id": pit_id})
requests.delete(es_url + '/_pit', data=query_str, headers=headers) requests.delete(es_url + "/_pit", data=query_str, headers=headers)
return all_results return all_results
@staticmethod @staticmethod
def build_bulk(all_results): def build_bulk(all_results):
""" build bulk query data from all_results """ """build bulk query data from all_results"""
bulk_list = [] bulk_list = []
for document in all_results: for document in all_results:
document_id = document['_id'] document_id = document["_id"]
es_index = document['_index'] es_index = document["_index"]
action = {"index": {"_index": es_index, "_id": document_id}} action = {"index": {"_index": es_index, "_id": document_id}}
source = document['_source'] source = document["_source"]
bulk_list.append(json.dumps(action)) bulk_list.append(json.dumps(action))
bulk_list.append(json.dumps(source)) bulk_list.append(json.dumps(source))
# add last newline # add last newline
bulk_list.append('\n') bulk_list.append("\n")
file_content = '\n'.join(bulk_list) file_content = "\n".join(bulk_list)
return file_content return file_content
def write_es_json(self, file_content, index_name): def write_es_json(self, file_content, index_name):
""" write nd-json file for es _bulk API to disk """ """write nd-json file for es _bulk API to disk"""
cache_dir = self.config['application']['cache_dir'] cache_dir = self.config["application"]["cache_dir"]
file_name = f'es_{index_name}-{self.timestamp}.json' file_name = f"es_{index_name}-{self.timestamp}.json"
file_path = os.path.join(cache_dir, 'backup', file_name) file_path = os.path.join(cache_dir, "backup", file_name)
with open(file_path, 'w', encoding='utf-8') as f: with open(file_path, "w", encoding="utf-8") as f:
f.write(file_content) f.write(file_content)
self.backup_files.append(file_path) self.backup_files.append(file_path)
def write_ta_json(self, all_results, index_name): def write_ta_json(self, all_results, index_name):
""" write generic json file to disk """ """write generic json file to disk"""
cache_dir = self.config['application']['cache_dir'] cache_dir = self.config["application"]["cache_dir"]
file_name = f'ta_{index_name}-{self.timestamp}.json' file_name = f"ta_{index_name}-{self.timestamp}.json"
file_path = os.path.join(cache_dir, 'backup', file_name) file_path = os.path.join(cache_dir, "backup", file_name)
to_write = [i['_source'] for i in all_results] to_write = [i["_source"] for i in all_results]
file_content = json.dumps(to_write) file_content = json.dumps(to_write)
with open(file_path, 'w', encoding='utf-8') as f: with open(file_path, "w", encoding="utf-8") as f:
f.write(file_content) f.write(file_content)
self.backup_files.append(file_path) self.backup_files.append(file_path)
def zip_it(self): def zip_it(self):
""" pack it up into single zip file """ """pack it up into single zip file"""
cache_dir = self.config['application']['cache_dir'] cache_dir = self.config["application"]["cache_dir"]
file_name = f'ta_backup-{self.timestamp}.zip' file_name = f"ta_backup-{self.timestamp}.zip"
backup_folder = os.path.join(cache_dir, 'backup') backup_folder = os.path.join(cache_dir, "backup")
backup_file = os.path.join(backup_folder, file_name) backup_file = os.path.join(backup_folder, file_name)
with zipfile.ZipFile( with zipfile.ZipFile(
backup_file, 'w', compression=zipfile.ZIP_DEFLATED backup_file, "w", compression=zipfile.ZIP_DEFLATED
) as zip_f: ) as zip_f:
for backup_file in self.backup_files: for backup_file in self.backup_files:
zip_f.write(backup_file, os.path.basename(backup_file)) zip_f.write(backup_file, os.path.basename(backup_file))
@ -472,66 +412,67 @@ class ElasticBackup:
os.remove(backup_file) os.remove(backup_file)
def post_bulk_restore(self, file_name): def post_bulk_restore(self, file_name):
""" send bulk to es """ """send bulk to es"""
cache_dir = self.config['application']['cache_dir'] cache_dir = self.config["application"]["cache_dir"]
es_url = self.config['application']['es_url'] es_url = self.config["application"]["es_url"]
headers = {'Content-type': 'application/x-ndjson'} headers = {"Content-type": "application/x-ndjson"}
file_path = os.path.join(cache_dir, file_name) file_path = os.path.join(cache_dir, file_name)
with open(file_path, 'r', encoding='utf-8') as f: with open(file_path, "r", encoding="utf-8") as f:
query_str = f.read() query_str = f.read()
if not query_str.strip(): if not query_str.strip():
return return
url = es_url + '/_bulk' url = es_url + "/_bulk"
request = requests.post(url, data=query_str, headers=headers) request = requests.post(url, data=query_str, headers=headers)
if not request.ok: if not request.ok:
print(request.text) print(request.text)
def unpack_zip_backup(self): def unpack_zip_backup(self):
""" extract backup zip and return filelist """ """extract backup zip and return filelist"""
cache_dir = self.config['application']['cache_dir'] cache_dir = self.config["application"]["cache_dir"]
backup_dir = os.path.join(cache_dir, 'backup') backup_dir = os.path.join(cache_dir, "backup")
all_available_backups = [ all_available_backups = [
i for i in os.listdir(backup_dir) if i
i.startswith('ta_') and i.endswith('.zip') for i in os.listdir(backup_dir)
if i.startswith("ta_") and i.endswith(".zip")
] ]
all_available_backups.sort() all_available_backups.sort()
newest_backup = all_available_backups[-1] newest_backup = all_available_backups[-1]
file_path = os.path.join(backup_dir, newest_backup) file_path = os.path.join(backup_dir, newest_backup)
with zipfile.ZipFile(file_path, 'r') as z: with zipfile.ZipFile(file_path, "r") as z:
zip_content = z.namelist() zip_content = z.namelist()
z.extractall(backup_dir) z.extractall(backup_dir)
return zip_content return zip_content
def restore_json_files(self, zip_content): def restore_json_files(self, zip_content):
""" go through the unpacked files and restore """ """go through the unpacked files and restore"""
cache_dir = self.config['application']['cache_dir'] cache_dir = self.config["application"]["cache_dir"]
backup_dir = os.path.join(cache_dir, 'backup') backup_dir = os.path.join(cache_dir, "backup")
for json_f in zip_content: for json_f in zip_content:
file_name = os.path.join(backup_dir, json_f) file_name = os.path.join(backup_dir, json_f)
if not json_f.startswith('es_') or not json_f.endswith('.json'): if not json_f.startswith("es_") or not json_f.endswith(".json"):
os.remove(file_name) os.remove(file_name)
continue continue
print('restoring: ' + json_f) print("restoring: " + json_f)
self.post_bulk_restore(file_name) self.post_bulk_restore(file_name)
os.remove(file_name) os.remove(file_name)
def backup_all_indexes(): def backup_all_indexes():
""" backup all es indexes to disk """ """backup all es indexes to disk"""
backup_handler = ElasticBackup(INDEX_CONFIG) backup_handler = ElasticBackup(INDEX_CONFIG)
for index in backup_handler.index_config: for index in backup_handler.index_config:
index_name = index['index_name'] index_name = index["index_name"]
all_results = backup_handler.get_all_documents(index_name) all_results = backup_handler.get_all_documents(index_name)
file_content = backup_handler.build_bulk(all_results) file_content = backup_handler.build_bulk(all_results)
backup_handler.write_es_json(file_content, index_name) backup_handler.write_es_json(file_content, index_name)
@ -541,7 +482,7 @@ def backup_all_indexes():
def restore_from_backup(): def restore_from_backup():
""" restore indexes from backup file """ """restore indexes from backup file"""
# delete # delete
index_check(force_restore=True) index_check(force_restore=True)
# recreate # recreate
@ -551,14 +492,14 @@ def restore_from_backup():
def index_check(force_restore=False): def index_check(force_restore=False):
""" check if all indexes are created and have correct mapping """ """check if all indexes are created and have correct mapping"""
backed_up = False backed_up = False
for index in INDEX_CONFIG: for index in INDEX_CONFIG:
index_name = index['index_name'] index_name = index["index_name"]
expected_map = index['expected_map'] expected_map = index["expected_map"]
expected_set = index['expected_set'] expected_set = index["expected_set"]
handler = ElasticIndex(index_name, expected_map, expected_set) handler = ElasticIndex(index_name, expected_map, expected_set)
# force restore # force restore
if force_restore: if force_restore:
@ -568,7 +509,7 @@ def index_check(force_restore=False):
# create new # create new
if not handler.exists: if not handler.exists:
print(f'create new blank index with name ta_{index_name}...') print(f"create new blank index with name ta_{index_name}...")
handler.create_blank() handler.create_blank()
continue continue
@ -577,13 +518,13 @@ def index_check(force_restore=False):
if rebuild: if rebuild:
# make backup before rebuild # make backup before rebuild
if not backed_up: if not backed_up:
print('running backup first') print("running backup first")
backup_all_indexes() backup_all_indexes()
backed_up = True backed_up = True
print(f'applying new mappings to index ta_{index_name}...') print(f"applying new mappings to index ta_{index_name}...")
handler.rebuild_index() handler.rebuild_index()
continue continue
# else all good # else all good
print(f'ta_{index_name} index is created and up to date...') print(f"ta_{index_name} index is created and up to date...")

View File

@ -17,19 +17,23 @@ from time import sleep
import requests import requests
from home.src.config import AppConfig from home.src.config import AppConfig
from home.src.download import ChannelSubscription, PendingList, VideoDownloader from home.src.download import ChannelSubscription, PendingList, VideoDownloader
from home.src.helper import (clean_string, get_message, get_total_hits, from home.src.helper import (
set_message) clean_string,
get_message,
get_total_hits,
set_message,
)
from home.src.index import YoutubeChannel, YoutubeVideo, index_new_video from home.src.index import YoutubeChannel, YoutubeVideo, index_new_video
class Reindex: class Reindex:
""" check for outdated documents and refresh data from youtube """ """check for outdated documents and refresh data from youtube"""
def __init__(self): def __init__(self):
# config # config
config = AppConfig().config config = AppConfig().config
self.sleep_interval = config['downloads']['sleep_interval'] self.sleep_interval = config["downloads"]["sleep_interval"]
self.es_url = config['application']['es_url'] self.es_url = config["application"]["es_url"]
self.refresh_interval = 90 self.refresh_interval = 90
# scan # scan
self.video_daily, self.channel_daily = self.get_daily() self.video_daily, self.channel_daily = self.get_daily()
@ -37,20 +41,18 @@ class Reindex:
self.all_channel_ids = False self.all_channel_ids = False
def get_daily(self): def get_daily(self):
""" get daily refresh values """ """get daily refresh values"""
total_videos = get_total_hits( total_videos = get_total_hits("ta_video", self.es_url, "active")
'ta_video', self.es_url, 'active'
)
video_daily = ceil(total_videos / self.refresh_interval * 1.2) video_daily = ceil(total_videos / self.refresh_interval * 1.2)
total_channels = get_total_hits( total_channels = get_total_hits(
'ta_channel', self.es_url, 'channel_active' "ta_channel", self.es_url, "channel_active"
) )
channel_daily = ceil(total_channels / self.refresh_interval * 1.2) channel_daily = ceil(total_channels / self.refresh_interval * 1.2)
return (video_daily, channel_daily) return (video_daily, channel_daily)
def get_outdated_vids(self): def get_outdated_vids(self):
""" get daily videos to refresh """ """get daily videos to refresh"""
headers = {'Content-type': 'application/json'} headers = {"Content-type": "application/json"}
now = int(datetime.now().strftime("%s")) now = int(datetime.now().strftime("%s"))
now_3m = now - 3 * 30 * 24 * 60 * 60 now_3m = now - 3 * 30 * 24 * 60 * 60
size = self.video_daily size = self.video_daily
@ -60,24 +62,25 @@ class Reindex:
"bool": { "bool": {
"must": [ "must": [
{"match": {"active": True}}, {"match": {"active": True}},
{"range": {"vid_last_refresh": {"lte": now_3m}}} {"range": {"vid_last_refresh": {"lte": now_3m}}},
] ]
} }
}, },
"sort": [{"vid_last_refresh": {"order": "asc"}}], "_source": False "sort": [{"vid_last_refresh": {"order": "asc"}}],
"_source": False,
} }
query_str = json.dumps(data) query_str = json.dumps(data)
url = self.es_url + '/ta_video/_search' url = self.es_url + "/ta_video/_search"
response = requests.get(url, data=query_str, headers=headers) response = requests.get(url, data=query_str, headers=headers)
if not response.ok: if not response.ok:
print(response.text) print(response.text)
response_dict = json.loads(response.text) response_dict = json.loads(response.text)
all_youtube_ids = [i['_id'] for i in response_dict['hits']['hits']] all_youtube_ids = [i["_id"] for i in response_dict["hits"]["hits"]]
return all_youtube_ids return all_youtube_ids
def get_outdated_channels(self): def get_outdated_channels(self):
""" get daily channels to refresh """ """get daily channels to refresh"""
headers = {'Content-type': 'application/json'} headers = {"Content-type": "application/json"}
now = int(datetime.now().strftime("%s")) now = int(datetime.now().strftime("%s"))
now_3m = now - 3 * 30 * 24 * 60 * 60 now_3m = now - 3 * 30 * 24 * 60 * 60
size = self.channel_daily size = self.channel_daily
@ -87,52 +90,50 @@ class Reindex:
"bool": { "bool": {
"must": [ "must": [
{"match": {"channel_active": True}}, {"match": {"channel_active": True}},
{"range": {"channel_last_refresh": {"lte": now_3m}}} {"range": {"channel_last_refresh": {"lte": now_3m}}},
] ]
} }
}, },
"sort": [{"channel_last_refresh": {"order": "asc"}}], "sort": [{"channel_last_refresh": {"order": "asc"}}],
"_source": False "_source": False,
} }
query_str = json.dumps(data) query_str = json.dumps(data)
url = self.es_url + '/ta_channel/_search' url = self.es_url + "/ta_channel/_search"
response = requests.get(url, data=query_str, headers=headers) response = requests.get(url, data=query_str, headers=headers)
if not response.ok: if not response.ok:
print(response.text) print(response.text)
response_dict = json.loads(response.text) response_dict = json.loads(response.text)
all_channel_ids = [i['_id'] for i in response_dict['hits']['hits']] all_channel_ids = [i["_id"] for i in response_dict["hits"]["hits"]]
return all_channel_ids return all_channel_ids
def check_outdated(self): def check_outdated(self):
""" add missing vids and channels """ """add missing vids and channels"""
self.all_youtube_ids = self.get_outdated_vids() self.all_youtube_ids = self.get_outdated_vids()
self.all_channel_ids = self.get_outdated_channels() self.all_channel_ids = self.get_outdated_channels()
def rescrape_all_channels(self): def rescrape_all_channels(self):
""" sync new data from channel to all matching videos """ """sync new data from channel to all matching videos"""
sleep_interval = self.sleep_interval sleep_interval = self.sleep_interval
channel_sub_handler = ChannelSubscription() channel_sub_handler = ChannelSubscription()
all_channels = channel_sub_handler.get_channels( all_channels = channel_sub_handler.get_channels(subscribed_only=False)
subscribed_only=False all_channel_ids = [i["channel_id"] for i in all_channels]
)
all_channel_ids = [i['channel_id'] for i in all_channels]
counter = 1 counter = 1
for channel_id in all_channel_ids: for channel_id in all_channel_ids:
message = f'Progress: {counter}/{len(all_channels)}' message = f"Progress: {counter}/{len(all_channels)}"
mess_dict = { mess_dict = {
"status": "scraping", "status": "scraping",
"level": "info", "level": "info",
"title": "Scraping all youtube channels", "title": "Scraping all youtube channels",
"message": message "message": message,
} }
set_message('progress:download', mess_dict) set_message("progress:download", mess_dict)
channel_index = YoutubeChannel(channel_id) channel_index = YoutubeChannel(channel_id)
subscribed = channel_index.channel_dict['channel_subscribed'] subscribed = channel_index.channel_dict["channel_subscribed"]
channel_index.channel_dict = channel_index.build_channel_dict( channel_index.channel_dict = channel_index.build_channel_dict(
scrape=True scrape=True
) )
channel_index.channel_dict['channel_subscribed'] = subscribed channel_index.channel_dict["channel_subscribed"] = subscribed
channel_index.upload_to_es() channel_index.upload_to_es()
channel_index.sync_to_videos() channel_index.sync_to_videos()
counter = counter + 1 counter = counter + 1
@ -141,7 +142,7 @@ class Reindex:
@staticmethod @staticmethod
def reindex_single_video(youtube_id): def reindex_single_video(youtube_id):
""" refresh data for single video """ """refresh data for single video"""
vid_handler = YoutubeVideo(youtube_id) vid_handler = YoutubeVideo(youtube_id)
if not vid_handler.vid_dict: if not vid_handler.vid_dict:
# stop if deactivated # stop if deactivated
@ -149,42 +150,42 @@ class Reindex:
return return
es_vid_dict = vid_handler.get_es_data() es_vid_dict = vid_handler.get_es_data()
player = es_vid_dict['_source']['player'] player = es_vid_dict["_source"]["player"]
date_downloaded = es_vid_dict['_source']['date_downloaded'] date_downloaded = es_vid_dict["_source"]["date_downloaded"]
channel_dict = es_vid_dict['_source']['channel'] channel_dict = es_vid_dict["_source"]["channel"]
channel_name = channel_dict['channel_name'] channel_name = channel_dict["channel_name"]
vid_handler.build_file_path(channel_name) vid_handler.build_file_path(channel_name)
# add to vid_dict # add to vid_dict
vid_handler.vid_dict['player'] = player vid_handler.vid_dict["player"] = player
vid_handler.vid_dict['date_downloaded'] = date_downloaded vid_handler.vid_dict["date_downloaded"] = date_downloaded
vid_handler.vid_dict['channel'] = channel_dict vid_handler.vid_dict["channel"] = channel_dict
# update # update
vid_handler.upload_to_es() vid_handler.upload_to_es()
vid_handler.delete_cache() vid_handler.delete_cache()
@staticmethod @staticmethod
def reindex_single_channel(channel_id): def reindex_single_channel(channel_id):
""" refresh channel data and sync to videos """ """refresh channel data and sync to videos"""
channel_handler = YoutubeChannel(channel_id) channel_handler = YoutubeChannel(channel_id)
subscribed = channel_handler.channel_dict['channel_subscribed'] subscribed = channel_handler.channel_dict["channel_subscribed"]
channel_handler.channel_dict = channel_handler.build_channel_dict( channel_handler.channel_dict = channel_handler.build_channel_dict(
scrape=True scrape=True
) )
channel_handler.channel_dict['channel_subscribed'] = subscribed channel_handler.channel_dict["channel_subscribed"] = subscribed
channel_handler.upload_to_es() channel_handler.upload_to_es()
channel_handler.sync_to_videos() channel_handler.sync_to_videos()
channel_handler.clear_cache() channel_handler.clear_cache()
def reindex(self): def reindex(self):
""" reindex what's needed """ """reindex what's needed"""
# videos # videos
print(f'reindexing {len(self.all_youtube_ids)} videos') print(f"reindexing {len(self.all_youtube_ids)} videos")
for youtube_id in self.all_youtube_ids: for youtube_id in self.all_youtube_ids:
self.reindex_single_video(youtube_id) self.reindex_single_video(youtube_id)
if self.sleep_interval: if self.sleep_interval:
sleep(self.sleep_interval) sleep(self.sleep_interval)
# channels # channels
print(f'reindexing {len(self.all_channel_ids)} channels') print(f"reindexing {len(self.all_channel_ids)} channels")
for channel_id in self.all_channel_ids: for channel_id in self.all_channel_ids:
self.reindex_single_channel(channel_id) self.reindex_single_channel(channel_id)
if self.sleep_interval: if self.sleep_interval:
@ -192,11 +193,11 @@ class Reindex:
class FilesystemScanner: class FilesystemScanner:
""" handle scanning and fixing from filesystem """ """handle scanning and fixing from filesystem"""
CONFIG = AppConfig().config CONFIG = AppConfig().config
ES_URL = CONFIG['application']['es_url'] ES_URL = CONFIG["application"]["es_url"]
VIDEOS = CONFIG['application']['videos'] VIDEOS = CONFIG["application"]["videos"]
def __init__(self): def __init__(self):
self.all_downloaded = self.get_all_downloaded() self.all_downloaded = self.get_all_downloaded()
@ -207,7 +208,7 @@ class FilesystemScanner:
self.to_delete = None self.to_delete = None
def get_all_downloaded(self): def get_all_downloaded(self):
""" get a list of all video files downloaded """ """get a list of all video files downloaded"""
all_channels = os.listdir(self.VIDEOS) all_channels = os.listdir(self.VIDEOS)
all_channels.sort() all_channels.sort()
all_downloaded = [] all_downloaded = []
@ -221,26 +222,26 @@ class FilesystemScanner:
@staticmethod @staticmethod
def get_all_indexed(): def get_all_indexed():
""" get a list of all indexed videos """ """get a list of all indexed videos"""
index_handler = PendingList() index_handler = PendingList()
all_indexed_raw = index_handler.get_all_indexed() all_indexed_raw = index_handler.get_all_indexed()
all_indexed = [] all_indexed = []
for video in all_indexed_raw: for video in all_indexed_raw:
youtube_id = video['_id'] youtube_id = video["_id"]
media_url = video['_source']['media_url'] media_url = video["_source"]["media_url"]
published = video['_source']['published'] published = video["_source"]["published"]
title = video['_source']['title'] title = video["_source"]["title"]
all_indexed.append((youtube_id, media_url, published, title)) all_indexed.append((youtube_id, media_url, published, title))
return all_indexed return all_indexed
def list_comarison(self): def list_comarison(self):
""" compare the lists to figure out what to do """ """compare the lists to figure out what to do"""
self.find_unindexed() self.find_unindexed()
self.find_missing() self.find_missing()
self.find_bad_media_url() self.find_bad_media_url()
def find_unindexed(self): def find_unindexed(self):
""" find video files without a matching document indexed """ """find video files without a matching document indexed"""
all_indexed_ids = [i[0] for i in self.all_indexed] all_indexed_ids = [i[0] for i in self.all_indexed]
to_index = [] to_index = []
for downloaded in self.all_downloaded: for downloaded in self.all_downloaded:
@ -250,7 +251,7 @@ class FilesystemScanner:
self.to_index = to_index self.to_index = to_index
def find_missing(self): def find_missing(self):
""" find indexed videos without matching media file """ """find indexed videos without matching media file"""
all_downloaded_ids = [i[2] for i in self.all_downloaded] all_downloaded_ids = [i[2] for i in self.all_downloaded]
to_delete = [] to_delete = []
for video in self.all_indexed: for video in self.all_indexed:
@ -261,7 +262,7 @@ class FilesystemScanner:
self.to_delete = to_delete self.to_delete = to_delete
def find_bad_media_url(self): def find_bad_media_url(self):
""" rename media files not matching the indexed title """ """rename media files not matching the indexed title"""
to_fix = [] to_fix = []
to_rename = [] to_rename = []
for downloaded in self.all_downloaded: for downloaded in self.all_downloaded:
@ -272,8 +273,8 @@ class FilesystemScanner:
if indexed_id == downloaded_id: if indexed_id == downloaded_id:
# found it # found it
title_c = clean_string(title) title_c = clean_string(title)
pub = published.replace('-', '') pub = published.replace("-", "")
expected_filename = f'{pub}_{indexed_id}_{title_c}.mp4' expected_filename = f"{pub}_{indexed_id}_{title_c}.mp4"
new_url = os.path.join(channel, expected_filename) new_url = os.path.join(channel, expected_filename)
if expected_filename != filename: if expected_filename != filename:
# file to rename # file to rename
@ -290,7 +291,7 @@ class FilesystemScanner:
self.to_rename = to_rename self.to_rename = to_rename
def rename_files(self): def rename_files(self):
""" rename media files as identified by find_bad_media_url """ """rename media files as identified by find_bad_media_url"""
for bad_filename in self.to_rename: for bad_filename in self.to_rename:
channel, filename, expected_filename = bad_filename channel, filename, expected_filename = bad_filename
old_path = os.path.join(self.VIDEOS, channel, filename) old_path = os.path.join(self.VIDEOS, channel, filename)
@ -298,71 +299,72 @@ class FilesystemScanner:
os.rename(old_path, new_path) os.rename(old_path, new_path)
def send_mismatch_bulk(self): def send_mismatch_bulk(self):
""" build bulk update """ """build bulk update"""
bulk_list = [] bulk_list = []
for video_mismatch in self.mismatch: for video_mismatch in self.mismatch:
youtube_id, media_url = video_mismatch youtube_id, media_url = video_mismatch
action = {"update": {"_id": youtube_id, "_index": 'ta_video'}} action = {"update": {"_id": youtube_id, "_index": "ta_video"}}
source = {"doc": {"media_url": media_url}} source = {"doc": {"media_url": media_url}}
bulk_list.append(json.dumps(action)) bulk_list.append(json.dumps(action))
bulk_list.append(json.dumps(source)) bulk_list.append(json.dumps(source))
# add last newline # add last newline
bulk_list.append('\n') bulk_list.append("\n")
query_str = '\n'.join(bulk_list) query_str = "\n".join(bulk_list)
# make the call # make the call
headers = {'Content-type': 'application/x-ndjson'} headers = {"Content-type": "application/x-ndjson"}
url = self.ES_URL + '/_bulk' url = self.ES_URL + "/_bulk"
request = requests.post(url, data=query_str, headers=headers) request = requests.post(url, data=query_str, headers=headers)
if not request.ok: if not request.ok:
print(request.text) print(request.text)
def delete_from_index(self): def delete_from_index(self):
""" find indexed but deleted mediafile """ """find indexed but deleted mediafile"""
for indexed in self.to_delete: for indexed in self.to_delete:
youtube_id, _ = indexed youtube_id, _ = indexed
url = self.ES_URL + '/ta_video/_doc/' + youtube_id url = self.ES_URL + "/ta_video/_doc/" + youtube_id
request = requests.delete(url) request = requests.delete(url)
if not request.ok: if not request.ok:
print(request.text) print(request.text)
class ManualImport: class ManualImport:
""" import and indexing existing video files """ """import and indexing existing video files"""
CONFIG = AppConfig().config CONFIG = AppConfig().config
CACHE_DIR = CONFIG['application']['cache_dir'] CACHE_DIR = CONFIG["application"]["cache_dir"]
IMPORT_DIR = os.path.join(CACHE_DIR, 'import') IMPORT_DIR = os.path.join(CACHE_DIR, "import")
def __init__(self): def __init__(self):
self.identified = self.import_folder_parser() self.identified = self.import_folder_parser()
def import_folder_parser(self): def import_folder_parser(self):
""" detect files in import folder """ """detect files in import folder"""
to_import = os.listdir(self.IMPORT_DIR) to_import = os.listdir(self.IMPORT_DIR)
to_import.sort() to_import.sort()
video_files = [i for i in to_import if not i.endswith('.json')] video_files = [i for i in to_import if not i.endswith(".json")]
identified = [] identified = []
for file_path in video_files: for file_path in video_files:
file_dict = {'video_file': file_path} file_dict = {"video_file": file_path}
file_name, _ = os.path.splitext(file_path) file_name, _ = os.path.splitext(file_path)
matching_json = [ matching_json = [
i for i in to_import if i.startswith(file_name) i
and i.endswith('.json') for i in to_import
if i.startswith(file_name) and i.endswith(".json")
] ]
if matching_json: if matching_json:
json_file = matching_json[0] json_file = matching_json[0]
youtube_id = self.extract_id_from_json(json_file) youtube_id = self.extract_id_from_json(json_file)
file_dict.update({'json_file': json_file}) file_dict.update({"json_file": json_file})
else: else:
youtube_id = self.extract_id_from_filename(file_name) youtube_id = self.extract_id_from_filename(file_name)
file_dict.update({'json_file': False}) file_dict.update({"json_file": False})
file_dict.update({'youtube_id': youtube_id}) file_dict.update({"youtube_id": youtube_id})
identified.append(file_dict) identified.append(file_dict)
return identified return identified
@ -373,33 +375,33 @@ class ManualImport:
look at the file name for the youtube id look at the file name for the youtube id
expects filename ending in [<youtube_id>].<ext> expects filename ending in [<youtube_id>].<ext>
""" """
id_search = re.search(r'\[([a-zA-Z0-9_-]{11})\]$', file_name) id_search = re.search(r"\[([a-zA-Z0-9_-]{11})\]$", file_name)
if id_search: if id_search:
youtube_id = id_search.group(1) youtube_id = id_search.group(1)
return youtube_id return youtube_id
print('failed to extract youtube id for: ' + file_name) print("failed to extract youtube id for: " + file_name)
raise Exception raise Exception
def extract_id_from_json(self, json_file): def extract_id_from_json(self, json_file):
""" open json file and extract id """ """open json file and extract id"""
json_path = os.path.join(self.CACHE_DIR, 'import', json_file) json_path = os.path.join(self.CACHE_DIR, "import", json_file)
with open(json_path, 'r', encoding='utf-8') as f: with open(json_path, "r", encoding="utf-8") as f:
json_content = f.read() json_content = f.read()
youtube_id = json.loads(json_content)['id'] youtube_id = json.loads(json_content)["id"]
return youtube_id return youtube_id
def process_import(self): def process_import(self):
""" go through identified media files """ """go through identified media files"""
for media_file in self.identified: for media_file in self.identified:
json_file = media_file['json_file'] json_file = media_file["json_file"]
video_file = media_file['video_file'] video_file = media_file["video_file"]
youtube_id = media_file['youtube_id'] youtube_id = media_file["youtube_id"]
video_path = os.path.join(self.CACHE_DIR, 'import', video_file) video_path = os.path.join(self.CACHE_DIR, "import", video_file)
self.move_to_cache(video_path, youtube_id) self.move_to_cache(video_path, youtube_id)
@ -411,35 +413,43 @@ class ManualImport:
if os.path.exists(video_path): if os.path.exists(video_path):
os.remove(video_path) os.remove(video_path)
if json_file: if json_file:
json_path = os.path.join(self.CACHE_DIR, 'import', json_file) json_path = os.path.join(self.CACHE_DIR, "import", json_file)
os.remove(json_path) os.remove(json_path)
def move_to_cache(self, video_path, youtube_id): def move_to_cache(self, video_path, youtube_id):
""" move identified video file to cache, convert to mp4 """ """move identified video file to cache, convert to mp4"""
file_name = os.path.split(video_path)[-1] file_name = os.path.split(video_path)[-1]
video_file, ext = os.path.splitext(file_name) video_file, ext = os.path.splitext(file_name)
# make sure youtube_id is in filename # make sure youtube_id is in filename
if youtube_id not in video_file: if youtube_id not in video_file:
video_file = f'{video_file}_{youtube_id}' video_file = f"{video_file}_{youtube_id}"
# move, convert if needed # move, convert if needed
if ext == '.mp4': if ext == ".mp4":
new_file = video_file + ext new_file = video_file + ext
dest_path = os.path.join(self.CACHE_DIR, 'download', new_file) dest_path = os.path.join(self.CACHE_DIR, "download", new_file)
shutil.move(video_path, dest_path) shutil.move(video_path, dest_path)
else: else:
print(f'processing with ffmpeg: {video_file}') print(f"processing with ffmpeg: {video_file}")
new_file = video_file + '.mp4' new_file = video_file + ".mp4"
dest_path = os.path.join(self.CACHE_DIR, 'download', new_file) dest_path = os.path.join(self.CACHE_DIR, "download", new_file)
subprocess.run( subprocess.run(
["ffmpeg", "-i", video_path, dest_path, [
"-loglevel", "warning", "-stats"], check=True "ffmpeg",
"-i",
video_path,
dest_path,
"-loglevel",
"warning",
"-stats",
],
check=True,
) )
def scan_filesystem(): def scan_filesystem():
""" grouped function to delete and update index """ """grouped function to delete and update index"""
filesystem_handler = FilesystemScanner() filesystem_handler = FilesystemScanner()
filesystem_handler.list_comarison() filesystem_handler.list_comarison()
if filesystem_handler.to_rename: if filesystem_handler.to_rename:
@ -455,10 +465,10 @@ def scan_filesystem():
def reindex_old_documents(): def reindex_old_documents():
""" daily refresh of old documents """ """daily refresh of old documents"""
# check needed last run # check needed last run
now = int(datetime.now().strftime("%s")) now = int(datetime.now().strftime("%s"))
last_reindex = get_message('last_reindex') last_reindex = get_message("last_reindex")
if isinstance(last_reindex, int) and now - last_reindex < 60 * 60 * 24: if isinstance(last_reindex, int) and now - last_reindex < 60 * 60 * 24:
return return
# continue if needed # continue if needed
@ -466,4 +476,4 @@ def reindex_old_documents():
reindex_handler.check_outdated() reindex_handler.check_outdated()
reindex_handler.reindex() reindex_handler.reindex()
# set timestamp # set timestamp
set_message('last_reindex', now, expire=False) set_message("last_reindex", now, expire=False)

View File

@ -17,10 +17,10 @@ from PIL import Image
class SearchHandler: class SearchHandler:
""" search elastic search """ """search elastic search"""
CONFIG = AppConfig().config CONFIG = AppConfig().config
CACHE_DIR = CONFIG['application']['cache_dir'] CACHE_DIR = CONFIG["application"]["cache_dir"]
def __init__(self, url, data, cache=True): def __init__(self, url, data, cache=True):
self.max_hits = None self.max_hits = None
@ -29,15 +29,15 @@ class SearchHandler:
self.cache = cache self.cache = cache
def get_data(self): def get_data(self):
""" get the data """ """get the data"""
if self.data: if self.data:
response = requests.get(self.url, json=self.data).json() response = requests.get(self.url, json=self.data).json()
else: else:
response = requests.get(self.url).json() response = requests.get(self.url).json()
if 'hits' in response.keys(): if "hits" in response.keys():
self.max_hits = response['hits']['total']['value'] self.max_hits = response["hits"]["total"]["value"]
return_value = response['hits']['hits'] return_value = response["hits"]["hits"]
else: else:
# simulate list for single result to reuse rest of class # simulate list for single result to reuse rest of class
return_value = [response] return_value = [response]
@ -50,13 +50,13 @@ class SearchHandler:
all_channels = [] all_channels = []
for idx, hit in enumerate(return_value): for idx, hit in enumerate(return_value):
return_value[idx] = self.hit_cleanup(hit) return_value[idx] = self.hit_cleanup(hit)
if hit['_index'] == 'ta_video': if hit["_index"] == "ta_video":
video_dict, channel_dict = self.vid_cache_link(hit) video_dict, channel_dict = self.vid_cache_link(hit)
if video_dict not in all_videos: if video_dict not in all_videos:
all_videos.append(video_dict) all_videos.append(video_dict)
if channel_dict not in all_channels: if channel_dict not in all_channels:
all_channels.append(channel_dict) all_channels.append(channel_dict)
elif hit['_index'] == 'ta_channel': elif hit["_index"] == "ta_channel":
channel_dict = self.channel_cache_link(hit) channel_dict = self.channel_cache_link(hit)
if channel_dict not in all_channels: if channel_dict not in all_channels:
all_channels.append(channel_dict) all_channels.append(channel_dict)
@ -69,52 +69,49 @@ class SearchHandler:
@staticmethod @staticmethod
def vid_cache_link(hit): def vid_cache_link(hit):
""" download thumbnails into cache """ """download thumbnails into cache"""
vid_thumb = hit['source']['vid_thumb_url'] vid_thumb = hit["source"]["vid_thumb_url"]
youtube_id = hit['source']['youtube_id'] youtube_id = hit["source"]["youtube_id"]
channel_id_hit = hit['source']['channel']['channel_id'] channel_id_hit = hit["source"]["channel"]["channel_id"]
chan_thumb = hit['source']['channel']['channel_thumb_url'] chan_thumb = hit["source"]["channel"]["channel_thumb_url"]
try: try:
chan_banner = hit['source']['channel']['channel_banner_url'] chan_banner = hit["source"]["channel"]["channel_banner_url"]
except KeyError: except KeyError:
chan_banner = False chan_banner = False
video_dict = { video_dict = {"youtube_id": youtube_id, "vid_thumb": vid_thumb}
'youtube_id': youtube_id,
'vid_thumb': vid_thumb
}
channel_dict = { channel_dict = {
'channel_id': channel_id_hit, "channel_id": channel_id_hit,
'chan_thumb': chan_thumb, "chan_thumb": chan_thumb,
'chan_banner': chan_banner "chan_banner": chan_banner,
} }
return video_dict, channel_dict return video_dict, channel_dict
@staticmethod @staticmethod
def channel_cache_link(hit): def channel_cache_link(hit):
""" build channel thumb links """ """build channel thumb links"""
channel_id_hit = hit['source']['channel_id'] channel_id_hit = hit["source"]["channel_id"]
chan_thumb = hit['source']['channel_thumb_url'] chan_thumb = hit["source"]["channel_thumb_url"]
try: try:
chan_banner = hit['source']['channel_banner_url'] chan_banner = hit["source"]["channel_banner_url"]
except KeyError: except KeyError:
chan_banner = False chan_banner = False
channel_dict = { channel_dict = {
'channel_id': channel_id_hit, "channel_id": channel_id_hit,
'chan_thumb': chan_thumb, "chan_thumb": chan_thumb,
'chan_banner': chan_banner "chan_banner": chan_banner,
} }
return channel_dict return channel_dict
def cache_dl_vids(self, all_videos): def cache_dl_vids(self, all_videos):
""" video thumbs links for cache """ """video thumbs links for cache"""
vid_cache = os.path.join(self.CACHE_DIR, 'videos') vid_cache = os.path.join(self.CACHE_DIR, "videos")
all_vid_cached = os.listdir(vid_cache) all_vid_cached = os.listdir(vid_cache)
# videos # videos
for video_dict in all_videos: for video_dict in all_videos:
youtube_id = video_dict['youtube_id'] youtube_id = video_dict["youtube_id"]
if not youtube_id + '.jpg' in all_vid_cached: if not youtube_id + ".jpg" in all_vid_cached:
cache_path = os.path.join(vid_cache, youtube_id + '.jpg') cache_path = os.path.join(vid_cache, youtube_id + ".jpg")
thumb_url = video_dict['vid_thumb'] thumb_url = video_dict["vid_thumb"]
img_raw = requests.get(thumb_url, stream=True).raw img_raw = requests.get(thumb_url, stream=True).raw
img = Image.open(img_raw) img = Image.open(img_raw)
width, height = img.size width, height = img.size
@ -125,62 +122,62 @@ class SearchHandler:
img.convert("RGB").save(cache_path) img.convert("RGB").save(cache_path)
def cache_dl_chan(self, all_channels): def cache_dl_chan(self, all_channels):
""" download channel thumbs """ """download channel thumbs"""
chan_cache = os.path.join(self.CACHE_DIR, 'channels') chan_cache = os.path.join(self.CACHE_DIR, "channels")
all_chan_cached = os.listdir(chan_cache) all_chan_cached = os.listdir(chan_cache)
for channel_dict in all_channels: for channel_dict in all_channels:
channel_id_cache = channel_dict['channel_id'] channel_id_cache = channel_dict["channel_id"]
channel_banner_url = channel_dict['chan_banner'] channel_banner_url = channel_dict["chan_banner"]
channel_banner = channel_id_cache + '_banner.jpg' channel_banner = channel_id_cache + "_banner.jpg"
channel_thumb_url = channel_dict['chan_thumb'] channel_thumb_url = channel_dict["chan_thumb"]
channel_thumb = channel_id_cache + '_thumb.jpg' channel_thumb = channel_id_cache + "_thumb.jpg"
# thumb # thumb
if channel_thumb_url and channel_thumb not in all_chan_cached: if channel_thumb_url and channel_thumb not in all_chan_cached:
cache_path = os.path.join(chan_cache, channel_thumb) cache_path = os.path.join(chan_cache, channel_thumb)
img_raw = requests.get(channel_thumb_url, stream=True).content img_raw = requests.get(channel_thumb_url, stream=True).content
with open(cache_path, 'wb') as f: with open(cache_path, "wb") as f:
f.write(img_raw) f.write(img_raw)
# banner # banner
if channel_banner_url and channel_banner not in all_chan_cached: if channel_banner_url and channel_banner not in all_chan_cached:
cache_path = os.path.join(chan_cache, channel_banner) cache_path = os.path.join(chan_cache, channel_banner)
img_raw = requests.get(channel_banner_url, stream=True).content img_raw = requests.get(channel_banner_url, stream=True).content
with open(cache_path, 'wb') as f: with open(cache_path, "wb") as f:
f.write(img_raw) f.write(img_raw)
@staticmethod @staticmethod
def hit_cleanup(hit): def hit_cleanup(hit):
""" clean up and parse data from a single hit """ """clean up and parse data from a single hit"""
hit['source'] = hit.pop('_source') hit["source"] = hit.pop("_source")
hit_keys = hit['source'].keys() hit_keys = hit["source"].keys()
if 'media_url' in hit_keys: if "media_url" in hit_keys:
parsed_url = urllib.parse.quote(hit['source']['media_url']) parsed_url = urllib.parse.quote(hit["source"]["media_url"])
hit['source']['media_url'] = parsed_url hit["source"]["media_url"] = parsed_url
if 'published' in hit_keys: if "published" in hit_keys:
published = hit['source']['published'] published = hit["source"]["published"]
date_pub = datetime.strptime(published, "%Y-%m-%d") date_pub = datetime.strptime(published, "%Y-%m-%d")
date_str = datetime.strftime(date_pub, "%d %b, %Y") date_str = datetime.strftime(date_pub, "%d %b, %Y")
hit['source']['published'] = date_str hit["source"]["published"] = date_str
if 'vid_last_refresh' in hit_keys: if "vid_last_refresh" in hit_keys:
vid_last_refresh = hit['source']['vid_last_refresh'] vid_last_refresh = hit["source"]["vid_last_refresh"]
date_refresh = datetime.fromtimestamp(vid_last_refresh) date_refresh = datetime.fromtimestamp(vid_last_refresh)
date_str = datetime.strftime(date_refresh, "%d %b, %Y") date_str = datetime.strftime(date_refresh, "%d %b, %Y")
hit['source']['vid_last_refresh'] = date_str hit["source"]["vid_last_refresh"] = date_str
if 'channel_last_refresh' in hit_keys: if "channel_last_refresh" in hit_keys:
refreshed = hit['source']['channel_last_refresh'] refreshed = hit["source"]["channel_last_refresh"]
date_refresh = datetime.fromtimestamp(refreshed) date_refresh = datetime.fromtimestamp(refreshed)
date_str = datetime.strftime(date_refresh, "%d %b, %Y") date_str = datetime.strftime(date_refresh, "%d %b, %Y")
hit['source']['channel_last_refresh'] = date_str hit["source"]["channel_last_refresh"] = date_str
if 'channel' in hit_keys: if "channel" in hit_keys:
channel_keys = hit['source']['channel'].keys() channel_keys = hit["source"]["channel"].keys()
if 'channel_last_refresh' in channel_keys: if "channel_last_refresh" in channel_keys:
refreshed = hit['source']['channel']['channel_last_refresh'] refreshed = hit["source"]["channel"]["channel_last_refresh"]
date_refresh = datetime.fromtimestamp(refreshed) date_refresh = datetime.fromtimestamp(refreshed)
date_str = datetime.strftime(date_refresh, "%d %b, %Y") date_str = datetime.strftime(date_refresh, "%d %b, %Y")
hit['source']['channel']['channel_last_refresh'] = date_str hit["source"]["channel"]["channel_last_refresh"] = date_str
return hit return hit
@ -192,13 +189,13 @@ class Pagination:
def __init__(self, page_get, search_get=False): def __init__(self, page_get, search_get=False):
config = AppConfig().config config = AppConfig().config
self.page_size = config['archive']['page_size'] self.page_size = config["archive"]["page_size"]
self.page_get = page_get self.page_get = page_get
self.search_get = search_get self.search_get = search_get
self.pagination = self.first_guess() self.pagination = self.first_guess()
def first_guess(self): def first_guess(self):
""" build first guess before api call """ """build first guess before api call"""
page_get = self.page_get page_get = self.page_get
if page_get in [0, 1]: if page_get in [0, 1]:
page_from = 0 page_from = 0
@ -213,22 +210,22 @@ class Pagination:
"page_size": self.page_size, "page_size": self.page_size,
"page_from": page_from, "page_from": page_from,
"prev_pages": prev_pages, "prev_pages": prev_pages,
"current_page": page_get "current_page": page_get,
} }
if self.search_get: if self.search_get:
pagination.update({"search_get": self.search_get}) pagination.update({"search_get": self.search_get})
return pagination return pagination
def validate(self, total_hits): def validate(self, total_hits):
""" validate pagination with total_hits after making api call """ """validate pagination with total_hits after making api call"""
page_get = self.page_get page_get = self.page_get
max_pages = math.ceil(total_hits / self.page_size) max_pages = math.ceil(total_hits / self.page_size)
if page_get < max_pages and max_pages > 1: if page_get < max_pages and max_pages > 1:
self.pagination['last_page'] = max_pages self.pagination["last_page"] = max_pages
else: else:
self.pagination['last_page'] = False self.pagination["last_page"] = False
next_pages = [ next_pages = [
i for i in range(page_get + 1, page_get + 6) if 1 < i < max_pages i for i in range(page_get + 1, page_get + 6) if 1 < i < max_pages
] ]
self.pagination['next_pages'] = next_pages self.pagination["next_pages"] = next_pages

View File

@ -14,17 +14,17 @@ from home.src.index_management import backup_all_indexes, restore_from_backup
from home.src.reindex import ManualImport, reindex_old_documents from home.src.reindex import ManualImport, reindex_old_documents
CONFIG = AppConfig().config CONFIG = AppConfig().config
REDIS_HOST = CONFIG['application']['REDIS_HOST'] REDIS_HOST = CONFIG["application"]["REDIS_HOST"]
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'home.settings') os.environ.setdefault("DJANGO_SETTINGS_MODULE", "home.settings")
app = Celery('tasks', broker='redis://' + REDIS_HOST) app = Celery("tasks", broker="redis://" + REDIS_HOST)
app.config_from_object('django.conf:settings', namespace='CELERY') app.config_from_object("django.conf:settings", namespace="CELERY")
app.autodiscover_tasks() app.autodiscover_tasks()
@shared_task @shared_task
def update_subscribed(): def update_subscribed():
""" look for missing videos and add to pending """ """look for missing videos and add to pending"""
channel_handler = ChannelSubscription() channel_handler = ChannelSubscription()
missing_videos = channel_handler.find_missing() missing_videos = channel_handler.find_missing()
if missing_videos: if missing_videos:
@ -36,10 +36,10 @@ def update_subscribed():
@shared_task @shared_task
def download_pending(): def download_pending():
""" download latest pending videos """ """download latest pending videos"""
pending_handler = PendingList() pending_handler = PendingList()
pending_vids = pending_handler.get_all_pending()[0] pending_vids = pending_handler.get_all_pending()[0]
to_download = [i['youtube_id'] for i in pending_vids] to_download = [i["youtube_id"] for i in pending_vids]
to_download.reverse() to_download.reverse()
if to_download: if to_download:
download_handler = VideoDownloader(to_download) download_handler = VideoDownloader(to_download)
@ -48,14 +48,14 @@ def download_pending():
@shared_task @shared_task
def download_single(youtube_id): def download_single(youtube_id):
""" start download single video now """ """start download single video now"""
download_handler = VideoDownloader([youtube_id]) download_handler = VideoDownloader([youtube_id])
download_handler.download_list() download_handler.download_list()
@shared_task @shared_task
def extrac_dl(youtube_ids): def extrac_dl(youtube_ids):
""" parse list passed and add to pending """ """parse list passed and add to pending"""
pending_handler = PendingList() pending_handler = PendingList()
missing_videos = pending_handler.parse_url_list(youtube_ids) missing_videos = pending_handler.parse_url_list(youtube_ids)
pending_handler.add_to_pending(missing_videos) pending_handler.add_to_pending(missing_videos)
@ -63,17 +63,17 @@ def extrac_dl(youtube_ids):
@shared_task @shared_task
def check_reindex(): def check_reindex():
""" run the reindex main command """ """run the reindex main command"""
reindex_old_documents() reindex_old_documents()
@shared_task @shared_task
def run_manual_import(): def run_manual_import():
""" called from settings page, to go through import folder """ """called from settings page, to go through import folder"""
print('starting media file import') print("starting media file import")
have_lock = False have_lock = False
my_lock = get_lock('manual_import') my_lock = get_lock("manual_import")
try: try:
have_lock = my_lock.acquire(blocking=False) have_lock = my_lock.acquire(blocking=False)
@ -91,13 +91,13 @@ def run_manual_import():
@shared_task @shared_task
def run_backup(): def run_backup():
""" called from settings page, dump backup to zip file """ """called from settings page, dump backup to zip file"""
backup_all_indexes() backup_all_indexes()
print('backup finished') print("backup finished")
@shared_task @shared_task
def run_restore_backup(): def run_restore_backup():
""" called from settings page, dump backup to zip file """ """called from settings page, dump backup to zip file"""
restore_from_backup() restore_from_backup()
print('index restore finished') print("index restore finished")

View File

@ -96,7 +96,7 @@
</div> </div>
<div class="footer"> <div class="footer">
<div class="boxed-content"> <div class="boxed-content">
<span>© 2021 The Tube Archivist | <a href="https://github.com/bbilly1/tubearchivist" target="_blank">Github</a> | <a href="https://hub.docker.com/r/bbilly1/tubearchivist" target="_blank">Docker Hub</a></span> <span>© 2021 The Tube Archivist v0.0.3 | <a href="https://github.com/bbilly1/tubearchivist" target="_blank">Github</a> | <a href="https://hub.docker.com/r/bbilly1/tubearchivist" target="_blank">Docker Hub</a></span>
</div> </div>
</div> </div>
</body> </body>

View File

@ -1,22 +1,30 @@
""" all home app urls """ """ all home app urls """
from django.urls import path from django.urls import path
from home.views import (AboutView, ChannelIdView, ChannelView, DownloadView, from home.views import (
HomeView, SettingsView, VideoView) AboutView,
ChannelIdView,
ChannelView,
DownloadView,
HomeView,
SettingsView,
VideoView,
)
from . import views from . import views
urlpatterns = [ urlpatterns = [
path('', HomeView.as_view(), name='home'), path("", HomeView.as_view(), name="home"),
path('about/', AboutView.as_view(), name='about'), path("about/", AboutView.as_view(), name="about"),
path('downloads/', DownloadView.as_view(), name='downloads'), path("downloads/", DownloadView.as_view(), name="downloads"),
path('settings/', SettingsView.as_view(), name='settings'), path("settings/", SettingsView.as_view(), name="settings"),
path('process/', views.process, name='process'), path("process/", views.process, name="process"),
path('downloads/progress', views.progress, name='progress'), path("downloads/progress", views.progress, name="progress"),
path('channel/', ChannelView.as_view(), name='channel'), path("channel/", ChannelView.as_view(), name="channel"),
path( path(
'channel/<slug:channel_id_detail>/', "channel/<slug:channel_id_detail>/",
ChannelIdView.as_view(), name='channel_id' ChannelIdView.as_view(),
name="channel_id",
), ),
path('video/<slug:video_id>/', VideoView.as_view(), name='video') path("video/<slug:video_id>/", VideoView.as_view(), name="video"),
] ]

View File

@ -16,36 +16,46 @@ from django.utils.http import urlencode
from django.views import View from django.views import View
from home.src.config import AppConfig from home.src.config import AppConfig
from home.src.download import ChannelSubscription, PendingList from home.src.download import ChannelSubscription, PendingList
from home.src.helper import (get_dl_message, get_message, process_url_list, from home.src.helper import (
set_message) get_dl_message,
get_message,
process_url_list,
set_message,
)
from home.src.searching import Pagination, SearchHandler from home.src.searching import Pagination, SearchHandler
from home.tasks import (download_pending, download_single, extrac_dl, from home.tasks import (
run_backup, run_manual_import, run_restore_backup, download_pending,
update_subscribed) download_single,
extrac_dl,
run_backup,
run_manual_import,
run_restore_backup,
update_subscribed,
)
class HomeView(View): class HomeView(View):
""" resolves to / """resolves to /
handle home page and video search post functionality handle home page and video search post functionality
""" """
CONFIG = AppConfig().config CONFIG = AppConfig().config
ES_URL = CONFIG['application']['es_url'] ES_URL = CONFIG["application"]["es_url"]
def get(self, request): def get(self, request):
""" return home search results """ """return home search results"""
colors, sort_order, hide_watched = self.read_config() colors, sort_order, hide_watched = self.read_config()
# handle search # handle search
search_get = request.GET.get('search', False) search_get = request.GET.get("search", False)
if search_get: if search_get:
search_encoded = urllib.parse.quote(search_get) search_encoded = urllib.parse.quote(search_get)
else: else:
search_encoded = False search_encoded = False
# define page size # define page size
page_get = int(request.GET.get('page', 0)) page_get = int(request.GET.get("page", 0))
pagination_handler = Pagination(page_get, search_encoded) pagination_handler = Pagination(page_get, search_encoded)
url = self.ES_URL + '/ta_video/_search' url = self.ES_URL + "/ta_video/_search"
data = self.build_data( data = self.build_data(
pagination_handler, sort_order, search_get, hide_watched pagination_handler, sort_order, search_get, hide_watched
@ -56,95 +66,94 @@ class HomeView(View):
max_hits = search.max_hits max_hits = search.max_hits
pagination_handler.validate(max_hits) pagination_handler.validate(max_hits)
context = { context = {
'videos': videos_hits, "videos": videos_hits,
'pagination': pagination_handler.pagination, "pagination": pagination_handler.pagination,
'sortorder': sort_order, "sortorder": sort_order,
'hide_watched': hide_watched, "hide_watched": hide_watched,
'colors': colors "colors": colors,
} }
return render(request, 'home/home.html', context) return render(request, "home/home.html", context)
@staticmethod @staticmethod
def build_data(pagination_handler, sort_order, search_get, hide_watched): def build_data(pagination_handler, sort_order, search_get, hide_watched):
""" build the data dict for the search query """ """build the data dict for the search query"""
page_size = pagination_handler.pagination['page_size'] page_size = pagination_handler.pagination["page_size"]
page_from = pagination_handler.pagination['page_from'] page_from = pagination_handler.pagination["page_from"]
data = { data = {
"size": page_size, "from": page_from, "query": {"match_all": {}}, "size": page_size,
"from": page_from,
"query": {"match_all": {}},
"sort": [ "sort": [
{"published": {"order": "desc"}}, {"published": {"order": "desc"}},
{"date_downloaded": {"order": "desc"}} {"date_downloaded": {"order": "desc"}},
] ],
} }
# define sort # define sort
if sort_order == 'downloaded': if sort_order == "downloaded":
del data['sort'][0] del data["sort"][0]
if search_get: if search_get:
del data['sort'] del data["sort"]
if hide_watched: if hide_watched:
data['query'] = {"term": {"player.watched": {"value": False}}} data["query"] = {"term": {"player.watched": {"value": False}}}
if search_get: if search_get:
query = { query = {
"multi_match": { "multi_match": {
"query": search_get, "query": search_get,
"fields": ["title", "channel.channel_name", "tags"], "fields": ["title", "channel.channel_name", "tags"],
"type": "cross_fields", "type": "cross_fields",
"operator": "and" "operator": "and",
} }
} }
data['query'] = query data["query"] = query
return data return data
@staticmethod @staticmethod
def read_config(): def read_config():
""" read needed values from redis """ """read needed values from redis"""
config_handler = AppConfig().config config_handler = AppConfig().config
colors = config_handler['application']['colors'] colors = config_handler["application"]["colors"]
sort_order = get_message('sort_order') sort_order = get_message("sort_order")
hide_watched = get_message('hide_watched') hide_watched = get_message("hide_watched")
return colors, sort_order, hide_watched return colors, sort_order, hide_watched
@staticmethod @staticmethod
def post(request): def post(request):
""" handle post from search form """ """handle post from search form"""
post_data = dict(request.POST) post_data = dict(request.POST)
search_query = post_data['videoSearch'][0] search_query = post_data["videoSearch"][0]
search_url = '/?' + urlencode({'search': search_query}) search_url = "/?" + urlencode({"search": search_query})
return redirect(search_url, permanent=True) return redirect(search_url, permanent=True)
class AboutView(View): class AboutView(View):
""" resolves to /about/ """resolves to /about/
show helpful how to information show helpful how to information
""" """
@staticmethod @staticmethod
def get(request): def get(request):
""" handle http get """ """handle http get"""
config = AppConfig().config config = AppConfig().config
colors = config['application']['colors'] colors = config["application"]["colors"]
context = { context = {"title": "About", "colors": colors}
'title': 'About', return render(request, "home/about.html", context)
'colors': colors
}
return render(request, 'home/about.html', context)
class DownloadView(View): class DownloadView(View):
""" resolves to /download/ """resolves to /download/
takes POST for downloading youtube links takes POST for downloading youtube links
""" """
def get(self, request): def get(self, request):
""" handle get requests """ """handle get requests"""
config = AppConfig().config config = AppConfig().config
colors = config['application']['colors'] colors = config["application"]["colors"]
page_get = int(request.GET.get('page', 0)) page_get = int(request.GET.get("page", 0))
pagination_handler = Pagination(page_get) pagination_handler = Pagination(page_get)
url = config['application']['es_url'] + '/ta_download/_search' url = config["application"]["es_url"] + "/ta_download/_search"
data = self.build_data(pagination_handler) data = self.build_data(pagination_handler)
search = SearchHandler(url, data, cache=False) search = SearchHandler(url, data, cache=False)
@ -152,7 +161,7 @@ class DownloadView(View):
max_hits = search.max_hits max_hits = search.max_hits
if videos_hits: if videos_hits:
all_pending = [i['source'] for i in videos_hits] all_pending = [i["source"] for i in videos_hits]
pagination_handler.validate(max_hits) pagination_handler.validate(max_hits)
pagination = pagination_handler.pagination pagination = pagination_handler.pagination
else: else:
@ -160,33 +169,34 @@ class DownloadView(View):
pagination = False pagination = False
context = { context = {
'pending': all_pending, "pending": all_pending,
'max_hits': max_hits, "max_hits": max_hits,
'pagination': pagination, "pagination": pagination,
'title': 'Downloads', "title": "Downloads",
'colors': colors "colors": colors,
} }
return render(request, 'home/downloads.html', context) return render(request, "home/downloads.html", context)
@staticmethod @staticmethod
def build_data(pagination_handler): def build_data(pagination_handler):
""" build data dict for search """ """build data dict for search"""
page_size = pagination_handler.pagination['page_size'] page_size = pagination_handler.pagination["page_size"]
page_from = pagination_handler.pagination['page_from'] page_from = pagination_handler.pagination["page_from"]
data = { data = {
"size": page_size, "from": page_from, "size": page_size,
"from": page_from,
"query": {"term": {"status": {"value": "pending"}}}, "query": {"term": {"status": {"value": "pending"}}},
"sort": [{"timestamp": {"order": "desc"}}] "sort": [{"timestamp": {"order": "desc"}}],
} }
return data return data
@staticmethod @staticmethod
def post(request): def post(request):
""" handle post requests """ """handle post requests"""
download_post = dict(request.POST) download_post = dict(request.POST)
if 'vid-url' in download_post.keys(): if "vid-url" in download_post.keys():
url_str = download_post['vid-url'] url_str = download_post["vid-url"]
print('adding to queue') print("adding to queue")
youtube_ids = process_url_list(url_str) youtube_ids = process_url_list(url_str)
if not youtube_ids: if not youtube_ids:
# failed to process # failed to process
@ -194,52 +204,52 @@ class DownloadView(View):
mess_dict = { mess_dict = {
"status": "downloading", "status": "downloading",
"level": "error", "level": "error",
"title": 'Failed to extract links.', "title": "Failed to extract links.",
"message": '' "message": "",
} }
set_message('progress:download', mess_dict) set_message("progress:download", mess_dict)
return redirect('downloads') return redirect("downloads")
print(youtube_ids) print(youtube_ids)
extrac_dl.delay(youtube_ids) extrac_dl.delay(youtube_ids)
sleep(2) sleep(2)
return redirect('downloads', permanent=True) return redirect("downloads", permanent=True)
class ChannelIdView(View): class ChannelIdView(View):
""" resolves to /channel/<channel-id>/ """resolves to /channel/<channel-id>/
display single channel page from channel_id display single channel page from channel_id
""" """
def get(self, request, channel_id_detail): def get(self, request, channel_id_detail):
""" get method """ """get method"""
es_url, colors = self.read_config() es_url, colors = self.read_config()
context = self.get_channel_videos(request, channel_id_detail, es_url) context = self.get_channel_videos(request, channel_id_detail, es_url)
context.update({'colors': colors}) context.update({"colors": colors})
return render(request, 'home/channel_id.html', context) return render(request, "home/channel_id.html", context)
@staticmethod @staticmethod
def read_config(): def read_config():
""" read config file """ """read config file"""
config = AppConfig().config config = AppConfig().config
es_url = config['application']['es_url'] es_url = config["application"]["es_url"]
colors = config['application']['colors'] colors = config["application"]["colors"]
return es_url, colors return es_url, colors
def get_channel_videos(self, request, channel_id_detail, es_url): def get_channel_videos(self, request, channel_id_detail, es_url):
""" get channel from video index """ """get channel from video index"""
page_get = int(request.GET.get('page', 0)) page_get = int(request.GET.get("page", 0))
pagination_handler = Pagination(page_get) pagination_handler = Pagination(page_get)
# get data # get data
url = es_url + '/ta_video/_search' url = es_url + "/ta_video/_search"
data = self.build_data(pagination_handler, channel_id_detail) data = self.build_data(pagination_handler, channel_id_detail)
search = SearchHandler(url, data) search = SearchHandler(url, data)
videos_hits = search.get_data() videos_hits = search.get_data()
max_hits = search.max_hits max_hits = search.max_hits
if max_hits: if max_hits:
channel_info = videos_hits[0]['source']['channel'] channel_info = videos_hits[0]["source"]["channel"]
channel_name = channel_info['channel_name'] channel_name = channel_info["channel_name"]
pagination_handler.validate(max_hits) pagination_handler.validate(max_hits)
pagination = pagination_handler.pagination pagination = pagination_handler.pagination
else: else:
@ -251,218 +261,223 @@ class ChannelIdView(View):
pagination = False pagination = False
context = { context = {
'channel_info': channel_info, "channel_info": channel_info,
'videos': videos_hits, "videos": videos_hits,
'max_hits': max_hits, "max_hits": max_hits,
'pagination': pagination, "pagination": pagination,
'title': 'Channel: ' + channel_name, "title": "Channel: " + channel_name,
} }
return context return context
@staticmethod @staticmethod
def build_data(pagination_handler, channel_id_detail): def build_data(pagination_handler, channel_id_detail):
""" build data dict for search """ """build data dict for search"""
page_size = pagination_handler.pagination['page_size'] page_size = pagination_handler.pagination["page_size"]
page_from = pagination_handler.pagination['page_from'] page_from = pagination_handler.pagination["page_from"]
data = { data = {
"size": page_size, "from": page_from, "size": page_size,
"from": page_from,
"query": { "query": {
"term": {"channel.channel_id": {"value": channel_id_detail}} "term": {"channel.channel_id": {"value": channel_id_detail}}
}, },
"sort": [ "sort": [
{"published": {"order": "desc"}}, {"published": {"order": "desc"}},
{"date_downloaded": {"order": "desc"}} {"date_downloaded": {"order": "desc"}},
] ],
} }
return data return data
@staticmethod @staticmethod
def get_channel_info(channel_id_detail, es_url): def get_channel_info(channel_id_detail, es_url):
""" get channel info from channel index if no videos """ """get channel info from channel index if no videos"""
url = f'{es_url}/ta_channel/_doc/{channel_id_detail}' url = f"{es_url}/ta_channel/_doc/{channel_id_detail}"
data = False data = False
search = SearchHandler(url, data) search = SearchHandler(url, data)
channel_data = search.get_data() channel_data = search.get_data()
channel_info = channel_data[0]['source'] channel_info = channel_data[0]["source"]
channel_name = channel_info['channel_name'] channel_name = channel_info["channel_name"]
return channel_info, channel_name return channel_info, channel_name
class ChannelView(View): class ChannelView(View):
""" resolves to /channel/ """resolves to /channel/
handle functionality for channel overview page, subscribe to channel, handle functionality for channel overview page, subscribe to channel,
search as you type for channel name search as you type for channel name
""" """
def get(self, request): def get(self, request):
""" handle http get requests """ """handle http get requests"""
es_url, colors = self.read_config() es_url, colors = self.read_config()
page_get = int(request.GET.get('page', 0)) page_get = int(request.GET.get("page", 0))
pagination_handler = Pagination(page_get) pagination_handler = Pagination(page_get)
page_size = pagination_handler.pagination['page_size'] page_size = pagination_handler.pagination["page_size"]
page_from = pagination_handler.pagination['page_from'] page_from = pagination_handler.pagination["page_from"]
# get # get
url = es_url + '/ta_channel/_search' url = es_url + "/ta_channel/_search"
data = { data = {
"size": page_size, "from": page_from, "query": {"match_all": {}}, "size": page_size,
"sort": [{"channel_name.keyword": {"order": "asc"}}] "from": page_from,
"query": {"match_all": {}},
"sort": [{"channel_name.keyword": {"order": "asc"}}],
} }
show_subed_only = get_message('show_subed_only') show_subed_only = get_message("show_subed_only")
if show_subed_only: if show_subed_only:
data['query'] = {"term": {"channel_subscribed": {"value": True}}} data["query"] = {"term": {"channel_subscribed": {"value": True}}}
search = SearchHandler(url, data) search = SearchHandler(url, data)
channel_hits = search.get_data() channel_hits = search.get_data()
max_hits = search.max_hits max_hits = search.max_hits
pagination_handler.validate(search.max_hits) pagination_handler.validate(search.max_hits)
context = { context = {
'channels': channel_hits, "channels": channel_hits,
'max_hits': max_hits, "max_hits": max_hits,
'pagination': pagination_handler.pagination, "pagination": pagination_handler.pagination,
'show_subed_only': show_subed_only, "show_subed_only": show_subed_only,
'title': 'Channels', "title": "Channels",
'colors': colors "colors": colors,
} }
return render(request, 'home/channel.html', context) return render(request, "home/channel.html", context)
@staticmethod @staticmethod
def read_config(): def read_config():
""" read config file """ """read config file"""
config = AppConfig().config config = AppConfig().config
es_url = config['application']['es_url'] es_url = config["application"]["es_url"]
colors = config['application']['colors'] colors = config["application"]["colors"]
return es_url, colors return es_url, colors
def post(self, request): def post(self, request):
""" handle http post requests """ """handle http post requests"""
subscriptions_post = dict(request.POST) subscriptions_post = dict(request.POST)
print(subscriptions_post) print(subscriptions_post)
subscriptions_post = dict(request.POST) subscriptions_post = dict(request.POST)
if 'subscribe' in subscriptions_post.keys(): if "subscribe" in subscriptions_post.keys():
sub_str = subscriptions_post['subscribe'] sub_str = subscriptions_post["subscribe"]
try: try:
youtube_ids = process_url_list(sub_str) youtube_ids = process_url_list(sub_str)
self.subscribe_to(youtube_ids) self.subscribe_to(youtube_ids)
except ValueError: except ValueError:
print('parsing subscribe ids failed!') print("parsing subscribe ids failed!")
print(sub_str) print(sub_str)
sleep(1) sleep(1)
return redirect('channel', permanent=True) return redirect("channel", permanent=True)
@staticmethod @staticmethod
def subscribe_to(youtube_ids): def subscribe_to(youtube_ids):
""" process the subscribe ids """ """process the subscribe ids"""
for youtube_id in youtube_ids: for youtube_id in youtube_ids:
if youtube_id['type'] == 'video': if youtube_id["type"] == "video":
to_sub = youtube_id['url'] to_sub = youtube_id["url"]
vid_details = PendingList().get_youtube_details(to_sub) vid_details = PendingList().get_youtube_details(to_sub)
channel_id_sub = vid_details['channel_id'] channel_id_sub = vid_details["channel_id"]
elif youtube_id['type'] == 'channel': elif youtube_id["type"] == "channel":
channel_id_sub = youtube_id['url'] channel_id_sub = youtube_id["url"]
else: else:
raise ValueError('failed to subscribe to: ' + youtube_id) raise ValueError("failed to subscribe to: " + youtube_id)
ChannelSubscription().change_subscribe( ChannelSubscription().change_subscribe(
channel_id_sub, channel_subscribed=True channel_id_sub, channel_subscribed=True
) )
print('subscribed to: ' + channel_id_sub) print("subscribed to: " + channel_id_sub)
class VideoView(View): class VideoView(View):
""" resolves to /video/<video-id>/ """resolves to /video/<video-id>/
display details about a single video display details about a single video
""" """
def get(self, request, video_id): def get(self, request, video_id):
""" get single video """ """get single video"""
es_url, colors = self.read_config() es_url, colors = self.read_config()
url = f'{es_url}/ta_video/_doc/{video_id}' url = f"{es_url}/ta_video/_doc/{video_id}"
data = None data = None
look_up = SearchHandler(url, data) look_up = SearchHandler(url, data)
video_hit = look_up.get_data() video_hit = look_up.get_data()
video_data = video_hit[0]['source'] video_data = video_hit[0]["source"]
video_title = video_data['title'] video_title = video_data["title"]
context = { context = {"video": video_data, "title": video_title, "colors": colors}
'video': video_data, return render(request, "home/video.html", context)
'title': video_title,
'colors': colors
}
return render(request, 'home/video.html', context)
@staticmethod @staticmethod
def read_config(): def read_config():
""" read config file """ """read config file"""
config = AppConfig().config config = AppConfig().config
es_url = config['application']['es_url'] es_url = config["application"]["es_url"]
colors = config['application']['colors'] colors = config["application"]["colors"]
return es_url, colors return es_url, colors
class SettingsView(View): class SettingsView(View):
""" resolves to /settings/ """resolves to /settings/
handle the settings page, display current settings, handle the settings page, display current settings,
take post request from the form to update settings take post request from the form to update settings
""" """
@staticmethod @staticmethod
def get(request): def get(request):
""" read and display current settings """ """read and display current settings"""
config = AppConfig().config config = AppConfig().config
colors = config['application']['colors'] colors = config["application"]["colors"]
context = { context = {"title": "Settings", "config": config, "colors": colors}
'title': 'Settings',
'config': config,
'colors': colors
}
return render(request, 'home/settings.html', context) return render(request, "home/settings.html", context)
@staticmethod @staticmethod
def post(request): def post(request):
""" handle form post to update settings """ """handle form post to update settings"""
form_post = dict(request.POST) form_post = dict(request.POST)
del form_post['csrfmiddlewaretoken'] del form_post["csrfmiddlewaretoken"]
print(form_post) print(form_post)
config_handler = AppConfig() config_handler = AppConfig()
config_handler.update_config(form_post) config_handler.update_config(form_post)
return redirect('settings', permanent=True) return redirect("settings", permanent=True)
def progress(request): def progress(request):
# pylint: disable=unused-argument # pylint: disable=unused-argument
""" endpoint for download progress ajax calls """ """endpoint for download progress ajax calls"""
config = AppConfig().config config = AppConfig().config
cache_dir = config['application']['cache_dir'] cache_dir = config["application"]["cache_dir"]
json_data = get_dl_message(cache_dir) json_data = get_dl_message(cache_dir)
return JsonResponse(json_data) return JsonResponse(json_data)
def process(request): def process(request):
""" handle all the buttons calls via POST ajax """ """handle all the buttons calls via POST ajax"""
if request.method == 'POST': if request.method == "POST":
post_dict = json.loads(request.body.decode()) post_dict = json.loads(request.body.decode())
post_handler = PostData(post_dict) post_handler = PostData(post_dict)
if post_handler.to_do: if post_handler.to_do:
task_result = post_handler.run_task() task_result = post_handler.run_task()
return JsonResponse(task_result) return JsonResponse(task_result)
return JsonResponse({'success': False}) return JsonResponse({"success": False})
class PostData: class PostData:
""" generic post handler from process route """ """generic post handler from process route"""
CONFIG = AppConfig().config CONFIG = AppConfig().config
ES_URL = CONFIG['application']['es_url'] ES_URL = CONFIG["application"]["es_url"]
VALID_KEYS = [ VALID_KEYS = [
"watched", "rescan_pending", "ignore", "dl_pending", "watched",
"unsubscribe", "sort_order", "hide_watched", "show_subed_only", "rescan_pending",
"channel-search", "video-search", "dlnow", "manual-import", "ignore",
"db-backup", "db-restore" "dl_pending",
"unsubscribe",
"sort_order",
"hide_watched",
"show_subed_only",
"channel-search",
"video-search",
"dlnow",
"manual-import",
"db-backup",
"db-restore",
] ]
def __init__(self, post_dict): def __init__(self, post_dict):
@ -470,81 +485,81 @@ class PostData:
self.to_do = self.validate() self.to_do = self.validate()
def validate(self): def validate(self):
""" validate the post_dict """ """validate the post_dict"""
to_do = [] to_do = []
for key, value in self.post_dict.items(): for key, value in self.post_dict.items():
if key in self.VALID_KEYS: if key in self.VALID_KEYS:
task_item = {'task': key, 'status': value} task_item = {"task": key, "status": value}
print(task_item) print(task_item)
to_do.append(task_item) to_do.append(task_item)
else: else:
print(key + ' not a valid key') print(key + " not a valid key")
return to_do return to_do
def run_task(self): def run_task(self):
""" run through the tasks to do """ """run through the tasks to do"""
for item in self.to_do: for item in self.to_do:
task = item['task'] task = item["task"]
if task == 'watched': if task == "watched":
youtube_id = item['status'] youtube_id = item["status"]
self.parse_watched(youtube_id) self.parse_watched(youtube_id)
elif task == 'rescan_pending': elif task == "rescan_pending":
print('rescan subscribed channels') print("rescan subscribed channels")
update_subscribed.delay() update_subscribed.delay()
elif task == 'ignore': elif task == "ignore":
print('ignore video') print("ignore video")
handler = PendingList() handler = PendingList()
ignore_list = item['status'] ignore_list = item["status"]
handler.ignore_from_pending([ignore_list]) handler.ignore_from_pending([ignore_list])
elif task == 'dl_pending': elif task == "dl_pending":
print('download pending') print("download pending")
download_pending.delay() download_pending.delay()
elif task == 'unsubscribe': elif task == "unsubscribe":
channel_id_unsub = item['status'] channel_id_unsub = item["status"]
print('unsubscribe from ' + channel_id_unsub) print("unsubscribe from " + channel_id_unsub)
ChannelSubscription().change_subscribe( ChannelSubscription().change_subscribe(
channel_id_unsub, channel_subscribed=False channel_id_unsub, channel_subscribed=False
) )
elif task == 'sort_order': elif task == "sort_order":
sort_order = item['status'] sort_order = item["status"]
set_message('sort_order', sort_order, expire=False) set_message("sort_order", sort_order, expire=False)
elif task == 'hide_watched': elif task == "hide_watched":
hide_watched = bool(int(item['status'])) hide_watched = bool(int(item["status"]))
print(item['status']) print(item["status"])
set_message('hide_watched', hide_watched, expire=False) set_message("hide_watched", hide_watched, expire=False)
elif task == 'show_subed_only': elif task == "show_subed_only":
show_subed_only = bool(int(item['status'])) show_subed_only = bool(int(item["status"]))
print(show_subed_only) print(show_subed_only)
set_message('show_subed_only', show_subed_only, expire=False) set_message("show_subed_only", show_subed_only, expire=False)
elif task == 'channel-search': elif task == "channel-search":
search_query = item['status'] search_query = item["status"]
print('searching for: ' + search_query) print("searching for: " + search_query)
search_results = self.search_channels(search_query) search_results = self.search_channels(search_query)
return search_results return search_results
elif task == 'video-search': elif task == "video-search":
search_query = item['status'] search_query = item["status"]
print('searching for: ' + search_query) print("searching for: " + search_query)
search_results = self.search_videos(search_query) search_results = self.search_videos(search_query)
return search_results return search_results
elif task == 'dlnow': elif task == "dlnow":
youtube_id = item['status'] youtube_id = item["status"]
print('downloading: ' + youtube_id) print("downloading: " + youtube_id)
download_single.delay(youtube_id=youtube_id) download_single.delay(youtube_id=youtube_id)
elif task == 'manual-import': elif task == "manual-import":
print('starting manual import') print("starting manual import")
run_manual_import.delay() run_manual_import.delay()
elif task == 'db-backup': elif task == "db-backup":
print('backing up database') print("backing up database")
run_backup.delay() run_backup.delay()
elif task == 'db-restore': elif task == "db-restore":
print('restoring index from backup zip') print("restoring index from backup zip")
run_restore_backup.delay() run_restore_backup.delay()
return {'success': True} return {"success": True}
def search_channels(self, search_query): def search_channels(self, search_query):
""" fancy searching channels as you type """ """fancy searching channels as you type"""
url = self.ES_URL + '/ta_channel/_search' url = self.ES_URL + "/ta_channel/_search"
data = { data = {
"size": 10, "size": 10,
"query": { "query": {
@ -554,18 +569,18 @@ class PostData:
"fields": [ "fields": [
"channel_name.search_as_you_type", "channel_name.search_as_you_type",
"channel_name._2gram", "channel_name._2gram",
"channel_name._3gram" "channel_name._3gram",
] ],
}
} }
},
} }
look_up = SearchHandler(url, data, cache=False) look_up = SearchHandler(url, data, cache=False)
search_results = look_up.get_data() search_results = look_up.get_data()
return {'results': search_results} return {"results": search_results}
def search_videos(self, search_query): def search_videos(self, search_query):
""" fancy searching videos as you type """ """fancy searching videos as you type"""
url = self.ES_URL + '/ta_video/_search' url = self.ES_URL + "/ta_video/_search"
data = { data = {
"size": 10, "size": 10,
"query": { "query": {
@ -575,51 +590,51 @@ class PostData:
"fields": [ "fields": [
"title.search_as_you_type", "title.search_as_you_type",
"title._2gram", "title._2gram",
"title._3gram" "title._3gram",
] ],
}
} }
},
} }
look_up = SearchHandler(url, data, cache=False) look_up = SearchHandler(url, data, cache=False)
search_results = look_up.get_data() search_results = look_up.get_data()
return {'results': search_results} return {"results": search_results}
def parse_watched(self, youtube_id): def parse_watched(self, youtube_id):
""" marked as watched based on id type """ """marked as watched based on id type"""
es_url = self.ES_URL es_url = self.ES_URL
id_type = process_url_list([youtube_id])[0]['type'] id_type = process_url_list([youtube_id])[0]["type"]
stamp = int(datetime.now().strftime("%s")) stamp = int(datetime.now().strftime("%s"))
if id_type == 'video': if id_type == "video":
stamp = int(datetime.now().strftime("%s")) stamp = int(datetime.now().strftime("%s"))
url = self.ES_URL + '/ta_video/_update/' + youtube_id url = self.ES_URL + "/ta_video/_update/" + youtube_id
source = { source = {
"doc": {"player": {"watched": True, "watched_date": stamp}} "doc": {"player": {"watched": True, "watched_date": stamp}}
} }
request = requests.post(url, json=source) request = requests.post(url, json=source)
if not request.ok: if not request.ok:
print(request.text) print(request.text)
elif id_type == 'channel': elif id_type == "channel":
headers = {'Content-type': 'application/json'} headers = {"Content-type": "application/json"}
data = { data = {
"description": youtube_id, "description": youtube_id,
"processors": [ "processors": [
{"set": {"field": "player.watched", "value": True}}, {"set": {"field": "player.watched", "value": True}},
{"set": {"field": "player.watched_date", "value": stamp}} {"set": {"field": "player.watched_date", "value": stamp}},
] ],
} }
payload = json.dumps(data) payload = json.dumps(data)
url = es_url + '/_ingest/pipeline/' + youtube_id url = es_url + "/_ingest/pipeline/" + youtube_id
request = requests.put(url, data=payload, headers=headers) request = requests.put(url, data=payload, headers=headers)
if not request.ok: if not request.ok:
print(request.text) print(request.text)
# apply pipeline # apply pipeline
must_list = [ must_list = [
{"term": {"channel.channel_id": {"value": youtube_id}}}, {"term": {"channel.channel_id": {"value": youtube_id}}},
{"term": {"player.watched": {"value": False}}} {"term": {"player.watched": {"value": False}}},
] ]
data = {"query": {"bool": {"must": must_list}}} data = {"query": {"bool": {"must": must_list}}}
payload = json.dumps(data) payload = json.dumps(data)
url = f'{es_url}/ta_video/_update_by_query?pipeline={youtube_id}' url = f"{es_url}/ta_video/_update_by_query?pipeline={youtube_id}"
request = requests.post(url, data=payload, headers=headers) request = requests.post(url, data=payload, headers=headers)
if not request.ok: if not request.ok:
print(request.text) print(request.text)

View File

@ -7,7 +7,7 @@ import sys
def main(): def main():
# pylint: disable=import-outside-toplevel # pylint: disable=import-outside-toplevel
"""Run administrative tasks.""" """Run administrative tasks."""
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'config.settings') os.environ.setdefault("DJANGO_SETTINGS_MODULE", "config.settings")
try: try:
from django.core.management import execute_from_command_line from django.core.management import execute_from_command_line
except ImportError as exc: except ImportError as exc:
@ -19,5 +19,5 @@ def main():
execute_from_command_line(sys.argv) execute_from_command_line(sys.argv)
if __name__ == '__main__': if __name__ == "__main__":
main() main()

View File

@ -8,10 +8,10 @@ import requests
class Requirements: class Requirements:
""" handle requirements.txt """ """handle requirements.txt"""
FILE_PATH = 'tubearchivist/requirements.txt' FILE_PATH = "tubearchivist/requirements.txt"
LOCK = '/tmp/tubearchivist-requirements.lock' LOCK = "/tmp/tubearchivist-requirements.lock"
def __init__(self): def __init__(self):
self.exists = self.checked_today() self.exists = self.checked_today()
@ -19,24 +19,24 @@ class Requirements:
self.all_updates = False self.all_updates = False
def checked_today(self): def checked_today(self):
""" skip requirements check when lock file exists """ """skip requirements check when lock file exists"""
exists = pathlib.Path(self.LOCK).exists() exists = pathlib.Path(self.LOCK).exists()
return exists return exists
def look_for_updates(self): def look_for_updates(self):
""" look through requirements and check for updates """ """look through requirements and check for updates"""
self.all_requirements = self.get_dependencies() self.all_requirements = self.get_dependencies()
self.all_updates = self.check_packages() self.all_updates = self.check_packages()
def get_dependencies(self): def get_dependencies(self):
""" read out requirements.txt """ """read out requirements.txt"""
all_requirements = [] all_requirements = []
with open(self.FILE_PATH, 'r', encoding='utf-8') as f: with open(self.FILE_PATH, "r", encoding="utf-8") as f:
dependencies = f.readlines() dependencies = f.readlines()
for dependency in dependencies: for dependency in dependencies:
package, version = dependency.split('==') package, version = dependency.split("==")
all_requirements.append((package, version.strip())) all_requirements.append((package, version.strip()))
all_requirements.sort(key=lambda x: x[0].lower()) all_requirements.sort(key=lambda x: x[0].lower())
@ -44,33 +44,32 @@ class Requirements:
return all_requirements return all_requirements
def check_packages(self): def check_packages(self):
""" compare installed with remote version """ """compare installed with remote version"""
total = len(self.all_requirements) total = len(self.all_requirements)
print(f'checking versions for {total} packages...') print(f"checking versions for {total} packages...")
all_updates = {} all_updates = {}
for dependency in self.all_requirements: for dependency in self.all_requirements:
package, version_installed = dependency package, version_installed = dependency
url = f'https://pypi.org/pypi/{package}/json' url = f"https://pypi.org/pypi/{package}/json"
response = requests.get(url).json() response = requests.get(url).json()
version_remote = response['info']['version'] version_remote = response["info"]["version"]
homepage = response['info']['home_page'] homepage = response["info"]["home_page"]
if version_remote != version_installed: if version_remote != version_installed:
to_update = { to_update = {
package: { package: {"from": version_installed, "to": version_remote}
"from": version_installed,
"to": version_remote
}
} }
all_updates.update(to_update) all_updates.update(to_update)
message = (f'update {package} {version_installed}' + message = (
f'==> {version_remote}\n {homepage}') f"update {package} {version_installed}"
+ f"==> {version_remote}\n {homepage}"
)
print(message) print(message)
if not all_updates: if not all_updates:
print('no updates found') print("no updates found")
# remember that # remember that
pathlib.Path(self.LOCK).touch() pathlib.Path(self.LOCK).touch()
@ -78,7 +77,7 @@ class Requirements:
return all_updates return all_updates
def apply_updates(self): def apply_updates(self):
""" update requirements.txt file with new versions """ """update requirements.txt file with new versions"""
to_write = [] to_write = []
@ -86,31 +85,31 @@ class Requirements:
package, old_version = requirement package, old_version = requirement
if package in self.all_updates.keys(): if package in self.all_updates.keys():
package_version = self.all_updates[package]['to'] package_version = self.all_updates[package]["to"]
else: else:
package_version = old_version package_version = old_version
to_write.append(f'{package}=={package_version}\n') to_write.append(f"{package}=={package_version}\n")
with open(self.FILE_PATH, 'w', encoding='utf-8') as f: with open(self.FILE_PATH, "w", encoding="utf-8") as f:
f.writelines(to_write) f.writelines(to_write)
print('requirements.txt updates') print("requirements.txt updates")
def main(): def main():
""" main to check for updates """ """main to check for updates"""
handler = Requirements() handler = Requirements()
if handler.exists: if handler.exists:
return return
handler.look_for_updates() handler.look_for_updates()
if handler.all_updates: if handler.all_updates:
input_response = input('\nupdate requirements.txt? [y/n] ') input_response = input("\nupdate requirements.txt? [y/n] ")
if input_response == 'y': if input_response == "y":
handler.apply_updates() handler.apply_updates()
else: else:
print('cancle update...') print("cancel update...")
sys.exit(1) sys.exit(1)