From 3d26f320bfe14fa463d0f8d57bba2421badd500c Mon Sep 17 00:00:00 2001 From: simon Date: Tue, 18 Jan 2022 11:26:26 +0700 Subject: [PATCH 01/18] new logout icon --- tubearchivist/static/img/icon-exit.svg | 60 +++++++++++++++----------- 1 file changed, 34 insertions(+), 26 deletions(-) diff --git a/tubearchivist/static/img/icon-exit.svg b/tubearchivist/static/img/icon-exit.svg index afc40a6..88580d1 100644 --- a/tubearchivist/static/img/icon-exit.svg +++ b/tubearchivist/static/img/icon-exit.svg @@ -9,15 +9,15 @@ xmlns="http://www.w3.org/2000/svg" xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" - width="500" - height="500" - viewBox="0 0 132.29197 132.29167" + width="210mm" + height="210mm" + viewBox="0 0 210 210" version="1.1" - id="svg1303" + id="svg1566" inkscape:version="0.92.4 (5da689c313, 2019-01-14)" - sodipodi:docname="Icons_exit.svg"> + sodipodi:docname="Icons_exit 05.svg"> + id="defs1560" /> + inkscape:window-width="1920" + inkscape:window-height="1009" + inkscape:window-x="-8" + inkscape:window-y="-8" + inkscape:window-maximized="1" /> + id="metadata1563"> @@ -53,15 +52,24 @@ inkscape:label="Ebene 1" inkscape:groupmode="layer" id="layer1" - transform="translate(0,-164.70764)"> - - - + transform="translate(0,-87)"> + + + From 8b15ea5dc8db9d95a247c6887f90ec9e8368f24d Mon Sep 17 00:00:00 2001 From: simon Date: Tue, 18 Jan 2022 11:27:24 +0700 Subject: [PATCH 02/18] bump redis version --- tubearchivist/requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tubearchivist/requirements.txt b/tubearchivist/requirements.txt index 2bf1310..c1085f7 100644 --- a/tubearchivist/requirements.txt +++ b/tubearchivist/requirements.txt @@ -1,10 +1,10 @@ beautifulsoup4==4.10.0 celery==5.2.3 -django-cors-headers==3.11.0 Django==4.0.1 +django-cors-headers==3.11.0 djangorestframework==3.13.1 Pillow==9.0.0 -redis==4.1.0 +redis==4.1.1 requests==2.27.1 ryd-client==0.0.3 uWSGI==2.0.20 From af2783c18ac78455a53d94a277b79492150f8e48 Mon Sep 17 00:00:00 2001 From: simon Date: Tue, 18 Jan 2022 13:35:48 +0700 Subject: [PATCH 03/18] remove unneeded auth --- tubearchivist/home/src/index.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tubearchivist/home/src/index.py b/tubearchivist/home/src/index.py index 8a872f0..7d27ec6 100644 --- a/tubearchivist/home/src/index.py +++ b/tubearchivist/home/src/index.py @@ -65,7 +65,7 @@ class YoutubeChannel: channel_id = self.channel_id url = f"https://www.youtube.com/channel/{channel_id}/about?hl=en" cookies = {"CONSENT": "YES+xxxxxxxxxxxxxxxxxxxxxxxxxxx"} - response = requests.get(url, cookies=cookies, auth=self.ES_AUTH) + response = requests.get(url, cookies=cookies) if response.ok: channel_page = response.text else: From 3eb0353fa9968f78bf9268194a855b1352c2ae9e Mon Sep 17 00:00:00 2001 From: simon Date: Tue, 18 Jan 2022 13:36:41 +0700 Subject: [PATCH 04/18] new base class to make all es calls from --- tubearchivist/home/src/es.py | 75 ++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 tubearchivist/home/src/es.py diff --git a/tubearchivist/home/src/es.py b/tubearchivist/home/src/es.py new file mode 100644 index 0000000..75553c6 --- /dev/null +++ b/tubearchivist/home/src/es.py @@ -0,0 +1,75 @@ +"""holds es connection manager""" + +import json + +import requests +from home.src.config import AppConfig + + +class ElasticWrap: + """makes all calls to elastic search + returns response json and status code tuple + """ + + def __init__(self, path, config=False): + self.url = False + self.auth = False + self.path = path + self.config = config + self._get_config() + + def _get_config(self): + """add config if not passed""" + if not self.config: + self.config = AppConfig().config + + es_url = self.config["application"]["es_url"] + self.auth = self.config["application"]["es_auth"] + self.url = f"{es_url}/{self.path}" + + def get(self, data=False): + """get data from es""" + + if data: + response = requests.get(self.url, json=data, auth=self.auth) + else: + response = requests.get(self.url, auth=self.auth) + if not response.ok: + print(response.text) + + return response.json(), response.status_code + + def post(self, data, ndjson=False): + """post data to es""" + if ndjson: + headers = {"Content-type": "application/x-ndjson"} + payload = data + else: + headers = {"Content-type": "application/json"} + payload = json.dumps(data) + + response = requests.post( + self.url, data=payload, header=headers, auth=self.auth + ) + + if not response.ok: + print(response.text) + + return response.json(), response.status_code + + def put(self, data): + """put data to es""" + + response = requests.put(self.url, json=data, auth=self.auth) + if not response.ok: + print(response.text) + + return response.json(), response.status_code + + def delete(self): + """delete document from es""" + response = requests.delete(self.url, auth=self.auth) + if not response.ok: + print(response.text) + + return response.json(), response.status_code From 68f19b17199982e5f65e8c380dc205ec23d47ac0 Mon Sep 17 00:00:00 2001 From: simon Date: Tue, 18 Jan 2022 13:37:22 +0700 Subject: [PATCH 05/18] make SearchHandler use new ElasticWrap class --- tubearchivist/home/src/searching.py | 23 +++++++---------------- tubearchivist/home/views.py | 29 ++++++++++++++--------------- 2 files changed, 21 insertions(+), 31 deletions(-) diff --git a/tubearchivist/home/src/searching.py b/tubearchivist/home/src/searching.py index ffd11d4..23c3ddd 100644 --- a/tubearchivist/home/src/searching.py +++ b/tubearchivist/home/src/searching.py @@ -10,8 +10,8 @@ import math import urllib.parse from datetime import datetime -import requests from home.src.config import AppConfig +from home.src.es import ElasticWrap from home.src.helper import RedisArchivist from home.src.thumbnails import ThumbManager @@ -19,23 +19,15 @@ from home.src.thumbnails import ThumbManager class SearchHandler: """search elastic search""" - CONFIG = AppConfig().config - CACHE_DIR = CONFIG["application"]["cache_dir"] - ES_AUTH = CONFIG["application"]["es_auth"] - - def __init__(self, url, data): + def __init__(self, path, config, data=False): self.max_hits = None - self.url = url + self.path = path + self.config = config self.data = data def get_data(self): """get the data""" - if self.data: - response = requests.get( - self.url, json=self.data, auth=self.ES_AUTH - ).json() - else: - response = requests.get(self.url, auth=self.ES_AUTH).json() + response, _ = ElasticWrap(self.path, config=self.config).get(self.data) if "hits" in response.keys(): self.max_hits = response["hits"]["total"]["value"] @@ -153,11 +145,10 @@ class SearchForm: """build query from search form data""" CONFIG = AppConfig().config - ES_URL = CONFIG["application"]["es_url"] def multi_search(self, search_query): """searching through index""" - url = self.ES_URL + "/ta_video,ta_channel,ta_playlist/_search" + path = "ta_video,ta_channel,ta_playlist/_search" data = { "size": 30, "query": { @@ -184,7 +175,7 @@ class SearchForm: } }, } - look_up = SearchHandler(url, data) + look_up = SearchHandler(path, config=self.CONFIG, data=data) search_results = look_up.get_data() all_results = self.build_results(search_results) diff --git a/tubearchivist/home/views.py b/tubearchivist/home/views.py index ca83db9..b3de6c4 100644 --- a/tubearchivist/home/views.py +++ b/tubearchivist/home/views.py @@ -169,8 +169,7 @@ class ArchivistResultsView(ArchivistViewConfig): def single_lookup(self, es_path): """retrieve a single item from url""" - es_url = self.default_conf["application"]["es_url"] - search = SearchHandler(f"{es_url}/{es_path}", data=False) + search = SearchHandler(es_path, config=self.default_conf) result = search.get_data()[0]["source"] return result @@ -189,8 +188,9 @@ class ArchivistResultsView(ArchivistViewConfig): def find_results(self): """add results and pagination to context""" - url = self.default_conf["application"]["es_url"] + self.es_search - search = SearchHandler(url, self.data) + search = SearchHandler( + self.es_search, config=self.default_conf, data=self.data + ) self.context["results"] = search.get_data() self.pagination_handler.validate(search.max_hits) self.context["max_hits"] = search.max_hits @@ -203,7 +203,7 @@ class HomeView(ArchivistResultsView): """ view_origin = "home" - es_search = "/ta_video/_search" + es_search = "ta_video/_search" def get(self, request): """handle get requests""" @@ -284,7 +284,7 @@ class DownloadView(ArchivistResultsView): """ view_origin = "downloads" - es_search = "/ta_download/_search" + es_search = "ta_download/_search" def get(self, request): """handle get request""" @@ -346,7 +346,7 @@ class ChannelIdView(ArchivistResultsView): """ view_origin = "home" - es_search = "/ta_video/_search" + es_search = "ta_video/_search" def get(self, request, channel_id): """get request""" @@ -395,7 +395,7 @@ class ChannelView(ArchivistResultsView): """ view_origin = "channel" - es_search = "/ta_channel/_search" + es_search = "ta_channel/_search" def get(self, request): """handle get request""" @@ -445,7 +445,7 @@ class PlaylistIdView(ArchivistResultsView): """ view_origin = "home" - es_search = "/ta_video/_search" + es_search = "ta_video/_search" def get(self, request, playlist_id): """handle get request""" @@ -521,7 +521,7 @@ class PlaylistView(ArchivistResultsView): """ view_origin = "playlist" - es_search = "/ta_playlist/_search" + es_search = "ta_playlist/_search" def get(self, request): """handle get request""" @@ -592,9 +592,9 @@ class VideoView(View): def get(self, request, video_id): """get single video""" - es_url, colors, cast = self.read_config(user_id=request.user.id) - url = f"{es_url}/ta_video/_doc/{video_id}" - look_up = SearchHandler(url, None) + colors, cast = self.read_config(user_id=request.user.id) + path = f"ta_video/_doc/{video_id}" + look_up = SearchHandler(path, config=False) video_hit = look_up.get_data() video_data = video_hit[0]["source"] try: @@ -636,10 +636,9 @@ class VideoView(View): def read_config(user_id): """read config file""" config_handler = AppConfig(user_id) - es_url = config_handler.config["application"]["es_url"] cast = config_handler.config["application"]["enable_cast"] colors = config_handler.colors - return es_url, colors, cast + return colors, cast @staticmethod def star_creator(rating): From 2f4d4e715b46a39fdfe2e4ae2ac7171b461fcdf6 Mon Sep 17 00:00:00 2001 From: simon Date: Wed, 19 Jan 2022 13:04:15 +0700 Subject: [PATCH 06/18] refactor index class in to base and video classes --- tubearchivist/home/src/es.py | 7 +- tubearchivist/home/src/index.py | 185 ++++++++++++++++++++++++++++++++ 2 files changed, 190 insertions(+), 2 deletions(-) diff --git a/tubearchivist/home/src/es.py b/tubearchivist/home/src/es.py index 75553c6..ea44a24 100644 --- a/tubearchivist/home/src/es.py +++ b/tubearchivist/home/src/es.py @@ -59,10 +59,13 @@ class ElasticWrap: def put(self, data): """put data to es""" - - response = requests.put(self.url, json=data, auth=self.auth) + response = requests.put( + f"{self.url}/?refresh=true", json=data, auth=self.auth + ) if not response.ok: print(response.text) + print(data) + raise ValueError("failed to add item to index") return response.json(), response.status_code diff --git a/tubearchivist/home/src/index.py b/tubearchivist/home/src/index.py index 7d27ec6..c7deda7 100644 --- a/tubearchivist/home/src/index.py +++ b/tubearchivist/home/src/index.py @@ -15,11 +15,196 @@ import requests import yt_dlp from bs4 import BeautifulSoup from home.src.config import AppConfig +from home.src.es import ElasticWrap from home.src.helper import DurationConverter, UrlListParser, clean_string from home.src.thumbnails import ThumbManager from ryd_client import ryd_client +class YouTubeItem: + """base class for youtube""" + + es_path = False + index_name = False + + def __init__(self, youtube_id): + self.app_conf = self._get_conf() + self.youtube_id = youtube_id + self.youtube_meta = False + self.json_data = False + + @staticmethod + def _get_conf(): + """read user conf""" + config = AppConfig().config + return config["application"] + + def get_from_youtube(self): + """use yt-dlp to get meta data from youtube""" + print("getting from youtube") + obs = { + "quiet": True, + "default_search": "ytsearch", + "skip_download": True, + "check_formats": "selected", + "noplaylist": True, + } + + try: + response = yt_dlp.YoutubeDL(obs).extract_info(self.youtube_id) + except ( + yt_dlp.utils.ExtractorError, + yt_dlp.utils.DownloadError, + ): + print("failed to get info for " + self.youtube_id) + self.youtube_meta = False + + self.youtube_meta = response + + def get_from_es(self): + """get indexed data from elastic search""" + print("get from es") + response, _ = ElasticWrap(self.es_path).get() + self.json_data = response + + def upload_to_es(self): + """add json_data to elastic""" + _, _ = ElasticWrap(self.es_path).put(self.json_data) + + def deactivate(self): + """deactivate document in es""" + key_match = { + "video": "active", + "channel": "channel_active", + "playlist": "playlist_active", + } + update_path = f"ta_{self.index_name}/_update/{self.youtube_id}" + data = { + "script": f"ctx._source.{key_match.get(self.index_name)} = false" + } + _, _ = ElasticWrap(update_path).post(data) + + def del_in_es(self): + """delete item from elastic search""" + print("delete from es") + _, _ = ElasticWrap(self.es_path).delete() + + +class YoutubeVideoNew(YouTubeItem): + """represents a single youtube video""" + + es_path = False + index_name = "video" + + def __init__(self, youtube_id): + super().__init__(youtube_id) + self.channel_id = False + self.es_path = f"ta_video/_doc/{youtube_id}" + + def build_json(self): + """build json dict of video""" + self.get_from_youtube() + self.process_youtube_meta() + self.add_stats() + + def process_youtube_meta(self): + """extract relevant fields from youtube""" + # extract + self.channel_id = self.youtube_meta["channel_id"] + upload_date = self.youtube_meta["upload_date"] + upload_date_time = datetime.strptime(upload_date, "%Y%m%d") + published = upload_date_time.strftime("%Y-%m-%d") + last_refresh = int(datetime.now().strftime("%s")) + # build json_data basics + self.json_data = { + "title": self.youtube_meta["title"], + "description": self.youtube_meta["description"], + "category": self.youtube_meta["categories"], + "vid_thumb_url": self.youtube_meta["thumbnail"], + "tags": self.youtube_meta["tags"], + "published": published, + "vid_last_refresh": last_refresh, + "date_downloaded": last_refresh, + "youtube_id": self.youtube_id, + "active": True, + } + + def add_stats(self): + """add stats dicst to json_data""" + # likes + like_count = self.youtube_meta.get("like_count", 0) + dislike_count = self.youtube_meta.get("dislike_count", 0) + self.json_data.update( + { + "stats": { + "view_count": self.youtube_meta["view_count"], + "like_count": like_count, + "dislike_count": dislike_count, + "average_rating": self.youtube_meta["average_rating"], + } + } + ) + + def add_player(self, vid_path): + """add player information for new videos""" + duration_handler = DurationConverter() + duration = duration_handler.get_sec(vid_path) + duration_str = duration_handler.get_str(duration) + self.json_data.update( + { + "player": { + "watched": False, + "duration": duration, + "duration_str": duration_str, + } + } + ) + + def add_file_path(self): + """build media_url for where file will be located""" + channel_name = self.json_data["channel"]["channel_name"] + clean_channel_name = clean_string(channel_name) + timestamp = self.json_data["published"].replace("-", "") + youtube_id = self.json_data["youtube_id"] + title = self.json_data["title"] + clean_title = clean_string(title) + filename = f"{timestamp}_{youtube_id}_{clean_title}.mp4" + media_url = os.path.join(clean_channel_name, filename) + self.json_data["media_url"] = media_url + + def delete_media_file(self): + """delete video file, meta data, thumbnails""" + # delete media file + video_base = self.app_conf["videos"] + media_url = self.json_data["media_url"] + print(f"delete {media_url} from file system") + to_delete = os.path.join(video_base, media_url) + os.remove(to_delete) + # delete from index + self.del_in_es() + + def get_ryd_stats(self): + """get optional stats from returnyoutubedislikeapi.com""" + try: + print(f"get ryd stats for: {self.youtube_id}") + result = ryd_client.get(self.youtube_id) + except requests.exceptions.ConnectionError: + print(f"failed to query ryd api, skipping {self.youtube_id}") + return False + + if result["status"] == 404: + return False + + dislikes = { + "dislike_count": result["dislikes"], + "average_rating": result["rating"], + } + self.json_data["stats"].update(dislikes) + + return True + + + class YoutubeChannel: """represents a single youtube channel""" From b64a2f81951add4204ca6c48d1409a2bb7663a07 Mon Sep 17 00:00:00 2001 From: simon Date: Sat, 22 Jan 2022 17:48:54 +0700 Subject: [PATCH 07/18] squash index.py refactor commits --- tubearchivist/home/src/download.py | 138 ++--- tubearchivist/home/src/es.py | 88 ++- tubearchivist/home/src/frontend.py | 2 +- tubearchivist/home/src/index.py | 898 +++++++++-------------------- tubearchivist/home/src/reindex.py | 104 ++-- tubearchivist/home/tasks.py | 30 +- tubearchivist/home/views.py | 10 +- 7 files changed, 464 insertions(+), 806 deletions(-) diff --git a/tubearchivist/home/src/download.py b/tubearchivist/home/src/download.py index 429c90d..0dbd0be 100644 --- a/tubearchivist/home/src/download.py +++ b/tubearchivist/home/src/download.py @@ -15,6 +15,7 @@ from time import sleep import requests import yt_dlp from home.src.config import AppConfig +from home.src.es import IndexPaginate from home.src.helper import ( DurationConverter, RedisArchivist, @@ -23,7 +24,6 @@ from home.src.helper import ( ignore_filelist, ) from home.src.index import ( - IndexPaginate, YoutubeChannel, YoutubePlaylist, YoutubeVideo, @@ -69,7 +69,9 @@ class PendingList: missing_videos = missing_videos + youtube_ids elif url_type == "playlist": self.missing_from_playlists.append(entry) - video_results = YoutubePlaylist(url).get_entries() + playlist = YoutubePlaylist(url) + playlist.build_json() + video_results = playlist.json_data.get("playlist_entries") youtube_ids = [i["youtube_id"] for i in video_results] missing_videos = missing_videos + youtube_ids @@ -346,34 +348,14 @@ class ChannelSubscription: return missing_videos - def change_subscribe(self, channel_id, channel_subscribed): + @staticmethod + def change_subscribe(channel_id, channel_subscribed): """subscribe or unsubscribe from channel and update""" - if not isinstance(channel_subscribed, bool): - print("invalid status, should be bool") - return - headers = {"Content-type": "application/json"} - channel_handler = YoutubeChannel(channel_id) - channel_dict = channel_handler.channel_dict - channel_dict["channel_subscribed"] = channel_subscribed - if channel_subscribed: - # handle subscribe - url = self.es_url + "/ta_channel/_doc/" + channel_id - payload = json.dumps(channel_dict) - print(channel_dict) - else: - url = self.es_url + "/ta_channel/_update/" + channel_id - payload = json.dumps({"doc": channel_dict}) - # update channel - request = requests.post( - url, data=payload, headers=headers, auth=self.es_auth - ) - if not request.ok: - print(request.text) - raise ValueError("failed change subscribe status") - # sync to videos - channel_handler.sync_to_videos() - if channel_handler.source == "scraped": - channel_handler.get_channel_art() + channel = YoutubeChannel(channel_id) + channel.build_json() + channel.json_data["channel_subscribed"] = channel_subscribed + channel.upload_to_es() + channel.sync_to_videos() class PlaylistSubscription: @@ -413,20 +395,15 @@ class PlaylistSubscription: print(f"{playlist_id} not a playlist, skipping...") continue - playlist_h = YoutubePlaylist( - playlist_id, all_youtube_ids=all_youtube_ids - ) - if not playlist_h.get_es_playlist(): - playlist_h.get_playlist_dict() - playlist_h.playlist_dict["playlist_subscribed"] = subscribed - playlist_h.upload_to_es() - playlist_h.add_vids_to_playlist() - thumb = playlist_h.playlist_dict["playlist_thumbnail"] - new_thumbs.append((playlist_id, thumb)) - self.channel_validate(playlist_h) - else: - self.change_subscribe(playlist_id, subscribe_status=True) - + playlist_h = YoutubePlaylist(playlist_id) + playlist_h.all_youtube_ids = all_youtube_ids + playlist_h.build_json() + playlist_h.json_data["playlist_subscribed"] = subscribed + playlist_h.upload_to_es() + playlist_h.add_vids_to_playlist() + self.channel_validate(playlist_h.json_data["playlist_channel_id"]) + thumb = playlist_h.json_data["playlist_thumbnail"] + new_thumbs.append((playlist_id, thumb)) # notify message = { "status": "message:subplaylist", @@ -441,41 +418,18 @@ class PlaylistSubscription: return new_thumbs @staticmethod - def channel_validate(playlist_handler): + def channel_validate(channel_id): """make sure channel of playlist is there""" - channel_id = playlist_handler.playlist_dict["playlist_channel_id"] - channel_handler = YoutubeChannel(channel_id) - if channel_handler.source == "scraped": - channel_handler.channel_dict["channel_subscribed"] = False - channel_handler.upload_to_es() - channel_handler.get_channel_art() + channel = YoutubeChannel(channel_id) + channel.build_json() - def change_subscribe(self, playlist_id, subscribe_status): + @staticmethod + def change_subscribe(playlist_id, subscribe_status): """change the subscribe status of a playlist""" - es_url = self.config["application"]["es_url"] - es_auth = self.config["application"]["es_auth"] - playlist_handler = YoutubePlaylist(playlist_id) - playlist_handler.get_playlist_dict() - subed_now = playlist_handler.playlist_dict["playlist_subscribed"] - - if subed_now == subscribe_status: - # status already as expected, do nothing - return False - - # update subscribed status - headers = {"Content-type": "application/json"} - url = f"{es_url}/ta_playlist/_update/{playlist_id}" - payload = json.dumps( - {"doc": {"playlist_subscribed": subscribe_status}} - ) - response = requests.post( - url, data=payload, headers=headers, auth=es_auth - ) - if not response.ok: - print(response.text) - raise ValueError("failed to change subscribe status") - - return True + playlist = YoutubePlaylist(playlist_id) + playlist.build_json() + playlist.json_data["playlist_subscribed"] = subscribe_status + playlist.upload_to_es() @staticmethod def get_to_ignore(): @@ -493,26 +447,25 @@ class PlaylistSubscription: to_ignore = self.get_to_ignore() missing_videos = [] - counter = 1 - for playlist_id in all_playlists: + for idx, playlist_id in enumerate(all_playlists): size_limit = self.config["subscriptions"]["channel_size"] - playlist_handler = YoutubePlaylist(playlist_id) - playlist = playlist_handler.update_playlist() + playlist = YoutubePlaylist(playlist_id) + playlist.update_playlist() if not playlist: - playlist_handler.deactivate() + playlist.deactivate() continue + playlist_entries = playlist.json_data["playlist_entries"] if size_limit: - playlist_entries = playlist["playlist_entries"][:size_limit] - else: - playlist_entries = playlist["playlist_entries"] + del playlist_entries[size_limit:] + all_missing = [i for i in playlist_entries if not i["downloaded"]] message = { "status": "message:rescan", "level": "info", "title": "Scanning playlists: Looking for new videos.", - "message": f"Progress: {counter}/{len(all_playlists)}", + "message": f"Progress: {idx + 1}/{len(all_playlists)}", } RedisArchivist().set_message("message:rescan", message=message) @@ -520,7 +473,6 @@ class PlaylistSubscription: youtube_id = video["youtube_id"] if youtube_id not in to_ignore: missing_videos.append(youtube_id) - counter = counter + 1 return missing_videos @@ -751,15 +703,15 @@ class VideoDownloader: playlists = YoutubeChannel(channel_id).get_indexed_playlists() all_playlist_ids = [i["playlist_id"] for i in playlists] for id_p, playlist_id in enumerate(all_playlist_ids): - playlist_handler = YoutubePlaylist( - playlist_id, all_youtube_ids=all_youtube_ids - ) - playlist_dict = playlist_handler.update_playlist() - if not playlist_dict: - playlist_handler.deactivate() - continue + playlist = YoutubePlaylist(playlist_id) + playlist.all_youtube_ids = all_youtube_ids + playlist.build_json(scrape=True) + if not playlist.json_data: + playlist.deactivate() + + playlist.add_vids_to_playlist() + playlist.upload_to_es() - playlist_handler.add_vids_to_playlist() # notify title = ( "Processing playlists for channels: " diff --git a/tubearchivist/home/src/es.py b/tubearchivist/home/src/es.py index ea44a24..ce72863 100644 --- a/tubearchivist/home/src/es.py +++ b/tubearchivist/home/src/es.py @@ -29,7 +29,6 @@ class ElasticWrap: def get(self, data=False): """get data from es""" - if data: response = requests.get(self.url, json=data, auth=self.auth) else: @@ -39,7 +38,7 @@ class ElasticWrap: return response.json(), response.status_code - def post(self, data, ndjson=False): + def post(self, data=False, ndjson=False): """post data to es""" if ndjson: headers = {"Content-type": "application/x-ndjson"} @@ -48,20 +47,23 @@ class ElasticWrap: headers = {"Content-type": "application/json"} payload = json.dumps(data) - response = requests.post( - self.url, data=payload, header=headers, auth=self.auth - ) + if data: + response = requests.post( + self.url, data=payload, headers=headers, auth=self.auth + ) + else: + response = requests.post(self.url, headers=headers, auth=self.auth) if not response.ok: print(response.text) return response.json(), response.status_code - def put(self, data): + def put(self, data, refresh=False): """put data to es""" - response = requests.put( - f"{self.url}/?refresh=true", json=data, auth=self.auth - ) + if refresh: + self.url = f"{self.url}/?refresh=true" + response = requests.put(f"{self.url}", json=data, auth=self.auth) if not response.ok: print(response.text) print(data) @@ -69,10 +71,74 @@ class ElasticWrap: return response.json(), response.status_code - def delete(self): + def delete(self, data=False): """delete document from es""" - response = requests.delete(self.url, auth=self.auth) + if data: + response = requests.delete(self.url, json=data, auth=self.auth) + else: + response = requests.delete(self.url, auth=self.auth) + if not response.ok: print(response.text) return response.json(), response.status_code + + +class IndexPaginate: + """use search_after to go through whole index""" + + DEFAULT_SIZE = 500 + + def __init__(self, index_name, data, size=False): + self.index_name = index_name + self.data = data + self.pit_id = False + self.size = size + + def get_results(self): + """get all results""" + self.get_pit() + self.validate_data() + all_results = self.run_loop() + self.clean_pit() + return all_results + + def get_pit(self): + """get pit for index""" + path = f"{self.index_name}/_pit?keep_alive=10m" + response, _ = ElasticWrap(path).post() + self.pit_id = response["id"] + + def validate_data(self): + """add pit and size to data""" + if "sort" not in self.data.keys(): + print(self.data) + raise ValueError("missing sort key in data") + + size = self.size or self.DEFAULT_SIZE + + self.data["size"] = size + self.data["pit"] = {"id": self.pit_id, "keep_alive": "10m"} + + def run_loop(self): + """loop through results until last hit""" + all_results = [] + while True: + response, _ = ElasticWrap("_search").get(data=self.data) + all_hits = response["hits"]["hits"] + if all_hits: + for hit in all_hits: + source = hit["_source"] + search_after = hit["sort"] + all_results.append(source) + # update search_after with last hit data + self.data["search_after"] = search_after + else: + break + + return all_results + + def clean_pit(self): + """delete pit from elastic search""" + data = {"id": self.pit_id} + ElasticWrap("_pit").delete(data=data) diff --git a/tubearchivist/home/src/frontend.py b/tubearchivist/home/src/frontend.py index c81283e..b6451e6 100644 --- a/tubearchivist/home/src/frontend.py +++ b/tubearchivist/home/src/frontend.py @@ -306,7 +306,7 @@ class PostData: playlist_dict = self.exec_val playlist_id = playlist_dict["playlist-id"] playlist_action = playlist_dict["playlist-action"] - print(f"delete {playlist_action} from playlist {playlist_id}") + print(f"{playlist_id}: delete playlist {playlist_action}") if playlist_action == "metadata": YoutubePlaylist(playlist_id).delete_metadata() elif playlist_action == "all": diff --git a/tubearchivist/home/src/index.py b/tubearchivist/home/src/index.py index c7deda7..fd22f17 100644 --- a/tubearchivist/home/src/index.py +++ b/tubearchivist/home/src/index.py @@ -9,13 +9,12 @@ import json import os import re from datetime import datetime -from time import sleep import requests import yt_dlp from bs4 import BeautifulSoup from home.src.config import AppConfig -from home.src.es import ElasticWrap +from home.src.es import ElasticWrap, IndexPaginate from home.src.helper import DurationConverter, UrlListParser, clean_string from home.src.thumbnails import ThumbManager from ryd_client import ryd_client @@ -26,50 +25,53 @@ class YouTubeItem: es_path = False index_name = False + yt_base = False + yt_obs = { + "quiet": True, + "default_search": "ytsearch", + "skip_download": True, + "check_formats": "selected", + "noplaylist": True, + } def __init__(self, youtube_id): - self.app_conf = self._get_conf() self.youtube_id = youtube_id + self.config = False + self.app_conf = False self.youtube_meta = False self.json_data = False + self._get_conf() - @staticmethod - def _get_conf(): + def _get_conf(self): """read user conf""" - config = AppConfig().config - return config["application"] + self.config = AppConfig().config + self.app_conf = self.config["application"] def get_from_youtube(self): """use yt-dlp to get meta data from youtube""" - print("getting from youtube") - obs = { - "quiet": True, - "default_search": "ytsearch", - "skip_download": True, - "check_formats": "selected", - "noplaylist": True, - } - + print(f"{self.youtube_id}: get metadata from youtube") try: - response = yt_dlp.YoutubeDL(obs).extract_info(self.youtube_id) + yt_item = yt_dlp.YoutubeDL(self.yt_obs) + response = yt_item.extract_info(self.yt_base + self.youtube_id) except ( yt_dlp.utils.ExtractorError, yt_dlp.utils.DownloadError, ): - print("failed to get info for " + self.youtube_id) + print(f"{self.youtube_id}: failed to get info from youtube") self.youtube_meta = False self.youtube_meta = response def get_from_es(self): """get indexed data from elastic search""" - print("get from es") - response, _ = ElasticWrap(self.es_path).get() - self.json_data = response + print(f"{self.youtube_id}: get metadata from es") + response, _ = ElasticWrap(f"{self.es_path}").get() + source = response.get("_source") + self.json_data = source def upload_to_es(self): """add json_data to elastic""" - _, _ = ElasticWrap(self.es_path).put(self.json_data) + _, _ = ElasticWrap(self.es_path).put(self.json_data, refresh=True) def deactivate(self): """deactivate document in es""" @@ -78,7 +80,7 @@ class YouTubeItem: "channel": "channel_active", "playlist": "playlist_active", } - update_path = f"ta_{self.index_name}/_update/{self.youtube_id}" + update_path = f"{self.index_name}/_update/{self.youtube_id}" data = { "script": f"ctx._source.{key_match.get(self.index_name)} = false" } @@ -86,28 +88,39 @@ class YouTubeItem: def del_in_es(self): """delete item from elastic search""" - print("delete from es") + print(f"{self.youtube_id}: delete from es") _, _ = ElasticWrap(self.es_path).delete() -class YoutubeVideoNew(YouTubeItem): +class YoutubeVideo(YouTubeItem): """represents a single youtube video""" es_path = False - index_name = "video" + index_name = "ta_video" + yt_base = "https://www.youtube.com/watch?v=" def __init__(self, youtube_id): super().__init__(youtube_id) self.channel_id = False - self.es_path = f"ta_video/_doc/{youtube_id}" + self.es_path = f"{self.index_name}/_doc/{youtube_id}" def build_json(self): """build json dict of video""" self.get_from_youtube() - self.process_youtube_meta() - self.add_stats() + if not self.youtube_meta: + return - def process_youtube_meta(self): + self._process_youtube_meta() + self._add_channel() + self._add_stats() + self.add_file_path() + self.add_player() + if self.config["downloads"]["integrate_ryd"]: + self._get_ryd_stats() + + return + + def _process_youtube_meta(self): """extract relevant fields from youtube""" # extract self.channel_id = self.youtube_meta["channel_id"] @@ -129,7 +142,13 @@ class YoutubeVideoNew(YouTubeItem): "active": True, } - def add_stats(self): + def _add_channel(self): + """add channel dict to video json_data""" + channel = YoutubeChannel(self.channel_id) + channel.build_json(upload=True) + self.json_data.update({"channel": channel.json_data}) + + def _add_stats(self): """add stats dicst to json_data""" # likes like_count = self.youtube_meta.get("like_count", 0) @@ -145,8 +164,28 @@ class YoutubeVideoNew(YouTubeItem): } ) - def add_player(self, vid_path): + def build_dl_cache_path(self): + """find video path in dl cache""" + cache_dir = self.app_conf["cache_dir"] + cache_path = f"{cache_dir}/download/" + all_cached = os.listdir(cache_path) + for file_cached in all_cached: + if self.youtube_id in file_cached: + vid_path = os.path.join(cache_path, file_cached) + return vid_path + + return False + + def add_player(self): """add player information for new videos""" + try: + # when indexing from download task + vid_path = self.build_dl_cache_path() + except FileNotFoundError: + # when reindexing + base = self.app_conf["videos"] + vid_path = os.path.join(base, self.json_data["media_url"]) + duration_handler = DurationConverter() duration = duration_handler.get_sec(vid_path) duration_str = duration_handler.get_str(duration) @@ -173,23 +212,22 @@ class YoutubeVideoNew(YouTubeItem): self.json_data["media_url"] = media_url def delete_media_file(self): - """delete video file, meta data, thumbnails""" - # delete media file + """delete video file, meta data""" + self.get_from_es() video_base = self.app_conf["videos"] media_url = self.json_data["media_url"] - print(f"delete {media_url} from file system") + print(f"{self.youtube_id}: delete {media_url} from file system") to_delete = os.path.join(video_base, media_url) os.remove(to_delete) - # delete from index self.del_in_es() - def get_ryd_stats(self): + def _get_ryd_stats(self): """get optional stats from returnyoutubedislikeapi.com""" try: - print(f"get ryd stats for: {self.youtube_id}") + print(f"{self.youtube_id}: get ryd stats") result = ryd_client.get(self.youtube_id) except requests.exceptions.ConnectionError: - print(f"failed to query ryd api, skipping {self.youtube_id}") + print(f"{self.youtube_id}: failed to query ryd api, skipping") return False if result["status"] == 404: @@ -204,61 +242,42 @@ class YoutubeVideoNew(YouTubeItem): return True - -class YoutubeChannel: - """represents a single youtube channel""" - - CONFIG = AppConfig().config - ES_URL = CONFIG["application"]["es_url"] - ES_AUTH = CONFIG["application"]["es_auth"] - CACHE_DIR = CONFIG["application"]["cache_dir"] - VIDEOS = CONFIG["application"]["videos"] +class ChannelScraper: + """custom scraper using bs4 to scrape channel about page + will be able to be integrated into yt-dlp + once #2237 and #2350 are merged upstream + """ def __init__(self, channel_id): self.channel_id = channel_id - self.json_data = None - self.source = None - self.channel_dict = self.build_channel_dict() + self.soup = False + self.yt_json = False + self.json_data = False - def build_channel_dict(self, scrape=False): - """combine the dicts build from extracted json payload""" - if scrape: - channel_dict = False - else: - channel_dict = self.get_es_channel() - if not channel_dict: - print("scrape data from youtube") - self.scrape_channel() - channel_dict = self.parse_channel_main() - channel_dict.update(self.parse_channel_meta()) - self.source = "scraped" - return channel_dict + def get_json(self): + """main method to return channel dict""" + self.get_soup() + self._extract_yt_json() + self._parse_channel_main() + self._parse_channel_meta() + return self.json_data - def get_es_channel(self): - """get from elastic search first if possible""" - channel_id = self.channel_id - url = f"{self.ES_URL}/ta_channel/_doc/{channel_id}" - response = requests.get(url, auth=self.ES_AUTH) - if response.ok: - channel_source = response.json()["_source"] - self.source = "elastic" - return channel_source - return False - - def scrape_channel(self): - """scrape channel page for additional infos""" - channel_id = self.channel_id - url = f"https://www.youtube.com/channel/{channel_id}/about?hl=en" + def get_soup(self): + """return soup from youtube""" + print(f"{self.channel_id}: scrape channel data from youtube") + url = f"https://www.youtube.com/channel/{self.channel_id}/about?hl=en" cookies = {"CONSENT": "YES+xxxxxxxxxxxxxxxxxxxxxxxxxxx"} response = requests.get(url, cookies=cookies) if response.ok: channel_page = response.text else: - print(f"failed to extract channel info for: {channel_id}") + print(f"{self.channel_id}: failed to extract channel info") raise ConnectionError - soup = BeautifulSoup(channel_page, "html.parser") - # load script into json - all_scripts = soup.find("body").find_all("script") + self.soup = BeautifulSoup(channel_page, "html.parser") + + def _extract_yt_json(self): + """parse soup and get ytInitialData json""" + all_scripts = self.soup.find("body").find_all("script") for script in all_scripts: if "var ytInitialData = " in str(script): script_content = str(script) @@ -266,16 +285,37 @@ class YoutubeChannel: # extract payload script_content = script_content.split("var ytInitialData = ")[1] json_raw = script_content.rstrip(";") - json_data = json.loads(json_raw) - # add to self - self.json_data = json_data + self.yt_json = json.loads(json_raw) - def parse_channel_main(self): + def _parse_channel_main(self): """extract maintab values from scraped channel json data""" - main_tab = self.json_data["header"]["c4TabbedHeaderRenderer"] - channel_name = main_tab["title"] - last_refresh = int(datetime.now().strftime("%s")) - # channel_subs + main_tab = self.yt_json["header"]["c4TabbedHeaderRenderer"] + # build and return dict + self.json_data = { + "channel_active": True, + "channel_last_refresh": int(datetime.now().strftime("%s")), + "channel_subs": self._get_channel_subs(main_tab), + "channel_name": main_tab["title"], + "channel_banner_url": self._get_thumbnails(main_tab, "banner"), + "channel_tvart_url": self._get_thumbnails(main_tab, "tvBanner"), + "channel_id": self.channel_id, + "channel_subscribed": False, + } + + @staticmethod + def _get_thumbnails(main_tab, thumb_name): + """extract banner url from main_tab""" + try: + all_banners = main_tab[thumb_name]["thumbnails"] + banner = sorted(all_banners, key=lambda k: k["width"])[-1]["url"] + except KeyError: + banner = False + + return banner + + @staticmethod + def _get_channel_subs(main_tab): + """process main_tab to get channel subs as int""" try: sub_text_simple = main_tab["subscriberCountText"]["simpleText"] sub_text = sub_text_simple.split(" ")[0] @@ -290,34 +330,18 @@ class YoutubeChannel: print(message) except KeyError: channel_subs = 0 - # banner - try: - all_banners = main_tab["banner"]["thumbnails"] - banner = sorted(all_banners, key=lambda k: k["width"])[-1]["url"] - except KeyError: - banner = False - # build and return dict - main_channel_dict = { - "channel_active": True, - "channel_last_refresh": last_refresh, - "channel_subs": channel_subs, - "channel_banner_url": banner, - "channel_name": channel_name, - "channel_id": self.channel_id, - } - return main_channel_dict - def parse_channel_meta(self): + return channel_subs + + def _parse_channel_meta(self): """extract meta tab values from channel payload""" # meta tab - json_data = self.json_data - meta_tab = json_data["metadata"]["channelMetadataRenderer"] - description = meta_tab["description"] + meta_tab = self.yt_json["metadata"]["channelMetadataRenderer"] all_thumbs = meta_tab["avatar"]["thumbnails"] thumb_url = sorted(all_thumbs, key=lambda k: k["width"])[-1]["url"] # stats tab renderer = "twoColumnBrowseResultsRenderer" - all_tabs = json_data["contents"][renderer]["tabs"] + all_tabs = self.yt_json["contents"][renderer]["tabs"] for tab in all_tabs: if "tabRenderer" in tab.keys(): if tab["tabRenderer"]["title"] == "About": @@ -333,81 +357,81 @@ class YoutubeChannel: except KeyError: channel_views = 0 - meta_channel_dict = { - "channel_description": description, - "channel_thumb_url": thumb_url, - "channel_views": channel_views, - } + self.json_data.update( + { + "channel_description": meta_tab["description"], + "channel_thumb_url": thumb_url, + "channel_views": channel_views, + } + ) - return meta_channel_dict + +class YoutubeChannel(YouTubeItem): + """represents a single youtube channel""" + + es_path = False + index_name = "ta_channel" + yt_base = "https://www.youtube.com/channel/" + + def __init__(self, youtube_id): + super().__init__(youtube_id) + self.es_path = f"{self.index_name}/_doc/{youtube_id}" + + def build_json(self, upload=False): + """get from es or from youtube""" + self.get_from_es() + if self.json_data: + return + + self.get_from_youtube() + if upload: + self.upload_to_es() + return + + def get_from_youtube(self): + """use bs4 to scrape channel about page""" + self.json_data = ChannelScraper(self.youtube_id).get_json() + self.get_channel_art() def get_channel_art(self): """download channel art for new channels""" - channel_id = self.channel_id - channel_thumb = self.channel_dict["channel_thumb_url"] - channel_banner = self.channel_dict["channel_banner_url"] + channel_id = self.youtube_id + channel_thumb = self.json_data["channel_thumb_url"] + channel_banner = self.json_data["channel_banner_url"] ThumbManager().download_chan( [(channel_id, channel_thumb, channel_banner)] ) - def upload_to_es(self): - """upload channel data to elastic search""" - url = f"{self.ES_URL}/ta_channel/_doc/{self.channel_id}" - response = requests.put(url, json=self.channel_dict, auth=self.ES_AUTH) - print(f"added {self.channel_id} to es") - if not response.ok: - print(response.text) - raise ValueError("failed to add channel to index") - def sync_to_videos(self): """sync new channel_dict to all videos of channel""" - headers = {"Content-type": "application/json"} - channel_id = self.channel_id # add ingest pipeline processors = [] - for field, value in self.channel_dict.items(): + for field, value in self.json_data.items(): line = {"set": {"field": "channel." + field, "value": value}} processors.append(line) - data = {"description": channel_id, "processors": processors} - payload = json.dumps(data) - url = self.ES_URL + "/_ingest/pipeline/" + channel_id - request = requests.put( - url, data=payload, headers=headers, auth=self.ES_AUTH - ) - if not request.ok: - print(request.text) + data = {"description": self.youtube_id, "processors": processors} + ingest_path = f"_ingest/pipeline/{self.youtube_id}" + _, _ = ElasticWrap(ingest_path).put(data) # apply pipeline - data = {"query": {"match": {"channel.channel_id": channel_id}}} - payload = json.dumps(data) - url = self.ES_URL + "/ta_video/_update_by_query?pipeline=" + channel_id - request = requests.post( - url, data=payload, headers=headers, auth=self.ES_AUTH - ) - if not request.ok: - print(request.text) + data = {"query": {"match": {"channel.channel_id": self.youtube_id}}} + update_path = f"ta_video/_update_by_query?pipeline={self.youtube_id}" + _, _ = ElasticWrap(update_path).post(data) def get_folder_path(self): """get folder where media files get stored""" - channel_name = self.channel_dict["channel_name"] + channel_name = self.json_data["channel_name"] folder_name = clean_string(channel_name) - folder_path = os.path.join(self.VIDEOS, folder_name) + folder_path = os.path.join(self.app_conf["videos"], folder_name) return folder_path def delete_es_videos(self): """delete all channel documents from elasticsearch""" - headers = {"Content-type": "application/json"} data = { "query": { - "term": {"channel.channel_id": {"value": self.channel_id}} + "term": {"channel.channel_id": {"value": self.youtube_id}} } } - payload = json.dumps(data) - url = self.ES_URL + "/ta_video/_delete_by_query" - response = requests.post( - url, data=payload, headers=headers, auth=self.ES_AUTH - ) - if not response.ok: - print(response.text) + _, _ = ElasticWrap("ta_video/_delete_by_query").post(data) def delete_playlists(self): """delete all indexed playlist from es""" @@ -418,9 +442,10 @@ class YoutubeChannel: def delete_channel(self): """delete channel and all videos""" - print(f"deleting {self.channel_id} and all matching media files") + print(f"{self.youtube_id}: delete channel") + self.get_from_es() folder_path = self.get_folder_path() - print("delete all media files") + print(f"{self.youtube_id}: delete all media files") try: all_videos = os.listdir(folder_path) for video in all_videos: @@ -430,20 +455,16 @@ class YoutubeChannel: except FileNotFoundError: print(f"no videos found for {folder_path}") - ThumbManager().delete_chan_thumb(self.channel_id) - print("delete indexed playlists") + print(f"{self.youtube_id}: delete indexed playlists") self.delete_playlists() - print("delete indexed videos") + print(f"{self.youtube_id}: delete indexed videos") self.delete_es_videos() - url = self.ES_URL + "/ta_channel/_doc/" + self.channel_id - response = requests.delete(url, auth=self.ES_AUTH) - if not response.ok: - print(response.text) + self.del_in_es() def get_all_playlists(self): """get all playlists owned by this channel""" url = ( - f"https://www.youtube.com/channel/{self.channel_id}" + f"https://www.youtube.com/channel/{self.youtube_id}" + "/playlists?view=1&sort=dd&shelf_id=0" ) obs = { @@ -460,7 +481,7 @@ class YoutubeChannel: """get all indexed playlists from channel""" data = { "query": { - "term": {"playlist_channel_id": {"value": self.channel_id}} + "term": {"playlist_channel_id": {"value": self.youtube_id}} }, "sort": [{"playlist_channel.keyword": {"order": "desc"}}], } @@ -468,328 +489,85 @@ class YoutubeChannel: return all_playlists -class YoutubeVideo: - """represents a single youtube video""" +class YoutubePlaylist(YouTubeItem): + """represents a single youtube playlist""" - CONFIG = AppConfig().config - ES_URL = CONFIG["application"]["es_url"] - ES_AUTH = CONFIG["application"]["es_auth"] - CACHE_DIR = CONFIG["application"]["cache_dir"] - VIDEOS = CONFIG["application"]["videos"] + es_path = False + index_name = "ta_playlist" + yt_obs = { + "default_search": "ytsearch", + "quiet": True, + "skip_download": True, + "extract_flat": True, + } + yt_base = "https://www.youtube.com/playlist?list=" def __init__(self, youtube_id): - self.youtube_id = youtube_id - self.channel_id = None - self.vid_dict = None + super().__init__(youtube_id) + self.es_path = f"{self.index_name}/_doc/{youtube_id}" + self.all_members = False + self.nav = False + self.all_youtube_ids = [] - def get_vid_dict(self): - """wrapper to loop around yt_dlp to retry on failure""" - print(f"get video data for {self.youtube_id}") - vid_dict = False - for i in range(3): - try: - vid_dict = self.get_youtubedl_vid_data() - except KeyError as e: - print(e) - sleep((i + 1) ** 2) - continue - else: - break + def build_json(self, scrape=False): + """collection to create json_data""" + if not scrape: + self.get_from_es() - self.vid_dict = vid_dict - if self.CONFIG["downloads"]["integrate_ryd"]: - self.get_ryd_stats() + if scrape or not self.json_data: + self.get_from_youtube() + self.process_youtube_meta() + self.get_entries() + self.json_data["playlist_entries"] = self.all_members + self.get_playlist_art() - def get_youtubedl_vid_data(self): - """parse youtubedl extract info""" - youtube_id = self.youtube_id - obs = { - "quiet": True, - "default_search": "ytsearch", - "skip_download": True, - "check_formats": "selected", - "noplaylist": True, - } - try: - vid = yt_dlp.YoutubeDL(obs).extract_info(youtube_id) - except ( - yt_dlp.utils.ExtractorError, - yt_dlp.utils.DownloadError, - ): - print("failed to get info for " + youtube_id) - return False - # extract - self.channel_id = vid["channel_id"] - upload_date = vid["upload_date"] - upload_date_time = datetime.strptime(upload_date, "%Y%m%d") - published = upload_date_time.strftime("%Y-%m-%d") - last_refresh = int(datetime.now().strftime("%s")) - # likes - try: - like_count = vid["like_count"] - except KeyError: - like_count = 0 - try: - dislike_count = vid["dislike_count"] - except KeyError: - dislike_count = 0 - # build dicts - stats = { - "view_count": vid["view_count"], - "like_count": like_count, - "dislike_count": dislike_count, - "average_rating": vid["average_rating"], - } - vid_basic = { - "title": vid["title"], - "description": vid["description"], - "category": vid["categories"], - "vid_thumb_url": vid["thumbnail"], - "tags": vid["tags"], - "published": published, - "stats": stats, - "vid_last_refresh": last_refresh, - "date_downloaded": last_refresh, - "youtube_id": youtube_id, - "active": True, - "channel": False, - } - - return vid_basic - - def add_player(self, missing_vid): - """add player information for new videos""" - cache_path = self.CACHE_DIR + "/download/" - videos = self.VIDEOS - - if missing_vid: - # coming from scan_filesystem - channel_name, file_name, _ = missing_vid - vid_path = os.path.join(videos, channel_name, file_name) - else: - # coming from VideoDownload - all_cached = os.listdir(cache_path) - for file_cached in all_cached: - if self.youtube_id in file_cached: - vid_path = os.path.join(cache_path, file_cached) - break - - duration_handler = DurationConverter() - duration = duration_handler.get_sec(vid_path) - duration_str = duration_handler.get_str(duration) - player = { - "watched": False, - "duration": duration, - "duration_str": duration_str, - } - self.vid_dict["player"] = player - - def build_file_path(self, channel_name): - """build media_url from where file will be located""" - clean_channel_name = clean_string(channel_name) - timestamp = self.vid_dict["published"].replace("-", "") - youtube_id = self.vid_dict["youtube_id"] - title = self.vid_dict["title"] - clean_title = clean_string(title) - filename = f"{timestamp}_{youtube_id}_{clean_title}.mp4" - media_url = os.path.join(clean_channel_name, filename) - self.vid_dict["media_url"] = media_url - - def get_es_data(self): - """get current data from elastic search""" - url = self.ES_URL + "/ta_video/_doc/" + self.youtube_id - response = requests.get(url, auth=self.ES_AUTH) - if not response.ok: - print(response.text) - es_vid_dict = json.loads(response.text) - return es_vid_dict - - def upload_to_es(self): - """upload video data to elastic search""" - url = f"{self.ES_URL}/ta_video/_doc/{self.youtube_id}/?refresh=true" - response = requests.put(url, json=self.vid_dict, auth=self.ES_AUTH) - if not response.ok: - print(response.text) - raise ValueError("failed to add video to index") - - def deactivate(self): - """deactivate document on extractor error""" - youtube_id = self.youtube_id - headers = {"Content-type": "application/json"} - url = f"{self.ES_URL}/ta_video/_update/{youtube_id}" - data = {"script": "ctx._source.active = false"} - json_str = json.dumps(data) - response = requests.post( - url, data=json_str, headers=headers, auth=self.ES_AUTH - ) - print(f"deactivated {youtube_id}") - if not response.ok: - print(response.text) - - def delete_media_file(self): - """delete video file, meta data, thumbnails""" - # delete media file - es_vid_dict = self.get_es_data() - media_url = es_vid_dict["_source"]["media_url"] - print(f"delete {media_url} from file system") - to_delete = os.path.join(self.VIDEOS, media_url) - os.remove(to_delete) - # delete from index - url = f"{self.ES_URL}/ta_video/_doc/{self.youtube_id}" - response = requests.delete(url, auth=self.ES_AUTH) - if not response.ok: - print(response.text) - # delete thumbs from cache - ThumbManager().delete_vid_thumb(self.youtube_id) - - def get_ryd_stats(self): - """get optional stats from returnyoutubedislikeapi.com""" - try: - print(f"get ryd stats for: {self.youtube_id}") - result = ryd_client.get(self.youtube_id) - except requests.exceptions.ConnectionError: - print(f"failed to query ryd api, skipping {self.youtube_id}") - return False - - if result["status"] == 404: - return False - - dislikes = { - "dislike_count": result["dislikes"], - "average_rating": result["rating"], - } - self.vid_dict["stats"].update(dislikes) - - return True - - -class YoutubePlaylist: - """represent a single playlist on YouTube""" - - CONFIG = AppConfig().config - ES_URL = CONFIG["application"]["es_url"] - ES_AUTH = CONFIG["application"]["es_auth"] - - def __init__(self, playlist_id, all_youtube_ids=False): - self.playlist_id = playlist_id - self.stamp = int(datetime.now().strftime("%s")) - self.all_youtube_ids = all_youtube_ids - self.playlist_dict = False - - def get_playlist_dict(self, scrape=False): - """get data from es or youtube""" - print(f"get playlist with id {self.playlist_id}") - - if scrape: - playlist_dict = self.get_youtube_playlist() - if not playlist_dict: - return False - playlist_dict["playlist_entries"] = self.get_entries() - else: - playlist_dict = self.get_es_playlist() - if not playlist_dict: - playlist_dict = self.get_youtube_playlist() - playlist_dict["playlist_entries"] = self.get_entries() - - self.playlist_dict = playlist_dict - return True - - def get_youtube_playlist(self): - """get meta data dict from youtube""" - url = "https://www.youtube.com/playlist?list=" + self.playlist_id - obs = { - "default_search": "ytsearch", - "quiet": True, - "skip_download": True, - "extract_flat": True, - "playlistend": 0, - } - try: - playlist = yt_dlp.YoutubeDL(obs).extract_info(url, download=False) - except ( - yt_dlp.utils.ExtractorError, - yt_dlp.utils.DownloadError, - ): - print("failed to get info for " + self.playlist_id) - return False - - playlist_es = { - "playlist_id": self.playlist_id, + def process_youtube_meta(self): + """extract relevant fields from youtube""" + self.json_data = { + "playlist_id": self.youtube_id, "playlist_active": True, "playlist_subscribed": False, - "playlist_name": playlist["title"], - "playlist_channel": playlist["channel"], - "playlist_channel_id": playlist["channel_id"], - "playlist_thumbnail": playlist["thumbnails"][-1]["url"], - "playlist_description": playlist["description"] or False, - "playlist_last_refresh": self.stamp, + "playlist_name": self.youtube_meta["title"], + "playlist_channel": self.youtube_meta["channel"], + "playlist_channel_id": self.youtube_meta["channel_id"], + "playlist_thumbnail": self.youtube_meta["thumbnails"][-1]["url"], + "playlist_description": self.youtube_meta["description"] or False, + "playlist_last_refresh": int(datetime.now().strftime("%s")), } - return playlist_es - - def get_es_playlist(self): - """get indexed data from es""" - url = f"{self.ES_URL}/ta_playlist/_doc/{self.playlist_id}" - response = requests.get(url, auth=self.ES_AUTH) - if response.ok: - return json.loads(response.text)["_source"] - - return False - def get_entries(self, playlistend=False): """get all videos in playlist""" - url = "https://www.youtube.com/playlist?list=" + self.playlist_id - obs = { - "default_search": "ytsearch", - "quiet": True, - "skip_download": True, - "extract_flat": True, - } if playlistend: - obs["playlistend"] = playlistend - - try: - playlist = yt_dlp.YoutubeDL(obs).extract_info(url, download=False) - except ( - yt_dlp.utils.ExtractorError, - yt_dlp.utils.DownloadError, - ): - print("failed to get plealist entries for " + self.playlist_id) - return False - + # implement playlist end + print(playlistend) all_members = [] - for idx, entry in enumerate(playlist["entries"]): - uploader = entry["uploader"] - youtube_id = entry["id"] + for idx, entry in enumerate(self.youtube_meta["entries"]): if self.all_youtube_ids: - downloaded = youtube_id in self.all_youtube_ids + downloaded = entry["id"] in self.all_youtube_ids else: downloaded = False - if not uploader: + if not entry["uploader"]: continue to_append = { - "youtube_id": youtube_id, + "youtube_id": entry["id"], "title": entry["title"], - "uploader": uploader, + "uploader": entry["uploader"], "idx": idx, "downloaded": downloaded, } all_members.append(to_append) - return all_members + self.all_members = all_members - def upload_to_es(self): - """add playlist to es with its entries""" - playlist = self.playlist_dict - url = f"{self.ES_URL}/ta_playlist/_doc/{self.playlist_id}" - response = requests.put(url, json=playlist, auth=self.ES_AUTH) - if not response.ok: - print(response.text) - raise ValueError("failed to add playlist to index") + @staticmethod + def get_playlist_art(): + """download artwork of playlist""" + thumbnails = ThumbManager() + missing_playlists = thumbnails.get_missing_playlists() + thumbnails.download_playlist(missing_playlists) def add_vids_to_playlist(self): """sync the playlist id to videos""" - playlist_dict = self.playlist_dict script = ( 'if (!ctx._source.containsKey("playlist")) ' + "{ctx._source.playlist = [params.playlist]} " @@ -799,14 +577,14 @@ class YoutubePlaylist: ) bulk_list = [] - for entry in playlist_dict["playlist_entries"]: - youtube_id = entry["youtube_id"] - action = {"update": {"_id": youtube_id, "_index": "ta_video"}} + for entry in self.json_data["playlist_entries"]: + video_id = entry["youtube_id"] + action = {"update": {"_id": video_id, "_index": "ta_video"}} source = { "script": { "source": script, "lang": "painless", - "params": {"playlist": self.playlist_id}, + "params": {"playlist": self.youtube_id}, } } bulk_list.append(json.dumps(action)) @@ -815,34 +593,30 @@ class YoutubePlaylist: # add last newline bulk_list.append("\n") query_str = "\n".join(bulk_list) - headers = {"Content-type": "application/x-ndjson"} - url = self.ES_URL + "/_bulk" - response = requests.post( - url, data=query_str, headers=headers, auth=self.ES_AUTH - ) - if not response.ok: - print(response.text) + + ElasticWrap("_bulk").post(query_str, ndjson=True) def update_playlist(self): """update metadata for playlist with data from YouTube""" - subscribed = self.get_es_playlist()["playlist_subscribed"] - self.get_playlist_dict(scrape=True) - if not self.playlist_dict: + self.get_from_es() + subscribed = self.json_data["playlist_subscribed"] + self.get_from_youtube() + if not self.json_data: # return false to deactivate return False - self.playlist_dict["playlist_subscribed"] = subscribed + self.json_data["playlist_subscribed"] = subscribed self.upload_to_es() - return self.playlist_dict + return True def build_nav(self, youtube_id): """find next and previous in playlist of a given youtube_id""" - all_entries_available = self.playlist_dict["playlist_entries"] + all_entries_available = self.json_data["playlist_entries"] all_entries = [i for i in all_entries_available if i["downloaded"]] current = [i for i in all_entries if i["youtube_id"] == youtube_id] # stop if not found or playlist of 1 if not current or not len(all_entries) > 1: - return False + return current_idx = all_entries.index(current[0]) if current_idx == 0: @@ -861,17 +635,17 @@ class YoutubePlaylist: next_thumb = ThumbManager().vid_thumb_path(next_item["youtube_id"]) next_item["vid_thumb"] = next_thumb - nav = { + self.nav = { "playlist_meta": { "current_idx": current[0]["idx"], - "playlist_id": self.playlist_id, - "playlist_name": self.playlist_dict["playlist_name"], - "playlist_channel": self.playlist_dict["playlist_channel"], + "playlist_id": self.youtube_id, + "playlist_name": self.json_data["playlist_name"], + "playlist_channel": self.json_data["playlist_channel"], }, "playlist_previous": previous_item, "playlist_next": next_item, } - return nav + return def delete_metadata(self): """delete metadata for playlist""" @@ -881,58 +655,30 @@ class YoutubePlaylist: ) data = { "query": { - "term": {"playlist.keyword": {"value": self.playlist_id}} + "term": {"playlist.keyword": {"value": self.youtube_id}} }, "script": { "source": script, "lang": "painless", - "params": {"playlist": self.playlist_id}, + "params": {"playlist": self.youtube_id}, }, } - payload = json.dumps(data) - url = f"{self.ES_URL}/ta_video/_update_by_query" - headers = {"Content-type": "application/json"} - response = requests.post( - url, data=payload, headers=headers, auth=self.ES_AUTH - ) - if not response.ok: - print(response.text) - - self.delete_playlist() + _, _ = ElasticWrap("ta_video/_update_by_query").post(data) + self.del_in_es() def delete_videos_playlist(self): """delete playlist with all videos""" - print(f"delete playlist {self.playlist_id} with all videos") - self.get_playlist_dict() + print(f"{self.youtube_id}: delete playlist") + self.get_from_es() all_youtube_id = [ i["youtube_id"] - for i in self.playlist_dict["playlist_entries"] + for i in self.json_data["playlist_entries"] if i["downloaded"] ] for youtube_id in all_youtube_id: YoutubeVideo(youtube_id).delete_media_file() - self.delete_playlist() - - def delete_playlist(self): - """delete only playlist document""" - url = f"{self.ES_URL}/ta_playlist/_doc/{self.playlist_id}" - response = requests.delete(url, auth=self.ES_AUTH) - if not response.ok: - print(response.text) - - def deactivate(self): - """deactivate document on extractor error""" - headers = {"Content-type": "application/json"} - url = f"{self.ES_URL}/ta_playlist/_update/{self.playlist_id}" - data = {"script": "ctx._source.playlist_active = false"} - json_str = json.dumps(data) - response = requests.post( - url, data=json_str, headers=headers, auth=self.ES_AUTH - ) - print(f"deactivated {self.playlist_id}") - if not response.ok: - print(response.text) + self.delete_metadata() class WatchState: @@ -1052,104 +798,12 @@ class WatchState: raise ValueError("failed mark playlist as watched") -class IndexPaginate: - """use search_after to go through whole index""" - - CONFIG = AppConfig().config - ES_URL = CONFIG["application"]["es_url"] - ES_AUTH = CONFIG["application"]["es_auth"] - HEADERS = {"Content-type": "application/json"} - DEFAULT_SIZE = 500 - - def __init__(self, index_name, data, size=False): - self.index_name = index_name - self.data = data - self.pit_id = False - self.size = size - - def get_results(self): - """get all results""" - self.get_pit() - self.validate_data() - all_results = self.run_loop() - self.clean_pit() - return all_results - - def get_pit(self): - """get pit for index""" - url = f"{self.ES_URL}/{self.index_name}/_pit?keep_alive=10m" - response = requests.post(url, auth=self.ES_AUTH) - json_data = json.loads(response.text) - - self.pit_id = json_data["id"] - - def validate_data(self): - """add pit and size to data""" - if "sort" not in self.data.keys(): - print(self.data) - raise ValueError("missing sort key in data") - - size = self.size or self.DEFAULT_SIZE - - self.data["size"] = size - self.data["pit"] = {"id": self.pit_id, "keep_alive": "10m"} - - def run_loop(self): - """loop through results until last hit""" - query_str = json.dumps(self.data) - url = self.ES_URL + "/_search" - - all_results = [] - while True: - response = requests.get( - url, data=query_str, headers=self.HEADERS, auth=self.ES_AUTH - ) - json_data = json.loads(response.text) - all_hits = json_data["hits"]["hits"] - if all_hits: - for hit in all_hits: - source = hit["_source"] - search_after = hit["sort"] - all_results.append(source) - # update search_after with last hit data - self.data["search_after"] = search_after - query_str = json.dumps(self.data) - else: - break - - return all_results - - def clean_pit(self): - """delete pit from elastic search""" - query_str = json.dumps({"id": self.pit_id}) - requests.delete( - self.ES_URL + "/_pit", - data=query_str, - headers=self.HEADERS, - auth=self.ES_AUTH, - ) - - -def index_new_video(youtube_id, missing_vid=False): - """combine video and channel classes for new video index""" - vid_handler = YoutubeVideo(youtube_id) - vid_handler.get_vid_dict() - if not vid_handler.vid_dict: +def index_new_video(youtube_id): + """combined classes to create new video in index""" + video = YoutubeVideo(youtube_id) + video.build_json() + if not video.json_data: raise ValueError("failed to get metadata for " + youtube_id) - channel_handler = YoutubeChannel(vid_handler.channel_id) - # add filepath to vid_dict - channel_name = channel_handler.channel_dict["channel_name"] - vid_handler.build_file_path(channel_name) - # add channel and player to video - vid_handler.add_player(missing_vid) - vid_handler.vid_dict["channel"] = channel_handler.channel_dict - # add new channel to es - if channel_handler.source == "scraped": - channel_handler.channel_dict["channel_subscribed"] = False - channel_handler.upload_to_es() - channel_handler.get_channel_art() - # upload video to es - vid_handler.upload_to_es() - # return vid_dict for further processing - return vid_handler.vid_dict + video.upload_to_es() + return video.json_data diff --git a/tubearchivist/home/src/reindex.py b/tubearchivist/home/src/reindex.py index ea9900a..bb93cae 100644 --- a/tubearchivist/home/src/reindex.py +++ b/tubearchivist/home/src/reindex.py @@ -189,87 +189,73 @@ class Reindex: all_channels = channel_sub_handler.get_channels(subscribed_only=False) all_channel_ids = [i["channel_id"] for i in all_channels] - counter = 1 for channel_id in all_channel_ids: - channel_index = YoutubeChannel(channel_id) - subscribed = channel_index.channel_dict["channel_subscribed"] - channel_index.channel_dict = channel_index.build_channel_dict( - scrape=True - ) - channel_index.channel_dict["channel_subscribed"] = subscribed - channel_index.upload_to_es() - channel_index.sync_to_videos() - counter = counter + 1 + channel = YoutubeChannel(channel_id) + subscribed = channel.json_data["channel_subscribed"] + channel.get_from_youtube() + channel.json_data["channel_subscribed"] = subscribed + channel.upload_to_es() + channel.sync_to_videos() + if sleep_interval: sleep(sleep_interval) @staticmethod def reindex_single_video(youtube_id): """refresh data for single video""" - vid_handler = YoutubeVideo(youtube_id) - vid_handler.get_vid_dict() - if not vid_handler.vid_dict: - # stop if deactivated - vid_handler.deactivate() - return + video = YoutubeVideo(youtube_id) - es_vid_dict = vid_handler.get_es_data() - player = es_vid_dict["_source"]["player"] - date_downloaded = es_vid_dict["_source"]["date_downloaded"] - channel_dict = es_vid_dict["_source"]["channel"] - channel_name = channel_dict["channel_name"] - try: - playlist = es_vid_dict["_source"]["playlist"] - except KeyError: - playlist = False + # read current state + video.get_from_es() + player = video.json_data["player"] + date_downloaded = video.json_data["date_downloaded"] + channel_dict = video.json_data["channel"] + playlist = video.json_data.get("playlist") - vid_handler.build_file_path(channel_name) - # add to vid_dict - vid_handler.vid_dict["player"] = player - vid_handler.vid_dict["date_downloaded"] = date_downloaded - vid_handler.vid_dict["channel"] = channel_dict + # get new + video.build_json() + if not video.json_data: + video.deactivate() + + # add back + video.json_data["player"] = player + video.json_data["date_downloaded"] = date_downloaded + video.json_data["channel"] = channel_dict if playlist: - vid_handler.vid_dict["playlist"] = playlist - # update - vid_handler.upload_to_es() + video.json_data["playlist"] = playlist + + video.upload_to_es() + thumb_handler = ThumbManager() thumb_handler.delete_vid_thumb(youtube_id) - to_download = (youtube_id, vid_handler.vid_dict["vid_thumb_url"]) + to_download = (youtube_id, video.json_data["vid_thumb_url"]) thumb_handler.download_vid([to_download], notify=False) @staticmethod def reindex_single_channel(channel_id): """refresh channel data and sync to videos""" - channel_handler = YoutubeChannel(channel_id) - subscribed = channel_handler.channel_dict["channel_subscribed"] - channel_handler.channel_dict = channel_handler.build_channel_dict( - scrape=True - ) - channel_handler.channel_dict["channel_subscribed"] = subscribed - # update - channel_handler.upload_to_es() - channel_handler.sync_to_videos() - thumb_handler = ThumbManager() - thumb_handler.delete_chan_thumb(channel_id) - channel_thumb = channel_handler.channel_dict["channel_thumb_url"] - channel_banner = channel_handler.channel_dict["channel_banner_url"] - to_download = (channel_id, channel_thumb, channel_banner) - thumb_handler.download_chan([to_download]) + channel = YoutubeChannel(channel_id) + channel.get_from_es() + subscribed = channel.json_data["channel_subscribed"] + channel.get_from_youtube() + channel.json_data["channel_subscribed"] = subscribed + channel.upload_to_es() + channel.sync_to_videos() @staticmethod def reindex_single_playlist(playlist_id, all_indexed_ids): """refresh playlist data""" - playlist_handler = YoutubePlaylist( - playlist_id, all_youtube_ids=all_indexed_ids - ) - playlist = playlist_handler.update_playlist() - if not playlist: - playlist_handler.deactivate() + playlist = YoutubePlaylist(playlist_id) + playlist.get_from_es() + subscribed = playlist.json_data["playlist_subscribed"] + playlist.all_youtube_ids = all_indexed_ids + playlist.build_json(scrape=True) + if not playlist.json_data: + playlist.deactivate() return - playlist_thumbnail = (playlist_id, playlist["playlist_thumbnail"]) - thumb_handler = ThumbManager() - thumb_handler.download_playlist([playlist_thumbnail]) + playlist.json_data["playlist_subscribed"] = subscribed + playlist.upload_to_es() return def reindex(self): @@ -586,7 +572,7 @@ def scan_filesystem(): print("index new videos") for missing_vid in filesystem_handler.to_index: youtube_id = missing_vid[2] - index_new_video(youtube_id, missing_vid=missing_vid) + index_new_video(youtube_id) def reindex_old_documents(): diff --git a/tubearchivist/home/tasks.py b/tubearchivist/home/tasks.py index ad43d67..995cf8f 100644 --- a/tubearchivist/home/tasks.py +++ b/tubearchivist/home/tasks.py @@ -266,17 +266,16 @@ def subscribe_to(url_str): @shared_task def index_channel_playlists(channel_id): """add all playlists of channel to index""" - channel_handler = YoutubeChannel(channel_id) - channel_name = channel_handler.channel_dict["channel_name"] + channel = YoutubeChannel(channel_id) # notify mess_dict = { "status": "message:playlistscan", "level": "info", "title": "Looking for playlists", - "message": f'Scanning channel "{channel_name}" in progress', + "message": f'Scanning channel "{channel.youtube_id}" in progress', } RedisArchivist().set_message("message:playlistscan", mess_dict) - all_playlists = channel_handler.get_all_playlists() + all_playlists = channel.get_all_playlists() if not all_playlists: print(f"no playlists found for channel {channel_id}") @@ -295,28 +294,29 @@ def index_channel_playlists(channel_id): } RedisArchivist().set_message("message:playlistscan", mess_dict) print("add playlist: " + playlist_title) - playlist_handler = YoutubePlaylist( - playlist_id, all_youtube_ids=all_youtube_ids - ) - playlist_handler.get_playlist_dict() - if not playlist_handler.playlist_dict: + + playlist = YoutubePlaylist(playlist_id) + playlist.all_youtube_ids = all_youtube_ids + playlist.build_json() + + if not playlist.json_data: # skip if not available continue + # don't add if no videos downloaded downloaded = [ i - for i in playlist_handler.playlist_dict["playlist_entries"] + for i in playlist.json_data["playlist_entries"] if i["downloaded"] ] if not downloaded: continue - playlist_handler.upload_to_es() - playlist_handler.add_vids_to_playlist() + + playlist.upload_to_es() + playlist.add_vids_to_playlist() if all_playlists: - handler = ThumbManager() - missing_playlists = handler.get_missing_playlists() - handler.download_playlist(missing_playlists) + playlist.get_playlist_art() return diff --git a/tubearchivist/home/views.py b/tubearchivist/home/views.py index b3de6c4..687c0aa 100644 --- a/tubearchivist/home/views.py +++ b/tubearchivist/home/views.py @@ -624,11 +624,11 @@ class VideoView(View): """build playlist nav if available""" all_navs = [] for playlist_id in playlists: - handler = YoutubePlaylist(playlist_id) - handler.get_playlist_dict() - nav = handler.build_nav(video_id) - if nav: - all_navs.append(nav) + playlist = YoutubePlaylist(playlist_id) + playlist.get_from_es() + playlist.build_nav(video_id) + if playlist.nav: + all_navs.append(playlist.nav) return all_navs From c57d6c73cc86c286fae21f69b4f8f2b568b63616 Mon Sep 17 00:00:00 2001 From: simon Date: Sat, 22 Jan 2022 17:52:13 +0700 Subject: [PATCH 08/18] remove bandit, too many false positive --- .github/workflows/lint_python.yml | 1 - deploy.sh | 4 +--- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/.github/workflows/lint_python.yml b/.github/workflows/lint_python.yml index d34cfdc..8e0dc7e 100644 --- a/.github/workflows/lint_python.yml +++ b/.github/workflows/lint_python.yml @@ -9,7 +9,6 @@ jobs: - run: pip install --upgrade pip wheel - run: pip install bandit black codespell flake8 flake8-bugbear flake8-comprehensions isort - - run: bandit --recursive --skip B105,B108,B404,B603,B607 . - run: black --check --diff --line-length 79 . - run: codespell - run: flake8 . --count --max-complexity=12 --max-line-length=79 diff --git a/deploy.sh b/deploy.sh index 8e900e3..1af7865 100755 --- a/deploy.sh +++ b/deploy.sh @@ -85,9 +85,7 @@ function validate { fi echo "run validate on $check_path" - - echo "running bandit" - bandit --recursive --skip B105,B108,B404,B603,B607 "$check_path" + echo "running black" black --diff --color --check -l 79 "$check_path" echo "running codespell" From f3efca7464bcb64573cc61bf6fed72b217b36b6c Mon Sep 17 00:00:00 2001 From: simon Date: Sat, 22 Jan 2022 17:57:36 +0700 Subject: [PATCH 09/18] bump yt_dlp version --- tubearchivist/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tubearchivist/requirements.txt b/tubearchivist/requirements.txt index c1085f7..d08678e 100644 --- a/tubearchivist/requirements.txt +++ b/tubearchivist/requirements.txt @@ -9,4 +9,4 @@ requests==2.27.1 ryd-client==0.0.3 uWSGI==2.0.20 whitenoise==5.3.0 -yt_dlp==2021.12.27 +yt_dlp==2022.1.21 From 0fc0cc8e874646e61009ec515e5715c9886a8909 Mon Sep 17 00:00:00 2001 From: simon Date: Sat, 22 Jan 2022 22:13:37 +0700 Subject: [PATCH 10/18] major refactor, split up modules --- tubearchivist/api/views.py | 6 +- tubearchivist/config/settings.py | 2 +- tubearchivist/home/apps.py | 6 +- tubearchivist/home/src/download.py | 754 ---------------- tubearchivist/home/src/download/__init__.py | 0 tubearchivist/home/src/download/queue.py | 259 ++++++ .../home/src/download/subscriptions.py | 210 +++++ .../home/src/{ => download}/thumbnails.py | 25 +- .../home/src/download/yt_dlp_handler.py | 295 +++++++ tubearchivist/home/src/es/__init__.py | 0 .../home/src/{es.py => es/connect.py} | 2 +- tubearchivist/home/src/es/index_mapping.json | 274 ++++++ .../index_setup.py} | 298 ++----- tubearchivist/home/src/frontend/__init__.py | 0 .../{frontend.py => frontend/api_calls.py} | 19 +- .../home/{ => src/frontend}/forms.py | 0 .../home/src/{ => frontend}/searching.py | 67 +- tubearchivist/home/src/frontend/watched.py | 125 +++ tubearchivist/home/src/index.py | 809 ------------------ tubearchivist/home/src/index/__init__.py | 0 tubearchivist/home/src/index/channel.py | 262 ++++++ .../src/{reindex.py => index/filesystem.py} | 275 +----- tubearchivist/home/src/index/generic.py | 139 +++ tubearchivist/home/src/index/playlist.py | 201 +++++ tubearchivist/home/src/index/reindex.py | 267 ++++++ tubearchivist/home/src/index/video.py | 171 ++++ tubearchivist/home/src/ta/__init__.py | 0 tubearchivist/home/src/{ => ta}/config.py | 2 +- tubearchivist/home/src/{ => ta}/helper.py | 149 ---- tubearchivist/home/src/ta/ta_redis.py | 154 ++++ tubearchivist/home/tasks.py | 20 +- tubearchivist/home/views.py | 16 +- 32 files changed, 2467 insertions(+), 2340 deletions(-) delete mode 100644 tubearchivist/home/src/download.py create mode 100644 tubearchivist/home/src/download/__init__.py create mode 100644 tubearchivist/home/src/download/queue.py create mode 100644 tubearchivist/home/src/download/subscriptions.py rename tubearchivist/home/src/{ => download}/thumbnails.py (94%) create mode 100644 tubearchivist/home/src/download/yt_dlp_handler.py create mode 100644 tubearchivist/home/src/es/__init__.py rename tubearchivist/home/src/{es.py => es/connect.py} (99%) create mode 100644 tubearchivist/home/src/es/index_mapping.json rename tubearchivist/home/src/{index_management.py => es/index_setup.py} (66%) create mode 100644 tubearchivist/home/src/frontend/__init__.py rename tubearchivist/home/src/{frontend.py => frontend/api_calls.py} (96%) rename tubearchivist/home/{ => src/frontend}/forms.py (100%) rename tubearchivist/home/src/{ => frontend}/searching.py (77%) create mode 100644 tubearchivist/home/src/frontend/watched.py delete mode 100644 tubearchivist/home/src/index.py create mode 100644 tubearchivist/home/src/index/__init__.py create mode 100644 tubearchivist/home/src/index/channel.py rename tubearchivist/home/src/{reindex.py => index/filesystem.py} (52%) create mode 100644 tubearchivist/home/src/index/generic.py create mode 100644 tubearchivist/home/src/index/playlist.py create mode 100644 tubearchivist/home/src/index/reindex.py create mode 100644 tubearchivist/home/src/index/video.py create mode 100644 tubearchivist/home/src/ta/__init__.py rename tubearchivist/home/src/{ => ta}/config.py (99%) rename tubearchivist/home/src/{ => ta}/helper.py (57%) create mode 100644 tubearchivist/home/src/ta/ta_redis.py diff --git a/tubearchivist/api/views.py b/tubearchivist/api/views.py index 165ca60..c4bcb08 100644 --- a/tubearchivist/api/views.py +++ b/tubearchivist/api/views.py @@ -1,9 +1,9 @@ """all API views""" import requests -from home.src.config import AppConfig -from home.src.helper import UrlListParser -from home.src.thumbnails import ThumbManager +from home.src.ta.config import AppConfig +from home.src.ta.helper import UrlListParser +from home.src.download.thumbnails import ThumbManager from home.tasks import extrac_dl, subscribe_to from rest_framework.authentication import ( SessionAuthentication, diff --git a/tubearchivist/config/settings.py b/tubearchivist/config/settings.py index c1eb432..5447788 100644 --- a/tubearchivist/config/settings.py +++ b/tubearchivist/config/settings.py @@ -15,7 +15,7 @@ from os import environ, path from pathlib import Path from corsheaders.defaults import default_headers -from home.src.config import AppConfig +from home.src.ta.config import AppConfig # Build paths inside the project like this: BASE_DIR / 'subdir'. BASE_DIR = Path(__file__).resolve().parent.parent diff --git a/tubearchivist/home/apps.py b/tubearchivist/home/apps.py index 0bdf819..f58193f 100644 --- a/tubearchivist/home/apps.py +++ b/tubearchivist/home/apps.py @@ -3,9 +3,9 @@ import os from django.apps import AppConfig -from home.src.config import AppConfig as ArchivistConfig -from home.src.helper import RedisArchivist -from home.src.index_management import index_check +from home.src.es.index_setup import index_check +from home.src.ta.config import AppConfig as ArchivistConfig +from home.src.ta.ta_redis import RedisArchivist def sync_redis_state(): diff --git a/tubearchivist/home/src/download.py b/tubearchivist/home/src/download.py deleted file mode 100644 index 0dbd0be..0000000 --- a/tubearchivist/home/src/download.py +++ /dev/null @@ -1,754 +0,0 @@ -""" -Functionality: -- handele the download queue -- manage subscriptions to channels -- manage subscriptions to playlists -- downloading videos -""" - -import json -import os -import shutil -from datetime import datetime -from time import sleep - -import requests -import yt_dlp -from home.src.config import AppConfig -from home.src.es import IndexPaginate -from home.src.helper import ( - DurationConverter, - RedisArchivist, - RedisQueue, - clean_string, - ignore_filelist, -) -from home.src.index import ( - YoutubeChannel, - YoutubePlaylist, - YoutubeVideo, - index_new_video, -) - - -class PendingList: - """manage the pending videos list""" - - CONFIG = AppConfig().config - ES_URL = CONFIG["application"]["es_url"] - ES_AUTH = CONFIG["application"]["es_auth"] - VIDEOS = CONFIG["application"]["videos"] - - def __init__(self): - self.all_channel_ids = False - self.all_downloaded = False - self.missing_from_playlists = [] - - def parse_url_list(self, youtube_ids): - """extract youtube ids from list""" - missing_videos = [] - for entry in youtube_ids: - # notify - mess_dict = { - "status": "message:add", - "level": "info", - "title": "Adding to download queue.", - "message": "Extracting lists", - } - RedisArchivist().set_message("message:add", mess_dict) - # extract - url = entry["url"] - url_type = entry["type"] - if url_type == "video": - missing_videos.append(url) - elif url_type == "channel": - video_results = ChannelSubscription().get_last_youtube_videos( - url, limit=False - ) - youtube_ids = [i[0] for i in video_results] - missing_videos = missing_videos + youtube_ids - elif url_type == "playlist": - self.missing_from_playlists.append(entry) - playlist = YoutubePlaylist(url) - playlist.build_json() - video_results = playlist.json_data.get("playlist_entries") - youtube_ids = [i["youtube_id"] for i in video_results] - missing_videos = missing_videos + youtube_ids - - return missing_videos - - def add_to_pending(self, missing_videos, ignore=False): - """build the bulk json data from pending""" - # check if channel is indexed - channel_handler = ChannelSubscription() - all_indexed = channel_handler.get_channels(subscribed_only=False) - self.all_channel_ids = [i["channel_id"] for i in all_indexed] - # check if already there - self.all_downloaded = self.get_all_downloaded() - - bulk_list, all_videos_added = self.build_bulk(missing_videos, ignore) - # add last newline - bulk_list.append("\n") - query_str = "\n".join(bulk_list) - headers = {"Content-type": "application/x-ndjson"} - url = self.ES_URL + "/_bulk" - request = requests.post( - url, data=query_str, headers=headers, auth=self.ES_AUTH - ) - if not request.ok: - print(request) - raise ValueError("failed to add video to download queue") - - return all_videos_added - - def build_bulk(self, missing_videos, ignore=False): - """build the bulk lists""" - bulk_list = [] - all_videos_added = [] - - for idx, youtube_id in enumerate(missing_videos): - # check if already downloaded - if youtube_id in self.all_downloaded: - continue - - video = self.get_youtube_details(youtube_id) - # skip on download error - if not video: - continue - - channel_indexed = video["channel_id"] in self.all_channel_ids - video["channel_indexed"] = channel_indexed - if ignore: - video["status"] = "ignore" - else: - video["status"] = "pending" - action = {"create": {"_id": youtube_id, "_index": "ta_download"}} - bulk_list.append(json.dumps(action)) - bulk_list.append(json.dumps(video)) - all_videos_added.append((youtube_id, video["vid_thumb_url"])) - # notify - progress = f"{idx + 1}/{len(missing_videos)}" - mess_dict = { - "status": "message:add", - "level": "info", - "title": "Adding new videos to download queue.", - "message": "Progress: " + progress, - } - if idx + 1 == len(missing_videos): - RedisArchivist().set_message( - "message:add", mess_dict, expire=4 - ) - else: - RedisArchivist().set_message("message:add", mess_dict) - if idx + 1 % 25 == 0: - print("adding to queue progress: " + progress) - - return bulk_list, all_videos_added - - @staticmethod - def get_youtube_details(youtube_id): - """get details from youtubedl for single pending video""" - obs = { - "default_search": "ytsearch", - "quiet": True, - "check_formats": "selected", - "noplaylist": True, - "writethumbnail": True, - "simulate": True, - } - try: - vid = yt_dlp.YoutubeDL(obs).extract_info(youtube_id) - except yt_dlp.utils.DownloadError: - print("failed to extract info for: " + youtube_id) - return False - # stop if video is streaming live now - if vid["is_live"]: - return False - # parse response - seconds = vid["duration"] - duration_str = DurationConverter.get_str(seconds) - if duration_str == "NA": - print(f"skip extracting duration for: {youtube_id}") - upload_date = vid["upload_date"] - upload_dt = datetime.strptime(upload_date, "%Y%m%d") - published = upload_dt.strftime("%Y-%m-%d") - # build dict - youtube_details = { - "youtube_id": youtube_id, - "channel_name": vid["channel"], - "vid_thumb_url": vid["thumbnail"], - "title": vid["title"], - "channel_id": vid["channel_id"], - "duration": duration_str, - "published": published, - "timestamp": int(datetime.now().strftime("%s")), - } - return youtube_details - - @staticmethod - def get_all_pending(): - """get a list of all pending videos in ta_download""" - data = { - "query": {"match_all": {}}, - "sort": [{"timestamp": {"order": "asc"}}], - } - all_results = IndexPaginate("ta_download", data).get_results() - - all_pending = [] - all_ignore = [] - - for result in all_results: - if result["status"] == "pending": - all_pending.append(result) - elif result["status"] == "ignore": - all_ignore.append(result) - - return all_pending, all_ignore - - @staticmethod - def get_all_indexed(): - """get a list of all videos indexed""" - - data = { - "query": {"match_all": {}}, - "sort": [{"published": {"order": "desc"}}], - } - all_indexed = IndexPaginate("ta_video", data).get_results() - - return all_indexed - - def get_all_downloaded(self): - """get a list of all videos in archive""" - channel_folders = os.listdir(self.VIDEOS) - all_channel_folders = ignore_filelist(channel_folders) - all_downloaded = [] - for channel_folder in all_channel_folders: - channel_path = os.path.join(self.VIDEOS, channel_folder) - videos = os.listdir(channel_path) - all_videos = ignore_filelist(videos) - youtube_vids = [i[9:20] for i in all_videos] - for youtube_id in youtube_vids: - all_downloaded.append(youtube_id) - return all_downloaded - - def delete_from_pending(self, youtube_id): - """delete the youtube_id from ta_download""" - url = f"{self.ES_URL}/ta_download/_doc/{youtube_id}" - response = requests.delete(url, auth=self.ES_AUTH) - if not response.ok: - print(response.text) - - def delete_pending(self, status): - """delete download queue based on status value""" - data = {"query": {"term": {"status": {"value": status}}}} - payload = json.dumps(data) - url = self.ES_URL + "/ta_download/_delete_by_query" - headers = {"Content-type": "application/json"} - response = requests.post( - url, data=payload, headers=headers, auth=self.ES_AUTH - ) - if not response.ok: - print(response.text) - - def ignore_from_pending(self, ignore_list): - """build the bulk query string""" - - stamp = int(datetime.now().strftime("%s")) - bulk_list = [] - - for youtube_id in ignore_list: - action = {"update": {"_id": youtube_id, "_index": "ta_download"}} - source = {"doc": {"status": "ignore", "timestamp": stamp}} - bulk_list.append(json.dumps(action)) - bulk_list.append(json.dumps(source)) - - # add last newline - bulk_list.append("\n") - query_str = "\n".join(bulk_list) - - headers = {"Content-type": "application/x-ndjson"} - url = self.ES_URL + "/_bulk" - request = requests.post( - url, data=query_str, headers=headers, auth=self.ES_AUTH - ) - if not request.ok: - print(request) - raise ValueError("failed to set video to ignore") - - -class ChannelSubscription: - """manage the list of channels subscribed""" - - def __init__(self): - config = AppConfig().config - self.es_url = config["application"]["es_url"] - self.es_auth = config["application"]["es_auth"] - self.channel_size = config["subscriptions"]["channel_size"] - - @staticmethod - def get_channels(subscribed_only=True): - """get a list of all channels subscribed to""" - data = { - "sort": [{"channel_name.keyword": {"order": "asc"}}], - } - if subscribed_only: - data["query"] = {"term": {"channel_subscribed": {"value": True}}} - else: - data["query"] = {"match_all": {}} - - all_channels = IndexPaginate("ta_channel", data).get_results() - - return all_channels - - def get_last_youtube_videos(self, channel_id, limit=True): - """get a list of last videos from channel""" - url = f"https://www.youtube.com/channel/{channel_id}/videos" - obs = { - "default_search": "ytsearch", - "quiet": True, - "skip_download": True, - "extract_flat": True, - } - if limit: - obs["playlistend"] = self.channel_size - chan = yt_dlp.YoutubeDL(obs).extract_info(url, download=False) - last_videos = [(i["id"], i["title"]) for i in chan["entries"]] - return last_videos - - def find_missing(self): - """add missing videos from subscribed channels to pending""" - all_channels = self.get_channels() - pending_handler = PendingList() - all_pending, all_ignore = pending_handler.get_all_pending() - all_ids = [i["youtube_id"] for i in all_ignore + all_pending] - all_downloaded = pending_handler.get_all_downloaded() - to_ignore = all_ids + all_downloaded - - missing_videos = [] - - for idx, channel in enumerate(all_channels): - channel_id = channel["channel_id"] - last_videos = self.get_last_youtube_videos(channel_id) - for video in last_videos: - if video[0] not in to_ignore: - missing_videos.append(video[0]) - # notify - message = { - "status": "message:rescan", - "level": "info", - "title": "Scanning channels: Looking for new videos.", - "message": f"Progress: {idx + 1}/{len(all_channels)}", - } - if idx + 1 == len(all_channels): - RedisArchivist().set_message( - "message:rescan", message=message, expire=4 - ) - else: - RedisArchivist().set_message("message:rescan", message=message) - - return missing_videos - - @staticmethod - def change_subscribe(channel_id, channel_subscribed): - """subscribe or unsubscribe from channel and update""" - channel = YoutubeChannel(channel_id) - channel.build_json() - channel.json_data["channel_subscribed"] = channel_subscribed - channel.upload_to_es() - channel.sync_to_videos() - - -class PlaylistSubscription: - """manage the playlist download functionality""" - - def __init__(self): - self.config = AppConfig().config - - @staticmethod - def get_playlists(subscribed_only=True): - """get a list of all active playlists""" - data = { - "sort": [{"playlist_channel.keyword": {"order": "desc"}}], - } - data["query"] = { - "bool": {"must": [{"term": {"playlist_active": {"value": True}}}]} - } - if subscribed_only: - data["query"]["bool"]["must"].append( - {"term": {"playlist_subscribed": {"value": True}}} - ) - - all_playlists = IndexPaginate("ta_playlist", data).get_results() - - return all_playlists - - def process_url_str(self, new_playlists, subscribed=True): - """process playlist subscribe form url_str""" - all_indexed = PendingList().get_all_indexed() - all_youtube_ids = [i["youtube_id"] for i in all_indexed] - - new_thumbs = [] - for idx, playlist in enumerate(new_playlists): - url_type = playlist["type"] - playlist_id = playlist["url"] - if not url_type == "playlist": - print(f"{playlist_id} not a playlist, skipping...") - continue - - playlist_h = YoutubePlaylist(playlist_id) - playlist_h.all_youtube_ids = all_youtube_ids - playlist_h.build_json() - playlist_h.json_data["playlist_subscribed"] = subscribed - playlist_h.upload_to_es() - playlist_h.add_vids_to_playlist() - self.channel_validate(playlist_h.json_data["playlist_channel_id"]) - thumb = playlist_h.json_data["playlist_thumbnail"] - new_thumbs.append((playlist_id, thumb)) - # notify - message = { - "status": "message:subplaylist", - "level": "info", - "title": "Subscribing to Playlists", - "message": f"Processing {idx + 1} of {len(new_playlists)}", - } - RedisArchivist().set_message( - "message:subplaylist", message=message - ) - - return new_thumbs - - @staticmethod - def channel_validate(channel_id): - """make sure channel of playlist is there""" - channel = YoutubeChannel(channel_id) - channel.build_json() - - @staticmethod - def change_subscribe(playlist_id, subscribe_status): - """change the subscribe status of a playlist""" - playlist = YoutubePlaylist(playlist_id) - playlist.build_json() - playlist.json_data["playlist_subscribed"] = subscribe_status - playlist.upload_to_es() - - @staticmethod - def get_to_ignore(): - """get all youtube_ids already downloaded or ignored""" - pending_handler = PendingList() - all_pending, all_ignore = pending_handler.get_all_pending() - all_ids = [i["youtube_id"] for i in all_ignore + all_pending] - all_downloaded = pending_handler.get_all_downloaded() - to_ignore = all_ids + all_downloaded - return to_ignore - - def find_missing(self): - """find videos in subscribed playlists not downloaded yet""" - all_playlists = [i["playlist_id"] for i in self.get_playlists()] - to_ignore = self.get_to_ignore() - - missing_videos = [] - for idx, playlist_id in enumerate(all_playlists): - size_limit = self.config["subscriptions"]["channel_size"] - playlist = YoutubePlaylist(playlist_id) - playlist.update_playlist() - if not playlist: - playlist.deactivate() - continue - - playlist_entries = playlist.json_data["playlist_entries"] - if size_limit: - del playlist_entries[size_limit:] - - all_missing = [i for i in playlist_entries if not i["downloaded"]] - - message = { - "status": "message:rescan", - "level": "info", - "title": "Scanning playlists: Looking for new videos.", - "message": f"Progress: {idx + 1}/{len(all_playlists)}", - } - RedisArchivist().set_message("message:rescan", message=message) - - for video in all_missing: - youtube_id = video["youtube_id"] - if youtube_id not in to_ignore: - missing_videos.append(youtube_id) - - return missing_videos - - -class VideoDownloader: - """ - handle the video download functionality - if not initiated with list, take from queue - """ - - def __init__(self, youtube_id_list=False): - self.youtube_id_list = youtube_id_list - self.config = AppConfig().config - self.channels = set() - - def run_queue(self): - """setup download queue in redis loop until no more items""" - queue = RedisQueue("dl_queue") - - limit_queue = self.config["downloads"]["limit_count"] - if limit_queue: - queue.trim(limit_queue - 1) - - while True: - youtube_id = queue.get_next() - if not youtube_id: - break - - try: - self.dl_single_vid(youtube_id) - except yt_dlp.utils.DownloadError: - print("failed to download " + youtube_id) - continue - vid_dict = index_new_video(youtube_id) - self.channels.add(vid_dict["channel"]["channel_id"]) - self.move_to_archive(vid_dict) - self.delete_from_pending(youtube_id) - - autodelete_days = self.config["downloads"]["autodelete_days"] - if autodelete_days: - print(f"auto delete older than {autodelete_days} days") - self.auto_delete_watched(autodelete_days) - - @staticmethod - def add_pending(): - """add pending videos to download queue""" - mess_dict = { - "status": "message:download", - "level": "info", - "title": "Looking for videos to download", - "message": "Scanning your download queue.", - } - RedisArchivist().set_message("message:download", mess_dict) - all_pending, _ = PendingList().get_all_pending() - to_add = [i["youtube_id"] for i in all_pending] - if not to_add: - # there is nothing pending - print("download queue is empty") - mess_dict = { - "status": "message:download", - "level": "error", - "title": "Download queue is empty", - "message": "Add some videos to the queue first.", - } - RedisArchivist().set_message("message:download", mess_dict) - return - - queue = RedisQueue("dl_queue") - queue.add_list(to_add) - - @staticmethod - def progress_hook(response): - """process the progress_hooks from yt_dlp""" - # title - path = os.path.split(response["filename"])[-1][12:] - filename = os.path.splitext(os.path.splitext(path)[0])[0] - filename_clean = filename.replace("_", " ") - title = "Downloading: " + filename_clean - # message - try: - percent = response["_percent_str"] - size = response["_total_bytes_str"] - speed = response["_speed_str"] - eta = response["_eta_str"] - message = f"{percent} of {size} at {speed} - time left: {eta}" - except KeyError: - message = "processing" - mess_dict = { - "status": "message:download", - "level": "info", - "title": title, - "message": message, - } - RedisArchivist().set_message("message:download", mess_dict) - - def build_obs(self): - """build obs dictionary for yt-dlp""" - obs = { - "default_search": "ytsearch", - "merge_output_format": "mp4", - "restrictfilenames": True, - "outtmpl": ( - self.config["application"]["cache_dir"] - + "/download/" - + self.config["application"]["file_template"] - ), - "progress_hooks": [self.progress_hook], - "noprogress": True, - "quiet": True, - "continuedl": True, - "retries": 3, - "writethumbnail": False, - "noplaylist": True, - "check_formats": "selected", - } - if self.config["downloads"]["format"]: - obs["format"] = self.config["downloads"]["format"] - if self.config["downloads"]["limit_speed"]: - obs["ratelimit"] = self.config["downloads"]["limit_speed"] * 1024 - - throttle = self.config["downloads"]["throttledratelimit"] - if throttle: - obs["throttledratelimit"] = throttle * 1024 - - postprocessors = [] - - if self.config["downloads"]["add_metadata"]: - postprocessors.append( - { - "key": "FFmpegMetadata", - "add_chapters": True, - "add_metadata": True, - } - ) - - if self.config["downloads"]["add_thumbnail"]: - postprocessors.append( - { - "key": "EmbedThumbnail", - "already_have_thumbnail": True, - } - ) - obs["writethumbnail"] = True - - obs["postprocessors"] = postprocessors - - return obs - - def dl_single_vid(self, youtube_id): - """download single video""" - dl_cache = self.config["application"]["cache_dir"] + "/download/" - obs = self.build_obs() - - # check if already in cache to continue from there - all_cached = ignore_filelist(os.listdir(dl_cache)) - for file_name in all_cached: - if youtube_id in file_name: - obs["outtmpl"] = os.path.join(dl_cache, file_name) - with yt_dlp.YoutubeDL(obs) as ydl: - try: - ydl.download([youtube_id]) - except yt_dlp.utils.DownloadError: - print("retry failed download: " + youtube_id) - sleep(10) - ydl.download([youtube_id]) - - if obs["writethumbnail"]: - # webp files don't get cleaned up automatically - all_cached = ignore_filelist(os.listdir(dl_cache)) - to_clean = [i for i in all_cached if not i.endswith(".mp4")] - for file_name in to_clean: - file_path = os.path.join(dl_cache, file_name) - os.remove(file_path) - - def move_to_archive(self, vid_dict): - """move downloaded video from cache to archive""" - videos = self.config["application"]["videos"] - host_uid = self.config["application"]["HOST_UID"] - host_gid = self.config["application"]["HOST_GID"] - channel_name = clean_string(vid_dict["channel"]["channel_name"]) - # make archive folder with correct permissions - new_folder = os.path.join(videos, channel_name) - if not os.path.exists(new_folder): - os.makedirs(new_folder) - if host_uid and host_gid: - os.chown(new_folder, host_uid, host_gid) - # find real filename - cache_dir = self.config["application"]["cache_dir"] - all_cached = ignore_filelist(os.listdir(cache_dir + "/download/")) - for file_str in all_cached: - if vid_dict["youtube_id"] in file_str: - old_file = file_str - old_file_path = os.path.join(cache_dir, "download", old_file) - new_file_path = os.path.join(videos, vid_dict["media_url"]) - # move media file and fix permission - shutil.move(old_file_path, new_file_path) - if host_uid and host_gid: - os.chown(new_file_path, host_uid, host_gid) - - def delete_from_pending(self, youtube_id): - """delete downloaded video from pending index if its there""" - es_url = self.config["application"]["es_url"] - es_auth = self.config["application"]["es_auth"] - url = f"{es_url}/ta_download/_doc/{youtube_id}" - response = requests.delete(url, auth=es_auth) - if not response.ok and not response.status_code == 404: - print(response.text) - - def add_subscribed_channels(self): - """add all channels subscribed to refresh""" - all_subscribed = PlaylistSubscription().get_playlists() - if not all_subscribed: - return - - channel_ids = [i["playlist_channel_id"] for i in all_subscribed] - for channel_id in channel_ids: - self.channels.add(channel_id) - - return - - def validate_playlists(self): - """look for playlist needing to update""" - print("sync playlists") - self.add_subscribed_channels() - all_indexed = PendingList().get_all_indexed() - all_youtube_ids = [i["youtube_id"] for i in all_indexed] - for id_c, channel_id in enumerate(self.channels): - playlists = YoutubeChannel(channel_id).get_indexed_playlists() - all_playlist_ids = [i["playlist_id"] for i in playlists] - for id_p, playlist_id in enumerate(all_playlist_ids): - playlist = YoutubePlaylist(playlist_id) - playlist.all_youtube_ids = all_youtube_ids - playlist.build_json(scrape=True) - if not playlist.json_data: - playlist.deactivate() - - playlist.add_vids_to_playlist() - playlist.upload_to_es() - - # notify - title = ( - "Processing playlists for channels: " - + f"{id_c + 1}/{len(self.channels)}" - ) - message = f"Progress: {id_p + 1}/{len(all_playlist_ids)}" - mess_dict = { - "status": "message:download", - "level": "info", - "title": title, - "message": message, - } - if id_p + 1 == len(all_playlist_ids): - RedisArchivist().set_message( - "message:download", mess_dict, expire=4 - ) - else: - RedisArchivist().set_message("message:download", mess_dict) - - @staticmethod - def auto_delete_watched(autodelete_days): - """delete watched videos after x days""" - now = int(datetime.now().strftime("%s")) - now_lte = now - autodelete_days * 24 * 60 * 60 - data = { - "query": {"range": {"player.watched_date": {"lte": now_lte}}}, - "sort": [{"player.watched_date": {"order": "asc"}}], - } - all_to_delete = IndexPaginate("ta_video", data).get_results() - all_youtube_ids = [i["youtube_id"] for i in all_to_delete] - if not all_youtube_ids: - return - - for youtube_id in all_youtube_ids: - print(f"autodelete {youtube_id}") - YoutubeVideo(youtube_id).delete_media_file() - - print("add deleted to ignore list") - pending_handler = PendingList() - pending_handler.add_to_pending(all_youtube_ids, ignore=True) diff --git a/tubearchivist/home/src/download/__init__.py b/tubearchivist/home/src/download/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tubearchivist/home/src/download/queue.py b/tubearchivist/home/src/download/queue.py new file mode 100644 index 0000000..c8748e6 --- /dev/null +++ b/tubearchivist/home/src/download/queue.py @@ -0,0 +1,259 @@ +"""handle download queue""" + +import json +import os +from datetime import datetime + +import requests +import yt_dlp +from home.src.download.subscriptions import ChannelSubscription +from home.src.es.connect import IndexPaginate +from home.src.index.playlist import YoutubePlaylist +from home.src.ta.config import AppConfig +from home.src.ta.helper import DurationConverter, ignore_filelist +from home.src.ta.ta_redis import RedisArchivist + + +class PendingList: + """manage the pending videos list""" + + CONFIG = AppConfig().config + ES_URL = CONFIG["application"]["es_url"] + ES_AUTH = CONFIG["application"]["es_auth"] + VIDEOS = CONFIG["application"]["videos"] + + def __init__(self): + self.all_channel_ids = False + self.all_downloaded = False + self.missing_from_playlists = [] + + def parse_url_list(self, youtube_ids): + """extract youtube ids from list""" + missing_videos = [] + for entry in youtube_ids: + # notify + mess_dict = { + "status": "message:add", + "level": "info", + "title": "Adding to download queue.", + "message": "Extracting lists", + } + RedisArchivist().set_message("message:add", mess_dict) + # extract + url = entry["url"] + url_type = entry["type"] + if url_type == "video": + missing_videos.append(url) + elif url_type == "channel": + video_results = ChannelSubscription().get_last_youtube_videos( + url, limit=False + ) + youtube_ids = [i[0] for i in video_results] + missing_videos = missing_videos + youtube_ids + elif url_type == "playlist": + self.missing_from_playlists.append(entry) + playlist = YoutubePlaylist(url) + playlist.build_json() + video_results = playlist.json_data.get("playlist_entries") + youtube_ids = [i["youtube_id"] for i in video_results] + missing_videos = missing_videos + youtube_ids + + return missing_videos + + def add_to_pending(self, missing_videos, ignore=False): + """build the bulk json data from pending""" + # check if channel is indexed + channel_handler = ChannelSubscription() + all_indexed = channel_handler.get_channels(subscribed_only=False) + self.all_channel_ids = [i["channel_id"] for i in all_indexed] + # check if already there + self.all_downloaded = self.get_all_downloaded() + + bulk_list, all_videos_added = self.build_bulk(missing_videos, ignore) + # add last newline + bulk_list.append("\n") + query_str = "\n".join(bulk_list) + headers = {"Content-type": "application/x-ndjson"} + url = self.ES_URL + "/_bulk" + request = requests.post( + url, data=query_str, headers=headers, auth=self.ES_AUTH + ) + if not request.ok: + print(request) + raise ValueError("failed to add video to download queue") + + return all_videos_added + + def build_bulk(self, missing_videos, ignore=False): + """build the bulk lists""" + bulk_list = [] + all_videos_added = [] + + for idx, youtube_id in enumerate(missing_videos): + # check if already downloaded + if youtube_id in self.all_downloaded: + continue + + video = self.get_youtube_details(youtube_id) + # skip on download error + if not video: + continue + + channel_indexed = video["channel_id"] in self.all_channel_ids + video["channel_indexed"] = channel_indexed + if ignore: + video["status"] = "ignore" + else: + video["status"] = "pending" + action = {"create": {"_id": youtube_id, "_index": "ta_download"}} + bulk_list.append(json.dumps(action)) + bulk_list.append(json.dumps(video)) + all_videos_added.append((youtube_id, video["vid_thumb_url"])) + # notify + progress = f"{idx + 1}/{len(missing_videos)}" + mess_dict = { + "status": "message:add", + "level": "info", + "title": "Adding new videos to download queue.", + "message": "Progress: " + progress, + } + if idx + 1 == len(missing_videos): + RedisArchivist().set_message( + "message:add", mess_dict, expire=4 + ) + else: + RedisArchivist().set_message("message:add", mess_dict) + if idx + 1 % 25 == 0: + print("adding to queue progress: " + progress) + + return bulk_list, all_videos_added + + @staticmethod + def get_youtube_details(youtube_id): + """get details from youtubedl for single pending video""" + obs = { + "default_search": "ytsearch", + "quiet": True, + "check_formats": "selected", + "noplaylist": True, + "writethumbnail": True, + "simulate": True, + } + try: + vid = yt_dlp.YoutubeDL(obs).extract_info(youtube_id) + except yt_dlp.utils.DownloadError: + print("failed to extract info for: " + youtube_id) + return False + # stop if video is streaming live now + if vid["is_live"]: + return False + # parse response + seconds = vid["duration"] + duration_str = DurationConverter.get_str(seconds) + if duration_str == "NA": + print(f"skip extracting duration for: {youtube_id}") + upload_date = vid["upload_date"] + upload_dt = datetime.strptime(upload_date, "%Y%m%d") + published = upload_dt.strftime("%Y-%m-%d") + # build dict + youtube_details = { + "youtube_id": youtube_id, + "channel_name": vid["channel"], + "vid_thumb_url": vid["thumbnail"], + "title": vid["title"], + "channel_id": vid["channel_id"], + "duration": duration_str, + "published": published, + "timestamp": int(datetime.now().strftime("%s")), + } + return youtube_details + + @staticmethod + def get_all_pending(): + """get a list of all pending videos in ta_download""" + data = { + "query": {"match_all": {}}, + "sort": [{"timestamp": {"order": "asc"}}], + } + all_results = IndexPaginate("ta_download", data).get_results() + + all_pending = [] + all_ignore = [] + + for result in all_results: + if result["status"] == "pending": + all_pending.append(result) + elif result["status"] == "ignore": + all_ignore.append(result) + + return all_pending, all_ignore + + @staticmethod + def get_all_indexed(): + """get a list of all videos indexed""" + + data = { + "query": {"match_all": {}}, + "sort": [{"published": {"order": "desc"}}], + } + all_indexed = IndexPaginate("ta_video", data).get_results() + + return all_indexed + + def get_all_downloaded(self): + """get a list of all videos in archive""" + channel_folders = os.listdir(self.VIDEOS) + all_channel_folders = ignore_filelist(channel_folders) + all_downloaded = [] + for channel_folder in all_channel_folders: + channel_path = os.path.join(self.VIDEOS, channel_folder) + videos = os.listdir(channel_path) + all_videos = ignore_filelist(videos) + youtube_vids = [i[9:20] for i in all_videos] + for youtube_id in youtube_vids: + all_downloaded.append(youtube_id) + return all_downloaded + + def delete_from_pending(self, youtube_id): + """delete the youtube_id from ta_download""" + url = f"{self.ES_URL}/ta_download/_doc/{youtube_id}" + response = requests.delete(url, auth=self.ES_AUTH) + if not response.ok: + print(response.text) + + def delete_pending(self, status): + """delete download queue based on status value""" + data = {"query": {"term": {"status": {"value": status}}}} + payload = json.dumps(data) + url = self.ES_URL + "/ta_download/_delete_by_query" + headers = {"Content-type": "application/json"} + response = requests.post( + url, data=payload, headers=headers, auth=self.ES_AUTH + ) + if not response.ok: + print(response.text) + + def ignore_from_pending(self, ignore_list): + """build the bulk query string""" + + stamp = int(datetime.now().strftime("%s")) + bulk_list = [] + + for youtube_id in ignore_list: + action = {"update": {"_id": youtube_id, "_index": "ta_download"}} + source = {"doc": {"status": "ignore", "timestamp": stamp}} + bulk_list.append(json.dumps(action)) + bulk_list.append(json.dumps(source)) + + # add last newline + bulk_list.append("\n") + query_str = "\n".join(bulk_list) + + headers = {"Content-type": "application/x-ndjson"} + url = self.ES_URL + "/_bulk" + request = requests.post( + url, data=query_str, headers=headers, auth=self.ES_AUTH + ) + if not request.ok: + print(request) + raise ValueError("failed to set video to ignore") diff --git a/tubearchivist/home/src/download/subscriptions.py b/tubearchivist/home/src/download/subscriptions.py new file mode 100644 index 0000000..d610137 --- /dev/null +++ b/tubearchivist/home/src/download/subscriptions.py @@ -0,0 +1,210 @@ +"""handle subscriptions""" + +import yt_dlp +from home.src.download import queue # partial import +from home.src.es.connect import IndexPaginate +from home.src.index.channel import YoutubeChannel +from home.src.index.playlist import YoutubePlaylist +from home.src.ta.config import AppConfig +from home.src.ta.ta_redis import RedisArchivist + + +class ChannelSubscription: + """manage the list of channels subscribed""" + + def __init__(self): + config = AppConfig().config + self.es_url = config["application"]["es_url"] + self.es_auth = config["application"]["es_auth"] + self.channel_size = config["subscriptions"]["channel_size"] + + @staticmethod + def get_channels(subscribed_only=True): + """get a list of all channels subscribed to""" + data = { + "sort": [{"channel_name.keyword": {"order": "asc"}}], + } + if subscribed_only: + data["query"] = {"term": {"channel_subscribed": {"value": True}}} + else: + data["query"] = {"match_all": {}} + + all_channels = IndexPaginate("ta_channel", data).get_results() + + return all_channels + + def get_last_youtube_videos(self, channel_id, limit=True): + """get a list of last videos from channel""" + url = f"https://www.youtube.com/channel/{channel_id}/videos" + obs = { + "default_search": "ytsearch", + "quiet": True, + "skip_download": True, + "extract_flat": True, + } + if limit: + obs["playlistend"] = self.channel_size + chan = yt_dlp.YoutubeDL(obs).extract_info(url, download=False) + last_videos = [(i["id"], i["title"]) for i in chan["entries"]] + return last_videos + + def find_missing(self): + """add missing videos from subscribed channels to pending""" + all_channels = self.get_channels() + pending_handler = queue.PendingList() + all_pending, all_ignore = pending_handler.get_all_pending() + all_ids = [i["youtube_id"] for i in all_ignore + all_pending] + all_downloaded = pending_handler.get_all_downloaded() + to_ignore = all_ids + all_downloaded + + missing_videos = [] + + for idx, channel in enumerate(all_channels): + channel_id = channel["channel_id"] + last_videos = self.get_last_youtube_videos(channel_id) + for video in last_videos: + if video[0] not in to_ignore: + missing_videos.append(video[0]) + # notify + message = { + "status": "message:rescan", + "level": "info", + "title": "Scanning channels: Looking for new videos.", + "message": f"Progress: {idx + 1}/{len(all_channels)}", + } + if idx + 1 == len(all_channels): + RedisArchivist().set_message( + "message:rescan", message=message, expire=4 + ) + else: + RedisArchivist().set_message("message:rescan", message=message) + + return missing_videos + + @staticmethod + def change_subscribe(channel_id, channel_subscribed): + """subscribe or unsubscribe from channel and update""" + channel = YoutubeChannel(channel_id) + channel.build_json() + channel.json_data["channel_subscribed"] = channel_subscribed + channel.upload_to_es() + channel.sync_to_videos() + + +class PlaylistSubscription: + """manage the playlist download functionality""" + + def __init__(self): + self.config = AppConfig().config + + @staticmethod + def get_playlists(subscribed_only=True): + """get a list of all active playlists""" + data = { + "sort": [{"playlist_channel.keyword": {"order": "desc"}}], + } + data["query"] = { + "bool": {"must": [{"term": {"playlist_active": {"value": True}}}]} + } + if subscribed_only: + data["query"]["bool"]["must"].append( + {"term": {"playlist_subscribed": {"value": True}}} + ) + + all_playlists = IndexPaginate("ta_playlist", data).get_results() + + return all_playlists + + def process_url_str(self, new_playlists, subscribed=True): + """process playlist subscribe form url_str""" + all_indexed = queue.PendingList().get_all_indexed() + all_youtube_ids = [i["youtube_id"] for i in all_indexed] + + new_thumbs = [] + for idx, playlist in enumerate(new_playlists): + url_type = playlist["type"] + playlist_id = playlist["url"] + if not url_type == "playlist": + print(f"{playlist_id} not a playlist, skipping...") + continue + + playlist_h = YoutubePlaylist(playlist_id) + playlist_h.all_youtube_ids = all_youtube_ids + playlist_h.build_json() + playlist_h.json_data["playlist_subscribed"] = subscribed + playlist_h.upload_to_es() + playlist_h.add_vids_to_playlist() + self.channel_validate(playlist_h.json_data["playlist_channel_id"]) + thumb = playlist_h.json_data["playlist_thumbnail"] + new_thumbs.append((playlist_id, thumb)) + # notify + message = { + "status": "message:subplaylist", + "level": "info", + "title": "Subscribing to Playlists", + "message": f"Processing {idx + 1} of {len(new_playlists)}", + } + RedisArchivist().set_message( + "message:subplaylist", message=message + ) + + return new_thumbs + + @staticmethod + def channel_validate(channel_id): + """make sure channel of playlist is there""" + channel = YoutubeChannel(channel_id) + channel.build_json() + + @staticmethod + def change_subscribe(playlist_id, subscribe_status): + """change the subscribe status of a playlist""" + playlist = YoutubePlaylist(playlist_id) + playlist.build_json() + playlist.json_data["playlist_subscribed"] = subscribe_status + playlist.upload_to_es() + + @staticmethod + def get_to_ignore(): + """get all youtube_ids already downloaded or ignored""" + pending_handler = queue.PendingList() + all_pending, all_ignore = pending_handler.get_all_pending() + all_ids = [i["youtube_id"] for i in all_ignore + all_pending] + all_downloaded = pending_handler.get_all_downloaded() + to_ignore = all_ids + all_downloaded + return to_ignore + + def find_missing(self): + """find videos in subscribed playlists not downloaded yet""" + all_playlists = [i["playlist_id"] for i in self.get_playlists()] + to_ignore = self.get_to_ignore() + + missing_videos = [] + for idx, playlist_id in enumerate(all_playlists): + size_limit = self.config["subscriptions"]["channel_size"] + playlist = YoutubePlaylist(playlist_id) + playlist.update_playlist() + if not playlist: + playlist.deactivate() + continue + + playlist_entries = playlist.json_data["playlist_entries"] + if size_limit: + del playlist_entries[size_limit:] + + all_missing = [i for i in playlist_entries if not i["downloaded"]] + + message = { + "status": "message:rescan", + "level": "info", + "title": "Scanning playlists: Looking for new videos.", + "message": f"Progress: {idx + 1}/{len(all_playlists)}", + } + RedisArchivist().set_message("message:rescan", message=message) + + for video in all_missing: + youtube_id = video["youtube_id"] + if youtube_id not in to_ignore: + missing_videos.append(youtube_id) + + return missing_videos diff --git a/tubearchivist/home/src/thumbnails.py b/tubearchivist/home/src/download/thumbnails.py similarity index 94% rename from tubearchivist/home/src/thumbnails.py rename to tubearchivist/home/src/download/thumbnails.py index d9f6549..305bbd8 100644 --- a/tubearchivist/home/src/thumbnails.py +++ b/tubearchivist/home/src/download/thumbnails.py @@ -7,10 +7,12 @@ import os from collections import Counter from time import sleep -import home.src.download as download import requests -from home.src.config import AppConfig -from home.src.helper import RedisArchivist, ignore_filelist +from home.src.download import queue # partial import +from home.src.download import subscriptions # partial import +from home.src.ta.config import AppConfig +from home.src.ta.helper import ignore_filelist +from home.src.ta.ta_redis import RedisArchivist from mutagen.mp4 import MP4, MP4Cover from PIL import Image @@ -55,8 +57,8 @@ class ThumbManager: def get_needed_thumbs(self, missing_only=False): """get a list of all missing thumbnails""" all_thumbs = self.get_all_thumbs() - all_indexed = download.PendingList().get_all_indexed() - all_in_queue, all_ignored = download.PendingList().get_all_pending() + all_indexed = queue.PendingList().get_all_indexed() + all_in_queue, all_ignored = queue.PendingList().get_all_pending() needed_thumbs = [] for video in all_indexed: @@ -84,9 +86,8 @@ class ThumbManager: all_channel_art = os.listdir(self.CHANNEL_DIR) files = [i[0:24] for i in all_channel_art] cached_channel_ids = [k for (k, v) in Counter(files).items() if v > 1] - channels = download.ChannelSubscription().get_channels( - subscribed_only=False - ) + channel_sub = subscriptions.ChannelSubscription() + channels = channel_sub.get_channels(subscribed_only=False) missing_channels = [] for channel in channels: @@ -104,10 +105,8 @@ class ThumbManager: """get all missing playlist artwork""" all_downloaded = ignore_filelist(os.listdir(self.PLAYLIST_DIR)) all_ids_downloaded = [i.replace(".jpg", "") for i in all_downloaded] - - playlists = download.PlaylistSubscription().get_playlists( - subscribed_only=False - ) + playlist_sub = subscriptions.PlaylistSubscription() + playlists = playlist_sub.get_playlists(subscribed_only=False) missing_playlists = [] for playlist in playlists: @@ -276,7 +275,7 @@ class ThumbManager: def get_thumb_list(self): """get list of mediafiles and matching thumbnails""" - all_indexed = download.PendingList().get_all_indexed() + all_indexed = queue.PendingList().get_all_indexed() video_list = [] for video in all_indexed: youtube_id = video["youtube_id"] diff --git a/tubearchivist/home/src/download/yt_dlp_handler.py b/tubearchivist/home/src/download/yt_dlp_handler.py new file mode 100644 index 0000000..80c8b86 --- /dev/null +++ b/tubearchivist/home/src/download/yt_dlp_handler.py @@ -0,0 +1,295 @@ +"""handle yt_dlp downloads""" + +import os +import shutil +from datetime import datetime +from time import sleep + +import requests +import yt_dlp +from home.src.download.queue import PendingList +from home.src.download.subscriptions import PlaylistSubscription +from home.src.es.connect import IndexPaginate +from home.src.index.channel import YoutubeChannel +from home.src.index.playlist import YoutubePlaylist +from home.src.index.video import YoutubeVideo, index_new_video +from home.src.ta.config import AppConfig +from home.src.ta.helper import clean_string, ignore_filelist +from home.src.ta.ta_redis import RedisArchivist, RedisQueue + + +class VideoDownloader: + """ + handle the video download functionality + if not initiated with list, take from queue + """ + + def __init__(self, youtube_id_list=False): + self.youtube_id_list = youtube_id_list + self.config = AppConfig().config + self.channels = set() + + def run_queue(self): + """setup download queue in redis loop until no more items""" + queue = RedisQueue("dl_queue") + + limit_queue = self.config["downloads"]["limit_count"] + if limit_queue: + queue.trim(limit_queue - 1) + + while True: + youtube_id = queue.get_next() + if not youtube_id: + break + + try: + self.dl_single_vid(youtube_id) + except yt_dlp.utils.DownloadError: + print("failed to download " + youtube_id) + continue + vid_dict = index_new_video(youtube_id) + self.channels.add(vid_dict["channel"]["channel_id"]) + self.move_to_archive(vid_dict) + self.delete_from_pending(youtube_id) + + autodelete_days = self.config["downloads"]["autodelete_days"] + if autodelete_days: + print(f"auto delete older than {autodelete_days} days") + self.auto_delete_watched(autodelete_days) + + @staticmethod + def add_pending(): + """add pending videos to download queue""" + mess_dict = { + "status": "message:download", + "level": "info", + "title": "Looking for videos to download", + "message": "Scanning your download queue.", + } + RedisArchivist().set_message("message:download", mess_dict) + all_pending, _ = PendingList().get_all_pending() + to_add = [i["youtube_id"] for i in all_pending] + if not to_add: + # there is nothing pending + print("download queue is empty") + mess_dict = { + "status": "message:download", + "level": "error", + "title": "Download queue is empty", + "message": "Add some videos to the queue first.", + } + RedisArchivist().set_message("message:download", mess_dict) + return + + queue = RedisQueue("dl_queue") + queue.add_list(to_add) + + @staticmethod + def progress_hook(response): + """process the progress_hooks from yt_dlp""" + # title + path = os.path.split(response["filename"])[-1][12:] + filename = os.path.splitext(os.path.splitext(path)[0])[0] + filename_clean = filename.replace("_", " ") + title = "Downloading: " + filename_clean + # message + try: + percent = response["_percent_str"] + size = response["_total_bytes_str"] + speed = response["_speed_str"] + eta = response["_eta_str"] + message = f"{percent} of {size} at {speed} - time left: {eta}" + except KeyError: + message = "processing" + mess_dict = { + "status": "message:download", + "level": "info", + "title": title, + "message": message, + } + RedisArchivist().set_message("message:download", mess_dict) + + def build_obs(self): + """build obs dictionary for yt-dlp""" + obs = { + "default_search": "ytsearch", + "merge_output_format": "mp4", + "restrictfilenames": True, + "outtmpl": ( + self.config["application"]["cache_dir"] + + "/download/" + + self.config["application"]["file_template"] + ), + "progress_hooks": [self.progress_hook], + "noprogress": True, + "quiet": True, + "continuedl": True, + "retries": 3, + "writethumbnail": False, + "noplaylist": True, + "check_formats": "selected", + } + if self.config["downloads"]["format"]: + obs["format"] = self.config["downloads"]["format"] + if self.config["downloads"]["limit_speed"]: + obs["ratelimit"] = self.config["downloads"]["limit_speed"] * 1024 + + throttle = self.config["downloads"]["throttledratelimit"] + if throttle: + obs["throttledratelimit"] = throttle * 1024 + + postprocessors = [] + + if self.config["downloads"]["add_metadata"]: + postprocessors.append( + { + "key": "FFmpegMetadata", + "add_chapters": True, + "add_metadata": True, + } + ) + + if self.config["downloads"]["add_thumbnail"]: + postprocessors.append( + { + "key": "EmbedThumbnail", + "already_have_thumbnail": True, + } + ) + obs["writethumbnail"] = True + + obs["postprocessors"] = postprocessors + + return obs + + def dl_single_vid(self, youtube_id): + """download single video""" + dl_cache = self.config["application"]["cache_dir"] + "/download/" + obs = self.build_obs() + + # check if already in cache to continue from there + all_cached = ignore_filelist(os.listdir(dl_cache)) + for file_name in all_cached: + if youtube_id in file_name: + obs["outtmpl"] = os.path.join(dl_cache, file_name) + with yt_dlp.YoutubeDL(obs) as ydl: + try: + ydl.download([youtube_id]) + except yt_dlp.utils.DownloadError: + print("retry failed download: " + youtube_id) + sleep(10) + ydl.download([youtube_id]) + + if obs["writethumbnail"]: + # webp files don't get cleaned up automatically + all_cached = ignore_filelist(os.listdir(dl_cache)) + to_clean = [i for i in all_cached if not i.endswith(".mp4")] + for file_name in to_clean: + file_path = os.path.join(dl_cache, file_name) + os.remove(file_path) + + def move_to_archive(self, vid_dict): + """move downloaded video from cache to archive""" + videos = self.config["application"]["videos"] + host_uid = self.config["application"]["HOST_UID"] + host_gid = self.config["application"]["HOST_GID"] + channel_name = clean_string(vid_dict["channel"]["channel_name"]) + # make archive folder with correct permissions + new_folder = os.path.join(videos, channel_name) + if not os.path.exists(new_folder): + os.makedirs(new_folder) + if host_uid and host_gid: + os.chown(new_folder, host_uid, host_gid) + # find real filename + cache_dir = self.config["application"]["cache_dir"] + all_cached = ignore_filelist(os.listdir(cache_dir + "/download/")) + for file_str in all_cached: + if vid_dict["youtube_id"] in file_str: + old_file = file_str + old_file_path = os.path.join(cache_dir, "download", old_file) + new_file_path = os.path.join(videos, vid_dict["media_url"]) + # move media file and fix permission + shutil.move(old_file_path, new_file_path) + if host_uid and host_gid: + os.chown(new_file_path, host_uid, host_gid) + + def delete_from_pending(self, youtube_id): + """delete downloaded video from pending index if its there""" + es_url = self.config["application"]["es_url"] + es_auth = self.config["application"]["es_auth"] + url = f"{es_url}/ta_download/_doc/{youtube_id}" + response = requests.delete(url, auth=es_auth) + if not response.ok and not response.status_code == 404: + print(response.text) + + def add_subscribed_channels(self): + """add all channels subscribed to refresh""" + all_subscribed = PlaylistSubscription().get_playlists() + if not all_subscribed: + return + + channel_ids = [i["playlist_channel_id"] for i in all_subscribed] + for channel_id in channel_ids: + self.channels.add(channel_id) + + return + + def validate_playlists(self): + """look for playlist needing to update""" + print("sync playlists") + self.add_subscribed_channels() + all_indexed = PendingList().get_all_indexed() + all_youtube_ids = [i["youtube_id"] for i in all_indexed] + for id_c, channel_id in enumerate(self.channels): + playlists = YoutubeChannel(channel_id).get_indexed_playlists() + all_playlist_ids = [i["playlist_id"] for i in playlists] + for id_p, playlist_id in enumerate(all_playlist_ids): + playlist = YoutubePlaylist(playlist_id) + playlist.all_youtube_ids = all_youtube_ids + playlist.build_json(scrape=True) + if not playlist.json_data: + playlist.deactivate() + + playlist.add_vids_to_playlist() + playlist.upload_to_es() + + # notify + title = ( + "Processing playlists for channels: " + + f"{id_c + 1}/{len(self.channels)}" + ) + message = f"Progress: {id_p + 1}/{len(all_playlist_ids)}" + mess_dict = { + "status": "message:download", + "level": "info", + "title": title, + "message": message, + } + if id_p + 1 == len(all_playlist_ids): + RedisArchivist().set_message( + "message:download", mess_dict, expire=4 + ) + else: + RedisArchivist().set_message("message:download", mess_dict) + + @staticmethod + def auto_delete_watched(autodelete_days): + """delete watched videos after x days""" + now = int(datetime.now().strftime("%s")) + now_lte = now - autodelete_days * 24 * 60 * 60 + data = { + "query": {"range": {"player.watched_date": {"lte": now_lte}}}, + "sort": [{"player.watched_date": {"order": "asc"}}], + } + all_to_delete = IndexPaginate("ta_video", data).get_results() + all_youtube_ids = [i["youtube_id"] for i in all_to_delete] + if not all_youtube_ids: + return + + for youtube_id in all_youtube_ids: + print(f"autodelete {youtube_id}") + YoutubeVideo(youtube_id).delete_media_file() + + print("add deleted to ignore list") + pending_handler = PendingList() + pending_handler.add_to_pending(all_youtube_ids, ignore=True) diff --git a/tubearchivist/home/src/es/__init__.py b/tubearchivist/home/src/es/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tubearchivist/home/src/es.py b/tubearchivist/home/src/es/connect.py similarity index 99% rename from tubearchivist/home/src/es.py rename to tubearchivist/home/src/es/connect.py index ce72863..7cf7d8c 100644 --- a/tubearchivist/home/src/es.py +++ b/tubearchivist/home/src/es/connect.py @@ -3,7 +3,7 @@ import json import requests -from home.src.config import AppConfig +from home.src.ta.config import AppConfig class ElasticWrap: diff --git a/tubearchivist/home/src/es/index_mapping.json b/tubearchivist/home/src/es/index_mapping.json new file mode 100644 index 0000000..db413fb --- /dev/null +++ b/tubearchivist/home/src/es/index_mapping.json @@ -0,0 +1,274 @@ +{ + "index_config": [{ + "index_name": "channel", + "expected_map": { + "channel_id": { + "type": "keyword" + }, + "channel_name": { + "type": "text", + "analyzer": "english", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256, + "normalizer": "to_lower" + }, + "search_as_you_type": { + "type": "search_as_you_type", + "doc_values": false, + "max_shingle_size": 3 + } + } + }, + "channel_banner_url": { + "type": "keyword", + "index": false + }, + "channel_thumb_url": { + "type": "keyword", + "index": false + }, + "channel_description": { + "type": "text" + }, + "channel_last_refresh": { + "type": "date", + "format": "epoch_second" + } + }, + "expected_set": { + "analysis": { + "normalizer": { + "to_lower": { + "type": "custom", + "filter": ["lowercase"] + } + } + }, + "number_of_replicas": "0" + } + }, + { + "index_name": "video", + "expected_map": { + "vid_thumb_url": { + "type": "text", + "index": false + }, + "date_downloaded": { + "type": "date" + }, + "channel": { + "properties": { + "channel_id": { + "type": "keyword" + }, + "channel_name": { + "type": "text", + "analyzer": "english", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256, + "normalizer": "to_lower" + }, + "search_as_you_type": { + "type": "search_as_you_type", + "doc_values": false, + "max_shingle_size": 3 + } + } + }, + "channel_banner_url": { + "type": "keyword", + "index": false + }, + "channel_thumb_url": { + "type": "keyword", + "index": false + }, + "channel_description": { + "type": "text" + }, + "channel_last_refresh": { + "type": "date", + "format": "epoch_second" + } + } + }, + "description": { + "type": "text" + }, + "media_url": { + "type": "keyword", + "index": false + }, + "tags": { + "type": "text", + "analyzer": "english", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256 + } + } + }, + "title": { + "type": "text", + "analyzer": "english", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256, + "normalizer": "to_lower" + }, + "search_as_you_type": { + "type": "search_as_you_type", + "doc_values": false, + "max_shingle_size": 3 + } + } + }, + "vid_last_refresh": { + "type": "date" + }, + "youtube_id": { + "type": "keyword" + }, + "published": { + "type": "date" + }, + "playlist": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256, + "normalizer": "to_lower" + } + } + } + }, + "expected_set": { + "analysis": { + "normalizer": { + "to_lower": { + "type": "custom", + "filter": ["lowercase"] + } + } + }, + "number_of_replicas": "0" + } + }, + { + "index_name": "download", + "expected_map": { + "timestamp": { + "type": "date" + }, + "channel_id": { + "type": "keyword" + }, + "channel_name": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256, + "normalizer": "to_lower" + } + } + }, + "status": { + "type": "keyword" + }, + "title": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256, + "normalizer": "to_lower" + } + } + }, + "vid_thumb_url": { + "type": "keyword" + }, + "youtube_id": { + "type": "keyword" + } + }, + "expected_set": { + "analysis": { + "normalizer": { + "to_lower": { + "type": "custom", + "filter": ["lowercase"] + } + } + }, + "number_of_replicas": "0" + } + }, + { + "index_name": "playlist", + "expected_map": { + "playlist_id": { + "type": "keyword" + }, + "playlist_description": { + "type": "text" + }, + "playlist_name": { + "type": "text", + "analyzer": "english", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256, + "normalizer": "to_lower" + }, + "search_as_you_type": { + "type": "search_as_you_type", + "doc_values": false, + "max_shingle_size": 3 + } + } + }, + "playlist_channel": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256, + "normalizer": "to_lower" + } + } + }, + "playlist_channel_id": { + "type": "keyword" + }, + "playlist_thumbnail": { + "type": "keyword" + }, + "playlist_last_refresh": { + "type": "date" + } + }, + "expected_set": { + "analysis": { + "normalizer": { + "to_lower": { + "type": "custom", + "filter": ["lowercase"] + } + } + }, + "number_of_replicas": "0" + } + } + ] +} \ No newline at end of file diff --git a/tubearchivist/home/src/index_management.py b/tubearchivist/home/src/es/index_setup.py similarity index 66% rename from tubearchivist/home/src/index_management.py rename to tubearchivist/home/src/es/index_setup.py index 63f0a10..6b12a30 100644 --- a/tubearchivist/home/src/index_management.py +++ b/tubearchivist/home/src/es/index_setup.py @@ -1,10 +1,4 @@ -""" -Functionality: -- initial elastic search setup -- index configuration is represented in INDEX_CONFIG -- index mapping and settings validation -- backup and restore -""" +"""setup and verify needed elastic indexes""" import json import os @@ -12,213 +6,8 @@ import zipfile from datetime import datetime import requests -from home.src.config import AppConfig -from home.src.helper import ignore_filelist - -# expected mapping and settings -INDEX_CONFIG = [ - { - "index_name": "channel", - "expected_map": { - "channel_id": { - "type": "keyword", - }, - "channel_name": { - "type": "text", - "analyzer": "english", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256, - "normalizer": "to_lower", - }, - "search_as_you_type": { - "type": "search_as_you_type", - "doc_values": False, - "max_shingle_size": 3, - }, - }, - }, - "channel_banner_url": {"type": "keyword", "index": False}, - "channel_thumb_url": {"type": "keyword", "index": False}, - "channel_description": {"type": "text"}, - "channel_last_refresh": {"type": "date", "format": "epoch_second"}, - }, - "expected_set": { - "analysis": { - "normalizer": { - "to_lower": {"type": "custom", "filter": ["lowercase"]} - } - }, - "number_of_replicas": "0", - }, - }, - { - "index_name": "video", - "expected_map": { - "vid_thumb_url": {"type": "text", "index": False}, - "date_downloaded": {"type": "date"}, - "channel": { - "properties": { - "channel_id": { - "type": "keyword", - }, - "channel_name": { - "type": "text", - "analyzer": "english", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256, - "normalizer": "to_lower", - }, - "search_as_you_type": { - "type": "search_as_you_type", - "doc_values": False, - "max_shingle_size": 3, - }, - }, - }, - "channel_banner_url": {"type": "keyword", "index": False}, - "channel_thumb_url": {"type": "keyword", "index": False}, - "channel_description": {"type": "text"}, - "channel_last_refresh": { - "type": "date", - "format": "epoch_second", - }, - } - }, - "description": {"type": "text"}, - "media_url": {"type": "keyword", "index": False}, - "tags": { - "type": "text", - "analyzer": "english", - "fields": { - "keyword": {"type": "keyword", "ignore_above": 256} - }, - }, - "title": { - "type": "text", - "analyzer": "english", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256, - "normalizer": "to_lower", - }, - "search_as_you_type": { - "type": "search_as_you_type", - "doc_values": False, - "max_shingle_size": 3, - }, - }, - }, - "vid_last_refresh": {"type": "date"}, - "youtube_id": {"type": "keyword"}, - "published": {"type": "date"}, - "playlist": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256, - "normalizer": "to_lower", - } - }, - }, - }, - "expected_set": { - "analysis": { - "normalizer": { - "to_lower": {"type": "custom", "filter": ["lowercase"]} - } - }, - "number_of_replicas": "0", - }, - }, - { - "index_name": "download", - "expected_map": { - "timestamp": {"type": "date"}, - "channel_id": {"type": "keyword"}, - "channel_name": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256, - "normalizer": "to_lower", - } - }, - }, - "status": {"type": "keyword"}, - "title": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256, - "normalizer": "to_lower", - } - }, - }, - "vid_thumb_url": {"type": "keyword"}, - "youtube_id": {"type": "keyword"}, - }, - "expected_set": { - "analysis": { - "normalizer": { - "to_lower": {"type": "custom", "filter": ["lowercase"]} - } - }, - "number_of_replicas": "0", - }, - }, - { - "index_name": "playlist", - "expected_map": { - "playlist_id": {"type": "keyword"}, - "playlist_description": {"type": "text"}, - "playlist_name": { - "type": "text", - "analyzer": "english", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256, - "normalizer": "to_lower", - }, - "search_as_you_type": { - "type": "search_as_you_type", - "doc_values": False, - "max_shingle_size": 3, - }, - }, - }, - "playlist_channel": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 256, - "normalizer": "to_lower", - } - }, - }, - "playlist_channel_id": {"type": "keyword"}, - "playlist_thumbnail": {"type": "keyword"}, - "playlist_last_refresh": {"type": "date"}, - }, - "expected_set": { - "analysis": { - "normalizer": { - "to_lower": {"type": "custom", "filter": ["lowercase"]} - } - }, - "number_of_replicas": "0", - }, - }, -] +from home.src.ta.config import AppConfig +from home.src.ta.helper import ignore_filelist class ElasticIndex: @@ -602,48 +391,22 @@ class ElasticBackup: os.remove(file_path) -def get_available_backups(): - """return dict of available backups for settings view""" - backup_handler = ElasticBackup(INDEX_CONFIG, reason=False) - all_backup_files = backup_handler.get_all_backup_files() - return all_backup_files +def get_mapping(): + """read index_mapping.json and get expected mapping and settings""" + with open("home/src/es/index_mapping.json", "r", encoding="utf-8") as f: + config_str = f.read() + index_config = json.loads(config_str).get("index_config") - -def backup_all_indexes(reason): - """backup all es indexes to disk""" - backup_handler = ElasticBackup(INDEX_CONFIG, reason) - - for index in backup_handler.index_config: - index_name = index["index_name"] - if not backup_handler.index_exists(index_name): - continue - all_results = backup_handler.get_all_documents(index_name) - file_content = backup_handler.build_bulk(all_results) - backup_handler.write_es_json(file_content, index_name) - backup_handler.write_ta_json(all_results, index_name) - - backup_handler.zip_it() - - if reason == "auto": - backup_handler.rotate_backup() - - -def restore_from_backup(filename): - """restore indexes from backup file""" - # delete - index_check(force_restore=True) - # recreate - backup_handler = ElasticBackup(INDEX_CONFIG, reason=False) - zip_content = backup_handler.unpack_zip_backup(filename) - backup_handler.restore_json_files(zip_content) + return index_config def index_check(force_restore=False): """check if all indexes are created and have correct mapping""" backed_up = False + index_config = get_mapping() - for index in INDEX_CONFIG: + for index in index_config: index_name = index["index_name"] expected_map = index["expected_map"] expected_set = index["expected_set"] @@ -675,3 +438,42 @@ def index_check(force_restore=False): # else all good print(f"ta_{index_name} index is created and up to date...") + + +def get_available_backups(): + """return dict of available backups for settings view""" + index_config = get_mapping() + backup_handler = ElasticBackup(index_config, reason=False) + all_backup_files = backup_handler.get_all_backup_files() + return all_backup_files + + +def backup_all_indexes(reason): + """backup all es indexes to disk""" + index_config = get_mapping() + backup_handler = ElasticBackup(index_config, reason) + + for index in backup_handler.index_config: + index_name = index["index_name"] + if not backup_handler.index_exists(index_name): + continue + all_results = backup_handler.get_all_documents(index_name) + file_content = backup_handler.build_bulk(all_results) + backup_handler.write_es_json(file_content, index_name) + backup_handler.write_ta_json(all_results, index_name) + + backup_handler.zip_it() + + if reason == "auto": + backup_handler.rotate_backup() + + +def restore_from_backup(filename): + """restore indexes from backup file""" + # delete + index_check(force_restore=True) + # recreate + index_config = get_mapping() + backup_handler = ElasticBackup(index_config, reason=False) + zip_content = backup_handler.unpack_zip_backup(filename) + backup_handler.restore_json_files(zip_content) diff --git a/tubearchivist/home/src/frontend/__init__.py b/tubearchivist/home/src/frontend/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tubearchivist/home/src/frontend.py b/tubearchivist/home/src/frontend/api_calls.py similarity index 96% rename from tubearchivist/home/src/frontend.py rename to tubearchivist/home/src/frontend/api_calls.py index b6451e6..dff7636 100644 --- a/tubearchivist/home/src/frontend.py +++ b/tubearchivist/home/src/frontend/api_calls.py @@ -4,19 +4,18 @@ Functionality: - called via user input """ -from home.src.download import ( +from home.src.download.queue import PendingList +from home.src.download.subscriptions import ( ChannelSubscription, - PendingList, PlaylistSubscription, ) -from home.src.helper import RedisArchivist, RedisQueue, UrlListParser -from home.src.index import ( - WatchState, - YoutubeChannel, - YoutubePlaylist, - YoutubeVideo, -) -from home.src.searching import SearchForm +from home.src.frontend.searching import SearchForm +from home.src.frontend.watched import WatchState +from home.src.index.channel import YoutubeChannel +from home.src.index.playlist import YoutubePlaylist +from home.src.index.video import YoutubeVideo +from home.src.ta.helper import UrlListParser +from home.src.ta.ta_redis import RedisArchivist, RedisQueue from home.tasks import ( download_pending, download_single, diff --git a/tubearchivist/home/forms.py b/tubearchivist/home/src/frontend/forms.py similarity index 100% rename from tubearchivist/home/forms.py rename to tubearchivist/home/src/frontend/forms.py diff --git a/tubearchivist/home/src/searching.py b/tubearchivist/home/src/frontend/searching.py similarity index 77% rename from tubearchivist/home/src/searching.py rename to tubearchivist/home/src/frontend/searching.py index 23c3ddd..bca2742 100644 --- a/tubearchivist/home/src/searching.py +++ b/tubearchivist/home/src/frontend/searching.py @@ -6,14 +6,12 @@ Functionality: - calculate pagination values """ -import math import urllib.parse from datetime import datetime -from home.src.config import AppConfig -from home.src.es import ElasticWrap -from home.src.helper import RedisArchivist -from home.src.thumbnails import ThumbManager +from home.src.download.thumbnails import ThumbManager +from home.src.es.connect import ElasticWrap +from home.src.ta.config import AppConfig class SearchHandler: @@ -203,62 +201,3 @@ class SearchForm: } return all_results - - -class Pagination: - """ - figure out the pagination based on page size and total_hits - """ - - def __init__(self, page_get, user_id, search_get=False): - self.user_id = user_id - self.page_size = self.get_page_size() - self.page_get = page_get - self.search_get = search_get - self.pagination = self.first_guess() - - def get_page_size(self): - """get default or user modified page_size""" - key = f"{self.user_id}:page_size" - page_size = RedisArchivist().get_message(key)["status"] - if not page_size: - config = AppConfig().config - page_size = config["archive"]["page_size"] - - return page_size - - def first_guess(self): - """build first guess before api call""" - page_get = self.page_get - if page_get in [0, 1]: - page_from = 0 - prev_pages = False - elif page_get > 1: - page_from = (page_get - 1) * self.page_size - prev_pages = [ - i for i in range(page_get - 1, page_get - 6, -1) if i > 1 - ] - prev_pages.reverse() - pagination = { - "page_size": self.page_size, - "page_from": page_from, - "prev_pages": prev_pages, - "current_page": page_get, - } - if self.search_get: - pagination.update({"search_get": self.search_get}) - return pagination - - def validate(self, total_hits): - """validate pagination with total_hits after making api call""" - page_get = self.page_get - max_pages = math.ceil(total_hits / self.page_size) - if page_get < max_pages and max_pages > 1: - self.pagination["last_page"] = max_pages - else: - self.pagination["last_page"] = False - next_pages = [ - i for i in range(page_get + 1, page_get + 6) if 1 < i < max_pages - ] - - self.pagination["next_pages"] = next_pages diff --git a/tubearchivist/home/src/frontend/watched.py b/tubearchivist/home/src/frontend/watched.py new file mode 100644 index 0000000..0769232 --- /dev/null +++ b/tubearchivist/home/src/frontend/watched.py @@ -0,0 +1,125 @@ +"""handle watch state""" + +import json +from datetime import datetime + +import requests +from home.src.ta.config import AppConfig +from home.src.ta.helper import UrlListParser + + +class WatchState: + """handle watched checkbox for videos and channels""" + + CONFIG = AppConfig().config + ES_URL = CONFIG["application"]["es_url"] + ES_AUTH = CONFIG["application"]["es_auth"] + HEADERS = {"Content-type": "application/json"} + + def __init__(self, youtube_id): + self.youtube_id = youtube_id + self.stamp = int(datetime.now().strftime("%s")) + + def mark_as_watched(self): + """update es with new watched value""" + url_type = self.dedect_type() + if url_type == "video": + self.mark_vid_watched() + elif url_type == "channel": + self.mark_channel_watched() + elif url_type == "playlist": + self.mark_playlist_watched() + + print(f"marked {self.youtube_id} as watched") + + def mark_as_unwatched(self): + """revert watched state to false""" + url_type = self.dedect_type() + if url_type == "video": + self.mark_vid_watched(revert=True) + + print(f"revert {self.youtube_id} as unwatched") + + def dedect_type(self): + """find youtube id type""" + print(self.youtube_id) + url_process = UrlListParser(self.youtube_id).process_list() + url_type = url_process[0]["type"] + return url_type + + def mark_vid_watched(self, revert=False): + """change watched status of single video""" + url = self.ES_URL + "/ta_video/_update/" + self.youtube_id + data = { + "doc": {"player": {"watched": True, "watched_date": self.stamp}} + } + if revert: + data["doc"]["player"]["watched"] = False + + payload = json.dumps(data) + request = requests.post( + url, data=payload, headers=self.HEADERS, auth=self.ES_AUTH + ) + if not request.ok: + print(request.text) + raise ValueError("failed to mark video as watched") + + def mark_channel_watched(self): + """change watched status of every video in channel""" + data = { + "query": { + "bool": { + "must": [ + { + "term": { + "channel.channel_id": { + "value": self.youtube_id + } + } + }, + {"term": {"player.watched": {"value": False}}}, + ] + } + }, + "script": { + "source": "ctx._source.player['watched'] = true", + "lang": "painless", + }, + } + payload = json.dumps(data) + url = f"{self.ES_URL}/ta_video/_update_by_query" + request = requests.post( + url, data=payload, headers=self.HEADERS, auth=self.ES_AUTH + ) + if not request.ok: + print(request.text) + raise ValueError("failed mark channel as watched") + + def mark_playlist_watched(self): + """change watched state of all videos in playlist""" + data = { + "query": { + "bool": { + "must": [ + { + "term": { + "playlist.keyword": {"value": self.youtube_id} + } + }, + {"term": {"player.watched": {"value": False}}}, + ] + } + }, + "script": { + "source": "ctx._source.player['watched'] = true", + "lang": "painless", + }, + } + payload = json.dumps(data) + url = f"{self.ES_URL}/ta_video/_update_by_query" + request = requests.post( + url, data=payload, headers=self.HEADERS, auth=self.ES_AUTH + ) + if not request.ok: + print(request.text) + raise ValueError("failed mark playlist as watched") diff --git a/tubearchivist/home/src/index.py b/tubearchivist/home/src/index.py deleted file mode 100644 index fd22f17..0000000 --- a/tubearchivist/home/src/index.py +++ /dev/null @@ -1,809 +0,0 @@ -""" -Functionality: -- index new videos into elastisearch -- extract video info with yt_dlp -- scrape youtube channel page if needed -""" - -import json -import os -import re -from datetime import datetime - -import requests -import yt_dlp -from bs4 import BeautifulSoup -from home.src.config import AppConfig -from home.src.es import ElasticWrap, IndexPaginate -from home.src.helper import DurationConverter, UrlListParser, clean_string -from home.src.thumbnails import ThumbManager -from ryd_client import ryd_client - - -class YouTubeItem: - """base class for youtube""" - - es_path = False - index_name = False - yt_base = False - yt_obs = { - "quiet": True, - "default_search": "ytsearch", - "skip_download": True, - "check_formats": "selected", - "noplaylist": True, - } - - def __init__(self, youtube_id): - self.youtube_id = youtube_id - self.config = False - self.app_conf = False - self.youtube_meta = False - self.json_data = False - self._get_conf() - - def _get_conf(self): - """read user conf""" - self.config = AppConfig().config - self.app_conf = self.config["application"] - - def get_from_youtube(self): - """use yt-dlp to get meta data from youtube""" - print(f"{self.youtube_id}: get metadata from youtube") - try: - yt_item = yt_dlp.YoutubeDL(self.yt_obs) - response = yt_item.extract_info(self.yt_base + self.youtube_id) - except ( - yt_dlp.utils.ExtractorError, - yt_dlp.utils.DownloadError, - ): - print(f"{self.youtube_id}: failed to get info from youtube") - self.youtube_meta = False - - self.youtube_meta = response - - def get_from_es(self): - """get indexed data from elastic search""" - print(f"{self.youtube_id}: get metadata from es") - response, _ = ElasticWrap(f"{self.es_path}").get() - source = response.get("_source") - self.json_data = source - - def upload_to_es(self): - """add json_data to elastic""" - _, _ = ElasticWrap(self.es_path).put(self.json_data, refresh=True) - - def deactivate(self): - """deactivate document in es""" - key_match = { - "video": "active", - "channel": "channel_active", - "playlist": "playlist_active", - } - update_path = f"{self.index_name}/_update/{self.youtube_id}" - data = { - "script": f"ctx._source.{key_match.get(self.index_name)} = false" - } - _, _ = ElasticWrap(update_path).post(data) - - def del_in_es(self): - """delete item from elastic search""" - print(f"{self.youtube_id}: delete from es") - _, _ = ElasticWrap(self.es_path).delete() - - -class YoutubeVideo(YouTubeItem): - """represents a single youtube video""" - - es_path = False - index_name = "ta_video" - yt_base = "https://www.youtube.com/watch?v=" - - def __init__(self, youtube_id): - super().__init__(youtube_id) - self.channel_id = False - self.es_path = f"{self.index_name}/_doc/{youtube_id}" - - def build_json(self): - """build json dict of video""" - self.get_from_youtube() - if not self.youtube_meta: - return - - self._process_youtube_meta() - self._add_channel() - self._add_stats() - self.add_file_path() - self.add_player() - if self.config["downloads"]["integrate_ryd"]: - self._get_ryd_stats() - - return - - def _process_youtube_meta(self): - """extract relevant fields from youtube""" - # extract - self.channel_id = self.youtube_meta["channel_id"] - upload_date = self.youtube_meta["upload_date"] - upload_date_time = datetime.strptime(upload_date, "%Y%m%d") - published = upload_date_time.strftime("%Y-%m-%d") - last_refresh = int(datetime.now().strftime("%s")) - # build json_data basics - self.json_data = { - "title": self.youtube_meta["title"], - "description": self.youtube_meta["description"], - "category": self.youtube_meta["categories"], - "vid_thumb_url": self.youtube_meta["thumbnail"], - "tags": self.youtube_meta["tags"], - "published": published, - "vid_last_refresh": last_refresh, - "date_downloaded": last_refresh, - "youtube_id": self.youtube_id, - "active": True, - } - - def _add_channel(self): - """add channel dict to video json_data""" - channel = YoutubeChannel(self.channel_id) - channel.build_json(upload=True) - self.json_data.update({"channel": channel.json_data}) - - def _add_stats(self): - """add stats dicst to json_data""" - # likes - like_count = self.youtube_meta.get("like_count", 0) - dislike_count = self.youtube_meta.get("dislike_count", 0) - self.json_data.update( - { - "stats": { - "view_count": self.youtube_meta["view_count"], - "like_count": like_count, - "dislike_count": dislike_count, - "average_rating": self.youtube_meta["average_rating"], - } - } - ) - - def build_dl_cache_path(self): - """find video path in dl cache""" - cache_dir = self.app_conf["cache_dir"] - cache_path = f"{cache_dir}/download/" - all_cached = os.listdir(cache_path) - for file_cached in all_cached: - if self.youtube_id in file_cached: - vid_path = os.path.join(cache_path, file_cached) - return vid_path - - return False - - def add_player(self): - """add player information for new videos""" - try: - # when indexing from download task - vid_path = self.build_dl_cache_path() - except FileNotFoundError: - # when reindexing - base = self.app_conf["videos"] - vid_path = os.path.join(base, self.json_data["media_url"]) - - duration_handler = DurationConverter() - duration = duration_handler.get_sec(vid_path) - duration_str = duration_handler.get_str(duration) - self.json_data.update( - { - "player": { - "watched": False, - "duration": duration, - "duration_str": duration_str, - } - } - ) - - def add_file_path(self): - """build media_url for where file will be located""" - channel_name = self.json_data["channel"]["channel_name"] - clean_channel_name = clean_string(channel_name) - timestamp = self.json_data["published"].replace("-", "") - youtube_id = self.json_data["youtube_id"] - title = self.json_data["title"] - clean_title = clean_string(title) - filename = f"{timestamp}_{youtube_id}_{clean_title}.mp4" - media_url = os.path.join(clean_channel_name, filename) - self.json_data["media_url"] = media_url - - def delete_media_file(self): - """delete video file, meta data""" - self.get_from_es() - video_base = self.app_conf["videos"] - media_url = self.json_data["media_url"] - print(f"{self.youtube_id}: delete {media_url} from file system") - to_delete = os.path.join(video_base, media_url) - os.remove(to_delete) - self.del_in_es() - - def _get_ryd_stats(self): - """get optional stats from returnyoutubedislikeapi.com""" - try: - print(f"{self.youtube_id}: get ryd stats") - result = ryd_client.get(self.youtube_id) - except requests.exceptions.ConnectionError: - print(f"{self.youtube_id}: failed to query ryd api, skipping") - return False - - if result["status"] == 404: - return False - - dislikes = { - "dislike_count": result["dislikes"], - "average_rating": result["rating"], - } - self.json_data["stats"].update(dislikes) - - return True - - -class ChannelScraper: - """custom scraper using bs4 to scrape channel about page - will be able to be integrated into yt-dlp - once #2237 and #2350 are merged upstream - """ - - def __init__(self, channel_id): - self.channel_id = channel_id - self.soup = False - self.yt_json = False - self.json_data = False - - def get_json(self): - """main method to return channel dict""" - self.get_soup() - self._extract_yt_json() - self._parse_channel_main() - self._parse_channel_meta() - return self.json_data - - def get_soup(self): - """return soup from youtube""" - print(f"{self.channel_id}: scrape channel data from youtube") - url = f"https://www.youtube.com/channel/{self.channel_id}/about?hl=en" - cookies = {"CONSENT": "YES+xxxxxxxxxxxxxxxxxxxxxxxxxxx"} - response = requests.get(url, cookies=cookies) - if response.ok: - channel_page = response.text - else: - print(f"{self.channel_id}: failed to extract channel info") - raise ConnectionError - self.soup = BeautifulSoup(channel_page, "html.parser") - - def _extract_yt_json(self): - """parse soup and get ytInitialData json""" - all_scripts = self.soup.find("body").find_all("script") - for script in all_scripts: - if "var ytInitialData = " in str(script): - script_content = str(script) - break - # extract payload - script_content = script_content.split("var ytInitialData = ")[1] - json_raw = script_content.rstrip(";") - self.yt_json = json.loads(json_raw) - - def _parse_channel_main(self): - """extract maintab values from scraped channel json data""" - main_tab = self.yt_json["header"]["c4TabbedHeaderRenderer"] - # build and return dict - self.json_data = { - "channel_active": True, - "channel_last_refresh": int(datetime.now().strftime("%s")), - "channel_subs": self._get_channel_subs(main_tab), - "channel_name": main_tab["title"], - "channel_banner_url": self._get_thumbnails(main_tab, "banner"), - "channel_tvart_url": self._get_thumbnails(main_tab, "tvBanner"), - "channel_id": self.channel_id, - "channel_subscribed": False, - } - - @staticmethod - def _get_thumbnails(main_tab, thumb_name): - """extract banner url from main_tab""" - try: - all_banners = main_tab[thumb_name]["thumbnails"] - banner = sorted(all_banners, key=lambda k: k["width"])[-1]["url"] - except KeyError: - banner = False - - return banner - - @staticmethod - def _get_channel_subs(main_tab): - """process main_tab to get channel subs as int""" - try: - sub_text_simple = main_tab["subscriberCountText"]["simpleText"] - sub_text = sub_text_simple.split(" ")[0] - if sub_text[-1] == "K": - channel_subs = int(float(sub_text.replace("K", "")) * 1000) - elif sub_text[-1] == "M": - channel_subs = int(float(sub_text.replace("M", "")) * 1000000) - elif int(sub_text) >= 0: - channel_subs = int(sub_text) - else: - message = f"{sub_text} not dealt with" - print(message) - except KeyError: - channel_subs = 0 - - return channel_subs - - def _parse_channel_meta(self): - """extract meta tab values from channel payload""" - # meta tab - meta_tab = self.yt_json["metadata"]["channelMetadataRenderer"] - all_thumbs = meta_tab["avatar"]["thumbnails"] - thumb_url = sorted(all_thumbs, key=lambda k: k["width"])[-1]["url"] - # stats tab - renderer = "twoColumnBrowseResultsRenderer" - all_tabs = self.yt_json["contents"][renderer]["tabs"] - for tab in all_tabs: - if "tabRenderer" in tab.keys(): - if tab["tabRenderer"]["title"] == "About": - about_tab = tab["tabRenderer"]["content"][ - "sectionListRenderer" - ]["contents"][0]["itemSectionRenderer"]["contents"][0][ - "channelAboutFullMetadataRenderer" - ] - break - try: - channel_views_text = about_tab["viewCountText"]["simpleText"] - channel_views = int(re.sub(r"\D", "", channel_views_text)) - except KeyError: - channel_views = 0 - - self.json_data.update( - { - "channel_description": meta_tab["description"], - "channel_thumb_url": thumb_url, - "channel_views": channel_views, - } - ) - - -class YoutubeChannel(YouTubeItem): - """represents a single youtube channel""" - - es_path = False - index_name = "ta_channel" - yt_base = "https://www.youtube.com/channel/" - - def __init__(self, youtube_id): - super().__init__(youtube_id) - self.es_path = f"{self.index_name}/_doc/{youtube_id}" - - def build_json(self, upload=False): - """get from es or from youtube""" - self.get_from_es() - if self.json_data: - return - - self.get_from_youtube() - if upload: - self.upload_to_es() - return - - def get_from_youtube(self): - """use bs4 to scrape channel about page""" - self.json_data = ChannelScraper(self.youtube_id).get_json() - self.get_channel_art() - - def get_channel_art(self): - """download channel art for new channels""" - channel_id = self.youtube_id - channel_thumb = self.json_data["channel_thumb_url"] - channel_banner = self.json_data["channel_banner_url"] - ThumbManager().download_chan( - [(channel_id, channel_thumb, channel_banner)] - ) - - def sync_to_videos(self): - """sync new channel_dict to all videos of channel""" - # add ingest pipeline - processors = [] - for field, value in self.json_data.items(): - line = {"set": {"field": "channel." + field, "value": value}} - processors.append(line) - data = {"description": self.youtube_id, "processors": processors} - ingest_path = f"_ingest/pipeline/{self.youtube_id}" - _, _ = ElasticWrap(ingest_path).put(data) - # apply pipeline - data = {"query": {"match": {"channel.channel_id": self.youtube_id}}} - update_path = f"ta_video/_update_by_query?pipeline={self.youtube_id}" - _, _ = ElasticWrap(update_path).post(data) - - def get_folder_path(self): - """get folder where media files get stored""" - channel_name = self.json_data["channel_name"] - folder_name = clean_string(channel_name) - folder_path = os.path.join(self.app_conf["videos"], folder_name) - return folder_path - - def delete_es_videos(self): - """delete all channel documents from elasticsearch""" - data = { - "query": { - "term": {"channel.channel_id": {"value": self.youtube_id}} - } - } - _, _ = ElasticWrap("ta_video/_delete_by_query").post(data) - - def delete_playlists(self): - """delete all indexed playlist from es""" - all_playlists = self.get_indexed_playlists() - for playlist in all_playlists: - playlist_id = playlist["playlist_id"] - YoutubePlaylist(playlist_id).delete_metadata() - - def delete_channel(self): - """delete channel and all videos""" - print(f"{self.youtube_id}: delete channel") - self.get_from_es() - folder_path = self.get_folder_path() - print(f"{self.youtube_id}: delete all media files") - try: - all_videos = os.listdir(folder_path) - for video in all_videos: - video_path = os.path.join(folder_path, video) - os.remove(video_path) - os.rmdir(folder_path) - except FileNotFoundError: - print(f"no videos found for {folder_path}") - - print(f"{self.youtube_id}: delete indexed playlists") - self.delete_playlists() - print(f"{self.youtube_id}: delete indexed videos") - self.delete_es_videos() - self.del_in_es() - - def get_all_playlists(self): - """get all playlists owned by this channel""" - url = ( - f"https://www.youtube.com/channel/{self.youtube_id}" - + "/playlists?view=1&sort=dd&shelf_id=0" - ) - obs = { - "quiet": True, - "skip_download": True, - "extract_flat": True, - } - playlists = yt_dlp.YoutubeDL(obs).extract_info(url) - all_entries = [(i["id"], i["title"]) for i in playlists["entries"]] - - return all_entries - - def get_indexed_playlists(self): - """get all indexed playlists from channel""" - data = { - "query": { - "term": {"playlist_channel_id": {"value": self.youtube_id}} - }, - "sort": [{"playlist_channel.keyword": {"order": "desc"}}], - } - all_playlists = IndexPaginate("ta_playlist", data).get_results() - return all_playlists - - -class YoutubePlaylist(YouTubeItem): - """represents a single youtube playlist""" - - es_path = False - index_name = "ta_playlist" - yt_obs = { - "default_search": "ytsearch", - "quiet": True, - "skip_download": True, - "extract_flat": True, - } - yt_base = "https://www.youtube.com/playlist?list=" - - def __init__(self, youtube_id): - super().__init__(youtube_id) - self.es_path = f"{self.index_name}/_doc/{youtube_id}" - self.all_members = False - self.nav = False - self.all_youtube_ids = [] - - def build_json(self, scrape=False): - """collection to create json_data""" - if not scrape: - self.get_from_es() - - if scrape or not self.json_data: - self.get_from_youtube() - self.process_youtube_meta() - self.get_entries() - self.json_data["playlist_entries"] = self.all_members - self.get_playlist_art() - - def process_youtube_meta(self): - """extract relevant fields from youtube""" - self.json_data = { - "playlist_id": self.youtube_id, - "playlist_active": True, - "playlist_subscribed": False, - "playlist_name": self.youtube_meta["title"], - "playlist_channel": self.youtube_meta["channel"], - "playlist_channel_id": self.youtube_meta["channel_id"], - "playlist_thumbnail": self.youtube_meta["thumbnails"][-1]["url"], - "playlist_description": self.youtube_meta["description"] or False, - "playlist_last_refresh": int(datetime.now().strftime("%s")), - } - - def get_entries(self, playlistend=False): - """get all videos in playlist""" - if playlistend: - # implement playlist end - print(playlistend) - all_members = [] - for idx, entry in enumerate(self.youtube_meta["entries"]): - if self.all_youtube_ids: - downloaded = entry["id"] in self.all_youtube_ids - else: - downloaded = False - if not entry["uploader"]: - continue - to_append = { - "youtube_id": entry["id"], - "title": entry["title"], - "uploader": entry["uploader"], - "idx": idx, - "downloaded": downloaded, - } - all_members.append(to_append) - - self.all_members = all_members - - @staticmethod - def get_playlist_art(): - """download artwork of playlist""" - thumbnails = ThumbManager() - missing_playlists = thumbnails.get_missing_playlists() - thumbnails.download_playlist(missing_playlists) - - def add_vids_to_playlist(self): - """sync the playlist id to videos""" - script = ( - 'if (!ctx._source.containsKey("playlist")) ' - + "{ctx._source.playlist = [params.playlist]} " - + "else if (!ctx._source.playlist.contains(params.playlist)) " - + "{ctx._source.playlist.add(params.playlist)} " - + "else {ctx.op = 'none'}" - ) - - bulk_list = [] - for entry in self.json_data["playlist_entries"]: - video_id = entry["youtube_id"] - action = {"update": {"_id": video_id, "_index": "ta_video"}} - source = { - "script": { - "source": script, - "lang": "painless", - "params": {"playlist": self.youtube_id}, - } - } - bulk_list.append(json.dumps(action)) - bulk_list.append(json.dumps(source)) - - # add last newline - bulk_list.append("\n") - query_str = "\n".join(bulk_list) - - ElasticWrap("_bulk").post(query_str, ndjson=True) - - def update_playlist(self): - """update metadata for playlist with data from YouTube""" - self.get_from_es() - subscribed = self.json_data["playlist_subscribed"] - self.get_from_youtube() - if not self.json_data: - # return false to deactivate - return False - - self.json_data["playlist_subscribed"] = subscribed - self.upload_to_es() - return True - - def build_nav(self, youtube_id): - """find next and previous in playlist of a given youtube_id""" - all_entries_available = self.json_data["playlist_entries"] - all_entries = [i for i in all_entries_available if i["downloaded"]] - current = [i for i in all_entries if i["youtube_id"] == youtube_id] - # stop if not found or playlist of 1 - if not current or not len(all_entries) > 1: - return - - current_idx = all_entries.index(current[0]) - if current_idx == 0: - previous_item = False - else: - previous_item = all_entries[current_idx - 1] - prev_thumb = ThumbManager().vid_thumb_path( - previous_item["youtube_id"] - ) - previous_item["vid_thumb"] = prev_thumb - - if current_idx == len(all_entries) - 1: - next_item = False - else: - next_item = all_entries[current_idx + 1] - next_thumb = ThumbManager().vid_thumb_path(next_item["youtube_id"]) - next_item["vid_thumb"] = next_thumb - - self.nav = { - "playlist_meta": { - "current_idx": current[0]["idx"], - "playlist_id": self.youtube_id, - "playlist_name": self.json_data["playlist_name"], - "playlist_channel": self.json_data["playlist_channel"], - }, - "playlist_previous": previous_item, - "playlist_next": next_item, - } - return - - def delete_metadata(self): - """delete metadata for playlist""" - script = ( - "ctx._source.playlist.removeAll(" - + "Collections.singleton(params.playlist)) " - ) - data = { - "query": { - "term": {"playlist.keyword": {"value": self.youtube_id}} - }, - "script": { - "source": script, - "lang": "painless", - "params": {"playlist": self.youtube_id}, - }, - } - _, _ = ElasticWrap("ta_video/_update_by_query").post(data) - self.del_in_es() - - def delete_videos_playlist(self): - """delete playlist with all videos""" - print(f"{self.youtube_id}: delete playlist") - self.get_from_es() - all_youtube_id = [ - i["youtube_id"] - for i in self.json_data["playlist_entries"] - if i["downloaded"] - ] - for youtube_id in all_youtube_id: - YoutubeVideo(youtube_id).delete_media_file() - - self.delete_metadata() - - -class WatchState: - """handle watched checkbox for videos and channels""" - - CONFIG = AppConfig().config - ES_URL = CONFIG["application"]["es_url"] - ES_AUTH = CONFIG["application"]["es_auth"] - HEADERS = {"Content-type": "application/json"} - - def __init__(self, youtube_id): - self.youtube_id = youtube_id - self.stamp = int(datetime.now().strftime("%s")) - - def mark_as_watched(self): - """update es with new watched value""" - url_type = self.dedect_type() - if url_type == "video": - self.mark_vid_watched() - elif url_type == "channel": - self.mark_channel_watched() - elif url_type == "playlist": - self.mark_playlist_watched() - - print(f"marked {self.youtube_id} as watched") - - def mark_as_unwatched(self): - """revert watched state to false""" - url_type = self.dedect_type() - if url_type == "video": - self.mark_vid_watched(revert=True) - - print(f"revert {self.youtube_id} as unwatched") - - def dedect_type(self): - """find youtube id type""" - print(self.youtube_id) - url_process = UrlListParser(self.youtube_id).process_list() - url_type = url_process[0]["type"] - return url_type - - def mark_vid_watched(self, revert=False): - """change watched status of single video""" - url = self.ES_URL + "/ta_video/_update/" + self.youtube_id - data = { - "doc": {"player": {"watched": True, "watched_date": self.stamp}} - } - if revert: - data["doc"]["player"]["watched"] = False - - payload = json.dumps(data) - request = requests.post( - url, data=payload, headers=self.HEADERS, auth=self.ES_AUTH - ) - if not request.ok: - print(request.text) - raise ValueError("failed to mark video as watched") - - def mark_channel_watched(self): - """change watched status of every video in channel""" - data = { - "query": { - "bool": { - "must": [ - { - "term": { - "channel.channel_id": { - "value": self.youtube_id - } - } - }, - {"term": {"player.watched": {"value": False}}}, - ] - } - }, - "script": { - "source": "ctx._source.player['watched'] = true", - "lang": "painless", - }, - } - payload = json.dumps(data) - url = f"{self.ES_URL}/ta_video/_update_by_query" - request = requests.post( - url, data=payload, headers=self.HEADERS, auth=self.ES_AUTH - ) - if not request.ok: - print(request.text) - raise ValueError("failed mark channel as watched") - - def mark_playlist_watched(self): - """change watched state of all videos in playlist""" - data = { - "query": { - "bool": { - "must": [ - { - "term": { - "playlist.keyword": {"value": self.youtube_id} - } - }, - {"term": {"player.watched": {"value": False}}}, - ] - } - }, - "script": { - "source": "ctx._source.player['watched'] = true", - "lang": "painless", - }, - } - payload = json.dumps(data) - url = f"{self.ES_URL}/ta_video/_update_by_query" - request = requests.post( - url, data=payload, headers=self.HEADERS, auth=self.ES_AUTH - ) - if not request.ok: - print(request.text) - raise ValueError("failed mark playlist as watched") - - -def index_new_video(youtube_id): - """combined classes to create new video in index""" - video = YoutubeVideo(youtube_id) - video.build_json() - if not video.json_data: - raise ValueError("failed to get metadata for " + youtube_id) - - video.upload_to_es() - return video.json_data diff --git a/tubearchivist/home/src/index/__init__.py b/tubearchivist/home/src/index/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tubearchivist/home/src/index/channel.py b/tubearchivist/home/src/index/channel.py new file mode 100644 index 0000000..a8b1525 --- /dev/null +++ b/tubearchivist/home/src/index/channel.py @@ -0,0 +1,262 @@ +"""handle single channel in index""" + +import json +import os +import re +from datetime import datetime + +import requests +import yt_dlp +from bs4 import BeautifulSoup +from home.src.download.thumbnails import ThumbManager +from home.src.es.connect import ElasticWrap, IndexPaginate +from home.src.index.generic import YouTubeItem +from home.src.index.playlist import YoutubePlaylist +from home.src.ta.helper import clean_string + + +class ChannelScraper: + """custom scraper using bs4 to scrape channel about page + will be able to be integrated into yt-dlp + once #2237 and #2350 are merged upstream + """ + + def __init__(self, channel_id): + self.channel_id = channel_id + self.soup = False + self.yt_json = False + self.json_data = False + + def get_json(self): + """main method to return channel dict""" + self.get_soup() + self._extract_yt_json() + self._parse_channel_main() + self._parse_channel_meta() + return self.json_data + + def get_soup(self): + """return soup from youtube""" + print(f"{self.channel_id}: scrape channel data from youtube") + url = f"https://www.youtube.com/channel/{self.channel_id}/about?hl=en" + cookies = {"CONSENT": "YES+xxxxxxxxxxxxxxxxxxxxxxxxxxx"} + response = requests.get(url, cookies=cookies) + if response.ok: + channel_page = response.text + else: + print(f"{self.channel_id}: failed to extract channel info") + raise ConnectionError + self.soup = BeautifulSoup(channel_page, "html.parser") + + def _extract_yt_json(self): + """parse soup and get ytInitialData json""" + all_scripts = self.soup.find("body").find_all("script") + for script in all_scripts: + if "var ytInitialData = " in str(script): + script_content = str(script) + break + # extract payload + script_content = script_content.split("var ytInitialData = ")[1] + json_raw = script_content.rstrip(";") + self.yt_json = json.loads(json_raw) + + def _parse_channel_main(self): + """extract maintab values from scraped channel json data""" + main_tab = self.yt_json["header"]["c4TabbedHeaderRenderer"] + # build and return dict + self.json_data = { + "channel_active": True, + "channel_last_refresh": int(datetime.now().strftime("%s")), + "channel_subs": self._get_channel_subs(main_tab), + "channel_name": main_tab["title"], + "channel_banner_url": self._get_thumbnails(main_tab, "banner"), + "channel_tvart_url": self._get_thumbnails(main_tab, "tvBanner"), + "channel_id": self.channel_id, + "channel_subscribed": False, + } + + @staticmethod + def _get_thumbnails(main_tab, thumb_name): + """extract banner url from main_tab""" + try: + all_banners = main_tab[thumb_name]["thumbnails"] + banner = sorted(all_banners, key=lambda k: k["width"])[-1]["url"] + except KeyError: + banner = False + + return banner + + @staticmethod + def _get_channel_subs(main_tab): + """process main_tab to get channel subs as int""" + try: + sub_text_simple = main_tab["subscriberCountText"]["simpleText"] + sub_text = sub_text_simple.split(" ")[0] + if sub_text[-1] == "K": + channel_subs = int(float(sub_text.replace("K", "")) * 1000) + elif sub_text[-1] == "M": + channel_subs = int(float(sub_text.replace("M", "")) * 1000000) + elif int(sub_text) >= 0: + channel_subs = int(sub_text) + else: + message = f"{sub_text} not dealt with" + print(message) + except KeyError: + channel_subs = 0 + + return channel_subs + + def _parse_channel_meta(self): + """extract meta tab values from channel payload""" + # meta tab + meta_tab = self.yt_json["metadata"]["channelMetadataRenderer"] + all_thumbs = meta_tab["avatar"]["thumbnails"] + thumb_url = sorted(all_thumbs, key=lambda k: k["width"])[-1]["url"] + # stats tab + renderer = "twoColumnBrowseResultsRenderer" + all_tabs = self.yt_json["contents"][renderer]["tabs"] + for tab in all_tabs: + if "tabRenderer" in tab.keys(): + if tab["tabRenderer"]["title"] == "About": + about_tab = tab["tabRenderer"]["content"][ + "sectionListRenderer" + ]["contents"][0]["itemSectionRenderer"]["contents"][0][ + "channelAboutFullMetadataRenderer" + ] + break + try: + channel_views_text = about_tab["viewCountText"]["simpleText"] + channel_views = int(re.sub(r"\D", "", channel_views_text)) + except KeyError: + channel_views = 0 + + self.json_data.update( + { + "channel_description": meta_tab["description"], + "channel_thumb_url": thumb_url, + "channel_views": channel_views, + } + ) + + +class YoutubeChannel(YouTubeItem): + """represents a single youtube channel""" + + es_path = False + index_name = "ta_channel" + yt_base = "https://www.youtube.com/channel/" + + def __init__(self, youtube_id): + super().__init__(youtube_id) + self.es_path = f"{self.index_name}/_doc/{youtube_id}" + + def build_json(self, upload=False): + """get from es or from youtube""" + self.get_from_es() + if self.json_data: + return + + self.get_from_youtube() + if upload: + self.upload_to_es() + return + + def get_from_youtube(self): + """use bs4 to scrape channel about page""" + self.json_data = ChannelScraper(self.youtube_id).get_json() + self.get_channel_art() + + def get_channel_art(self): + """download channel art for new channels""" + channel_id = self.youtube_id + channel_thumb = self.json_data["channel_thumb_url"] + channel_banner = self.json_data["channel_banner_url"] + ThumbManager().download_chan( + [(channel_id, channel_thumb, channel_banner)] + ) + + def sync_to_videos(self): + """sync new channel_dict to all videos of channel""" + # add ingest pipeline + processors = [] + for field, value in self.json_data.items(): + line = {"set": {"field": "channel." + field, "value": value}} + processors.append(line) + data = {"description": self.youtube_id, "processors": processors} + ingest_path = f"_ingest/pipeline/{self.youtube_id}" + _, _ = ElasticWrap(ingest_path).put(data) + # apply pipeline + data = {"query": {"match": {"channel.channel_id": self.youtube_id}}} + update_path = f"ta_video/_update_by_query?pipeline={self.youtube_id}" + _, _ = ElasticWrap(update_path).post(data) + + def get_folder_path(self): + """get folder where media files get stored""" + channel_name = self.json_data["channel_name"] + folder_name = clean_string(channel_name) + folder_path = os.path.join(self.app_conf["videos"], folder_name) + return folder_path + + def delete_es_videos(self): + """delete all channel documents from elasticsearch""" + data = { + "query": { + "term": {"channel.channel_id": {"value": self.youtube_id}} + } + } + _, _ = ElasticWrap("ta_video/_delete_by_query").post(data) + + def delete_playlists(self): + """delete all indexed playlist from es""" + all_playlists = self.get_indexed_playlists() + for playlist in all_playlists: + playlist_id = playlist["playlist_id"] + YoutubePlaylist(playlist_id).delete_metadata() + + def delete_channel(self): + """delete channel and all videos""" + print(f"{self.youtube_id}: delete channel") + self.get_from_es() + folder_path = self.get_folder_path() + print(f"{self.youtube_id}: delete all media files") + try: + all_videos = os.listdir(folder_path) + for video in all_videos: + video_path = os.path.join(folder_path, video) + os.remove(video_path) + os.rmdir(folder_path) + except FileNotFoundError: + print(f"no videos found for {folder_path}") + + print(f"{self.youtube_id}: delete indexed playlists") + self.delete_playlists() + print(f"{self.youtube_id}: delete indexed videos") + self.delete_es_videos() + self.del_in_es() + + def get_all_playlists(self): + """get all playlists owned by this channel""" + url = ( + f"https://www.youtube.com/channel/{self.youtube_id}" + + "/playlists?view=1&sort=dd&shelf_id=0" + ) + obs = { + "quiet": True, + "skip_download": True, + "extract_flat": True, + } + playlists = yt_dlp.YoutubeDL(obs).extract_info(url) + all_entries = [(i["id"], i["title"]) for i in playlists["entries"]] + + return all_entries + + def get_indexed_playlists(self): + """get all indexed playlists from channel""" + data = { + "query": { + "term": {"playlist_channel_id": {"value": self.youtube_id}} + }, + "sort": [{"playlist_channel.keyword": {"order": "desc"}}], + } + all_playlists = IndexPaginate("ta_playlist", data).get_results() + return all_playlists diff --git a/tubearchivist/home/src/reindex.py b/tubearchivist/home/src/index/filesystem.py similarity index 52% rename from tubearchivist/home/src/reindex.py rename to tubearchivist/home/src/index/filesystem.py index bb93cae..5a33501 100644 --- a/tubearchivist/home/src/reindex.py +++ b/tubearchivist/home/src/index/filesystem.py @@ -11,276 +11,15 @@ import re import shutil import subprocess from datetime import datetime -from math import ceil -from time import sleep import requests -from home.src.config import AppConfig -from home.src.download import ChannelSubscription, PendingList, VideoDownloader -from home.src.helper import ( - RedisArchivist, - clean_string, - get_total_hits, - ignore_filelist, -) -from home.src.index import ( - YoutubeChannel, - YoutubePlaylist, - YoutubeVideo, - index_new_video, -) -from home.src.thumbnails import ThumbManager - - -class Reindex: - """check for outdated documents and refresh data from youtube""" - - def __init__(self): - # config - config = AppConfig().config - self.sleep_interval = config["downloads"]["sleep_interval"] - self.es_url = config["application"]["es_url"] - self.es_auth = config["application"]["es_auth"] - self.refresh_interval = config["scheduler"]["check_reindex_days"] - self.integrate_ryd = config["downloads"]["integrate_ryd"] - # scan - self.all_youtube_ids = False - self.all_channel_ids = False - self.all_playlist_ids = False - - def get_daily(self): - """get daily refresh values""" - total_videos = get_total_hits( - "ta_video", self.es_url, self.es_auth, "active" - ) - video_daily = ceil(total_videos / self.refresh_interval * 1.2) - total_channels = get_total_hits( - "ta_channel", self.es_url, self.es_auth, "channel_active" - ) - channel_daily = ceil(total_channels / self.refresh_interval * 1.2) - total_playlists = get_total_hits( - "ta_playlist", self.es_url, self.es_auth, "playlist_active" - ) - playlist_daily = ceil(total_playlists / self.refresh_interval * 1.2) - return (video_daily, channel_daily, playlist_daily) - - def get_outdated_vids(self, size): - """get daily videos to refresh""" - headers = {"Content-type": "application/json"} - now = int(datetime.now().strftime("%s")) - now_lte = now - self.refresh_interval * 24 * 60 * 60 - data = { - "size": size, - "query": { - "bool": { - "must": [ - {"match": {"active": True}}, - {"range": {"vid_last_refresh": {"lte": now_lte}}}, - ] - } - }, - "sort": [{"vid_last_refresh": {"order": "asc"}}], - "_source": False, - } - query_str = json.dumps(data) - url = self.es_url + "/ta_video/_search" - response = requests.get( - url, data=query_str, headers=headers, auth=self.es_auth - ) - if not response.ok: - print(response.text) - response_dict = json.loads(response.text) - all_youtube_ids = [i["_id"] for i in response_dict["hits"]["hits"]] - return all_youtube_ids - - def get_unrated_vids(self): - """get all videos without rating if ryd integration is enabled""" - headers = {"Content-type": "application/json"} - data = { - "size": 200, - "query": { - "bool": { - "must_not": [{"exists": {"field": "stats.average_rating"}}] - } - }, - } - query_str = json.dumps(data) - url = self.es_url + "/ta_video/_search" - response = requests.get( - url, data=query_str, headers=headers, auth=self.es_auth - ) - if not response.ok: - print(response.text) - response_dict = json.loads(response.text) - missing_rating = [i["_id"] for i in response_dict["hits"]["hits"]] - self.all_youtube_ids = self.all_youtube_ids + missing_rating - - def get_outdated_channels(self, size): - """get daily channels to refresh""" - headers = {"Content-type": "application/json"} - now = int(datetime.now().strftime("%s")) - now_lte = now - self.refresh_interval * 24 * 60 * 60 - data = { - "size": size, - "query": { - "bool": { - "must": [ - {"match": {"channel_active": True}}, - {"range": {"channel_last_refresh": {"lte": now_lte}}}, - ] - } - }, - "sort": [{"channel_last_refresh": {"order": "asc"}}], - "_source": False, - } - query_str = json.dumps(data) - url = self.es_url + "/ta_channel/_search" - response = requests.get( - url, data=query_str, headers=headers, auth=self.es_auth - ) - if not response.ok: - print(response.text) - response_dict = json.loads(response.text) - all_channel_ids = [i["_id"] for i in response_dict["hits"]["hits"]] - return all_channel_ids - - def get_outdated_playlists(self, size): - """get daily outdated playlists to refresh""" - headers = {"Content-type": "application/json"} - now = int(datetime.now().strftime("%s")) - now_lte = now - self.refresh_interval * 24 * 60 * 60 - data = { - "size": size, - "query": { - "bool": { - "must": [ - {"match": {"playlist_active": True}}, - {"range": {"playlist_last_refresh": {"lte": now_lte}}}, - ] - } - }, - "sort": [{"playlist_last_refresh": {"order": "asc"}}], - "_source": False, - } - query_str = json.dumps(data) - url = self.es_url + "/ta_playlist/_search" - response = requests.get( - url, data=query_str, headers=headers, auth=self.es_auth - ) - if not response.ok: - print(response.text) - response_dict = json.loads(response.text) - all_playlist_ids = [i["_id"] for i in response_dict["hits"]["hits"]] - return all_playlist_ids - - def check_outdated(self): - """add missing vids and channels""" - video_daily, channel_daily, playlist_daily = self.get_daily() - self.all_youtube_ids = self.get_outdated_vids(video_daily) - self.all_channel_ids = self.get_outdated_channels(channel_daily) - self.all_playlist_ids = self.get_outdated_playlists(playlist_daily) - if self.integrate_ryd: - self.get_unrated_vids() - - def rescrape_all_channels(self): - """sync new data from channel to all matching videos""" - sleep_interval = self.sleep_interval - channel_sub_handler = ChannelSubscription() - all_channels = channel_sub_handler.get_channels(subscribed_only=False) - all_channel_ids = [i["channel_id"] for i in all_channels] - - for channel_id in all_channel_ids: - channel = YoutubeChannel(channel_id) - subscribed = channel.json_data["channel_subscribed"] - channel.get_from_youtube() - channel.json_data["channel_subscribed"] = subscribed - channel.upload_to_es() - channel.sync_to_videos() - - if sleep_interval: - sleep(sleep_interval) - - @staticmethod - def reindex_single_video(youtube_id): - """refresh data for single video""" - video = YoutubeVideo(youtube_id) - - # read current state - video.get_from_es() - player = video.json_data["player"] - date_downloaded = video.json_data["date_downloaded"] - channel_dict = video.json_data["channel"] - playlist = video.json_data.get("playlist") - - # get new - video.build_json() - if not video.json_data: - video.deactivate() - - # add back - video.json_data["player"] = player - video.json_data["date_downloaded"] = date_downloaded - video.json_data["channel"] = channel_dict - if playlist: - video.json_data["playlist"] = playlist - - video.upload_to_es() - - thumb_handler = ThumbManager() - thumb_handler.delete_vid_thumb(youtube_id) - to_download = (youtube_id, video.json_data["vid_thumb_url"]) - thumb_handler.download_vid([to_download], notify=False) - - @staticmethod - def reindex_single_channel(channel_id): - """refresh channel data and sync to videos""" - channel = YoutubeChannel(channel_id) - channel.get_from_es() - subscribed = channel.json_data["channel_subscribed"] - channel.get_from_youtube() - channel.json_data["channel_subscribed"] = subscribed - channel.upload_to_es() - channel.sync_to_videos() - - @staticmethod - def reindex_single_playlist(playlist_id, all_indexed_ids): - """refresh playlist data""" - playlist = YoutubePlaylist(playlist_id) - playlist.get_from_es() - subscribed = playlist.json_data["playlist_subscribed"] - playlist.all_youtube_ids = all_indexed_ids - playlist.build_json(scrape=True) - if not playlist.json_data: - playlist.deactivate() - return - - playlist.json_data["playlist_subscribed"] = subscribed - playlist.upload_to_es() - return - - def reindex(self): - """reindex what's needed""" - # videos - print(f"reindexing {len(self.all_youtube_ids)} videos") - for youtube_id in self.all_youtube_ids: - self.reindex_single_video(youtube_id) - if self.sleep_interval: - sleep(self.sleep_interval) - # channels - print(f"reindexing {len(self.all_channel_ids)} channels") - for channel_id in self.all_channel_ids: - self.reindex_single_channel(channel_id) - if self.sleep_interval: - sleep(self.sleep_interval) - # playlist - print(f"reindexing {len(self.all_playlist_ids)} playlists") - if self.all_playlist_ids: - all_indexed = PendingList().get_all_indexed() - all_indexed_ids = [i["youtube_id"] for i in all_indexed] - for playlist_id in self.all_playlist_ids: - self.reindex_single_playlist(playlist_id, all_indexed_ids) - if self.sleep_interval: - sleep(self.sleep_interval) +from home.src.download.queue import PendingList +from home.src.download.yt_dlp_handler import VideoDownloader +from home.src.index.reindex import Reindex +from home.src.index.video import index_new_video +from home.src.ta.config import AppConfig +from home.src.ta.helper import clean_string, ignore_filelist +from home.src.ta.ta_redis import RedisArchivist class FilesystemScanner: diff --git a/tubearchivist/home/src/index/generic.py b/tubearchivist/home/src/index/generic.py new file mode 100644 index 0000000..6f88e37 --- /dev/null +++ b/tubearchivist/home/src/index/generic.py @@ -0,0 +1,139 @@ +"""generic base class for indexing documents""" + +import math + +import yt_dlp +from home.src.es.connect import ElasticWrap +from home.src.ta.config import AppConfig +from home.src.ta.ta_redis import RedisArchivist + + +class YouTubeItem: + """base class for youtube""" + + es_path = False + index_name = False + yt_base = False + yt_obs = { + "quiet": True, + "default_search": "ytsearch", + "skip_download": True, + "check_formats": "selected", + "noplaylist": True, + } + + def __init__(self, youtube_id): + self.youtube_id = youtube_id + self.config = False + self.app_conf = False + self.youtube_meta = False + self.json_data = False + self._get_conf() + + def _get_conf(self): + """read user conf""" + self.config = AppConfig().config + self.app_conf = self.config["application"] + + def get_from_youtube(self): + """use yt-dlp to get meta data from youtube""" + print(f"{self.youtube_id}: get metadata from youtube") + try: + yt_item = yt_dlp.YoutubeDL(self.yt_obs) + response = yt_item.extract_info(self.yt_base + self.youtube_id) + except ( + yt_dlp.utils.ExtractorError, + yt_dlp.utils.DownloadError, + ): + print(f"{self.youtube_id}: failed to get info from youtube") + self.youtube_meta = False + + self.youtube_meta = response + + def get_from_es(self): + """get indexed data from elastic search""" + print(f"{self.youtube_id}: get metadata from es") + response, _ = ElasticWrap(f"{self.es_path}").get() + source = response.get("_source") + self.json_data = source + + def upload_to_es(self): + """add json_data to elastic""" + _, _ = ElasticWrap(self.es_path).put(self.json_data, refresh=True) + + def deactivate(self): + """deactivate document in es""" + key_match = { + "video": "active", + "channel": "channel_active", + "playlist": "playlist_active", + } + update_path = f"{self.index_name}/_update/{self.youtube_id}" + data = { + "script": f"ctx._source.{key_match.get(self.index_name)} = false" + } + _, _ = ElasticWrap(update_path).post(data) + + def del_in_es(self): + """delete item from elastic search""" + print(f"{self.youtube_id}: delete from es") + _, _ = ElasticWrap(self.es_path).delete() + + +class Pagination: + """ + figure out the pagination based on page size and total_hits + """ + + def __init__(self, page_get, user_id, search_get=False): + self.user_id = user_id + self.page_size = self.get_page_size() + self.page_get = page_get + self.search_get = search_get + self.pagination = self.first_guess() + + def get_page_size(self): + """get default or user modified page_size""" + key = f"{self.user_id}:page_size" + page_size = RedisArchivist().get_message(key)["status"] + if not page_size: + config = AppConfig().config + page_size = config["archive"]["page_size"] + + return page_size + + def first_guess(self): + """build first guess before api call""" + page_get = self.page_get + if page_get in [0, 1]: + page_from = 0 + prev_pages = False + elif page_get > 1: + page_from = (page_get - 1) * self.page_size + prev_pages = [ + i for i in range(page_get - 1, page_get - 6, -1) if i > 1 + ] + prev_pages.reverse() + pagination = { + "page_size": self.page_size, + "page_from": page_from, + "prev_pages": prev_pages, + "current_page": page_get, + } + if self.search_get: + pagination.update({"search_get": self.search_get}) + return pagination + + def validate(self, total_hits): + """validate pagination with total_hits after making api call""" + page_get = self.page_get + max_pages = math.ceil(total_hits / self.page_size) + if page_get < max_pages and max_pages > 1: + self.pagination["last_page"] = max_pages + else: + self.pagination["last_page"] = False + next_pages = [ + i for i in range(page_get + 1, page_get + 6) if 1 < i < max_pages + ] + + self.pagination["next_pages"] = next_pages diff --git a/tubearchivist/home/src/index/playlist.py b/tubearchivist/home/src/index/playlist.py new file mode 100644 index 0000000..2bd2d3a --- /dev/null +++ b/tubearchivist/home/src/index/playlist.py @@ -0,0 +1,201 @@ +"""handle playlist""" + +import json +from datetime import datetime + +from home.src.download.thumbnails import ThumbManager +from home.src.es.connect import ElasticWrap +from home.src.index.generic import YouTubeItem +from home.src.index.video import YoutubeVideo + + +class YoutubePlaylist(YouTubeItem): + """represents a single youtube playlist""" + + es_path = False + index_name = "ta_playlist" + yt_obs = { + "default_search": "ytsearch", + "quiet": True, + "skip_download": True, + "extract_flat": True, + } + yt_base = "https://www.youtube.com/playlist?list=" + + def __init__(self, youtube_id): + super().__init__(youtube_id) + self.es_path = f"{self.index_name}/_doc/{youtube_id}" + self.all_members = False + self.nav = False + self.all_youtube_ids = [] + + def build_json(self, scrape=False): + """collection to create json_data""" + if not scrape: + self.get_from_es() + + if scrape or not self.json_data: + self.get_from_youtube() + self.process_youtube_meta() + self.get_entries() + self.json_data["playlist_entries"] = self.all_members + self.get_playlist_art() + + def process_youtube_meta(self): + """extract relevant fields from youtube""" + self.json_data = { + "playlist_id": self.youtube_id, + "playlist_active": True, + "playlist_subscribed": False, + "playlist_name": self.youtube_meta["title"], + "playlist_channel": self.youtube_meta["channel"], + "playlist_channel_id": self.youtube_meta["channel_id"], + "playlist_thumbnail": self.youtube_meta["thumbnails"][-1]["url"], + "playlist_description": self.youtube_meta["description"] or False, + "playlist_last_refresh": int(datetime.now().strftime("%s")), + } + + def get_entries(self, playlistend=False): + """get all videos in playlist""" + if playlistend: + # implement playlist end + print(playlistend) + all_members = [] + for idx, entry in enumerate(self.youtube_meta["entries"]): + if self.all_youtube_ids: + downloaded = entry["id"] in self.all_youtube_ids + else: + downloaded = False + if not entry["uploader"]: + continue + to_append = { + "youtube_id": entry["id"], + "title": entry["title"], + "uploader": entry["uploader"], + "idx": idx, + "downloaded": downloaded, + } + all_members.append(to_append) + + self.all_members = all_members + + @staticmethod + def get_playlist_art(): + """download artwork of playlist""" + thumbnails = ThumbManager() + missing_playlists = thumbnails.get_missing_playlists() + thumbnails.download_playlist(missing_playlists) + + def add_vids_to_playlist(self): + """sync the playlist id to videos""" + script = ( + 'if (!ctx._source.containsKey("playlist")) ' + + "{ctx._source.playlist = [params.playlist]} " + + "else if (!ctx._source.playlist.contains(params.playlist)) " + + "{ctx._source.playlist.add(params.playlist)} " + + "else {ctx.op = 'none'}" + ) + + bulk_list = [] + for entry in self.json_data["playlist_entries"]: + video_id = entry["youtube_id"] + action = {"update": {"_id": video_id, "_index": "ta_video"}} + source = { + "script": { + "source": script, + "lang": "painless", + "params": {"playlist": self.youtube_id}, + } + } + bulk_list.append(json.dumps(action)) + bulk_list.append(json.dumps(source)) + + # add last newline + bulk_list.append("\n") + query_str = "\n".join(bulk_list) + + ElasticWrap("_bulk").post(query_str, ndjson=True) + + def update_playlist(self): + """update metadata for playlist with data from YouTube""" + self.get_from_es() + subscribed = self.json_data["playlist_subscribed"] + self.get_from_youtube() + if not self.json_data: + # return false to deactivate + return False + + self.json_data["playlist_subscribed"] = subscribed + self.upload_to_es() + return True + + def build_nav(self, youtube_id): + """find next and previous in playlist of a given youtube_id""" + all_entries_available = self.json_data["playlist_entries"] + all_entries = [i for i in all_entries_available if i["downloaded"]] + current = [i for i in all_entries if i["youtube_id"] == youtube_id] + # stop if not found or playlist of 1 + if not current or not len(all_entries) > 1: + return + + current_idx = all_entries.index(current[0]) + if current_idx == 0: + previous_item = False + else: + previous_item = all_entries[current_idx - 1] + prev_thumb = ThumbManager().vid_thumb_path( + previous_item["youtube_id"] + ) + previous_item["vid_thumb"] = prev_thumb + + if current_idx == len(all_entries) - 1: + next_item = False + else: + next_item = all_entries[current_idx + 1] + next_thumb = ThumbManager().vid_thumb_path(next_item["youtube_id"]) + next_item["vid_thumb"] = next_thumb + + self.nav = { + "playlist_meta": { + "current_idx": current[0]["idx"], + "playlist_id": self.youtube_id, + "playlist_name": self.json_data["playlist_name"], + "playlist_channel": self.json_data["playlist_channel"], + }, + "playlist_previous": previous_item, + "playlist_next": next_item, + } + return + + def delete_metadata(self): + """delete metadata for playlist""" + script = ( + "ctx._source.playlist.removeAll(" + + "Collections.singleton(params.playlist)) " + ) + data = { + "query": { + "term": {"playlist.keyword": {"value": self.youtube_id}} + }, + "script": { + "source": script, + "lang": "painless", + "params": {"playlist": self.youtube_id}, + }, + } + _, _ = ElasticWrap("ta_video/_update_by_query").post(data) + self.del_in_es() + + def delete_videos_playlist(self): + """delete playlist with all videos""" + print(f"{self.youtube_id}: delete playlist") + self.get_from_es() + all_youtube_id = [ + i["youtube_id"] + for i in self.json_data["playlist_entries"] + if i["downloaded"] + ] + for youtube_id in all_youtube_id: + YoutubeVideo(youtube_id).delete_media_file() + + self.delete_metadata() diff --git a/tubearchivist/home/src/index/reindex.py b/tubearchivist/home/src/index/reindex.py new file mode 100644 index 0000000..b59d5a5 --- /dev/null +++ b/tubearchivist/home/src/index/reindex.py @@ -0,0 +1,267 @@ +"""periodically refresh documents""" + +import json +from datetime import datetime +from math import ceil +from time import sleep + +import requests +from home.src.download.queue import PendingList +from home.src.download.subscriptions import ChannelSubscription +from home.src.download.thumbnails import ThumbManager +from home.src.index.channel import YoutubeChannel +from home.src.index.playlist import YoutubePlaylist +from home.src.index.video import YoutubeVideo +from home.src.ta.config import AppConfig +from home.src.ta.helper import get_total_hits + + +class Reindex: + """check for outdated documents and refresh data from youtube""" + + def __init__(self): + # config + config = AppConfig().config + self.sleep_interval = config["downloads"]["sleep_interval"] + self.es_url = config["application"]["es_url"] + self.es_auth = config["application"]["es_auth"] + self.refresh_interval = config["scheduler"]["check_reindex_days"] + self.integrate_ryd = config["downloads"]["integrate_ryd"] + # scan + self.all_youtube_ids = False + self.all_channel_ids = False + self.all_playlist_ids = False + + def get_daily(self): + """get daily refresh values""" + total_videos = get_total_hits( + "ta_video", self.es_url, self.es_auth, "active" + ) + video_daily = ceil(total_videos / self.refresh_interval * 1.2) + total_channels = get_total_hits( + "ta_channel", self.es_url, self.es_auth, "channel_active" + ) + channel_daily = ceil(total_channels / self.refresh_interval * 1.2) + total_playlists = get_total_hits( + "ta_playlist", self.es_url, self.es_auth, "playlist_active" + ) + playlist_daily = ceil(total_playlists / self.refresh_interval * 1.2) + return (video_daily, channel_daily, playlist_daily) + + def get_outdated_vids(self, size): + """get daily videos to refresh""" + headers = {"Content-type": "application/json"} + now = int(datetime.now().strftime("%s")) + now_lte = now - self.refresh_interval * 24 * 60 * 60 + data = { + "size": size, + "query": { + "bool": { + "must": [ + {"match": {"active": True}}, + {"range": {"vid_last_refresh": {"lte": now_lte}}}, + ] + } + }, + "sort": [{"vid_last_refresh": {"order": "asc"}}], + "_source": False, + } + query_str = json.dumps(data) + url = self.es_url + "/ta_video/_search" + response = requests.get( + url, data=query_str, headers=headers, auth=self.es_auth + ) + if not response.ok: + print(response.text) + response_dict = json.loads(response.text) + all_youtube_ids = [i["_id"] for i in response_dict["hits"]["hits"]] + return all_youtube_ids + + def get_unrated_vids(self): + """get all videos without rating if ryd integration is enabled""" + headers = {"Content-type": "application/json"} + data = { + "size": 200, + "query": { + "bool": { + "must_not": [{"exists": {"field": "stats.average_rating"}}] + } + }, + } + query_str = json.dumps(data) + url = self.es_url + "/ta_video/_search" + response = requests.get( + url, data=query_str, headers=headers, auth=self.es_auth + ) + if not response.ok: + print(response.text) + response_dict = json.loads(response.text) + missing_rating = [i["_id"] for i in response_dict["hits"]["hits"]] + self.all_youtube_ids = self.all_youtube_ids + missing_rating + + def get_outdated_channels(self, size): + """get daily channels to refresh""" + headers = {"Content-type": "application/json"} + now = int(datetime.now().strftime("%s")) + now_lte = now - self.refresh_interval * 24 * 60 * 60 + data = { + "size": size, + "query": { + "bool": { + "must": [ + {"match": {"channel_active": True}}, + {"range": {"channel_last_refresh": {"lte": now_lte}}}, + ] + } + }, + "sort": [{"channel_last_refresh": {"order": "asc"}}], + "_source": False, + } + query_str = json.dumps(data) + url = self.es_url + "/ta_channel/_search" + response = requests.get( + url, data=query_str, headers=headers, auth=self.es_auth + ) + if not response.ok: + print(response.text) + response_dict = json.loads(response.text) + all_channel_ids = [i["_id"] for i in response_dict["hits"]["hits"]] + return all_channel_ids + + def get_outdated_playlists(self, size): + """get daily outdated playlists to refresh""" + headers = {"Content-type": "application/json"} + now = int(datetime.now().strftime("%s")) + now_lte = now - self.refresh_interval * 24 * 60 * 60 + data = { + "size": size, + "query": { + "bool": { + "must": [ + {"match": {"playlist_active": True}}, + {"range": {"playlist_last_refresh": {"lte": now_lte}}}, + ] + } + }, + "sort": [{"playlist_last_refresh": {"order": "asc"}}], + "_source": False, + } + query_str = json.dumps(data) + url = self.es_url + "/ta_playlist/_search" + response = requests.get( + url, data=query_str, headers=headers, auth=self.es_auth + ) + if not response.ok: + print(response.text) + response_dict = json.loads(response.text) + all_playlist_ids = [i["_id"] for i in response_dict["hits"]["hits"]] + return all_playlist_ids + + def check_outdated(self): + """add missing vids and channels""" + video_daily, channel_daily, playlist_daily = self.get_daily() + self.all_youtube_ids = self.get_outdated_vids(video_daily) + self.all_channel_ids = self.get_outdated_channels(channel_daily) + self.all_playlist_ids = self.get_outdated_playlists(playlist_daily) + if self.integrate_ryd: + self.get_unrated_vids() + + def rescrape_all_channels(self): + """sync new data from channel to all matching videos""" + sleep_interval = self.sleep_interval + channel_sub_handler = ChannelSubscription() + all_channels = channel_sub_handler.get_channels(subscribed_only=False) + all_channel_ids = [i["channel_id"] for i in all_channels] + + for channel_id in all_channel_ids: + channel = YoutubeChannel(channel_id) + subscribed = channel.json_data["channel_subscribed"] + channel.get_from_youtube() + channel.json_data["channel_subscribed"] = subscribed + channel.upload_to_es() + channel.sync_to_videos() + + if sleep_interval: + sleep(sleep_interval) + + @staticmethod + def reindex_single_video(youtube_id): + """refresh data for single video""" + video = YoutubeVideo(youtube_id) + + # read current state + video.get_from_es() + player = video.json_data["player"] + date_downloaded = video.json_data["date_downloaded"] + channel_dict = video.json_data["channel"] + playlist = video.json_data.get("playlist") + + # get new + video.build_json() + if not video.json_data: + video.deactivate() + + # add back + video.json_data["player"] = player + video.json_data["date_downloaded"] = date_downloaded + video.json_data["channel"] = channel_dict + if playlist: + video.json_data["playlist"] = playlist + + video.upload_to_es() + + thumb_handler = ThumbManager() + thumb_handler.delete_vid_thumb(youtube_id) + to_download = (youtube_id, video.json_data["vid_thumb_url"]) + thumb_handler.download_vid([to_download], notify=False) + + @staticmethod + def reindex_single_channel(channel_id): + """refresh channel data and sync to videos""" + channel = YoutubeChannel(channel_id) + channel.get_from_es() + subscribed = channel.json_data["channel_subscribed"] + channel.get_from_youtube() + channel.json_data["channel_subscribed"] = subscribed + channel.upload_to_es() + channel.sync_to_videos() + + @staticmethod + def reindex_single_playlist(playlist_id, all_indexed_ids): + """refresh playlist data""" + playlist = YoutubePlaylist(playlist_id) + playlist.get_from_es() + subscribed = playlist.json_data["playlist_subscribed"] + playlist.all_youtube_ids = all_indexed_ids + playlist.build_json(scrape=True) + if not playlist.json_data: + playlist.deactivate() + return + + playlist.json_data["playlist_subscribed"] = subscribed + playlist.upload_to_es() + return + + def reindex(self): + """reindex what's needed""" + # videos + print(f"reindexing {len(self.all_youtube_ids)} videos") + for youtube_id in self.all_youtube_ids: + self.reindex_single_video(youtube_id) + if self.sleep_interval: + sleep(self.sleep_interval) + # channels + print(f"reindexing {len(self.all_channel_ids)} channels") + for channel_id in self.all_channel_ids: + self.reindex_single_channel(channel_id) + if self.sleep_interval: + sleep(self.sleep_interval) + # playlist + print(f"reindexing {len(self.all_playlist_ids)} playlists") + if self.all_playlist_ids: + all_indexed = PendingList().get_all_indexed() + all_indexed_ids = [i["youtube_id"] for i in all_indexed] + for playlist_id in self.all_playlist_ids: + self.reindex_single_playlist(playlist_id, all_indexed_ids) + if self.sleep_interval: + sleep(self.sleep_interval) diff --git a/tubearchivist/home/src/index/video.py b/tubearchivist/home/src/index/video.py new file mode 100644 index 0000000..7b71e36 --- /dev/null +++ b/tubearchivist/home/src/index/video.py @@ -0,0 +1,171 @@ +"""handle single video index""" + +import os +from datetime import datetime + +import requests +from home.src.index import channel as ta_channel +from home.src.index.generic import YouTubeItem +from home.src.ta.helper import DurationConverter, clean_string +from ryd_client import ryd_client + + +class YoutubeVideo(YouTubeItem): + """represents a single youtube video""" + + es_path = False + index_name = "ta_video" + yt_base = "https://www.youtube.com/watch?v=" + + def __init__(self, youtube_id): + super().__init__(youtube_id) + self.channel_id = False + self.es_path = f"{self.index_name}/_doc/{youtube_id}" + + def build_json(self): + """build json dict of video""" + self.get_from_youtube() + if not self.youtube_meta: + return + + self._process_youtube_meta() + self._add_channel() + self._add_stats() + self.add_file_path() + self.add_player() + if self.config["downloads"]["integrate_ryd"]: + self._get_ryd_stats() + + return + + def _process_youtube_meta(self): + """extract relevant fields from youtube""" + # extract + self.channel_id = self.youtube_meta["channel_id"] + upload_date = self.youtube_meta["upload_date"] + upload_date_time = datetime.strptime(upload_date, "%Y%m%d") + published = upload_date_time.strftime("%Y-%m-%d") + last_refresh = int(datetime.now().strftime("%s")) + # build json_data basics + self.json_data = { + "title": self.youtube_meta["title"], + "description": self.youtube_meta["description"], + "category": self.youtube_meta["categories"], + "vid_thumb_url": self.youtube_meta["thumbnail"], + "tags": self.youtube_meta["tags"], + "published": published, + "vid_last_refresh": last_refresh, + "date_downloaded": last_refresh, + "youtube_id": self.youtube_id, + "active": True, + } + + def _add_channel(self): + """add channel dict to video json_data""" + channel = ta_channel.YoutubeChannel(self.channel_id) + channel.build_json(upload=True) + self.json_data.update({"channel": channel.json_data}) + + def _add_stats(self): + """add stats dicst to json_data""" + # likes + like_count = self.youtube_meta.get("like_count", 0) + dislike_count = self.youtube_meta.get("dislike_count", 0) + self.json_data.update( + { + "stats": { + "view_count": self.youtube_meta["view_count"], + "like_count": like_count, + "dislike_count": dislike_count, + "average_rating": self.youtube_meta["average_rating"], + } + } + ) + + def build_dl_cache_path(self): + """find video path in dl cache""" + cache_dir = self.app_conf["cache_dir"] + cache_path = f"{cache_dir}/download/" + all_cached = os.listdir(cache_path) + for file_cached in all_cached: + if self.youtube_id in file_cached: + vid_path = os.path.join(cache_path, file_cached) + return vid_path + + return False + + def add_player(self): + """add player information for new videos""" + try: + # when indexing from download task + vid_path = self.build_dl_cache_path() + except FileNotFoundError: + # when reindexing + base = self.app_conf["videos"] + vid_path = os.path.join(base, self.json_data["media_url"]) + + duration_handler = DurationConverter() + duration = duration_handler.get_sec(vid_path) + duration_str = duration_handler.get_str(duration) + self.json_data.update( + { + "player": { + "watched": False, + "duration": duration, + "duration_str": duration_str, + } + } + ) + + def add_file_path(self): + """build media_url for where file will be located""" + channel_name = self.json_data["channel"]["channel_name"] + clean_channel_name = clean_string(channel_name) + timestamp = self.json_data["published"].replace("-", "") + youtube_id = self.json_data["youtube_id"] + title = self.json_data["title"] + clean_title = clean_string(title) + filename = f"{timestamp}_{youtube_id}_{clean_title}.mp4" + media_url = os.path.join(clean_channel_name, filename) + self.json_data["media_url"] = media_url + + def delete_media_file(self): + """delete video file, meta data""" + self.get_from_es() + video_base = self.app_conf["videos"] + media_url = self.json_data["media_url"] + print(f"{self.youtube_id}: delete {media_url} from file system") + to_delete = os.path.join(video_base, media_url) + os.remove(to_delete) + self.del_in_es() + + def _get_ryd_stats(self): + """get optional stats from returnyoutubedislikeapi.com""" + try: + print(f"{self.youtube_id}: get ryd stats") + result = ryd_client.get(self.youtube_id) + except requests.exceptions.ConnectionError: + print(f"{self.youtube_id}: failed to query ryd api, skipping") + return False + + if result["status"] == 404: + return False + + dislikes = { + "dislike_count": result["dislikes"], + "average_rating": result["rating"], + } + self.json_data["stats"].update(dislikes) + + return True + + +def index_new_video(youtube_id): + """combined classes to create new video in index""" + video = YoutubeVideo(youtube_id) + video.build_json() + if not video.json_data: + raise ValueError("failed to get metadata for " + youtube_id) + + video.upload_to_es() + return video.json_data diff --git a/tubearchivist/home/src/ta/__init__.py b/tubearchivist/home/src/ta/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tubearchivist/home/src/config.py b/tubearchivist/home/src/ta/config.py similarity index 99% rename from tubearchivist/home/src/config.py rename to tubearchivist/home/src/ta/config.py index 5690716..569b230 100644 --- a/tubearchivist/home/src/config.py +++ b/tubearchivist/home/src/ta/config.py @@ -10,7 +10,7 @@ import os import re from celery.schedules import crontab -from home.src.helper import RedisArchivist +from home.src.ta.ta_redis import RedisArchivist class AppConfig: diff --git a/tubearchivist/home/src/helper.py b/tubearchivist/home/src/ta/helper.py similarity index 57% rename from tubearchivist/home/src/helper.py rename to tubearchivist/home/src/ta/helper.py index 61f698e..4788636 100644 --- a/tubearchivist/home/src/helper.py +++ b/tubearchivist/home/src/ta/helper.py @@ -4,14 +4,12 @@ Loose collection of helper functions """ import json -import os import re import string import subprocess import unicodedata from urllib.parse import parse_qs, urlparse -import redis import requests import yt_dlp @@ -149,153 +147,6 @@ class UrlListParser: return channel_id -class RedisArchivist: - """collection of methods to interact with redis""" - - REDIS_HOST = os.environ.get("REDIS_HOST") - REDIS_PORT = os.environ.get("REDIS_PORT") or 6379 - NAME_SPACE = "ta:" - CHANNELS = [ - "download", - "add", - "rescan", - "subchannel", - "subplaylist", - "playlistscan", - "setting", - ] - - def __init__(self): - self.redis_connection = redis.Redis( - host=self.REDIS_HOST, port=self.REDIS_PORT - ) - - def set_message(self, key, message, expire=True): - """write new message to redis""" - self.redis_connection.execute_command( - "JSON.SET", self.NAME_SPACE + key, ".", json.dumps(message) - ) - - if expire: - if isinstance(expire, bool): - secs = 20 - else: - secs = expire - self.redis_connection.execute_command( - "EXPIRE", self.NAME_SPACE + key, secs - ) - - def get_message(self, key): - """get message dict from redis""" - reply = self.redis_connection.execute_command( - "JSON.GET", self.NAME_SPACE + key - ) - if reply: - json_str = json.loads(reply) - else: - json_str = {"status": False} - - return json_str - - def del_message(self, key): - """delete key from redis""" - response = self.redis_connection.execute_command( - "DEL", self.NAME_SPACE + key - ) - return response - - def get_lock(self, lock_key): - """handle lock for task management""" - redis_lock = self.redis_connection.lock(self.NAME_SPACE + lock_key) - return redis_lock - - def get_progress(self): - """get a list of all progress messages""" - all_messages = [] - for channel in self.CHANNELS: - key = "message:" + channel - reply = self.redis_connection.execute_command( - "JSON.GET", self.NAME_SPACE + key - ) - if reply: - json_str = json.loads(reply) - all_messages.append(json_str) - - return all_messages - - @staticmethod - def monitor_cache_dir(cache_dir): - """ - look at download cache dir directly as alternative progress info - """ - dl_cache = os.path.join(cache_dir, "download") - all_cache_file = os.listdir(dl_cache) - cache_file = ignore_filelist(all_cache_file) - if cache_file: - filename = cache_file[0][12:].replace("_", " ").split(".")[0] - mess_dict = { - "status": "message:download", - "level": "info", - "title": "Downloading: " + filename, - "message": "", - } - else: - return False - - return mess_dict - - -class RedisQueue: - """dynamically interact with the download queue in redis""" - - REDIS_HOST = os.environ.get("REDIS_HOST") - REDIS_PORT = os.environ.get("REDIS_PORT") - NAME_SPACE = "ta:" - - if not REDIS_PORT: - REDIS_PORT = 6379 - - def __init__(self, key): - self.key = self.NAME_SPACE + key - self.conn = redis.Redis(host=self.REDIS_HOST, port=self.REDIS_PORT) - - def get_all(self): - """return all elements in list""" - result = self.conn.execute_command("LRANGE", self.key, 0, -1) - all_elements = [i.decode() for i in result] - return all_elements - - def add_list(self, to_add): - """add list to queue""" - self.conn.execute_command("RPUSH", self.key, *to_add) - - def add_priority(self, to_add): - """add single video to front of queue""" - self.clear_item(to_add) - self.conn.execute_command("LPUSH", self.key, to_add) - - def get_next(self): - """return next element in the queue, False if none""" - result = self.conn.execute_command("LPOP", self.key) - if not result: - return False - - next_element = result.decode() - return next_element - - def clear(self): - """delete list from redis""" - self.conn.execute_command("DEL", self.key) - - def clear_item(self, to_clear): - """remove single item from list if it's there""" - self.conn.execute_command("LREM", self.key, 0, to_clear) - - def trim(self, size): - """trim the queue based on settings amount""" - self.conn.execute_command("LTRIM", self.key, 0, size) - - class DurationConverter: """ using ffmpeg to get and parse duration from filepath diff --git a/tubearchivist/home/src/ta/ta_redis.py b/tubearchivist/home/src/ta/ta_redis.py new file mode 100644 index 0000000..60b8e0e --- /dev/null +++ b/tubearchivist/home/src/ta/ta_redis.py @@ -0,0 +1,154 @@ +"""interact with redis""" + +import json +import os + +import redis +from home.src.ta.helper import ignore_filelist + + +class RedisArchivist: + """collection of methods to interact with redis""" + + REDIS_HOST = os.environ.get("REDIS_HOST") + REDIS_PORT = os.environ.get("REDIS_PORT") or 6379 + NAME_SPACE = "ta:" + CHANNELS = [ + "download", + "add", + "rescan", + "subchannel", + "subplaylist", + "playlistscan", + "setting", + ] + + def __init__(self): + self.redis_connection = redis.Redis( + host=self.REDIS_HOST, port=self.REDIS_PORT + ) + + def set_message(self, key, message, expire=True): + """write new message to redis""" + self.redis_connection.execute_command( + "JSON.SET", self.NAME_SPACE + key, ".", json.dumps(message) + ) + + if expire: + if isinstance(expire, bool): + secs = 20 + else: + secs = expire + self.redis_connection.execute_command( + "EXPIRE", self.NAME_SPACE + key, secs + ) + + def get_message(self, key): + """get message dict from redis""" + reply = self.redis_connection.execute_command( + "JSON.GET", self.NAME_SPACE + key + ) + if reply: + json_str = json.loads(reply) + else: + json_str = {"status": False} + + return json_str + + def del_message(self, key): + """delete key from redis""" + response = self.redis_connection.execute_command( + "DEL", self.NAME_SPACE + key + ) + return response + + def get_lock(self, lock_key): + """handle lock for task management""" + redis_lock = self.redis_connection.lock(self.NAME_SPACE + lock_key) + return redis_lock + + def get_progress(self): + """get a list of all progress messages""" + all_messages = [] + for channel in self.CHANNELS: + key = "message:" + channel + reply = self.redis_connection.execute_command( + "JSON.GET", self.NAME_SPACE + key + ) + if reply: + json_str = json.loads(reply) + all_messages.append(json_str) + + return all_messages + + @staticmethod + def monitor_cache_dir(cache_dir): + """ + look at download cache dir directly as alternative progress info + """ + dl_cache = os.path.join(cache_dir, "download") + all_cache_file = os.listdir(dl_cache) + cache_file = ignore_filelist(all_cache_file) + if cache_file: + filename = cache_file[0][12:].replace("_", " ").split(".")[0] + mess_dict = { + "status": "message:download", + "level": "info", + "title": "Downloading: " + filename, + "message": "", + } + else: + return False + + return mess_dict + + +class RedisQueue: + """dynamically interact with the download queue in redis""" + + REDIS_HOST = os.environ.get("REDIS_HOST") + REDIS_PORT = os.environ.get("REDIS_PORT") + NAME_SPACE = "ta:" + + if not REDIS_PORT: + REDIS_PORT = 6379 + + def __init__(self, key): + self.key = self.NAME_SPACE + key + self.conn = redis.Redis(host=self.REDIS_HOST, port=self.REDIS_PORT) + + def get_all(self): + """return all elements in list""" + result = self.conn.execute_command("LRANGE", self.key, 0, -1) + all_elements = [i.decode() for i in result] + return all_elements + + def add_list(self, to_add): + """add list to queue""" + self.conn.execute_command("RPUSH", self.key, *to_add) + + def add_priority(self, to_add): + """add single video to front of queue""" + self.clear_item(to_add) + self.conn.execute_command("LPUSH", self.key, to_add) + + def get_next(self): + """return next element in the queue, False if none""" + result = self.conn.execute_command("LPOP", self.key) + if not result: + return False + + next_element = result.decode() + return next_element + + def clear(self): + """delete list from redis""" + self.conn.execute_command("DEL", self.key) + + def clear_item(self, to_clear): + """remove single item from list if it's there""" + self.conn.execute_command("LREM", self.key, 0, to_clear) + + def trim(self, size): + """trim the queue based on settings amount""" + self.conn.execute_command("LTRIM", self.key, 0, size) diff --git a/tubearchivist/home/tasks.py b/tubearchivist/home/tasks.py index 995cf8f..63d84b2 100644 --- a/tubearchivist/home/tasks.py +++ b/tubearchivist/home/tasks.py @@ -10,22 +10,24 @@ import os import home.apps as startup_apps from celery import Celery, shared_task -from home.src.config import AppConfig, ScheduleBuilder -from home.src.download import ( +from home.src.download.queue import PendingList +from home.src.download.subscriptions import ( ChannelSubscription, - PendingList, PlaylistSubscription, - VideoDownloader, ) -from home.src.helper import RedisArchivist, RedisQueue, UrlListParser -from home.src.index import YoutubeChannel, YoutubePlaylist -from home.src.index_management import backup_all_indexes, restore_from_backup -from home.src.reindex import ( +from home.src.download.thumbnails import ThumbManager, validate_thumbnails +from home.src.download.yt_dlp_handler import VideoDownloader +from home.src.es.index_setup import backup_all_indexes, restore_from_backup +from home.src.index.channel import YoutubeChannel +from home.src.index.filesystem import ( ManualImport, reindex_old_documents, scan_filesystem, ) -from home.src.thumbnails import ThumbManager, validate_thumbnails +from home.src.index.playlist import YoutubePlaylist +from home.src.ta.config import AppConfig, ScheduleBuilder +from home.src.ta.helper import UrlListParser +from home.src.ta.ta_redis import RedisArchivist, RedisQueue CONFIG = AppConfig().config REDIS_HOST = os.environ.get("REDIS_HOST") diff --git a/tubearchivist/home/views.py b/tubearchivist/home/views.py index 687c0aa..006a944 100644 --- a/tubearchivist/home/views.py +++ b/tubearchivist/home/views.py @@ -14,7 +14,9 @@ from django.contrib.auth.forms import AuthenticationForm from django.http import JsonResponse from django.shortcuts import redirect, render from django.views import View -from home.forms import ( +from home.src.es.index_setup import get_available_backups +from home.src.frontend.api_calls import PostData +from home.src.frontend.forms import ( AddToQueueForm, ApplicationSettingsForm, CustomAuthForm, @@ -24,12 +26,12 @@ from home.forms import ( SubscribeToPlaylistForm, UserSettingsForm, ) -from home.src.config import AppConfig, ScheduleBuilder -from home.src.frontend import PostData -from home.src.helper import RedisArchivist, UrlListParser -from home.src.index import YoutubePlaylist -from home.src.index_management import get_available_backups -from home.src.searching import Pagination, SearchHandler +from home.src.frontend.searching import SearchHandler +from home.src.index.generic import Pagination +from home.src.index.playlist import YoutubePlaylist +from home.src.ta.config import AppConfig, ScheduleBuilder +from home.src.ta.helper import UrlListParser +from home.src.ta.ta_redis import RedisArchivist from home.tasks import extrac_dl, subscribe_to from rest_framework.authtoken.models import Token From 2fc0cbacf56311947773e975bb4e32d8f562cec1 Mon Sep 17 00:00:00 2001 From: simon Date: Sun, 23 Jan 2022 19:32:08 +0700 Subject: [PATCH 11/18] update doc strings to represent new module structure --- tubearchivist/api/views.py | 2 +- tubearchivist/home/src/download/queue.py | 6 +++++- tubearchivist/home/src/download/subscriptions.py | 8 ++++++-- tubearchivist/home/src/download/thumbnails.py | 1 + tubearchivist/home/src/download/yt_dlp_handler.py | 8 +++++++- tubearchivist/home/src/es/connect.py | 6 +++++- tubearchivist/home/src/es/index_setup.py | 7 ++++++- tubearchivist/home/src/frontend/watched.py | 5 ++++- tubearchivist/home/src/index/channel.py | 6 +++++- tubearchivist/home/src/index/generic.py | 5 ++++- tubearchivist/home/src/index/playlist.py | 6 +++++- tubearchivist/home/src/index/reindex.py | 6 +++++- tubearchivist/home/src/index/video.py | 6 +++++- tubearchivist/home/src/ta/config.py | 1 - tubearchivist/home/src/ta/ta_redis.py | 6 +++++- tubearchivist/home/views.py | 2 +- 16 files changed, 65 insertions(+), 16 deletions(-) diff --git a/tubearchivist/api/views.py b/tubearchivist/api/views.py index c4bcb08..7ab8b47 100644 --- a/tubearchivist/api/views.py +++ b/tubearchivist/api/views.py @@ -1,9 +1,9 @@ """all API views""" import requests +from home.src.download.thumbnails import ThumbManager from home.src.ta.config import AppConfig from home.src.ta.helper import UrlListParser -from home.src.download.thumbnails import ThumbManager from home.tasks import extrac_dl, subscribe_to from rest_framework.authentication import ( SessionAuthentication, diff --git a/tubearchivist/home/src/download/queue.py b/tubearchivist/home/src/download/queue.py index c8748e6..272c34d 100644 --- a/tubearchivist/home/src/download/queue.py +++ b/tubearchivist/home/src/download/queue.py @@ -1,4 +1,8 @@ -"""handle download queue""" +""" +Functionality: +- handle download queue +- linked with ta_dowload index +""" import json import os diff --git a/tubearchivist/home/src/download/subscriptions.py b/tubearchivist/home/src/download/subscriptions.py index d610137..2e4a29f 100644 --- a/tubearchivist/home/src/download/subscriptions.py +++ b/tubearchivist/home/src/download/subscriptions.py @@ -1,7 +1,11 @@ -"""handle subscriptions""" +""" +Functionality: +- handle channel subscriptions +- handle playlist subscriptions +""" import yt_dlp -from home.src.download import queue # partial import +from home.src.download import queue # partial import from home.src.es.connect import IndexPaginate from home.src.index.channel import YoutubeChannel from home.src.index.playlist import YoutubePlaylist diff --git a/tubearchivist/home/src/download/thumbnails.py b/tubearchivist/home/src/download/thumbnails.py index 305bbd8..2ea8477 100644 --- a/tubearchivist/home/src/download/thumbnails.py +++ b/tubearchivist/home/src/download/thumbnails.py @@ -1,6 +1,7 @@ """ functionality: - handle download and caching for thumbnails +- check for missing thumbnails """ import os diff --git a/tubearchivist/home/src/download/yt_dlp_handler.py b/tubearchivist/home/src/download/yt_dlp_handler.py index 80c8b86..671c5df 100644 --- a/tubearchivist/home/src/download/yt_dlp_handler.py +++ b/tubearchivist/home/src/download/yt_dlp_handler.py @@ -1,4 +1,10 @@ -"""handle yt_dlp downloads""" +""" +functionality: +- handle yt_dlp +- build options and post processor +- download video files +- move to archive +""" import os import shutil diff --git a/tubearchivist/home/src/es/connect.py b/tubearchivist/home/src/es/connect.py index 7cf7d8c..79fc0bd 100644 --- a/tubearchivist/home/src/es/connect.py +++ b/tubearchivist/home/src/es/connect.py @@ -1,4 +1,8 @@ -"""holds es connection manager""" +""" +functionality: +- wrapper around requests to call elastic search +- reusable search_after to extract total index +""" import json diff --git a/tubearchivist/home/src/es/index_setup.py b/tubearchivist/home/src/es/index_setup.py index 6b12a30..4bb980b 100644 --- a/tubearchivist/home/src/es/index_setup.py +++ b/tubearchivist/home/src/es/index_setup.py @@ -1,4 +1,9 @@ -"""setup and verify needed elastic indexes""" +""" +functionality: +- setup elastic index at first start +- verify and update index mapping and settings if needed +- backup and restore metadata +""" import json import os diff --git a/tubearchivist/home/src/frontend/watched.py b/tubearchivist/home/src/frontend/watched.py index 0769232..36ed072 100644 --- a/tubearchivist/home/src/frontend/watched.py +++ b/tubearchivist/home/src/frontend/watched.py @@ -1,4 +1,7 @@ -"""handle watch state""" +""" +functionality: +- handle watched state for videos, channels and playlists +""" import json from datetime import datetime diff --git a/tubearchivist/home/src/index/channel.py b/tubearchivist/home/src/index/channel.py index a8b1525..50a0696 100644 --- a/tubearchivist/home/src/index/channel.py +++ b/tubearchivist/home/src/index/channel.py @@ -1,4 +1,8 @@ -"""handle single channel in index""" +""" +functionality: +- get metadata from youtube for a channel +- index and update in es +""" import json import os diff --git a/tubearchivist/home/src/index/generic.py b/tubearchivist/home/src/index/generic.py index 6f88e37..af96abf 100644 --- a/tubearchivist/home/src/index/generic.py +++ b/tubearchivist/home/src/index/generic.py @@ -1,4 +1,7 @@ -"""generic base class for indexing documents""" +""" +functionality: +- generic base class to inherit from for video, channel and playlist +""" import math diff --git a/tubearchivist/home/src/index/playlist.py b/tubearchivist/home/src/index/playlist.py index 2bd2d3a..a9964a0 100644 --- a/tubearchivist/home/src/index/playlist.py +++ b/tubearchivist/home/src/index/playlist.py @@ -1,4 +1,8 @@ -"""handle playlist""" +""" +functionality: +- get metadata from youtube for a playlist +- index and update in es +""" import json from datetime import datetime diff --git a/tubearchivist/home/src/index/reindex.py b/tubearchivist/home/src/index/reindex.py index b59d5a5..0eea2e3 100644 --- a/tubearchivist/home/src/index/reindex.py +++ b/tubearchivist/home/src/index/reindex.py @@ -1,4 +1,8 @@ -"""periodically refresh documents""" +""" +functionality: +- periodically refresh documents +- index and update in es +""" import json from datetime import datetime diff --git a/tubearchivist/home/src/index/video.py b/tubearchivist/home/src/index/video.py index 7b71e36..240a1b3 100644 --- a/tubearchivist/home/src/index/video.py +++ b/tubearchivist/home/src/index/video.py @@ -1,4 +1,8 @@ -"""handle single video index""" +""" +functionality: +- get metadata from youtube for a video +- index and update in es +""" import os from datetime import datetime diff --git a/tubearchivist/home/src/ta/config.py b/tubearchivist/home/src/ta/config.py index 569b230..509e651 100644 --- a/tubearchivist/home/src/ta/config.py +++ b/tubearchivist/home/src/ta/config.py @@ -2,7 +2,6 @@ Functionality: - read and write config - load config variables into redis -- needs to be a separate module to avoid circular import """ import json diff --git a/tubearchivist/home/src/ta/ta_redis.py b/tubearchivist/home/src/ta/ta_redis.py index 60b8e0e..d131c96 100644 --- a/tubearchivist/home/src/ta/ta_redis.py +++ b/tubearchivist/home/src/ta/ta_redis.py @@ -1,4 +1,8 @@ -"""interact with redis""" +""" +functionality: +- interact with redis +- hold temporary download queue in redis +""" import json import os diff --git a/tubearchivist/home/views.py b/tubearchivist/home/views.py index 006a944..a0cac87 100644 --- a/tubearchivist/home/views.py +++ b/tubearchivist/home/views.py @@ -1,7 +1,7 @@ """ Functionality: - all views for home app -- process post data received from frontend via ajax +- holds base classes to inherit from """ import json From b6f9fb58adfea67bbb4bf27208ed9fb279ce0d30 Mon Sep 17 00:00:00 2001 From: simon Date: Wed, 26 Jan 2022 19:32:30 +0700 Subject: [PATCH 12/18] refactor VideoDownloader class, better obs builder --- .../home/src/download/yt_dlp_handler.py | 56 +++++++++++-------- 1 file changed, 34 insertions(+), 22 deletions(-) diff --git a/tubearchivist/home/src/download/yt_dlp_handler.py b/tubearchivist/home/src/download/yt_dlp_handler.py index 671c5df..6d17a33 100644 --- a/tubearchivist/home/src/download/yt_dlp_handler.py +++ b/tubearchivist/home/src/download/yt_dlp_handler.py @@ -31,8 +31,10 @@ class VideoDownloader: """ def __init__(self, youtube_id_list=False): + self.obs = False self.youtube_id_list = youtube_id_list self.config = AppConfig().config + self._build_obs() self.channels = set() def run_queue(self): @@ -49,14 +51,14 @@ class VideoDownloader: break try: - self.dl_single_vid(youtube_id) + self._dl_single_vid(youtube_id) except yt_dlp.utils.DownloadError: print("failed to download " + youtube_id) continue vid_dict = index_new_video(youtube_id) self.channels.add(vid_dict["channel"]["channel_id"]) self.move_to_archive(vid_dict) - self.delete_from_pending(youtube_id) + self._delete_from_pending(youtube_id) autodelete_days = self.config["downloads"]["autodelete_days"] if autodelete_days: @@ -91,7 +93,7 @@ class VideoDownloader: queue.add_list(to_add) @staticmethod - def progress_hook(response): + def _progress_hook(response): """process the progress_hooks from yt_dlp""" # title path = os.path.split(response["filename"])[-1][12:] @@ -115,9 +117,15 @@ class VideoDownloader: } RedisArchivist().set_message("message:download", mess_dict) - def build_obs(self): - """build obs dictionary for yt-dlp""" - obs = { + def _build_obs(self): + """collection to build all obs passed to yt-dlp""" + self._build_obs_basic() + self._build_obs_user() + self._build_obs_postprocessors() + + def _build_obs_basic(self): + """initial obs""" + self.obs = { "default_search": "ytsearch", "merge_output_format": "mp4", "restrictfilenames": True, @@ -126,7 +134,7 @@ class VideoDownloader: + "/download/" + self.config["application"]["file_template"] ), - "progress_hooks": [self.progress_hook], + "progress_hooks": [self._progress_hook], "noprogress": True, "quiet": True, "continuedl": True, @@ -135,15 +143,22 @@ class VideoDownloader: "noplaylist": True, "check_formats": "selected", } + + def _build_obs_user(self): + """build user customized options""" if self.config["downloads"]["format"]: - obs["format"] = self.config["downloads"]["format"] + self.obs["format"] = self.config["downloads"]["format"] if self.config["downloads"]["limit_speed"]: - obs["ratelimit"] = self.config["downloads"]["limit_speed"] * 1024 + self.obs["ratelimit"] = ( + self.config["downloads"]["limit_speed"] * 1024 + ) throttle = self.config["downloads"]["throttledratelimit"] if throttle: - obs["throttledratelimit"] = throttle * 1024 + self.obs["throttledratelimit"] = throttle * 1024 + def _build_obs_postprocessors(self): + """add postprocessor to obs""" postprocessors = [] if self.config["downloads"]["add_metadata"]: @@ -162,23 +177,20 @@ class VideoDownloader: "already_have_thumbnail": True, } ) - obs["writethumbnail"] = True + self.obs["writethumbnail"] = True - obs["postprocessors"] = postprocessors + self.obs["postprocessors"] = postprocessors - return obs - - def dl_single_vid(self, youtube_id): + def _dl_single_vid(self, youtube_id): """download single video""" dl_cache = self.config["application"]["cache_dir"] + "/download/" - obs = self.build_obs() # check if already in cache to continue from there all_cached = ignore_filelist(os.listdir(dl_cache)) for file_name in all_cached: if youtube_id in file_name: - obs["outtmpl"] = os.path.join(dl_cache, file_name) - with yt_dlp.YoutubeDL(obs) as ydl: + self.obs["outtmpl"] = os.path.join(dl_cache, file_name) + with yt_dlp.YoutubeDL(self.obs) as ydl: try: ydl.download([youtube_id]) except yt_dlp.utils.DownloadError: @@ -186,7 +198,7 @@ class VideoDownloader: sleep(10) ydl.download([youtube_id]) - if obs["writethumbnail"]: + if self.obs["writethumbnail"]: # webp files don't get cleaned up automatically all_cached = ignore_filelist(os.listdir(dl_cache)) to_clean = [i for i in all_cached if not i.endswith(".mp4")] @@ -219,7 +231,7 @@ class VideoDownloader: if host_uid and host_gid: os.chown(new_file_path, host_uid, host_gid) - def delete_from_pending(self, youtube_id): + def _delete_from_pending(self, youtube_id): """delete downloaded video from pending index if its there""" es_url = self.config["application"]["es_url"] es_auth = self.config["application"]["es_auth"] @@ -228,7 +240,7 @@ class VideoDownloader: if not response.ok and not response.status_code == 404: print(response.text) - def add_subscribed_channels(self): + def _add_subscribed_channels(self): """add all channels subscribed to refresh""" all_subscribed = PlaylistSubscription().get_playlists() if not all_subscribed: @@ -243,7 +255,7 @@ class VideoDownloader: def validate_playlists(self): """look for playlist needing to update""" print("sync playlists") - self.add_subscribed_channels() + self._add_subscribed_channels() all_indexed = PendingList().get_all_indexed() all_youtube_ids = [i["youtube_id"] for i in all_indexed] for id_c, channel_id in enumerate(self.channels): From 2eea07c85e0df0e973f13c7007a7b516f1ff1e3c Mon Sep 17 00:00:00 2001 From: simon Date: Wed, 26 Jan 2022 20:05:52 +0700 Subject: [PATCH 13/18] organize docker conf files --- Dockerfile | 6 +++--- nginx.conf => docker_assets/nginx.conf | 0 run.sh => docker_assets/run.sh | 6 +++--- uwsgi.ini => docker_assets/uwsgi.ini | 0 4 files changed, 6 insertions(+), 6 deletions(-) rename nginx.conf => docker_assets/nginx.conf (100%) rename run.sh => docker_assets/run.sh (95%) rename uwsgi.ini => docker_assets/uwsgi.ini (100%) diff --git a/Dockerfile b/Dockerfile index f196318..bc3f927 100644 --- a/Dockerfile +++ b/Dockerfile @@ -35,12 +35,12 @@ COPY ./tubearchivist/requirements.txt /requirements.txt RUN pip install --no-cache-dir -r requirements.txt --src /usr/local/src # copy config files -COPY nginx.conf /etc/nginx/conf.d/ +COPY docker_assets/nginx.conf /etc/nginx/conf.d/ # copy application into container COPY ./tubearchivist /app -COPY ./run.sh /app -COPY ./uwsgi.ini /app +COPY ./docker_assets/run.sh /app +COPY ./docker_assets/uwsgi.ini /app # volumes VOLUME /cache diff --git a/nginx.conf b/docker_assets/nginx.conf similarity index 100% rename from nginx.conf rename to docker_assets/nginx.conf diff --git a/run.sh b/docker_assets/run.sh similarity index 95% rename from run.sh rename to docker_assets/run.sh index 7f25cc9..235f823 100644 --- a/run.sh +++ b/docker_assets/run.sh @@ -2,9 +2,9 @@ # startup script inside the container for tubearchivist # check environment -if [[ -z "$DJANGO_DEBUG" ]]; then - export DJANGO_DEBUG=False -fi +# if [[ -z "$DJANGO_DEBUG" ]]; then +# export DJANGO_DEBUG=False +# fi if [[ -z "$ELASTIC_USER" ]]; then export ELASTIC_USER=elastic diff --git a/uwsgi.ini b/docker_assets/uwsgi.ini similarity index 100% rename from uwsgi.ini rename to docker_assets/uwsgi.ini From b2f69d14335d9761e6b23fcfaf4da31ecb40836f Mon Sep 17 00:00:00 2001 From: simon Date: Wed, 26 Jan 2022 20:21:08 +0700 Subject: [PATCH 14/18] untrack vscode folder --- .gitignore | 3 +++ .vscode/settings.json | 5 ----- 2 files changed, 3 insertions(+), 5 deletions(-) delete mode 100644 .vscode/settings.json diff --git a/.gitignore b/.gitignore index 484bd71..5445a53 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,6 @@ __pycache__ # django testing db db.sqlite3 + +# vscode custom conf +.vscode \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json deleted file mode 100644 index 97635f3..0000000 --- a/.vscode/settings.json +++ /dev/null @@ -1,5 +0,0 @@ -{ - "python.linting.pylintEnabled": true, - "python.linting.pycodestyleEnabled": false, - "python.linting.enabled": true -} \ No newline at end of file From b93a6f689bd3114f3f9f7008ac9b1ce857c1a9a5 Mon Sep 17 00:00:00 2001 From: simon Date: Thu, 27 Jan 2022 14:49:22 +0700 Subject: [PATCH 15/18] remove previous bug workaround to set django debug env, #159 --- docker_assets/run.sh | 5 ----- 1 file changed, 5 deletions(-) diff --git a/docker_assets/run.sh b/docker_assets/run.sh index 235f823..747548c 100644 --- a/docker_assets/run.sh +++ b/docker_assets/run.sh @@ -1,11 +1,6 @@ #!/bin/bash # startup script inside the container for tubearchivist -# check environment -# if [[ -z "$DJANGO_DEBUG" ]]; then -# export DJANGO_DEBUG=False -# fi - if [[ -z "$ELASTIC_USER" ]]; then export ELASTIC_USER=elastic fi From 00e9e4bc5361ccf7564267609438b9b6c03d68cf Mon Sep 17 00:00:00 2001 From: simon Date: Thu, 27 Jan 2022 15:08:26 +0700 Subject: [PATCH 16/18] bump python version --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index bc3f927..b831247 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,6 @@ # build the tube archivist image from default python slim image -FROM python:3.10.1-slim-bullseye +FROM python:3.10.2-slim-bullseye ARG TARGETPLATFORM ENV PYTHONUNBUFFERED 1 From 87b72a571d06a72c1217eb375f29a56cfc361673 Mon Sep 17 00:00:00 2001 From: simon Date: Thu, 27 Jan 2022 15:32:58 +0700 Subject: [PATCH 17/18] simplify reading json files --- tubearchivist/home/src/es/index_setup.py | 3 +-- tubearchivist/home/src/ta/config.py | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/tubearchivist/home/src/es/index_setup.py b/tubearchivist/home/src/es/index_setup.py index 4bb980b..8b0ed97 100644 --- a/tubearchivist/home/src/es/index_setup.py +++ b/tubearchivist/home/src/es/index_setup.py @@ -399,8 +399,7 @@ class ElasticBackup: def get_mapping(): """read index_mapping.json and get expected mapping and settings""" with open("home/src/es/index_mapping.json", "r", encoding="utf-8") as f: - config_str = f.read() - index_config = json.loads(config_str).get("index_config") + index_config = json.load(f).get("index_config") return index_config diff --git a/tubearchivist/home/src/ta/config.py b/tubearchivist/home/src/ta/config.py index 509e651..3258ed0 100644 --- a/tubearchivist/home/src/ta/config.py +++ b/tubearchivist/home/src/ta/config.py @@ -38,8 +38,7 @@ class AppConfig: def get_config_file(self): """read the defaults from config.json""" with open("home/config.json", "r", encoding="utf-8") as f: - config_str = f.read() - config_file = json.loads(config_str) + config_file = json.load(f) config_file["application"].update(self.get_config_env()) From 2b2ff814e3c7f13555ac783da0120c4520ec2e3b Mon Sep 17 00:00:00 2001 From: simon Date: Thu, 27 Jan 2022 16:04:04 +0700 Subject: [PATCH 18/18] red hover logout button --- tubearchivist/home/templates/home/base.html | 2 +- tubearchivist/static/css/style.css | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/tubearchivist/home/templates/home/base.html b/tubearchivist/home/templates/home/base.html index 4525dd2..3bd4899 100644 --- a/tubearchivist/home/templates/home/base.html +++ b/tubearchivist/home/templates/home/base.html @@ -69,7 +69,7 @@ gear-icon - exit-icon + exit-icon diff --git a/tubearchivist/static/css/style.css b/tubearchivist/static/css/style.css index d6f535c..40b845e 100644 --- a/tubearchivist/static/css/style.css +++ b/tubearchivist/static/css/style.css @@ -286,6 +286,10 @@ button:hover { --connected-color: var(--accent-font-light); } +.alert-hover:hover { + filter: var(--img-filter-error); +} + /* top of page */ .title-bar { padding-top: 30px;