refactor and consolidate Reindex class

2025-08-02 05:58:17 +00:00 · 2022-03-23 15:48:38 +07:00 · 2022-03-23 15:48:38 +07:00 · fda520ad44
commit fda520ad44
parent 1f7d6871cf
3 changed files with 98 additions and 149 deletions
--- a/tubearchivist/home/src/index/filesystem.py
+++ b/tubearchivist/home/src/index/filesystem.py
@ -10,7 +10,6 @@ import os
 import re
 import shutil
 import subprocess
 from datetime import datetime
 import requests
 from home.src.download.queue import PendingList
@ -319,10 +318,7 @@ def scan_filesystem():
 def reindex_old_documents():
    """daily refresh of old documents"""
-    # continue if needed
+    handler = Reindex()
-    reindex_handler = Reindex()
+    handler.check_outdated()
-    reindex_handler.check_outdated()
+    handler.reindex()
-    reindex_handler.reindex()
+    RedisArchivist().set_message("last_reindex", handler.now, expire=False)
    # set timestamp
    now = int(datetime.now().strftime("%s"))
    RedisArchivist().set_message("last_reindex", now, expire=False)
--- a/tubearchivist/home/src/index/reindex.py
+++ b/tubearchivist/home/src/index/reindex.py
@ -4,85 +4,60 @@ functionality:
 - index and update in es
 """
 import json
 from datetime import datetime
 from math import ceil
 from time import sleep
 import requests
 from home.src.download.queue import PendingList
 from home.src.download.thumbnails import ThumbManager
 from home.src.es.connect import ElasticWrap
 from home.src.index.channel import YoutubeChannel
 from home.src.index.playlist import YoutubePlaylist
 from home.src.index.video import YoutubeVideo
 from home.src.ta.config import AppConfig
 from home.src.ta.helper import get_total_hits
 class Reindex:
    """check for outdated documents and refresh data from youtube"""
    MATCH_FIELD = {
        "ta_video": "active",
        "ta_channel": "channel_active",
        "ta_playlist": "playlist_active",
    }
    MULTIPLY = 1.2
    def __init__(self):
        # config
-        config = AppConfig().config
+        self.now = int(datetime.now().strftime("%s"))
-        self.sleep_interval = config["downloads"]["sleep_interval"]
+        self.config = AppConfig().config
-        self.es_url = config["application"]["es_url"]
+        self.interval = self.config["scheduler"]["check_reindex_days"]
        self.es_auth = config["application"]["es_auth"]
        self.refresh_interval = config["scheduler"]["check_reindex_days"]
        self.integrate_ryd = config["downloads"]["integrate_ryd"]
        # scan
        self.all_youtube_ids = False
        self.all_channel_ids = False
        self.all_playlist_ids = False
-    def get_daily(self):
+    def _get_daily(self):
        """get daily refresh values"""
-        total_videos = get_total_hits(
+        total_videos = self._get_total_hits("ta_video")
-            "ta_video", self.es_url, self.es_auth, "active"
+        video_daily = ceil(total_videos / self.interval * self.MULTIPLY)
-        )
+        total_channels = self._get_total_hits("ta_channel")
-        video_daily = ceil(total_videos / self.refresh_interval * 1.2)
+        channel_daily = ceil(total_channels / self.interval * self.MULTIPLY)
-        total_channels = get_total_hits(
+        total_playlists = self._get_total_hits("ta_playlist")
-            "ta_channel", self.es_url, self.es_auth, "channel_active"
+        playlist_daily = ceil(total_playlists / self.interval * self.MULTIPLY)
        )
        channel_daily = ceil(total_channels / self.refresh_interval * 1.2)
        total_playlists = get_total_hits(
            "ta_playlist", self.es_url, self.es_auth, "playlist_active"
        )
        playlist_daily = ceil(total_playlists / self.refresh_interval * 1.2)
        return (video_daily, channel_daily, playlist_daily)
-    def get_outdated_vids(self, size):
+    def _get_total_hits(self, index):
-        """get daily videos to refresh"""
+        """get total hits from index"""
-        headers = {"Content-type": "application/json"}
+        match_field = self.MATCH_FIELD[index]
-        now = int(datetime.now().strftime("%s"))
+        path = f"{index}/_search?filter_path=hits.total"
-        now_lte = now - self.refresh_interval * 24 * 60 * 60
+        data = {"query": {"match": {match_field: True}}}
-        data = {
+        response, _ = ElasticWrap(path).post(data=data)
-            "size": size,
+        total_hits = response["hits"]["total"]["value"]
-            "query": {
+        return total_hits
                "bool": {
                    "must": [
                        {"match": {"active": True}},
                        {"range": {"vid_last_refresh": {"lte": now_lte}}},
                    ]
                }
            },
            "sort": [{"vid_last_refresh": {"order": "asc"}}],
            "_source": False,
        }
        query_str = json.dumps(data)
        url = self.es_url + "/ta_video/_search"
        response = requests.get(
            url, data=query_str, headers=headers, auth=self.es_auth
        )
        if not response.ok:
            print(response.text)
        response_dict = json.loads(response.text)
        all_youtube_ids = [i["_id"] for i in response_dict["hits"]["hits"]]
        return all_youtube_ids
-    def get_unrated_vids(self):
+    def _get_unrated_vids(self):
-        """get all videos without rating if ryd integration is enabled"""
+        """get max 200 videos without rating if ryd integration is enabled"""
        headers = {"Content-type": "application/json"}
        data = {
            "size": 200,
            "query": {
@ -91,86 +66,78 @@ class Reindex:
                }
            },
        }
-        query_str = json.dumps(data)
+        response, _ = ElasticWrap("ta_video/_search").get(data=data)
-        url = self.es_url + "/ta_video/_search"
+
-        response = requests.get(
+        missing_rating = [i["_id"] for i in response["hits"]["hits"]]
            url, data=query_str, headers=headers, auth=self.es_auth
        )
        if not response.ok:
            print(response.text)
        response_dict = json.loads(response.text)
        missing_rating = [i["_id"] for i in response_dict["hits"]["hits"]]
        self.all_youtube_ids = self.all_youtube_ids + missing_rating
-    def get_outdated_channels(self, size):
+    def _get_outdated_vids(self, size):
-        """get daily channels to refresh"""
+        """get daily videos to refresh"""
-        headers = {"Content-type": "application/json"}
+        now_lte = self.now - self.interval * 24 * 60 * 60
-        now = int(datetime.now().strftime("%s"))
+        must_list = [
-        now_lte = now - self.refresh_interval * 24 * 60 * 60
+            {"match": {"active": True}},
            {"range": {"vid_last_refresh": {"lte": now_lte}}},
        ]
        data = {
            "size": size,
-            "query": {
+            "query": {"bool": {"must": must_list}},
-                "bool": {
+            "sort": [{"vid_last_refresh": {"order": "asc"}}],
-                    "must": [
+            "_source": False,
-                        {"match": {"channel_active": True}},
+        }
-                        {"range": {"channel_last_refresh": {"lte": now_lte}}},
+        response, _ = ElasticWrap("ta_video/_search").get(data=data)
-                    ]
+
-                }
+        all_youtube_ids = [i["_id"] for i in response["hits"]["hits"]]
-            },
+        return all_youtube_ids
    def _get_outdated_channels(self, size):
        """get daily channels to refresh"""
        now_lte = self.now - self.interval * 24 * 60 * 60
        must_list = [
            {"match": {"channel_active": True}},
            {"range": {"channel_last_refresh": {"lte": now_lte}}},
        ]
        data = {
            "size": size,
            "query": {"bool": {"must": must_list}},
            "sort": [{"channel_last_refresh": {"order": "asc"}}],
            "_source": False,
        }
-        query_str = json.dumps(data)
+        response, _ = ElasticWrap("ta_channel/_search").get(data=data)
-        url = self.es_url + "/ta_channel/_search"
+
-        response = requests.get(
+        all_channel_ids = [i["_id"] for i in response["hits"]["hits"]]
            url, data=query_str, headers=headers, auth=self.es_auth
        )
        if not response.ok:
            print(response.text)
        response_dict = json.loads(response.text)
        all_channel_ids = [i["_id"] for i in response_dict["hits"]["hits"]]
        return all_channel_ids
-    def get_outdated_playlists(self, size):
+    def _get_outdated_playlists(self, size):
        """get daily outdated playlists to refresh"""
-        headers = {"Content-type": "application/json"}
+        now_lte = self.now - self.interval * 24 * 60 * 60
-        now = int(datetime.now().strftime("%s"))
+        must_list = [
-        now_lte = now - self.refresh_interval * 24 * 60 * 60
+            {"match": {"playlist_active": True}},
            {"range": {"playlist_last_refresh": {"lte": now_lte}}},
        ]
        data = {
            "size": size,
-            "query": {
+            "query": {"bool": {"must": must_list}},
                "bool": {
                    "must": [
                        {"match": {"playlist_active": True}},
                        {"range": {"playlist_last_refresh": {"lte": now_lte}}},
                    ]
                }
            },
            "sort": [{"playlist_last_refresh": {"order": "asc"}}],
            "_source": False,
        }
-        query_str = json.dumps(data)
+        response, _ = ElasticWrap("ta_playlist/_search").get(data=data)
-        url = self.es_url + "/ta_playlist/_search"
+
-        response = requests.get(
+        all_playlist_ids = [i["_id"] for i in response["hits"]["hits"]]
            url, data=query_str, headers=headers, auth=self.es_auth
        )
        if not response.ok:
            print(response.text)
        response_dict = json.loads(response.text)
        all_playlist_ids = [i["_id"] for i in response_dict["hits"]["hits"]]
        return all_playlist_ids
    def check_outdated(self):
        """add missing vids and channels"""
-        video_daily, channel_daily, playlist_daily = self.get_daily()
+        video_daily, channel_daily, playlist_daily = self._get_daily()
-        self.all_youtube_ids = self.get_outdated_vids(video_daily)
+        self.all_youtube_ids = self._get_outdated_vids(video_daily)
-        self.all_channel_ids = self.get_outdated_channels(channel_daily)
+        self.all_channel_ids = self._get_outdated_channels(channel_daily)
-        self.all_playlist_ids = self.get_outdated_playlists(playlist_daily)
+        self.all_playlist_ids = self._get_outdated_playlists(playlist_daily)
-        if self.integrate_ryd:
+
-            self.get_unrated_vids()
+        integrate_ryd = self.config["downloads"]["integrate_ryd"]
        if integrate_ryd:
            self._get_unrated_vids()
    @staticmethod
-    def reindex_single_video(youtube_id):
+    def _reindex_single_video(youtube_id):
        """refresh data for single video"""
        video = YoutubeVideo(youtube_id)
@ -204,20 +171,21 @@ class Reindex:
        return
    @staticmethod
-    def reindex_single_channel(channel_id):
+    def _reindex_single_channel(channel_id):
        """refresh channel data and sync to videos"""
        channel = YoutubeChannel(channel_id)
        channel.get_from_es()
        subscribed = channel.json_data["channel_subscribed"]
-        overwrites = channel.json_data["channel_overwrites"]
+        overwrites = channel.json_data.get("channel_overwrites", False)
        channel.get_from_youtube()
        channel.json_data["channel_subscribed"] = subscribed
-        channel.json_data["channel_overwrites"] = overwrites
+        if overwrites:
            channel.json_data["channel_overwrites"] = overwrites
        channel.upload_to_es()
        channel.sync_to_videos()
    @staticmethod
-    def reindex_single_playlist(playlist_id, all_indexed_ids):
+    def _reindex_single_playlist(playlist_id, all_indexed_ids):
        """refresh playlist data"""
        playlist = YoutubePlaylist(playlist_id)
        playlist.get_from_es()
@ -234,18 +202,19 @@ class Reindex:
    def reindex(self):
        """reindex what's needed"""
        sleep_interval = self.config["downloads"]["sleep_interval"]
        # videos
        print(f"reindexing {len(self.all_youtube_ids)} videos")
        for youtube_id in self.all_youtube_ids:
-            self.reindex_single_video(youtube_id)
+            self._reindex_single_video(youtube_id)
-            if self.sleep_interval:
+            if sleep_interval:
-                sleep(self.sleep_interval)
+                sleep(sleep_interval)
        # channels
        print(f"reindexing {len(self.all_channel_ids)} channels")
        for channel_id in self.all_channel_ids:
-            self.reindex_single_channel(channel_id)
+            self._reindex_single_channel(channel_id)
-            if self.sleep_interval:
+            if sleep_interval:
-                sleep(self.sleep_interval)
+                sleep(sleep_interval)
        # playlist
        print(f"reindexing {len(self.all_playlist_ids)} playlists")
        if self.all_playlist_ids:
@ -253,6 +222,6 @@ class Reindex:
            handler.get_indexed()
            all_indexed_ids = [i["youtube_id"] for i in handler.all_videos]
            for playlist_id in self.all_playlist_ids:
-                self.reindex_single_playlist(playlist_id, all_indexed_ids)
+                self._reindex_single_playlist(playlist_id, all_indexed_ids)
-                if self.sleep_interval:
+                if sleep_interval:
-                    sleep(self.sleep_interval)
+                    sleep(sleep_interval)
--- a/tubearchivist/home/src/ta/helper.py
+++ b/tubearchivist/home/src/ta/helper.py
@ -3,31 +3,15 @@ Loose collection of helper functions
 - don't import AppConfig class here to avoid circular imports
 """
 import json
 import re
 import string
 import subprocess
 import unicodedata
 from urllib.parse import parse_qs, urlparse
 import requests
 import yt_dlp
 def get_total_hits(index, es_url, es_auth, match_field):
    """get total hits from index"""
    headers = {"Content-type": "application/json"}
    data = {"query": {"match": {match_field: True}}}
    payload = json.dumps(data)
    url = f"{es_url}/{index}/_search?filter_path=hits.total"
    request = requests.post(url, data=payload, headers=headers, auth=es_auth)
    if not request.ok:
        print(request.text)
    total_json = json.loads(request.text)
    total_hits = total_json["hits"]["total"]["value"]
    return total_hits
 def clean_string(file_name):
    """clean string to only asci characters"""
    whitelist = "-_.() " + string.ascii_letters + string.digits