dedicated search_after class to scroll through index

2025-08-04 06:58:18 +00:00 · 2021-11-17 17:54:47 +07:00 · 2021-11-17 17:54:47 +07:00 · c4b0f900f8
commit c4b0f900f8
parent 0e9c0d9f6b
1 changed files with 78 additions and 0 deletions
--- a/tubearchivist/home/src/index.py
+++ b/tubearchivist/home/src/index.py
@ -707,6 +707,84 @@ class WatchState:
            print(request.text)
 class IndexPaginate:
    """use search_after to go through whole index"""
    CONFIG = AppConfig().config
    ES_URL = CONFIG["application"]["es_url"]
    ES_AUTH = CONFIG["application"]["es_auth"]
    HEADERS = {"Content-type": "application/json"}
    DEFAULT_SIZE = 500
    def __init__(self, index_name, data, size=False):
        self.index_name = index_name
        self.data = data
        self.pit_id = False
        self.size = size
    def get_results(self):
        """get all results"""
        self.get_pit()
        self.validate_data()
        all_results = self.run_loop()
        self.clean_pit()
        return all_results
    def get_pit(self):
        """get pit for index"""
        url = f"{self.ES_URL}/{self.index_name}/_pit?keep_alive=10m"
        response = requests.post(url, auth=self.ES_AUTH)
        json_data = json.loads(response.text)
        self.pit_id = json_data["id"]
    def validate_data(self):
        """add pit and size to data"""
        if "sort" not in self.data.keys():
            print(self.data)
            raise ValueError("missing sort key in data")
        size = self.size or self.DEFAULT_SIZE
        self.data["size"] = size
        self.data["pit"] = {"id": self.pit_id, "keep_alive": "10m"}
    def run_loop(self):
        """loop through results until last hit"""
        query_str = json.dumps(self.data)
        url = self.ES_URL + "/_search"
        all_results = []
        while True:
            response = requests.get(
                url, data=query_str, headers=self.HEADERS, auth=self.ES_AUTH
            )
            json_data = json.loads(response.text)
            all_hits = json_data["hits"]["hits"]
            if all_hits:
                for hit in all_hits:
                    source = hit["_source"]
                    search_after = hit["sort"]
                    all_results.append(source)
                # update search_after with last hit data
                self.data["search_after"] = search_after
                query_str = json.dumps(self.data)
            else:
                break
        return all_results
    def clean_pit(self):
        """delete pit from elastic search"""
        query_str = json.dumps({"id": self.pit_id})
        requests.delete(
            self.ES_URL + "/_pit",
            data=query_str,
            headers=self.HEADERS,
            auth=self.ES_AUTH,
        )
 def index_new_video(youtube_id, missing_vid=False):
    """combine video and channel classes for new video index"""
    vid_handler = YoutubeVideo(youtube_id)