mirror of
https://github.com/tubearchivist/tubearchivist.git
synced 2025-07-05 16:51:10 +00:00
* ES Client must bootstrap itself to be the source of config If this is not done a cyclic loop is created between the config loader and the ES client. This lays the ground work for ES being the source of all app config. * auto_download is not used anymore * Add UserConfig class that encapsulates user config storage This class will allow the rest of the code to 'not care' about how user properties are stored. This requires the addition of a ta_users index in ES. * Create migration task for user config transfer * Replace getters and setters for each property Strongly type the user configuration Migrate missed sponsorblock ID * Other DB settings will be another PR
175 lines
5.4 KiB
Python
175 lines
5.4 KiB
Python
"""
|
|
functionality:
|
|
- wrapper around requests to call elastic search
|
|
- reusable search_after to extract total index
|
|
"""
|
|
# pylint: disable=missing-timeout
|
|
|
|
import json
|
|
import os
|
|
|
|
import requests
|
|
|
|
|
|
class ElasticWrap:
|
|
"""makes all calls to elastic search
|
|
returns response json and status code tuple
|
|
"""
|
|
|
|
ES_URL: str = str(os.environ.get("ES_URL"))
|
|
ES_PASS: str = str(os.environ.get("ELASTIC_PASSWORD"))
|
|
ES_USER: str = str(os.environ.get("ELASTIC_USER") or "elastic")
|
|
|
|
def __init__(self, path):
|
|
self.url = f"{self.ES_URL}/{path}"
|
|
self.auth = (self.ES_USER, self.ES_PASS)
|
|
|
|
def get(self, data=False, timeout=10, print_error=True):
|
|
"""get data from es"""
|
|
if data:
|
|
response = requests.get(
|
|
self.url, json=data, auth=self.auth, timeout=timeout
|
|
)
|
|
else:
|
|
response = requests.get(self.url, auth=self.auth, timeout=timeout)
|
|
if print_error and not response.ok:
|
|
print(response.text)
|
|
|
|
return response.json(), response.status_code
|
|
|
|
def post(self, data=False, ndjson=False):
|
|
"""post data to es"""
|
|
if ndjson:
|
|
headers = {"Content-type": "application/x-ndjson"}
|
|
payload = data
|
|
else:
|
|
headers = {"Content-type": "application/json"}
|
|
payload = json.dumps(data)
|
|
|
|
if data:
|
|
response = requests.post(
|
|
self.url, data=payload, headers=headers, auth=self.auth
|
|
)
|
|
else:
|
|
response = requests.post(self.url, headers=headers, auth=self.auth)
|
|
|
|
if not response.ok:
|
|
print(response.text)
|
|
|
|
return response.json(), response.status_code
|
|
|
|
def put(self, data, refresh=False):
|
|
"""put data to es"""
|
|
if refresh:
|
|
self.url = f"{self.url}/?refresh=true"
|
|
response = requests.put(f"{self.url}", json=data, auth=self.auth)
|
|
if not response.ok:
|
|
print(response.text)
|
|
print(data)
|
|
raise ValueError("failed to add item to index")
|
|
|
|
return response.json(), response.status_code
|
|
|
|
def delete(self, data=False, refresh=False):
|
|
"""delete document from es"""
|
|
if refresh:
|
|
self.url = f"{self.url}/?refresh=true"
|
|
if data:
|
|
response = requests.delete(self.url, json=data, auth=self.auth)
|
|
else:
|
|
response = requests.delete(self.url, auth=self.auth)
|
|
|
|
if not response.ok:
|
|
print(response.text)
|
|
|
|
return response.json(), response.status_code
|
|
|
|
|
|
class IndexPaginate:
|
|
"""use search_after to go through whole index
|
|
kwargs:
|
|
- size: int, overwrite DEFAULT_SIZE
|
|
- keep_source: bool, keep _source key from es results
|
|
- callback: obj, Class implementing run method callback for every loop
|
|
- task: task object to send notification
|
|
- total: int, total items in index for progress message
|
|
"""
|
|
|
|
DEFAULT_SIZE = 500
|
|
|
|
def __init__(self, index_name, data, **kwargs):
|
|
self.index_name = index_name
|
|
self.data = data
|
|
self.pit_id = False
|
|
self.kwargs = kwargs
|
|
|
|
def get_results(self):
|
|
"""get all results, add task and total for notifications"""
|
|
self.get_pit()
|
|
self.validate_data()
|
|
all_results = self.run_loop()
|
|
self.clean_pit()
|
|
return all_results
|
|
|
|
def get_pit(self):
|
|
"""get pit for index"""
|
|
path = f"{self.index_name}/_pit?keep_alive=10m"
|
|
response, _ = ElasticWrap(path).post()
|
|
self.pit_id = response["id"]
|
|
|
|
def validate_data(self):
|
|
"""add pit and size to data"""
|
|
if not self.data:
|
|
self.data = {}
|
|
|
|
if "query" not in self.data.keys():
|
|
self.data.update({"query": {"match_all": {}}})
|
|
|
|
if "sort" not in self.data.keys():
|
|
self.data.update({"sort": [{"_doc": {"order": "desc"}}]})
|
|
|
|
self.data["size"] = self.kwargs.get("size") or self.DEFAULT_SIZE
|
|
self.data["pit"] = {"id": self.pit_id, "keep_alive": "10m"}
|
|
|
|
def run_loop(self):
|
|
"""loop through results until last hit"""
|
|
all_results = []
|
|
counter = 0
|
|
while True:
|
|
response, _ = ElasticWrap("_search").get(data=self.data)
|
|
all_hits = response["hits"]["hits"]
|
|
if not all_hits:
|
|
break
|
|
|
|
for hit in all_hits:
|
|
if self.kwargs.get("keep_source"):
|
|
all_results.append(hit)
|
|
else:
|
|
all_results.append(hit["_source"])
|
|
|
|
if self.kwargs.get("callback"):
|
|
self.kwargs.get("callback")(all_hits, self.index_name).run()
|
|
|
|
if self.kwargs.get("task"):
|
|
print(f"{self.index_name}: processing page {counter}")
|
|
self._notify(len(all_results))
|
|
|
|
counter += 1
|
|
|
|
# update search_after with last hit data
|
|
self.data["search_after"] = all_hits[-1]["sort"]
|
|
|
|
return all_results
|
|
|
|
def _notify(self, processed):
|
|
"""send notification on task"""
|
|
total = self.kwargs.get("total")
|
|
progress = processed / total
|
|
index_clean = self.index_name.lstrip("ta_").title()
|
|
message = [f"Processing {index_clean}s {processed}/{total}"]
|
|
self.kwargs.get("task").send_progress(message, progress=progress)
|
|
|
|
def clean_pit(self):
|
|
"""delete pit from elastic search"""
|
|
ElasticWrap("_pit").delete(data={"id": self.pit_id})
|