tubearchivist/tubearchivist/home/src/es/backup.py

268 lines
8.5 KiB
Python

"""
Functionality:
- Handle json zip file based backup
- create backup
- restore backup
"""
import json
import os
import zipfile
from datetime import datetime
from home.src.es.connect import ElasticWrap, IndexPaginate
from home.src.ta.config import AppConfig
from home.src.ta.helper import get_mapping, ignore_filelist
from home.src.ta.settings import EnvironmentSettings
class ElasticBackup:
"""dump index to nd-json files for later bulk import"""
INDEX_SPLIT = ["comment"]
CACHE_DIR = EnvironmentSettings.CACHE_DIR
BACKUP_DIR = os.path.join(CACHE_DIR, "backup")
def __init__(self, reason=False, task=False):
self.config = AppConfig().config
self.timestamp = datetime.now().strftime("%Y%m%d")
self.index_config = get_mapping()
self.reason = reason
self.task = task
def backup_all_indexes(self):
"""backup all indexes, add reason to init"""
print("backup all indexes")
if not self.reason:
raise ValueError("missing backup reason in ElasticBackup")
if self.task:
self.task.send_progress(["Scanning your index."])
for index in self.index_config:
index_name = index["index_name"]
print(f"backup: export in progress for {index_name}")
if not self.index_exists(index_name):
print(f"skip backup for not yet existing index {index_name}")
continue
self.backup_index(index_name)
if self.task:
self.task.send_progress(["Compress files to zip archive."])
self.zip_it()
if self.reason == "auto":
self.rotate_backup()
def backup_index(self, index_name):
"""export all documents of a single index"""
paginate_kwargs = {
"data": {"query": {"match_all": {}}},
"keep_source": True,
"callback": BackupCallback,
"task": self.task,
"total": self._get_total(index_name),
}
if index_name in self.INDEX_SPLIT:
paginate_kwargs.update({"size": 200})
paginate = IndexPaginate(f"ta_{index_name}", **paginate_kwargs)
_ = paginate.get_results()
@staticmethod
def _get_total(index_name):
"""get total documents in index"""
path = f"ta_{index_name}/_count"
response, _ = ElasticWrap(path).get()
return response.get("count")
def zip_it(self):
"""pack it up into single zip file"""
file_name = f"ta_backup-{self.timestamp}-{self.reason}.zip"
to_backup = []
for file in os.listdir(self.BACKUP_DIR):
if file.endswith(".json"):
to_backup.append(os.path.join(self.BACKUP_DIR, file))
backup_file = os.path.join(self.BACKUP_DIR, file_name)
comp = zipfile.ZIP_DEFLATED
with zipfile.ZipFile(backup_file, "w", compression=comp) as zip_f:
for backup_file in to_backup:
zip_f.write(backup_file, os.path.basename(backup_file))
# cleanup
for backup_file in to_backup:
os.remove(backup_file)
def post_bulk_restore(self, file_name):
"""send bulk to es"""
file_path = os.path.join(self.CACHE_DIR, file_name)
with open(file_path, "r", encoding="utf-8") as f:
data = f.read()
if not data.strip():
return
_, _ = ElasticWrap("_bulk").post(data=data, ndjson=True)
def get_all_backup_files(self):
"""build all available backup files for view"""
all_backup_files = ignore_filelist(os.listdir(self.BACKUP_DIR))
all_available_backups = [
i
for i in all_backup_files
if i.startswith("ta_") and i.endswith(".zip")
]
all_available_backups.sort(reverse=True)
backup_dicts = []
for filename in all_available_backups:
data = self.build_backup_file_data(filename)
backup_dicts.append(data)
return backup_dicts
def build_backup_file_data(self, filename):
"""build metadata of single backup file"""
file_path = os.path.join(self.BACKUP_DIR, filename)
if not os.path.exists(file_path):
return False
file_split = filename.split("-")
if len(file_split) == 2:
timestamp = file_split[1].strip(".zip")
reason = False
elif len(file_split) == 3:
timestamp = file_split[1]
reason = file_split[2].strip(".zip")
data = {
"filename": filename,
"file_path": file_path,
"file_size": os.path.getsize(file_path),
"timestamp": timestamp,
"reason": reason,
}
return data
def restore(self, filename):
"""
restore from backup zip file
call reset from ElasitIndexWrap first to start blank
"""
zip_content = self._unpack_zip_backup(filename)
self._restore_json_files(zip_content)
def _unpack_zip_backup(self, filename):
"""extract backup zip and return filelist"""
file_path = os.path.join(self.BACKUP_DIR, filename)
with zipfile.ZipFile(file_path, "r") as z:
zip_content = z.namelist()
z.extractall(self.BACKUP_DIR)
return zip_content
def _restore_json_files(self, zip_content):
"""go through the unpacked files and restore"""
for idx, json_f in enumerate(zip_content):
self._notify_restore(idx, json_f, len(zip_content))
file_name = os.path.join(self.BACKUP_DIR, json_f)
if not json_f.startswith("es_") or not json_f.endswith(".json"):
os.remove(file_name)
continue
print("restoring: " + json_f)
self.post_bulk_restore(file_name)
os.remove(file_name)
def _notify_restore(self, idx, json_f, total_files):
"""notify restore progress"""
message = [f"Restore index from json backup file {json_f}."]
progress = (idx + 1) / total_files
self.task.send_progress(message_lines=message, progress=progress)
@staticmethod
def index_exists(index_name):
"""check if index already exists to skip"""
_, status_code = ElasticWrap(f"ta_{index_name}").get()
exists = status_code == 200
return exists
def rotate_backup(self):
"""delete old backups if needed"""
rotate = self.config["scheduler"]["run_backup_rotate"]
if not rotate:
return
all_backup_files = self.get_all_backup_files()
auto = [i for i in all_backup_files if i["reason"] == "auto"]
if len(auto) <= rotate:
print("no backup files to rotate")
return
all_to_delete = auto[rotate:]
for to_delete in all_to_delete:
self.delete_file(to_delete["filename"])
def delete_file(self, filename):
"""delete backup file"""
file_path = os.path.join(self.BACKUP_DIR, filename)
if not os.path.exists(file_path):
print(f"backup file not found: {filename}")
return False
print(f"remove old backup file: {file_path}")
os.remove(file_path)
return file_path
class BackupCallback:
"""handle backup ndjson writer as callback for IndexPaginate"""
def __init__(self, source, index_name, counter=0):
self.source = source
self.index_name = index_name
self.counter = counter
self.timestamp = datetime.now().strftime("%Y%m%d")
self.cache_dir = EnvironmentSettings.CACHE_DIR
def run(self):
"""run the junk task"""
file_content = self._build_bulk()
self._write_es_json(file_content)
def _build_bulk(self):
"""build bulk query data from all_results"""
bulk_list = []
for document in self.source:
document_id = document["_id"]
es_index = document["_index"]
action = {"index": {"_index": es_index, "_id": document_id}}
source = document["_source"]
bulk_list.append(json.dumps(action))
bulk_list.append(json.dumps(source))
# add last newline
bulk_list.append("\n")
file_content = "\n".join(bulk_list)
return file_content
def _write_es_json(self, file_content):
"""write nd-json file for es _bulk API to disk"""
index = self.index_name.lstrip("ta_")
file_name = f"es_{index}-{self.timestamp}-{self.counter}.json"
file_path = os.path.join(self.cache_dir, "backup", file_name)
with open(file_path, "a+", encoding="utf-8") as f:
f.write(file_content)