tubearchivist/tubearchivist/home/src/index_management.py

518 lines
16 KiB
Python

"""
Functionality:
- initial elastic search setup
- index configuration is represented in INDEX_CONFIG
- index mapping and settings validation
- backup and restore
"""
import json
import os
from datetime import datetime
import requests
from home.src.config import AppConfig
# expected mapping and settings
INDEX_CONFIG = [
{
'index_name': 'channel',
'expected_map': {
"channel_id": {
"type": "keyword",
},
"channel_name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256,
"normalizer": "to_lower"
},
"search_as_you_type": {
"type": "search_as_you_type",
"doc_values": False,
"max_shingle_size": 3
}
}
},
"channel_banner_url": {
"type": "keyword",
"index": False
},
"channel_thumb_url": {
"type": "keyword",
"index": False
},
"channel_description": {
"type": "text"
},
"channel_last_refresh": {
"type": "date",
"format": "epoch_second"
}
},
'expected_set': {
"analysis": {
"normalizer": {
"to_lower": {
"type": "custom",
"filter": ["lowercase"]
}
}
},
"number_of_replicas": 0
}
},
{
'index_name': 'video',
'expected_map': {
"vid_thumb_url": {
"type": "text",
"index": False
},
"date_downloaded": {
"type": "date"
},
"channel": {
"properties": {
"channel_id": {
"type": "keyword",
},
"channel_name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256,
"normalizer": "to_lower"
},
"search_as_you_type": {
"type": "search_as_you_type",
"doc_values": False,
"max_shingle_size": 3
}
}
},
"channel_banner_url": {
"type": "keyword",
"index": False
},
"channel_thumb_url": {
"type": "keyword",
"index": False
},
"channel_description": {
"type": "text"
},
"channel_last_refresh": {
"type": "date",
"format": "epoch_second"
}
}
},
"description": {
"type": "text"
},
"media_url": {
"type": "keyword",
"index": False
},
"title": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256,
"normalizer": "to_lower"
},
"search_as_you_type": {
"type": "search_as_you_type",
"doc_values": False,
"max_shingle_size": 3
}
}
},
"vid_last_refresh": {
"type": "date"
},
"youtube_id": {
"type": "keyword"
},
"published": {
"type": "date"
},
},
'expected_set': {
"analysis": {
"normalizer": {
"to_lower": {
"type": "custom",
"filter": ["lowercase"]
}
}
},
"number_of_replicas": 0
}
},
{
'index_name': 'download',
'expected_map': {
"timestamp": {
"type": "date"
},
"channel_id": {
"type": "keyword"
},
"channel_name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256,
"normalizer": "to_lower"
}
}
},
"status": {
"type": "keyword"
},
"title": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256,
"normalizer": "to_lower"
}
}
},
"vid_thumb_url": {
"type": "keyword"
},
"youtube_id": {
"type": "keyword"
}
},
'expected_set': {
"analysis": {
"normalizer": {
"to_lower": {
"type": "custom",
"filter": ["lowercase"]
}
}
},
"number_of_replicas": 0
}
}
]
class ElasticIndex:
"""
handle mapping and settings on elastic search for a given index
"""
CONFIG = AppConfig().config
ES_URL = CONFIG['application']['es_url']
HEADERS = {'Content-type': 'application/json'}
def __init__(self, index_name, expected_map, expected_set):
self.index_name = index_name
self.expected_map = expected_map
self.expected_set = expected_set
self.exists, self.details = self.index_exists()
def index_exists(self):
""" check if index already exists and return mapping if it does """
index_name = self.index_name
url = f'{self.ES_URL}/ta_{index_name}'
response = requests.get(url)
exists = response.ok
if exists:
details = response.json()[f'ta_{index_name}']
else:
details = False
return exists, details
def validate(self):
"""
check if all expected mappings and settings match
returns True when rebuild is needed
"""
if self.expected_map:
rebuild = self.validate_mappings()
if rebuild:
return rebuild
if self.expected_set:
rebuild = self.validate_settings()
if rebuild:
return rebuild
return False
def validate_mappings(self):
""" check if all mappings are as expected """
expected_map = self.expected_map
now_map = self.details['mappings']['properties']
for key, value in expected_map.items():
# nested
if list(value.keys()) == ['properties']:
for key_n, value_n in value['properties'].items():
if key_n not in now_map[key]['properties'].keys():
print(key_n, value_n)
return True
if not value_n == now_map[key]['properties'][key_n]:
print(key_n, value_n)
return True
continue
# not nested
if key not in now_map.keys():
print(key, value)
return True
if not value == now_map[key]:
print(key, value)
return True
return False
def validate_settings(self):
""" check if all settings are as expected """
now_set = self.details['settings']['index']
for key, value in self.expected_set.items():
if key not in now_set.keys():
print(key, value)
return True
if not value == now_set[key]:
print(key, value)
return True
return False
def rebuild_index(self):
""" rebuild with new mapping """
# backup
self.reindex('backup')
# delete original
self.delete_index(backup=False)
# create new
self.create_blank()
self.reindex('restore')
# delete backup
self.delete_index()
def reindex(self, method):
""" create on elastic search """
index_name = self.index_name
if method == 'backup':
source = f'ta_{index_name}'
destination = f'ta_{index_name}_backup'
elif method == 'restore':
source = f'ta_{index_name}_backup'
destination = f'ta_{index_name}'
query = {
"source": {
"index": source
},
"dest": {
"index": destination
}
}
data = json.dumps(query)
url = self.ES_URL + '/_reindex?refresh=true'
response = requests.post(url=url, data=data, headers=self.HEADERS)
if not response.ok:
print(response.text)
def delete_index(self, backup=True):
""" delete index passed as argument """
if backup:
url = f'{self.ES_URL}/ta_{self.index_name}_backup'
else:
url = f'{self.ES_URL}/ta_{self.index_name}'
response = requests.delete(url)
if not response.ok:
print(response.text)
def create_blank(self):
""" apply new mapping and settings for blank new index """
expected_map = self.expected_map
expected_set = self.expected_set
# stich payload
payload = {}
if expected_set:
payload.update({"settings": expected_set})
if expected_map:
payload.update({"mappings": {"properties": expected_map}})
# create
url = f'{self.ES_URL}/ta_{self.index_name}'
data = json.dumps(payload)
response = requests.put(url=url, data=data, headers=self.HEADERS)
if not response.ok:
print(response.text)
class ElasticBackup:
""" dump index to nd-json files for later bulk import """
def __init__(self, index_config):
self.config = AppConfig().config
self.index_config = index_config
self.timestamp = datetime.now().strftime('%Y%m%d')
def get_all_documents(self, index_name):
""" export all documents of a single index """
headers = {'Content-type': 'application/json'}
es_url = self.config['application']['es_url']
# get PIT ID
url = f'{es_url}/ta_{index_name}/_pit?keep_alive=1m'
response = requests.post(url)
json_data = json.loads(response.text)
pit_id = json_data['id']
# build query
data = {
"query": {"match_all": {}},
"size": 100, "pit": {"id": pit_id, "keep_alive": "1m"},
"sort": [ {"_id": {"order": "asc"}} ]
}
query_str = json.dumps(data)
url = es_url + '/_search'
# loop until nothing left
all_results = []
while True:
response = requests.get(url, data=query_str, headers=headers)
json_data = json.loads(response.text)
all_hits = json_data['hits']['hits']
if all_hits:
for hit in all_hits:
search_after = hit['sort']
all_results.append(hit)
# update search_after with last hit data
data['search_after'] = search_after
query_str = json.dumps(data)
else:
break
# clean up PIT
query_str = json.dumps({"id": pit_id})
requests.delete(es_url + '/_pit', data=query_str, headers=headers)
return all_results
@staticmethod
def build_bulk(all_results):
""" build bulk query data from all_results """
bulk_list = []
for document in all_results:
document_id = document['_id']
es_index = document['_index']
action = { "index" : { "_index": es_index, "_id": document_id } }
source = document['_source']
bulk_list.append(json.dumps(action))
bulk_list.append(json.dumps(source))
# add last newline
bulk_list.append('\n')
file_content = '\n'.join(bulk_list)
return file_content
def write_json_file(self, file_content, index_name):
""" write json file to disk """
cache_dir = self.config['application']['cache_dir']
file_name = f'ta_{index_name}-{self.timestamp}.json'
file_path = os.path.join(cache_dir, file_name)
with open(file_path, 'w', encoding='utf-8') as f:
f.write(file_content)
def post_bulk_restore(self, file_name):
""" send bulk to es """
cache_dir = self.config['application']['cache_dir']
es_url = self.config['application']['es_url']
headers = {'Content-type': 'application/x-ndjson'}
file_path = os.path.join(cache_dir, file_name)
with open(file_path, 'r', encoding='utf-8') as f:
query_str = f.read()
url = es_url + '/_bulk'
request = requests.post(url, data=query_str, headers=headers)
if not request.ok:
print(request.text)
def restore_from_file(self):
""" restore all available backup files """
cache_dir = self.config['application']['cache_dir']
all_available_backups = [
i for i in os.listdir(cache_dir) if
i.startswith('ta_') and i.endswith('.json')
]
for file_name in all_available_backups:
self.post_bulk_restore(file_name)
def backup_all_indexes():
""" backup all es indexes to disk """
backup_handler = ElasticBackup(INDEX_CONFIG)
for index in backup_handler.index_config:
index_name = index['index_name']
all_results = backup_handler.get_all_documents(index_name)
file_content = backup_handler.build_bulk(all_results)
backup_handler.write_json_file(file_content, index_name)
def restore_from_backup():
""" restore indexes from backup file """
# delete
index_check(force_restore=True)
# recreate
backup_handler = ElasticBackup(INDEX_CONFIG)
backup_handler.restore_from_file()
def index_check(force_restore=False):
""" check if all indexes are created and have correct mapping """
for index in INDEX_CONFIG:
index_name = index['index_name']
expected_map = index['expected_map']
expected_set = index['expected_set']
handler = ElasticIndex(index_name, expected_map, expected_set)
# force restore
if force_restore:
handler.delete_index(backup=False)
handler.create_blank()
continue
# create new
if not handler.exists:
print(f'create new blank index with name ta_{index_name}...')
handler.create_blank()
continue
# validate index
rebuild = handler.validate()
if rebuild:
print(f'applying new mappings to index ta_{index_name}...')
handler.rebuild_index()
continue
# else all good
print(f'ta_{index_name} index is created and up to date...')