diff --git a/Dockerfile b/Dockerfile index 0ea95d4..07b5cc5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -34,7 +34,7 @@ VOLUME /youtube # start WORKDIR /app -EXPOSE 80 +EXPOSE 8000 RUN chmod +x ./run.sh diff --git a/README.md b/README.md index 1cb13fc..b3d10cd 100644 --- a/README.md +++ b/README.md @@ -82,7 +82,8 @@ Detect the YouTube ID from filename, this accepts the default yt-dlp naming conv ## Potential pitfalls -**Elastic Search** in Docker requires the kernel setting of the host machine `vm.max_map_count` to be set to least 262144. +### vm.max_map_count +**Elastic Search** in Docker requires the kernel setting of the host machine `vm.max_map_count` to be set to at least 262144. To temporary set the value run: ``` @@ -94,6 +95,13 @@ To apply the change permanently depends on your host operating system: - On Arch based systems create a file */etc/sysctl.d/max_map_count.conf* with the content `vm.max_map_count = 262144`. - On any other platform look up in the documentation on how to pass kernel parameters. +### Permissions for elasticsearch +If you see a message similar to `AccessDeniedException[/usr/share/elasticsearch/data/nodes]` when initially starting elasticsearch, that means the container is not allowed to write files to the volume. +That's most likely the case when you run `docker-compose` as an unprivileged user. To fix that issue, shutdown the container and on your host machine run: +``` +chown 1000:0 /path/to/mount/point +``` +This will match the permissions with the **UID** and **GID** of elasticsearch within the container and should fix the issue. ## Roadmap This should be considered as a **minimal viable product**, there is an extensive list of future functions and improvements planned. diff --git a/run.sh b/run.sh index 14a3663..e1eb4b7 100644 --- a/run.sh +++ b/run.sh @@ -14,7 +14,7 @@ until curl "$ES_URL" -fs; do done python manage.py migrate -python manage.py collectstatic +python manage.py collectstatic --noinput -c nginx & celery -A home.tasks worker --loglevel=INFO & uwsgi --ini uwsgi.ini diff --git a/tubearchivist/home/__init__.py b/tubearchivist/home/__init__.py index 8eb7ad5..d1e3e4e 100644 --- a/tubearchivist/home/__init__.py +++ b/tubearchivist/home/__init__.py @@ -22,8 +22,8 @@ def sync_redis_state(): def make_folders(): - """ make needed folders here to avoid letting docker messing it up """ - folders = ['download', 'channels', 'videos', 'import'] + """ make needed cache folders here so docker doesn't mess it up """ + folders = ['download', 'channels', 'videos', 'import', 'backup'] config = AppConfig().config cache_dir = config['application']['cache_dir'] for folder in folders: diff --git a/tubearchivist/home/src/index_management.py b/tubearchivist/home/src/index_management.py index 819ecce..533e379 100644 --- a/tubearchivist/home/src/index_management.py +++ b/tubearchivist/home/src/index_management.py @@ -8,6 +8,7 @@ Functionality: import json import os +import zipfile from datetime import datetime @@ -375,6 +376,7 @@ class ElasticBackup: self.config = AppConfig().config self.index_config = index_config self.timestamp = datetime.now().strftime('%Y%m%d') + self.backup_files = [] def get_all_documents(self, index_name): """ export all documents of a single index """ @@ -389,7 +391,7 @@ class ElasticBackup: data = { "query": {"match_all": {}}, "size": 100, "pit": {"id": pit_id, "keep_alive": "1m"}, - "sort": [ {"_id": {"order": "asc"}} ] + "sort": [{"_id": {"order": "asc"}}] } query_str = json.dumps(data) url = es_url + '/_search' @@ -422,7 +424,7 @@ class ElasticBackup: for document in all_results: document_id = document['_id'] es_index = document['_index'] - action = { "index" : { "_index": es_index, "_id": document_id } } + action = {"index": {"_index": es_index, "_id": document_id}} source = document['_source'] bulk_list.append(json.dumps(action)) bulk_list.append(json.dumps(source)) @@ -433,14 +435,44 @@ class ElasticBackup: return file_content - def write_json_file(self, file_content, index_name): - """ write json file to disk """ + def write_es_json(self, file_content, index_name): + """ write nd json file for es _bulk API to disk """ cache_dir = self.config['application']['cache_dir'] - file_name = f'ta_{index_name}-{self.timestamp}.json' - file_path = os.path.join(cache_dir, file_name) + file_name = f'es_{index_name}-{self.timestamp}.json' + file_path = os.path.join(cache_dir, 'backup', file_name) with open(file_path, 'w', encoding='utf-8') as f: f.write(file_content) + self.backup_files.append(file_path) + + def write_ta_json(self, all_results, index_name): + """ write generic json file to disk """ + cache_dir = self.config['application']['cache_dir'] + file_name = f'ta_{index_name}-{self.timestamp}.json' + file_path = os.path.join(cache_dir, 'backup', file_name) + to_write = [i['_source'] for i in all_results] + file_content = json.dumps(to_write) + with open(file_path, 'w', encoding='utf-8') as f: + f.write(file_content) + + self.backup_files.append(file_path) + + def zip_it(self): + """ pack it up into single zip file """ + cache_dir = self.config['application']['cache_dir'] + file_name = f'ta_backup-{self.timestamp}.zip' + backup_file = os.path.join(cache_dir, 'backup', file_name) + + with zipfile.ZipFile( + backup_file, 'w', compression=zipfile.ZIP_DEFLATED + ) as zip_f: + for backup_file in self.backup_files: + zip_f.write(backup_file) + + # cleanup + for backup_file in self.backup_files: + os.remove(backup_file) + def post_bulk_restore(self, file_name): """ send bulk to es """ cache_dir = self.config['application']['cache_dir'] @@ -475,7 +507,10 @@ def backup_all_indexes(): index_name = index['index_name'] all_results = backup_handler.get_all_documents(index_name) file_content = backup_handler.build_bulk(all_results) - backup_handler.write_json_file(file_content, index_name) + backup_handler.write_es_json(file_content, index_name) + backup_handler.write_ta_json(all_results, index_name) + + backup_handler.zip_it() def restore_from_backup(): diff --git a/tubearchivist/home/tasks.py b/tubearchivist/home/tasks.py index b24751c..ed6a6f3 100644 --- a/tubearchivist/home/tasks.py +++ b/tubearchivist/home/tasks.py @@ -15,6 +15,7 @@ from home.src.download import ( ) from home.src.config import AppConfig from home.src.reindex import reindex_old_documents, ManualImport +from home.src.index_management import backup_all_indexes from home.src.helper import get_lock @@ -54,8 +55,7 @@ def download_pending(): @shared_task def download_single(youtube_id): """ start download single video now """ - to_download = [youtube_id] - download_handler = VideoDownloader(to_download) + download_handler = VideoDownloader([youtube_id]) download_handler.download_list() @@ -93,3 +93,9 @@ def run_manual_import(): finally: if have_lock: my_lock.release() + +@shared_task +def run_backup(): + """ called from settings page, dump backup to zip file """ + backup_all_indexes() + print('backup finished') diff --git a/tubearchivist/home/templates/home/downloads.html b/tubearchivist/home/templates/home/downloads.html index db6621e..cd96bfb 100644 --- a/tubearchivist/home/templates/home/downloads.html +++ b/tubearchivist/home/templates/home/downloads.html @@ -29,7 +29,7 @@
Published: {{ video.published }} | Duration: {{ video.duration }} | {{ video.youtube_id }}
- +Add files to the cache/import folder. Make sure to follow the instructions on Github.
+Add files to the cache/import folder. Make sure to follow the instructions on Github.
Rescan filesystem.
+Export your database to a zip file stored at cache/backup.
+Restore from backup.
Coming soonBackup database.
+Rescan filesystem.
Coming soon