tubearchivist/tubearchivist/home/src/index/filesystem.py

206 lines
7.2 KiB
Python
Raw Normal View History

2021-09-05 17:10:14 +00:00
"""
Functionality:
- reindexing old documents
- syncing updated values between indexes
- scan the filesystem to delete or index
"""
import json
import os
2022-01-22 15:13:37 +00:00
from home.src.download.queue import PendingList
2022-03-23 08:56:53 +00:00
from home.src.es.connect import ElasticWrap
from home.src.index.comments import CommentList
from home.src.index.video import index_new_video
2022-01-22 15:13:37 +00:00
from home.src.ta.config import AppConfig
from home.src.ta.helper import clean_string, ignore_filelist
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
2021-09-05 17:10:14 +00:00
2023-03-16 06:18:26 +00:00
class ScannerBase:
"""scan the filesystem base class"""
2021-09-05 17:10:14 +00:00
CONFIG = AppConfig().config
2021-09-21 09:25:22 +00:00
VIDEOS = CONFIG["application"]["videos"]
2021-09-05 17:10:14 +00:00
def __init__(self):
2023-03-16 06:18:26 +00:00
self.to_index = False
self.to_delete = False
self.mismatch = False
self.to_rename = False
def scan(self):
"""entry point, scan and compare"""
all_downloaded = self._get_all_downloaded()
all_indexed = self._get_all_indexed()
self.list_comarison(all_downloaded, all_indexed)
def _get_all_downloaded(self):
2021-09-21 09:25:22 +00:00
"""get a list of all video files downloaded"""
channels = os.listdir(self.VIDEOS)
all_channels = ignore_filelist(channels)
2021-09-05 17:10:14 +00:00
all_channels.sort()
all_downloaded = []
for channel_name in all_channels:
channel_path = os.path.join(self.VIDEOS, channel_name)
channel_files = os.listdir(channel_path)
channel_files_clean = ignore_filelist(channel_files)
all_videos = [i for i in channel_files_clean if i.endswith(".mp4")]
for video in all_videos:
2021-09-05 17:10:14 +00:00
youtube_id = video[9:20]
all_downloaded.append((channel_name, video, youtube_id))
return all_downloaded
@staticmethod
2023-03-16 06:18:26 +00:00
def _get_all_indexed():
2021-09-21 09:25:22 +00:00
"""get a list of all indexed videos"""
2021-09-05 17:10:14 +00:00
index_handler = PendingList()
2022-03-19 07:36:46 +00:00
index_handler.get_download()
2022-03-18 11:27:25 +00:00
index_handler.get_indexed()
2021-09-05 17:10:14 +00:00
all_indexed = []
2022-03-18 11:27:25 +00:00
for video in index_handler.all_videos:
youtube_id = video["youtube_id"]
media_url = video["media_url"]
published = video["published"]
title = video["title"]
2021-09-05 17:10:14 +00:00
all_indexed.append((youtube_id, media_url, published, title))
return all_indexed
2023-03-16 06:18:26 +00:00
def list_comarison(self, all_downloaded, all_indexed):
2021-09-21 09:25:22 +00:00
"""compare the lists to figure out what to do"""
2023-03-16 06:18:26 +00:00
self._find_unindexed(all_downloaded, all_indexed)
self._find_missing(all_downloaded, all_indexed)
self._find_bad_media_url(all_downloaded, all_indexed)
2021-09-05 17:10:14 +00:00
2023-03-16 06:18:26 +00:00
def _find_unindexed(self, all_downloaded, all_indexed):
2021-09-21 09:25:22 +00:00
"""find video files without a matching document indexed"""
2023-03-16 06:18:26 +00:00
all_indexed_ids = [i[0] for i in all_indexed]
self.to_index = []
for downloaded in all_downloaded:
2021-09-05 17:10:14 +00:00
if downloaded[2] not in all_indexed_ids:
2023-03-16 06:18:26 +00:00
self.to_index.append(downloaded)
2021-09-05 17:10:14 +00:00
2023-03-16 06:18:26 +00:00
def _find_missing(self, all_downloaded, all_indexed):
2021-09-21 09:25:22 +00:00
"""find indexed videos without matching media file"""
2023-03-16 06:18:26 +00:00
all_downloaded_ids = [i[2] for i in all_downloaded]
self.to_delete = []
for video in all_indexed:
2021-09-05 17:10:14 +00:00
youtube_id = video[0]
if youtube_id not in all_downloaded_ids:
2023-03-16 06:18:26 +00:00
self.to_delete.append(video)
2021-09-05 17:10:14 +00:00
2023-03-16 06:18:26 +00:00
def _find_bad_media_url(self, all_downloaded, all_indexed):
2021-09-21 09:25:22 +00:00
"""rename media files not matching the indexed title"""
2023-03-16 06:18:26 +00:00
self.mismatch = []
self.to_rename = []
for downloaded in all_downloaded:
2021-09-05 17:10:14 +00:00
channel, filename, downloaded_id = downloaded
# find in indexed
2023-03-16 06:18:26 +00:00
for indexed in all_indexed:
2021-09-05 17:10:14 +00:00
indexed_id, media_url, published, title = indexed
if indexed_id == downloaded_id:
# found it
2021-09-21 09:25:22 +00:00
pub = published.replace("-", "")
2023-03-16 06:18:26 +00:00
expected = f"{pub}_{indexed_id}_{clean_string(title)}.mp4"
new_url = os.path.join(channel, expected)
if expected != filename:
2021-09-05 17:10:14 +00:00
# file to rename
2023-03-16 06:18:26 +00:00
self.to_rename.append((channel, filename, expected))
2021-09-05 17:10:14 +00:00
if media_url != new_url:
# media_url to update in es
2023-03-16 06:18:26 +00:00
self.mismatch.append((indexed_id, new_url))
2021-09-05 17:10:14 +00:00
break
2023-03-16 06:18:26 +00:00
class Filesystem(ScannerBase):
"""handle scanning and fixing from filesystem"""
def __init__(self, task=False):
super().__init__()
self.task = task
def process(self):
"""entry point"""
2023-05-07 08:36:26 +00:00
if self.task:
self.task.send_progress(["Scanning your archive and index."])
2023-03-16 06:18:26 +00:00
self.scan()
self.rename_files()
self.send_mismatch_bulk()
self.delete_from_index()
self.add_missing()
2021-09-05 17:10:14 +00:00
def rename_files(self):
2021-09-21 09:25:22 +00:00
"""rename media files as identified by find_bad_media_url"""
2023-03-16 06:18:26 +00:00
if not self.to_rename:
return
total = len(self.to_rename)
2023-05-07 08:36:26 +00:00
if self.task:
self.task.send_progress([f"Rename {total} media files."])
2021-09-05 17:10:14 +00:00
for bad_filename in self.to_rename:
channel, filename, expected_filename = bad_filename
print(f"renaming [{filename}] to [{expected_filename}]")
2021-09-05 17:10:14 +00:00
old_path = os.path.join(self.VIDEOS, channel, filename)
new_path = os.path.join(self.VIDEOS, channel, expected_filename)
os.rename(old_path, new_path)
2021-09-18 10:28:16 +00:00
def send_mismatch_bulk(self):
2021-09-21 09:25:22 +00:00
"""build bulk update"""
2023-03-16 06:18:26 +00:00
if not self.mismatch:
return
total = len(self.mismatch)
2023-05-07 08:36:26 +00:00
if self.task:
self.task.send_progress([f"Fix media urls for {total} files"])
2021-09-05 17:10:14 +00:00
bulk_list = []
2021-09-18 10:28:16 +00:00
for video_mismatch in self.mismatch:
youtube_id, media_url = video_mismatch
print(f"{youtube_id}: fixing media url {media_url}")
2021-09-21 09:25:22 +00:00
action = {"update": {"_id": youtube_id, "_index": "ta_video"}}
2021-09-05 17:10:14 +00:00
source = {"doc": {"media_url": media_url}}
bulk_list.append(json.dumps(action))
bulk_list.append(json.dumps(source))
# add last newline
2021-09-21 09:25:22 +00:00
bulk_list.append("\n")
2022-03-23 08:56:53 +00:00
data = "\n".join(bulk_list)
_, _ = ElasticWrap("_bulk").post(data=data, ndjson=True)
2021-09-05 17:10:14 +00:00
def delete_from_index(self):
2021-09-21 09:25:22 +00:00
"""find indexed but deleted mediafile"""
2023-03-16 06:18:26 +00:00
if not self.to_delete:
return
total = len(self.to_delete)
2023-05-07 08:36:26 +00:00
if self.task:
self.task.send_progress([f"Clean up {total} items from index."])
2021-09-05 17:10:14 +00:00
for indexed in self.to_delete:
youtube_id = indexed[0]
print(f"deleting {youtube_id} from index")
2022-03-23 08:56:53 +00:00
path = f"ta_video/_doc/{youtube_id}"
_, _ = ElasticWrap(path).delete()
2021-09-05 17:10:14 +00:00
2023-03-16 06:18:26 +00:00
def add_missing(self):
"""add missing videos to index"""
video_ids = [i[2] for i in self.to_index]
if not video_ids:
return
total = len(video_ids)
for idx, youtube_id in enumerate(video_ids):
if self.task:
self.task.send_progress(
message_lines=[
f"Index missing video {youtube_id}, {idx}/{total}"
],
progress=(idx + 1) / total,
)
2022-01-22 10:48:54 +00:00
index_new_video(youtube_id)
2023-03-16 06:18:26 +00:00
CommentList(video_ids, task=self.task).index()