2021-09-05 17:10:14 +00:00
|
|
|
"""
|
|
|
|
Functionality:
|
|
|
|
- reindexing old documents
|
|
|
|
- syncing updated values between indexes
|
|
|
|
- scan the filesystem to delete or index
|
|
|
|
"""
|
|
|
|
|
|
|
|
import json
|
|
|
|
import os
|
|
|
|
|
2022-01-22 15:13:37 +00:00
|
|
|
from home.src.download.queue import PendingList
|
2022-03-23 08:56:53 +00:00
|
|
|
from home.src.es.connect import ElasticWrap
|
2022-12-23 15:34:25 +00:00
|
|
|
from home.src.index.comments import CommentList
|
2023-03-16 03:59:47 +00:00
|
|
|
from home.src.index.video import index_new_video
|
2022-01-22 15:13:37 +00:00
|
|
|
from home.src.ta.config import AppConfig
|
|
|
|
from home.src.ta.helper import clean_string, ignore_filelist
|
2023-03-16 03:59:47 +00:00
|
|
|
from PIL import ImageFile
|
2022-08-08 07:52:33 +00:00
|
|
|
|
|
|
|
ImageFile.LOAD_TRUNCATED_IMAGES = True
|
2021-09-05 17:10:14 +00:00
|
|
|
|
|
|
|
|
2023-03-16 06:18:26 +00:00
|
|
|
class ScannerBase:
|
|
|
|
"""scan the filesystem base class"""
|
2021-09-05 17:10:14 +00:00
|
|
|
|
|
|
|
CONFIG = AppConfig().config
|
2021-09-21 09:25:22 +00:00
|
|
|
VIDEOS = CONFIG["application"]["videos"]
|
2021-09-05 17:10:14 +00:00
|
|
|
|
|
|
|
def __init__(self):
|
2023-03-16 06:18:26 +00:00
|
|
|
self.to_index = False
|
|
|
|
self.to_delete = False
|
|
|
|
self.mismatch = False
|
|
|
|
self.to_rename = False
|
|
|
|
|
|
|
|
def scan(self):
|
|
|
|
"""entry point, scan and compare"""
|
|
|
|
all_downloaded = self._get_all_downloaded()
|
|
|
|
all_indexed = self._get_all_indexed()
|
|
|
|
self.list_comarison(all_downloaded, all_indexed)
|
|
|
|
|
|
|
|
def _get_all_downloaded(self):
|
2021-09-21 09:25:22 +00:00
|
|
|
"""get a list of all video files downloaded"""
|
2021-09-25 11:59:54 +00:00
|
|
|
channels = os.listdir(self.VIDEOS)
|
|
|
|
all_channels = ignore_filelist(channels)
|
2021-09-05 17:10:14 +00:00
|
|
|
all_channels.sort()
|
|
|
|
all_downloaded = []
|
|
|
|
for channel_name in all_channels:
|
|
|
|
channel_path = os.path.join(self.VIDEOS, channel_name)
|
2022-02-10 11:48:35 +00:00
|
|
|
channel_files = os.listdir(channel_path)
|
|
|
|
channel_files_clean = ignore_filelist(channel_files)
|
|
|
|
all_videos = [i for i in channel_files_clean if i.endswith(".mp4")]
|
2021-09-25 11:59:54 +00:00
|
|
|
for video in all_videos:
|
2021-09-05 17:10:14 +00:00
|
|
|
youtube_id = video[9:20]
|
|
|
|
all_downloaded.append((channel_name, video, youtube_id))
|
|
|
|
|
|
|
|
return all_downloaded
|
|
|
|
|
|
|
|
@staticmethod
|
2023-03-16 06:18:26 +00:00
|
|
|
def _get_all_indexed():
|
2021-09-21 09:25:22 +00:00
|
|
|
"""get a list of all indexed videos"""
|
2021-09-05 17:10:14 +00:00
|
|
|
index_handler = PendingList()
|
2022-03-19 07:36:46 +00:00
|
|
|
index_handler.get_download()
|
2022-03-18 11:27:25 +00:00
|
|
|
index_handler.get_indexed()
|
|
|
|
|
2021-09-05 17:10:14 +00:00
|
|
|
all_indexed = []
|
2022-03-18 11:27:25 +00:00
|
|
|
for video in index_handler.all_videos:
|
2021-11-18 05:16:21 +00:00
|
|
|
youtube_id = video["youtube_id"]
|
|
|
|
media_url = video["media_url"]
|
|
|
|
published = video["published"]
|
|
|
|
title = video["title"]
|
2021-09-05 17:10:14 +00:00
|
|
|
all_indexed.append((youtube_id, media_url, published, title))
|
|
|
|
return all_indexed
|
|
|
|
|
2023-03-16 06:18:26 +00:00
|
|
|
def list_comarison(self, all_downloaded, all_indexed):
|
2021-09-21 09:25:22 +00:00
|
|
|
"""compare the lists to figure out what to do"""
|
2023-03-16 06:18:26 +00:00
|
|
|
self._find_unindexed(all_downloaded, all_indexed)
|
|
|
|
self._find_missing(all_downloaded, all_indexed)
|
|
|
|
self._find_bad_media_url(all_downloaded, all_indexed)
|
2021-09-05 17:10:14 +00:00
|
|
|
|
2023-03-16 06:18:26 +00:00
|
|
|
def _find_unindexed(self, all_downloaded, all_indexed):
|
2021-09-21 09:25:22 +00:00
|
|
|
"""find video files without a matching document indexed"""
|
2023-03-16 06:18:26 +00:00
|
|
|
all_indexed_ids = [i[0] for i in all_indexed]
|
|
|
|
self.to_index = []
|
|
|
|
for downloaded in all_downloaded:
|
2021-09-05 17:10:14 +00:00
|
|
|
if downloaded[2] not in all_indexed_ids:
|
2023-03-16 06:18:26 +00:00
|
|
|
self.to_index.append(downloaded)
|
2021-09-05 17:10:14 +00:00
|
|
|
|
2023-03-16 06:18:26 +00:00
|
|
|
def _find_missing(self, all_downloaded, all_indexed):
|
2021-09-21 09:25:22 +00:00
|
|
|
"""find indexed videos without matching media file"""
|
2023-03-16 06:18:26 +00:00
|
|
|
all_downloaded_ids = [i[2] for i in all_downloaded]
|
|
|
|
self.to_delete = []
|
|
|
|
for video in all_indexed:
|
2021-09-05 17:10:14 +00:00
|
|
|
youtube_id = video[0]
|
|
|
|
if youtube_id not in all_downloaded_ids:
|
2023-03-16 06:18:26 +00:00
|
|
|
self.to_delete.append(video)
|
2021-09-05 17:10:14 +00:00
|
|
|
|
2023-03-16 06:18:26 +00:00
|
|
|
def _find_bad_media_url(self, all_downloaded, all_indexed):
|
2021-09-21 09:25:22 +00:00
|
|
|
"""rename media files not matching the indexed title"""
|
2023-03-16 06:18:26 +00:00
|
|
|
self.mismatch = []
|
|
|
|
self.to_rename = []
|
|
|
|
|
|
|
|
for downloaded in all_downloaded:
|
2021-09-05 17:10:14 +00:00
|
|
|
channel, filename, downloaded_id = downloaded
|
|
|
|
# find in indexed
|
2023-03-16 06:18:26 +00:00
|
|
|
for indexed in all_indexed:
|
2021-09-05 17:10:14 +00:00
|
|
|
indexed_id, media_url, published, title = indexed
|
|
|
|
if indexed_id == downloaded_id:
|
|
|
|
# found it
|
2021-09-21 09:25:22 +00:00
|
|
|
pub = published.replace("-", "")
|
2023-03-16 06:18:26 +00:00
|
|
|
expected = f"{pub}_{indexed_id}_{clean_string(title)}.mp4"
|
|
|
|
new_url = os.path.join(channel, expected)
|
|
|
|
if expected != filename:
|
2021-09-05 17:10:14 +00:00
|
|
|
# file to rename
|
2023-03-16 06:18:26 +00:00
|
|
|
self.to_rename.append((channel, filename, expected))
|
2021-09-05 17:10:14 +00:00
|
|
|
if media_url != new_url:
|
|
|
|
# media_url to update in es
|
2023-03-16 06:18:26 +00:00
|
|
|
self.mismatch.append((indexed_id, new_url))
|
2021-09-05 17:10:14 +00:00
|
|
|
|
|
|
|
break
|
|
|
|
|
2023-03-16 06:18:26 +00:00
|
|
|
|
|
|
|
class Filesystem(ScannerBase):
|
|
|
|
"""handle scanning and fixing from filesystem"""
|
|
|
|
|
|
|
|
def __init__(self, task=False):
|
|
|
|
super().__init__()
|
|
|
|
self.task = task
|
|
|
|
|
|
|
|
def process(self):
|
|
|
|
"""entry point"""
|
|
|
|
self.task.send_progress(["Scanning your archive and index."])
|
|
|
|
self.scan()
|
|
|
|
self.rename_files()
|
|
|
|
self.send_mismatch_bulk()
|
|
|
|
self.delete_from_index()
|
|
|
|
self.add_missing()
|
2021-09-05 17:10:14 +00:00
|
|
|
|
|
|
|
def rename_files(self):
|
2021-09-21 09:25:22 +00:00
|
|
|
"""rename media files as identified by find_bad_media_url"""
|
2023-03-16 06:18:26 +00:00
|
|
|
if not self.to_rename:
|
|
|
|
return
|
|
|
|
|
|
|
|
total = len(self.to_rename)
|
|
|
|
self.task.send_progress([f"Rename {total} media files."])
|
2021-09-05 17:10:14 +00:00
|
|
|
for bad_filename in self.to_rename:
|
|
|
|
channel, filename, expected_filename = bad_filename
|
2021-10-08 07:56:07 +00:00
|
|
|
print(f"renaming [{filename}] to [{expected_filename}]")
|
2021-09-05 17:10:14 +00:00
|
|
|
old_path = os.path.join(self.VIDEOS, channel, filename)
|
|
|
|
new_path = os.path.join(self.VIDEOS, channel, expected_filename)
|
|
|
|
os.rename(old_path, new_path)
|
|
|
|
|
2021-09-18 10:28:16 +00:00
|
|
|
def send_mismatch_bulk(self):
|
2021-09-21 09:25:22 +00:00
|
|
|
"""build bulk update"""
|
2023-03-16 06:18:26 +00:00
|
|
|
if not self.mismatch:
|
|
|
|
return
|
|
|
|
|
|
|
|
total = len(self.mismatch)
|
|
|
|
self.task.send_progress([f"Fix media urls for {total} files"])
|
2021-09-05 17:10:14 +00:00
|
|
|
bulk_list = []
|
2021-09-18 10:28:16 +00:00
|
|
|
for video_mismatch in self.mismatch:
|
|
|
|
youtube_id, media_url = video_mismatch
|
2021-10-08 07:56:07 +00:00
|
|
|
print(f"{youtube_id}: fixing media url {media_url}")
|
2021-09-21 09:25:22 +00:00
|
|
|
action = {"update": {"_id": youtube_id, "_index": "ta_video"}}
|
2021-09-05 17:10:14 +00:00
|
|
|
source = {"doc": {"media_url": media_url}}
|
|
|
|
bulk_list.append(json.dumps(action))
|
|
|
|
bulk_list.append(json.dumps(source))
|
|
|
|
# add last newline
|
2021-09-21 09:25:22 +00:00
|
|
|
bulk_list.append("\n")
|
2022-03-23 08:56:53 +00:00
|
|
|
data = "\n".join(bulk_list)
|
|
|
|
_, _ = ElasticWrap("_bulk").post(data=data, ndjson=True)
|
2021-09-05 17:10:14 +00:00
|
|
|
|
|
|
|
def delete_from_index(self):
|
2021-09-21 09:25:22 +00:00
|
|
|
"""find indexed but deleted mediafile"""
|
2023-03-16 06:18:26 +00:00
|
|
|
if not self.to_delete:
|
|
|
|
return
|
|
|
|
|
|
|
|
total = len(self.to_delete)
|
|
|
|
self.task.send_progress([f"Clean up {total} items from index."])
|
2021-09-05 17:10:14 +00:00
|
|
|
for indexed in self.to_delete:
|
2021-10-08 07:56:07 +00:00
|
|
|
youtube_id = indexed[0]
|
|
|
|
print(f"deleting {youtube_id} from index")
|
2022-03-23 08:56:53 +00:00
|
|
|
path = f"ta_video/_doc/{youtube_id}"
|
|
|
|
_, _ = ElasticWrap(path).delete()
|
2021-09-05 17:10:14 +00:00
|
|
|
|
2023-03-16 06:18:26 +00:00
|
|
|
def add_missing(self):
|
|
|
|
"""add missing videos to index"""
|
|
|
|
video_ids = [i[2] for i in self.to_index]
|
|
|
|
if not video_ids:
|
|
|
|
return
|
|
|
|
|
|
|
|
total = len(video_ids)
|
|
|
|
for idx, youtube_id in enumerate(video_ids):
|
|
|
|
if self.task:
|
|
|
|
self.task.send_progress(
|
|
|
|
message_lines=[
|
|
|
|
f"Index missing video {youtube_id}, {idx}/{total}"
|
|
|
|
],
|
|
|
|
progress=(idx + 1) / total,
|
|
|
|
)
|
2022-01-22 10:48:54 +00:00
|
|
|
index_new_video(youtube_id)
|
2022-12-23 15:34:25 +00:00
|
|
|
|
2023-03-16 06:18:26 +00:00
|
|
|
CommentList(video_ids, task=self.task).index()
|