offline import, #build

Changed:
- added offline import for videos and channels
- thumbnail manager rewrite
This commit is contained in:
simon 2022-08-12 05:36:54 +07:00
commit c39ce61b2c
No known key found for this signature in database
GPG Key ID: 2C15AA5E89985DD4
13 changed files with 712 additions and 404 deletions

View File

@ -220,3 +220,12 @@ Second best way to support the development is to provide for caffeinated beverag
* [Paypal.me](https://paypal.me/bbilly1) for a one time coffee
* [Paypal Subscription](https://www.paypal.com/webapps/billing/plans/subscribe?plan_id=P-03770005GR991451KMFGVPMQ) for a monthly coffee
* [ko-fi.com](https://ko-fi.com/bbilly1) for an alternative platform
## Sponsor
Big thank you to [Digitalocean](https://www.digitalocean.com/) for generously donating credit for the tubearchivist.com VPS and buildserver.
<p>
<a href="https://www.digitalocean.com/">
<img src="https://opensource.nyc3.cdn.digitaloceanspaces.com/attribution/assets/PoweredByDO/DO_Powered_by_Badge_blue.svg" width="201px">
</a>
</p>

View File

@ -74,7 +74,7 @@ class SearchProcess:
media_url = urllib.parse.quote(video_dict["media_url"])
vid_last_refresh = date_praser(video_dict["vid_last_refresh"])
published = date_praser(video_dict["published"])
vid_thumb_url = ThumbManager().vid_thumb_path(video_id)
vid_thumb_url = ThumbManager(video_id).vid_thumb_path()
channel = self._process_channel(video_dict["channel"])
if "subtitles" in video_dict:
@ -113,7 +113,7 @@ class SearchProcess:
def _process_download(self, download_dict):
"""run on single download item"""
video_id = download_dict["youtube_id"]
vid_thumb_url = ThumbManager().vid_thumb_path(video_id)
vid_thumb_url = ThumbManager(video_id).vid_thumb_path()
published = date_praser(download_dict["published"])
download_dict.update(

View File

@ -161,10 +161,7 @@ class PendingList(PendingIndex):
self._parse_channel(entry["url"])
elif entry["type"] == "playlist":
self._parse_playlist(entry["url"])
new_thumbs = PlaylistSubscription().process_url_str(
[entry], subscribed=False
)
ThumbManager().download_playlist(new_thumbs)
PlaylistSubscription().process_url_str([entry], subscribed=False)
else:
raise ValueError(f"invalid url_type: {entry}")
@ -198,7 +195,6 @@ class PendingList(PendingIndex):
self.get_channels()
bulk_list = []
thumb_handler = ThumbManager()
for idx, youtube_id in enumerate(self.missing_videos):
video_details = self.get_youtube_details(youtube_id)
if not video_details:
@ -209,8 +205,9 @@ class PendingList(PendingIndex):
bulk_list.append(json.dumps(action))
bulk_list.append(json.dumps(video_details))
thumb_needed = [(youtube_id, video_details["vid_thumb_url"])]
thumb_handler.download_vid(thumb_needed)
url = video_details["vid_thumb_url"]
ThumbManager(youtube_id).download_video_thumb(url)
self._notify_add(idx)
if bulk_list:

View File

@ -5,6 +5,7 @@ Functionality:
"""
from home.src.download import queue # partial import
from home.src.download.thumbnails import ThumbManager
from home.src.download.yt_dlp_base import YtWrap
from home.src.es.connect import IndexPaginate
from home.src.index.channel import YoutubeChannel
@ -129,11 +130,9 @@ class PlaylistSubscription:
all_indexed = IndexPaginate("ta_video", data).get_results()
all_youtube_ids = [i["youtube_id"] for i in all_indexed]
new_thumbs = []
for idx, playlist in enumerate(new_playlists):
url_type = playlist["type"]
playlist_id = playlist["url"]
if not url_type == "playlist":
if not playlist["type"] == "playlist":
print(f"{playlist_id} not a playlist, skipping...")
continue
@ -144,8 +143,11 @@ class PlaylistSubscription:
playlist_h.upload_to_es()
playlist_h.add_vids_to_playlist()
self.channel_validate(playlist_h.json_data["playlist_channel_id"])
thumb = playlist_h.json_data["playlist_thumbnail"]
new_thumbs.append((playlist_id, thumb))
url = playlist_h.json_data["playlist_thumbnail"]
thumb = ThumbManager(playlist_id, item_type="playlist")
thumb.download_playlist_thumb(url)
# notify
message = {
"status": "message:subplaylist",
@ -157,8 +159,6 @@ class PlaylistSubscription:
"message:subplaylist", message=message, expire=True
)
return new_thumbs
@staticmethod
def channel_validate(channel_id):
"""make sure channel of playlist is there"""

View File

@ -6,136 +6,66 @@ functionality:
import base64
import os
from collections import Counter
from io import BytesIO
from time import sleep
import requests
from home.src.download import queue # partial import
from home.src.download import subscriptions # partial import
from home.src.es.connect import IndexPaginate
from home.src.ta.config import AppConfig
from home.src.ta.helper import ignore_filelist
from home.src.ta.ta_redis import RedisArchivist
from mutagen.mp4 import MP4, MP4Cover
from PIL import Image, ImageFile, ImageFilter
ImageFile.LOAD_TRUNCATED_IMAGES = True
class ThumbManager:
"""handle thumbnails related functions"""
class ThumbManagerBase:
"""base class for thumbnail management"""
CONFIG = AppConfig().config
MEDIA_DIR = CONFIG["application"]["videos"]
CACHE_DIR = CONFIG["application"]["cache_dir"]
VIDEO_DIR = os.path.join(CACHE_DIR, "videos")
CHANNEL_DIR = os.path.join(CACHE_DIR, "channels")
PLAYLIST_DIR = os.path.join(CACHE_DIR, "playlists")
def get_all_thumbs(self):
"""get all video artwork already downloaded"""
all_thumb_folders = ignore_filelist(os.listdir(self.VIDEO_DIR))
all_thumbs = []
for folder in all_thumb_folders:
folder_path = os.path.join(self.VIDEO_DIR, folder)
if os.path.isfile(folder_path):
self.update_path(folder)
all_thumbs.append(folder_path)
continue
# raise exemption here in a future version
# raise FileExistsError("video cache dir has files inside")
def __init__(self, item_id, item_type, fallback=False):
self.item_id = item_id
self.item_type = item_type
self.fallback = fallback
all_folder_thumbs = ignore_filelist(os.listdir(folder_path))
all_thumbs.extend(all_folder_thumbs)
def download_raw(self, url):
"""download thumbnail for video"""
if not url:
return self.get_fallback()
return all_thumbs
for i in range(3):
try:
response = requests.get(url, stream=True)
if response.ok:
return Image.open(response.raw)
if response.status_code == 404:
return self.get_fallback()
def update_path(self, file_name):
"""reorganize thumbnails into folders as update path from v0.0.5"""
folder_name = file_name[0].lower()
folder_path = os.path.join(self.VIDEO_DIR, folder_name)
old_file = os.path.join(self.VIDEO_DIR, file_name)
new_file = os.path.join(folder_path, file_name)
os.makedirs(folder_path, exist_ok=True)
os.rename(old_file, new_file)
except ConnectionError:
print(f"{self.item_id}: retry thumbnail download {url}")
sleep((i + 1) ** i)
def get_needed_thumbs(self, missing_only=False):
"""get a list of all missing thumbnails"""
all_thumbs = self.get_all_thumbs()
return False
pending = queue.PendingList()
pending.get_download()
pending.get_indexed()
def get_fallback(self):
"""get fallback thumbnail if not available"""
if self.fallback:
img_raw = Image.open(self.fallback)
return img_raw
needed_thumbs = []
for video in pending.all_videos:
youtube_id = video["youtube_id"]
thumb_url = video["vid_thumb_url"]
if missing_only:
if youtube_id + ".jpg" not in all_thumbs:
needed_thumbs.append((youtube_id, thumb_url))
else:
needed_thumbs.append((youtube_id, thumb_url))
for video in pending.all_pending + pending.all_ignored:
youtube_id = video["youtube_id"]
thumb_url = video["vid_thumb_url"]
if missing_only:
if youtube_id + ".jpg" not in all_thumbs:
needed_thumbs.append((youtube_id, thumb_url))
else:
needed_thumbs.append((youtube_id, thumb_url))
return needed_thumbs
def get_missing_channels(self):
"""get all channel artwork"""
all_channel_art = os.listdir(self.CHANNEL_DIR)
files = [i[0:24] for i in all_channel_art]
cached_channel_ids = [k for (k, v) in Counter(files).items() if v > 1]
channel_sub = subscriptions.ChannelSubscription()
channels = channel_sub.get_channels(subscribed_only=False)
missing_channels = []
for channel in channels:
channel_id = channel["channel_id"]
if channel_id not in cached_channel_ids:
channel_banner = channel["channel_banner_url"]
channel_thumb = channel["channel_thumb_url"]
missing_channels.append(
(channel_id, channel_thumb, channel_banner)
)
return missing_channels
def get_missing_playlists(self):
"""get all missing playlist artwork"""
all_downloaded = ignore_filelist(os.listdir(self.PLAYLIST_DIR))
all_ids_downloaded = [i.replace(".jpg", "") for i in all_downloaded]
playlist_sub = subscriptions.PlaylistSubscription()
playlists = playlist_sub.get_playlists(subscribed_only=False)
missing_playlists = []
for playlist in playlists:
playlist_id = playlist["playlist_id"]
if playlist_id not in all_ids_downloaded:
playlist_thumb = playlist["playlist_thumbnail"]
missing_playlists.append((playlist_id, playlist_thumb))
return missing_playlists
def get_raw_img(self, img_url, thumb_type):
"""get raw image from youtube and handle 404"""
try:
app_root = self.CONFIG["application"]["app_root"]
except KeyError:
# lazy keyerror fix to not have to deal with a strange startup
# racing contition between the threads in HomeConfig.ready()
app_root = "/app"
app_root = self.CONFIG["application"]["app_root"]
default_map = {
"video": os.path.join(
app_root, "static/img/default-video-thumb.jpg"
),
"playlist": os.path.join(
app_root, "static/img/default-video-thumb.jpg"
),
"icon": os.path.join(
app_root, "static/img/default-channel-icon.jpg"
),
@ -143,116 +73,134 @@ class ThumbManager:
app_root, "static/img/default-channel-banner.jpg"
),
}
if img_url:
try:
response = requests.get(img_url, stream=True)
except ConnectionError:
sleep(5)
response = requests.get(img_url, stream=True)
if not response.ok and not response.status_code == 404:
print("retry thumbnail download for " + img_url)
sleep(5)
response = requests.get(img_url, stream=True)
else:
response = False
if not response or response.status_code == 404:
# use default
img_raw = Image.open(default_map[thumb_type])
else:
# use response
img_obj = response.raw
img_raw = Image.open(img_obj)
img_raw = Image.open(default_map[self.item_type])
return img_raw
def download_vid(self, missing_thumbs, notify=True):
"""download all missing thumbnails from list"""
print(f"downloading {len(missing_thumbs)} thumbnails")
for idx, (youtube_id, thumb_url) in enumerate(missing_thumbs):
folder_path = os.path.join(self.VIDEO_DIR, youtube_id[0].lower())
thumb_path = os.path.join(
self.CACHE_DIR, self.vid_thumb_path(youtube_id)
)
os.makedirs(folder_path, exist_ok=True)
img_raw = self.get_raw_img(thumb_url, "video")
class ThumbManager(ThumbManagerBase):
"""handle thumbnails related functions"""
width, height = img_raw.size
if not width / height == 16 / 9:
new_height = width / 16 * 9
offset = (height - new_height) / 2
img_raw = img_raw.crop((0, offset, width, height - offset))
img_raw.convert("RGB").save(thumb_path)
def __init__(self, item_id, item_type="video", fallback=False):
super().__init__(item_id, item_type, fallback=fallback)
progress = f"{idx + 1}/{len(missing_thumbs)}"
if notify:
mess_dict = {
"status": "message:add",
"level": "info",
"title": "Processing Videos",
"message": "Downloading Thumbnails, Progress: " + progress,
}
if idx + 1 == len(missing_thumbs):
expire = 4
else:
expire = True
def download(self, url):
"""download thumbnail"""
print(f"{self.item_id}: download {self.item_type} thumbnail")
if self.item_type == "video":
self.download_video_thumb(url)
elif self.item_type == "channel":
self.download_channel_art(url)
elif self.item_type == "playlist":
self.download_playlist_thumb(url)
RedisArchivist().set_message(
"message:add", mess_dict, expire=expire
)
def delete(self):
"""delete thumbnail file"""
print(f"{self.item_id}: delete {self.item_type} thumbnail")
if self.item_type == "video":
self.delete_video_thumb()
elif self.item_type == "channel":
self.delete_channel_thumb()
elif self.item_type == "playlist":
self.delete_playlist_thumb()
if idx + 1 % 25 == 0:
print("thumbnail progress: " + progress)
def download_video_thumb(self, url, skip_existing=False):
"""pass url for video thumbnail"""
folder_path = os.path.join(self.VIDEO_DIR, self.item_id[0].lower())
thumb_path = self.vid_thumb_path(absolute=True)
def download_chan(self, missing_channels):
"""download needed artwork for channels"""
print(f"downloading {len(missing_channels)} channel artwork")
for channel in missing_channels:
channel_id, channel_thumb, channel_banner = channel
if skip_existing and os.path.exists(thumb_path):
return
thumb_path = os.path.join(
self.CHANNEL_DIR, channel_id + "_thumb.jpg"
)
img_raw = self.get_raw_img(channel_thumb, "icon")
img_raw.convert("RGB").save(thumb_path)
os.makedirs(folder_path, exist_ok=True)
img_raw = self.download_raw(url)
width, height = img_raw.size
banner_path = os.path.join(
self.CHANNEL_DIR, channel_id + "_banner.jpg"
)
img_raw = self.get_raw_img(channel_banner, "banner")
img_raw.convert("RGB").save(banner_path)
if not width / height == 16 / 9:
new_height = width / 16 * 9
offset = (height - new_height) / 2
img_raw = img_raw.crop((0, offset, width, height - offset))
mess_dict = {
"status": "message:download",
"level": "info",
"title": "Processing Channels",
"message": "Downloading Channel Art.",
}
key = "message:download"
RedisArchivist().set_message(key, mess_dict, expire=True)
img_raw.convert("RGB").save(thumb_path)
def download_playlist(self, missing_playlists):
"""download needed artwork for playlists"""
print(f"downloading {len(missing_playlists)} playlist artwork")
for playlist in missing_playlists:
playlist_id, playlist_thumb_url = playlist
thumb_path = os.path.join(self.PLAYLIST_DIR, playlist_id + ".jpg")
img_raw = self.get_raw_img(playlist_thumb_url, "video")
img_raw.convert("RGB").save(thumb_path)
def vid_thumb_path(self, absolute=False):
"""build expected path for video thumbnail from youtube_id"""
folder_name = self.item_id[0].lower()
folder_path = os.path.join("videos", folder_name)
thumb_path = os.path.join(folder_path, f"{self.item_id}.jpg")
if absolute:
thumb_path = os.path.join(self.CACHE_DIR, thumb_path)
mess_dict = {
"status": "message:download",
"level": "info",
"title": "Processing Playlists",
"message": "Downloading Playlist Art.",
}
key = "message:download"
RedisArchivist().set_message(key, mess_dict, expire=True)
return thumb_path
def get_base64_blur(self, youtube_id):
def download_channel_art(self, urls, skip_existing=False):
"""pass tuple of channel thumbnails"""
channel_thumb, channel_banner = urls
self._download_channel_thumb(channel_thumb, skip_existing)
self._download_channel_banner(channel_banner, skip_existing)
def _download_channel_thumb(self, channel_thumb, skip_existing):
"""download channel thumbnail"""
thumb_path = os.path.join(
self.CHANNEL_DIR, f"{self.item_id}_thumb.jpg"
)
self.item_type = "icon"
if skip_existing and os.path.exists(thumb_path):
return
img_raw = self.download_raw(channel_thumb)
img_raw.convert("RGB").save(thumb_path)
def _download_channel_banner(self, channel_banner, skip_existing):
"""download channel banner"""
banner_path = os.path.join(
self.CHANNEL_DIR, self.item_id + "_banner.jpg"
)
self.item_type = "banner"
if skip_existing and os.path.exists(banner_path):
return
img_raw = self.download_raw(channel_banner)
img_raw.convert("RGB").save(banner_path)
def download_playlist_thumb(self, url, skip_existing=False):
"""pass thumbnail url"""
thumb_path = os.path.join(self.PLAYLIST_DIR, f"{self.item_id}.jpg")
if skip_existing and os.path.exists(thumb_path):
return
img_raw = self.download_raw(url)
img_raw.convert("RGB").save(thumb_path)
def delete_video_thumb(self):
"""delete video thumbnail if exists"""
thumb_path = self.vid_thumb_path()
to_delete = os.path.join(self.CACHE_DIR, thumb_path)
if os.path.exists(to_delete):
os.remove(to_delete)
def delete_channel_thumb(self):
"""delete all artwork of channel"""
thumb = os.path.join(self.CHANNEL_DIR, f"{self.item_id}_thumb.jpg")
banner = os.path.join(self.CHANNEL_DIR, f"{self.item_id}_banner.jpg")
if os.path.exists(thumb):
os.remove(thumb)
if os.path.exists(banner):
os.remove(banner)
def delete_playlist_thumb(self):
"""delete playlist thumbnail"""
thumb_path = os.path.join(self.PLAYLIST_DIR, f"{self.item_id}.jpg")
if os.path.exists(thumb_path):
os.remove(thumb_path)
def get_vid_base64_blur(self):
"""return base64 encoded placeholder"""
img_path = self.vid_thumb_path(youtube_id)
file_path = os.path.join(self.CACHE_DIR, img_path)
file_path = os.path.join(self.CACHE_DIR, self.vid_thumb_path())
img_raw = Image.open(file_path)
img_raw.thumbnail((img_raw.width // 20, img_raw.height // 20))
img_blur = img_raw.filter(ImageFilter.BLUR)
@ -264,40 +212,109 @@ class ThumbManager:
return data_url
@staticmethod
def vid_thumb_path(youtube_id):
"""build expected path for video thumbnail from youtube_id"""
folder_name = youtube_id[0].lower()
folder_path = os.path.join("videos", folder_name)
thumb_path = os.path.join(folder_path, youtube_id + ".jpg")
return thumb_path
def delete_vid_thumb(self, youtube_id):
"""delete video thumbnail if exists"""
thumb_path = self.vid_thumb_path(youtube_id)
to_delete = os.path.join(self.CACHE_DIR, thumb_path)
if os.path.exists(to_delete):
os.remove(to_delete)
class ValidatorCallback:
"""handle callback validate thumbnails page by page"""
def delete_chan_thumb(self, channel_id):
"""delete all artwork of channel"""
thumb = os.path.join(self.CHANNEL_DIR, channel_id + "_thumb.jpg")
banner = os.path.join(self.CHANNEL_DIR, channel_id + "_banner.jpg")
if os.path.exists(thumb):
os.remove(thumb)
if os.path.exists(banner):
os.remove(banner)
def __init__(self, source, index_name):
self.source = source
self.index_name = index_name
def cleanup_downloaded(self):
"""find downloaded thumbnails without video indexed"""
all_thumbs = self.get_all_thumbs()
all_indexed = self.get_needed_thumbs()
all_needed_thumbs = [i[0] + ".jpg" for i in all_indexed]
for thumb in all_thumbs:
if thumb not in all_needed_thumbs:
# cleanup
youtube_id = thumb.rstrip(".jpg")
self.delete_vid_thumb(youtube_id)
def run(self):
"""run the task for page"""
print(f"{self.index_name}: validate artwork")
if self.index_name == "ta_video":
self._validate_videos()
elif self.index_name == "ta_channel":
self._validate_channels()
elif self.index_name == "ta_playlist":
self._validate_playlists()
def _validate_videos(self):
"""check if video thumbnails are correct"""
for video in self.source:
url = video["_source"]["vid_thumb_url"]
handler = ThumbManager(video["_source"]["youtube_id"])
handler.download_video_thumb(url, skip_existing=True)
def _validate_channels(self):
"""check if all channel artwork is there"""
for channel in self.source:
urls = (
channel["_source"]["channel_thumb_url"],
channel["_source"]["channel_banner_url"],
)
handler = ThumbManager(channel["_source"]["channel_id"])
handler.download_channel_art(urls, skip_existing=True)
def _validate_playlists(self):
"""check if all playlist artwork is there"""
for playlist in self.source:
url = playlist["_source"]["playlist_thumbnail"]
handler = ThumbManager(playlist["_source"]["playlist_id"])
handler.download_playlist_thumb(url, skip_existing=True)
class ThumbValidator:
"""validate thumbnails"""
def download_missing(self):
"""download all missing artwork"""
self.download_missing_videos()
self.download_missing_channels()
self.download_missing_playlists()
def download_missing_videos(self):
"""get all missing video thumbnails"""
data = {
"query": {"term": {"active": {"value": True}}},
"sort": [{"youtube_id": {"order": "asc"}}],
"_source": ["vid_thumb_url", "youtube_id"],
}
paginate = IndexPaginate(
"ta_video", data, size=5000, callback=ValidatorCallback
)
_ = paginate.get_results()
def download_missing_channels(self):
"""get all missing channel thumbnails"""
data = {
"query": {"term": {"channel_active": {"value": True}}},
"sort": [{"channel_id": {"order": "asc"}}],
"_source": {
"excludes": ["channel_description", "channel_overwrites"]
},
}
paginate = IndexPaginate(
"ta_channel", data, callback=ValidatorCallback
)
_ = paginate.get_results()
def download_missing_playlists(self):
"""get all missing playlist artwork"""
data = {
"query": {"term": {"playlist_active": {"value": True}}},
"sort": [{"playlist_id": {"order": "asc"}}],
"_source": ["playlist_id", "playlist_thumbnail"],
}
paginate = IndexPaginate(
"ta_playlist", data, callback=ValidatorCallback
)
_ = paginate.get_results()
class ThumbFilesystem:
"""filesystem tasks for thumbnails"""
CONFIG = AppConfig().config
CACHE_DIR = CONFIG["application"]["cache_dir"]
MEDIA_DIR = CONFIG["application"]["videos"]
VIDEO_DIR = os.path.join(CACHE_DIR, "videos")
def sync(self):
"""embed thumbnails to mediafiles"""
video_list = self.get_thumb_list()
self._embed_thumbs(video_list)
def get_thumb_list(self):
"""get list of mediafiles and matching thumbnails"""
@ -307,10 +324,10 @@ class ThumbManager:
video_list = []
for video in pending.all_videos:
youtube_id = video["youtube_id"]
video_id = video["youtube_id"]
media_url = os.path.join(self.MEDIA_DIR, video["media_url"])
thumb_path = os.path.join(
self.CACHE_DIR, self.vid_thumb_path(youtube_id)
self.CACHE_DIR, ThumbManager(video_id).vid_thumb_path()
)
video_list.append(
{
@ -322,7 +339,7 @@ class ThumbManager:
return video_list
@staticmethod
def write_all_thumbs(video_list):
def _embed_thumbs(video_list):
"""rewrite the thumbnail into media file"""
counter = 1
@ -340,15 +357,3 @@ class ThumbManager:
if counter % 50 == 0:
print(f"thumbnail write progress {counter}/{len(video_list)}")
counter = counter + 1
def validate_thumbnails():
"""check if all thumbnails are there and organized correctly"""
handler = ThumbManager()
thumbs_to_download = handler.get_needed_thumbs(missing_only=True)
handler.download_vid(thumbs_to_download)
missing_channels = handler.get_missing_channels()
handler.download_chan(missing_channels)
missing_playlists = handler.get_missing_playlists()
handler.download_playlist(missing_playlists)
handler.cleanup_downloaded()

View File

@ -119,7 +119,7 @@ class SearchHandler:
if "vid_thumb_url" in hit_keys:
youtube_id = hit["source"]["youtube_id"]
thumb_path = ThumbManager().vid_thumb_path(youtube_id)
thumb_path = ThumbManager(youtube_id).vid_thumb_path()
hit["source"]["vid_thumb_url"] = thumb_path
if "channel_last_refresh" in hit_keys:
@ -138,7 +138,7 @@ class SearchHandler:
if "subtitle_fragment_id" in hit_keys:
youtube_id = hit["source"]["youtube_id"]
thumb_path = ThumbManager().vid_thumb_path(youtube_id)
thumb_path = ThumbManager(youtube_id).vid_thumb_path()
hit["source"]["vid_thumb_url"] = f"/cache/{thumb_path}"
return hit

View File

@ -173,30 +173,71 @@ class YoutubeChannel(YouTubeItem):
self.es_path = f"{self.index_name}/_doc/{youtube_id}"
self.all_playlists = False
def build_json(self, upload=False):
def build_json(self, upload=False, fallback=False):
"""get from es or from youtube"""
self.get_from_es()
if self.json_data:
return
self.get_from_youtube()
self.get_from_youtube(fallback)
if upload:
self.upload_to_es()
return
def get_from_youtube(self):
def get_from_youtube(self, fallback=False):
"""use bs4 to scrape channel about page"""
self.json_data = ChannelScraper(self.youtube_id).get_json()
if not self.json_data and fallback:
self._video_fallback(fallback)
self.get_channel_art()
def _video_fallback(self, fallback):
"""use video metadata as fallback"""
print(f"{self.youtube_id}: fallback to video metadata")
self.json_data = {
"channel_active": False,
"channel_last_refresh": int(datetime.now().strftime("%s")),
"channel_subs": fallback.get("channel_follower_count", 0),
"channel_name": fallback["uploader"],
"channel_banner_url": False,
"channel_tvart_url": False,
"channel_id": self.youtube_id,
"channel_subscribed": False,
"channel_description": False,
"channel_thumb_url": False,
"channel_views": 0,
}
self._info_json_fallback()
def _info_json_fallback(self):
"""read channel info.json for additional metadata"""
info_json = os.path.join(
self.config["application"]["cache_dir"],
"import",
f"{self.youtube_id}.info.json",
)
if os.path.exists(info_json):
print(f"{self.youtube_id}: read info.json file")
with open(info_json, "r", encoding="utf-8") as f:
content = json.loads(f.read())
self.json_data.update(
{
"channel_subs": content["channel_follower_count"],
"channel_description": content["description"],
}
)
def get_channel_art(self):
"""download channel art for new channels"""
channel_id = self.youtube_id
channel_thumb = self.json_data["channel_thumb_url"]
channel_banner = self.json_data["channel_banner_url"]
ThumbManager().download_chan(
[(channel_id, channel_thumb, channel_banner)]
urls = (
self.json_data["channel_thumb_url"],
self.json_data["channel_banner_url"],
)
ThumbManager(self.youtube_id, item_type="channel").download(urls)
def sync_to_videos(self):
"""sync new channel_dict to all videos of channel"""

View File

@ -12,13 +12,16 @@ import shutil
import subprocess
from home.src.download.queue import PendingList
from home.src.download.yt_dlp_handler import VideoDownloader
from home.src.download.thumbnails import ThumbManager
from home.src.es.connect import ElasticWrap
from home.src.index.reindex import Reindex
from home.src.index.video import index_new_video
from home.src.index.video import YoutubeVideo, index_new_video
from home.src.ta.config import AppConfig
from home.src.ta.helper import clean_string, ignore_filelist
from home.src.ta.ta_redis import RedisArchivist
from PIL import Image, ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
class FilesystemScanner:
@ -157,63 +160,140 @@ class FilesystemScanner:
_, _ = ElasticWrap(path).delete()
class ManualImport:
"""import and indexing existing video files"""
class ImportFolderScanner:
"""import and indexing existing video files
- identify all media files belonging to a video
- identify youtube id
- convert if needed
"""
CONFIG = AppConfig().config
CACHE_DIR = CONFIG["application"]["cache_dir"]
IMPORT_DIR = os.path.join(CACHE_DIR, "import")
EXT_MAP = {
"media": [".mp4", ".mkv", ".webm"],
"metadata": [".json"],
"thumb": [".jpg", ".png", ".webp"],
"subtitle": [".vtt"],
}
def __init__(self):
self.identified = self.import_folder_parser()
self.to_import = False
def import_folder_parser(self):
"""detect files in import folder"""
import_files = os.listdir(self.IMPORT_DIR)
to_import = ignore_filelist(import_files)
to_import.sort()
video_files = [i for i in to_import if not i.endswith(".json")]
def scan(self):
"""scan and match media files"""
all_files = self.get_all_files()
self.match_files(all_files)
self.process_videos()
identified = []
return self.to_import
for file_path in video_files:
def get_all_files(self):
"""get all files in /import"""
rel_paths = ignore_filelist(os.listdir(self.IMPORT_DIR))
all_files = [os.path.join(self.IMPORT_DIR, i) for i in rel_paths]
all_files.sort()
file_dict = {"video_file": file_path}
file_name, _ = os.path.splitext(file_path)
matching_json = [
i
for i in to_import
if i.startswith(file_name) and i.endswith(".json")
]
if matching_json:
json_file = matching_json[0]
youtube_id = self.extract_id_from_json(json_file)
file_dict.update({"json_file": json_file})
else:
youtube_id = self.extract_id_from_filename(file_name)
file_dict.update({"json_file": False})
file_dict.update({"youtube_id": youtube_id})
identified.append(file_dict)
return identified
return all_files
@staticmethod
def extract_id_from_filename(file_name):
def _get_template():
"""base dict for video"""
return {
"media": False,
"video_id": False,
"metadata": False,
"thumb": False,
"subtitle": [],
}
def match_files(self, all_files):
"""loop through all files, join what matches"""
self.to_import = []
current_video = self._get_template()
last_base = False
for file_path in all_files:
base_name_raw, ext = os.path.splitext(file_path)
base_name, _ = os.path.splitext(base_name_raw)
key, file_path = self._detect_type(file_path, ext)
if not key or not file_path:
continue
if base_name != last_base:
if last_base:
self.to_import.append(current_video)
current_video = self._get_template()
last_base = base_name
if key == "subtitle":
current_video["subtitle"].append(file_path)
else:
current_video[key] = file_path
if current_video.get("media"):
self.to_import.append(current_video)
def _detect_type(self, file_path, ext):
"""detect metadata type for file"""
for key, value in self.EXT_MAP.items():
if ext in value:
return key, file_path
return False, False
def process_videos(self):
"""loop through all videos"""
for current_video in self.to_import:
if not current_video["media"]:
print(f"{current_video}: no matching media file found.")
raise ValueError
self._detect_youtube_id(current_video)
self._dump_thumb(current_video)
self._convert_thumb(current_video)
self._convert_video(current_video)
ManualImport(current_video, self.CONFIG).run()
def _detect_youtube_id(self, current_video):
"""find video id from filename or json"""
print(current_video)
youtube_id = self._extract_id_from_filename(current_video["media"])
if youtube_id:
current_video["video_id"] = youtube_id
return
youtube_id = self._extract_id_from_json(current_video["metadata"])
if youtube_id:
current_video["video_id"] = youtube_id
return
print(current_video["media"])
raise ValueError("failed to find video id")
@staticmethod
def _extract_id_from_filename(file_name):
"""
look at the file name for the youtube id
expects filename ending in [<youtube_id>].<ext>
"""
id_search = re.search(r"\[([a-zA-Z0-9_-]{11})\]$", file_name)
base_name, _ = os.path.splitext(file_name)
id_search = re.search(r"\[([a-zA-Z0-9_-]{11})\]$", base_name)
if id_search:
youtube_id = id_search.group(1)
return youtube_id
print("failed to extract youtube id for: " + file_name)
raise Exception
print(f"id extraction failed from filename: {file_name}")
def extract_id_from_json(self, json_file):
return False
def _extract_id_from_json(self, json_file):
"""open json file and extract id"""
json_path = os.path.join(self.CACHE_DIR, "import", json_file)
with open(json_path, "r", encoding="utf-8") as f:
@ -223,66 +303,239 @@ class ManualImport:
return youtube_id
def process_import(self):
"""go through identified media files"""
def _dump_thumb(self, current_video):
"""extract embedded thumb before converting"""
if current_video["thumb"]:
return
all_videos_added = []
media_path = current_video["media"]
_, ext = os.path.splitext(media_path)
for media_file in self.identified:
json_file = media_file["json_file"]
video_file = media_file["video_file"]
youtube_id = media_file["youtube_id"]
new_path = False
if ext == ".mkv":
idx, thumb_type = self._get_mkv_thumb_stream(media_path)
if idx:
new_path = self.dump_mpv_thumb(media_path, idx, thumb_type)
video_path = os.path.join(self.CACHE_DIR, "import", video_file)
elif ext == ".mp4":
thumb_type = self.get_mp4_thumb_type(media_path)
if thumb_type:
new_path = self.dump_mp4_thumb(media_path, thumb_type)
self.move_to_cache(video_path, youtube_id)
if new_path:
current_video["thumb"] = new_path
# identify and archive
vid_dict = index_new_video(youtube_id)
VideoDownloader([youtube_id]).move_to_archive(vid_dict)
youtube_id = vid_dict["youtube_id"]
thumb_url = vid_dict["vid_thumb_url"]
all_videos_added.append((youtube_id, thumb_url))
def _get_mkv_thumb_stream(self, media_path):
"""get stream idx of thumbnail for mkv files"""
streams = self._get_streams(media_path)
attachments = [
i for i in streams["streams"] if i["codec_type"] == "attachment"
]
# cleanup
if os.path.exists(video_path):
os.remove(video_path)
if json_file:
json_path = os.path.join(self.CACHE_DIR, "import", json_file)
os.remove(json_path)
for idx, stream in enumerate(attachments):
tags = stream["tags"]
if "mimetype" in tags and tags["filename"].startswith("cover"):
_, ext = os.path.splitext(tags["filename"])
return idx, ext
return all_videos_added
return False, False
def move_to_cache(self, video_path, youtube_id):
"""move identified video file to cache, convert to mp4"""
file_name = os.path.split(video_path)[-1]
video_file, ext = os.path.splitext(file_name)
@staticmethod
def dump_mpv_thumb(media_path, idx, thumb_type):
"""write cover to disk for mkv"""
_, media_ext = os.path.splitext(media_path)
new_path = f"{media_path.rstrip(media_ext)}{thumb_type}"
subprocess.run(
[
"ffmpeg",
"-v",
"quiet",
f"-dump_attachment:t:{idx}",
new_path,
"-i",
media_path,
],
check=False,
)
# make sure youtube_id is in filename
if youtube_id not in video_file:
video_file = f"{video_file}_{youtube_id}"
return new_path
# move, convert if needed
def get_mp4_thumb_type(self, media_path):
"""dedect filetype of embedded thumbnail"""
streams = self._get_streams(media_path)
for stream in streams["streams"]:
if stream["codec_name"] in ["png", "jpg"]:
return stream["codec_name"]
return False
def _convert_thumb(self, current_video):
"""convert all thumbnails to jpg"""
if not current_video["thumb"]:
return
thumb_path = current_video["thumb"]
base_path, ext = os.path.splitext(thumb_path)
if ext == ".jpg":
return
new_path = f"{base_path}.jpg"
img_raw = Image.open(thumb_path)
img_raw.convert("RGB").save(new_path)
os.remove(thumb_path)
current_video["thumb"] = new_path
@staticmethod
def _get_streams(media_path):
"""return all streams from media_path"""
streams_raw = subprocess.run(
[
"ffprobe",
"-v",
"error",
"-show_streams",
"-print_format",
"json",
media_path,
],
capture_output=True,
check=True,
)
streams = json.loads(streams_raw.stdout.decode())
return streams
@staticmethod
def dump_mp4_thumb(media_path, thumb_type):
"""save cover to disk"""
_, ext = os.path.splitext(media_path)
new_path = f"{media_path.rstrip(ext)}.{thumb_type}"
subprocess.run(
[
"ffmpeg",
"-i",
media_path,
"-map",
"0:v",
"-map",
"-0:V",
"-c",
"copy",
new_path,
],
check=True,
)
return new_path
def _convert_video(self, current_video):
"""convert if needed"""
current_path = current_video["media"]
base_path, ext = os.path.splitext(current_path)
if ext == ".mp4":
new_file = video_file + ext
dest_path = os.path.join(self.CACHE_DIR, "download", new_file)
shutil.move(video_path, dest_path, copy_function=shutil.copyfile)
return
new_path = base_path + ".mp4"
subprocess.run(
[
"ffmpeg",
"-i",
current_path,
new_path,
"-loglevel",
"warning",
"-stats",
],
check=True,
)
current_video["media"] = new_path
os.remove(current_path)
class ManualImport:
"""import single identified video"""
def __init__(self, current_video, config):
self.current_video = current_video
self.config = config
def run(self):
"""run all"""
json_data = self.index_metadata()
self._move_to_archive(json_data)
self._cleanup(json_data)
def index_metadata(self):
"""get metadata from yt or json"""
video_id = self.current_video["video_id"]
video = YoutubeVideo(video_id)
video.build_json(
youtube_meta_overwrite=self._get_info_json(),
media_path=self.current_video["media"],
)
if not video.json_data:
print(f"{video_id}: manual import failed, and no metadata found.")
raise ValueError
video.check_subtitles()
video.upload_to_es()
if video.offline_import and self.current_video["thumb"]:
old_path = self.current_video["thumb"]
new_path = ThumbManager(video_id).vid_thumb_path(absolute=True)
shutil.move(old_path, new_path, copy_function=shutil.copyfile)
else:
print(f"processing with ffmpeg: {video_file}")
new_file = video_file + ".mp4"
dest_path = os.path.join(self.CACHE_DIR, "download", new_file)
subprocess.run(
[
"ffmpeg",
"-i",
video_path,
dest_path,
"-loglevel",
"warning",
"-stats",
],
check=True,
)
url = video.json_data["vid_thumb_url"]
ThumbManager(video_id).download_video_thumb(url)
return video.json_data
def _get_info_json(self):
"""read info_json from file"""
if not self.current_video["metadata"]:
return False
with open(self.current_video["metadata"], "r", encoding="utf-8") as f:
info_json = json.loads(f.read())
return info_json
def _move_to_archive(self, json_data):
"""move identified media file to archive"""
videos = self.config["application"]["videos"]
channel, file = os.path.split(json_data["media_url"])
channel_folder = os.path.join(videos, channel)
if not os.path.exists(channel_folder):
os.makedirs(channel_folder)
old_path = self.current_video["media"]
new_path = os.path.join(channel_folder, file)
shutil.move(old_path, new_path, copy_function=shutil.copyfile)
def _cleanup(self, json_data):
"""cleanup leftover files"""
if os.path.exists(self.current_video["metadata"]):
os.remove(self.current_video["metadata"])
if os.path.exists(self.current_video["thumb"]):
os.remove(self.current_video["thumb"])
for subtitle_file in self.current_video["subtitle"]:
if os.path.exists(subtitle_file):
os.remove(subtitle_file)
channel_info = os.path.join(
self.config["application"]["cache_dir"],
"import",
f"{json_data['channel']['channel_id']}.info.json",
)
if os.path.exists(channel_info):
os.remove(channel_info)
def scan_filesystem():

View File

@ -41,7 +41,6 @@ class YoutubePlaylist(YouTubeItem):
self.process_youtube_meta()
self.get_entries()
self.json_data["playlist_entries"] = self.all_members
self.get_playlist_art()
self.json_data["playlist_subscribed"] = subscribed
def process_youtube_meta(self):
@ -81,12 +80,10 @@ class YoutubePlaylist(YouTubeItem):
self.all_members = all_members
@staticmethod
def get_playlist_art():
def get_playlist_art(self):
"""download artwork of playlist"""
thumbnails = ThumbManager()
missing_playlists = thumbnails.get_missing_playlists()
thumbnails.download_playlist(missing_playlists)
url = self.json_data["playlist_thumbnail"]
ThumbManager(self.youtube_id, item_type="playlist").download(url)
def add_vids_to_playlist(self):
"""sync the playlist id to videos"""
@ -145,17 +142,15 @@ class YoutubePlaylist(YouTubeItem):
previous_item = False
else:
previous_item = all_entries[current_idx - 1]
prev_thumb = ThumbManager().vid_thumb_path(
previous_item["youtube_id"]
)
previous_item["vid_thumb"] = prev_thumb
prev_id = previous_item["youtube_id"]
previous_item["vid_thumb"] = ThumbManager(prev_id).vid_thumb_path()
if current_idx == len(all_entries) - 1:
next_item = False
else:
next_item = all_entries[current_idx + 1]
next_thumb = ThumbManager().vid_thumb_path(next_item["youtube_id"])
next_item["vid_thumb"] = next_thumb
next_id = next_item["youtube_id"]
next_item["vid_thumb"] = ThumbManager(next_id).vid_thumb_path()
self.nav = {
"playlist_meta": {

View File

@ -181,10 +181,10 @@ class Reindex:
video.upload_to_es()
thumb_handler = ThumbManager()
thumb_handler.delete_vid_thumb(youtube_id)
to_download = (youtube_id, video.json_data["vid_thumb_url"])
thumb_handler.download_vid([to_download], notify=False)
thumb_handler = ThumbManager(youtube_id)
thumb_handler.delete_video_thumb()
thumb_handler.download_video_thumb(video.json_data["vid_thumb_url"])
return
@staticmethod