tubearchivist/tubearchivist/home/src/download/thumbnails.py

411 lines
13 KiB
Python
Raw Normal View History

"""
functionality:
- handle download and caching for thumbnails
- check for missing thumbnails
"""
2022-04-07 17:29:09 +00:00
import base64
import os
2022-04-07 17:29:09 +00:00
from io import BytesIO
from time import sleep
import requests
2023-03-16 10:12:50 +00:00
from home.src.es.connect import ElasticWrap, IndexPaginate
2022-01-22 15:13:37 +00:00
from home.src.ta.config import AppConfig
from mutagen.mp4 import MP4, MP4Cover
from PIL import Image, ImageFile, ImageFilter, UnidentifiedImageError
2022-06-01 02:04:55 +00:00
ImageFile.LOAD_TRUNCATED_IMAGES = True
2022-08-10 14:03:54 +00:00
class ThumbManagerBase:
"""base class for thumbnail management"""
CONFIG = AppConfig().config
CACHE_DIR = CONFIG["application"]["cache_dir"]
VIDEO_DIR = os.path.join(CACHE_DIR, "videos")
CHANNEL_DIR = os.path.join(CACHE_DIR, "channels")
2021-11-10 10:55:34 +00:00
PLAYLIST_DIR = os.path.join(CACHE_DIR, "playlists")
2022-08-10 14:03:54 +00:00
def __init__(self, item_id, item_type, fallback=False):
self.item_id = item_id
self.item_type = item_type
self.fallback = fallback
2022-08-10 14:03:54 +00:00
def download_raw(self, url):
"""download thumbnail for video"""
2022-08-10 15:24:02 +00:00
if not url:
return self.get_fallback()
2022-03-18 11:27:25 +00:00
2022-08-10 14:03:54 +00:00
for i in range(3):
try:
response = requests.get(url, stream=True, timeout=5)
2022-08-10 14:03:54 +00:00
if response.ok:
try:
img = Image.open(response.raw)
if isinstance(img, Image.Image):
return img
return self.get_fallback()
except (UnidentifiedImageError, OSError):
print(f"failed to open thumbnail: {url}")
return self.get_fallback()
2022-08-10 14:03:54 +00:00
if response.status_code == 404:
return self.get_fallback()
2023-07-28 11:19:22 +00:00
except (
requests.exceptions.RequestException,
requests.exceptions.ReadTimeout,
):
2022-08-10 14:03:54 +00:00
print(f"{self.item_id}: retry thumbnail download {url}")
sleep((i + 1) ** i)
return self.get_fallback()
2022-08-10 14:03:54 +00:00
def get_fallback(self):
"""get fallback thumbnail if not available"""
2023-02-13 05:15:12 +00:00
print(f"{self.item_id}: failed to extract thumbnail, use fallback")
2022-08-10 14:03:54 +00:00
if self.fallback:
img_raw = Image.open(self.fallback)
return img_raw
app_root = self.CONFIG["application"]["app_root"]
default_map = {
"video": os.path.join(
app_root, "static/img/default-video-thumb.jpg"
),
2022-08-10 14:03:54 +00:00
"playlist": os.path.join(
app_root, "static/img/default-video-thumb.jpg"
),
"icon": os.path.join(
app_root, "static/img/default-channel-icon.jpg"
),
"banner": os.path.join(
app_root, "static/img/default-channel-banner.jpg"
),
2023-04-07 12:14:25 +00:00
"tvart": os.path.join(
2023-04-05 14:49:03 +00:00
app_root, "static/img/default-channel-art.jpg"
),
}
2022-08-10 14:03:54 +00:00
img_raw = Image.open(default_map[self.item_type])
2022-08-10 14:03:54 +00:00
return img_raw
2022-08-10 14:03:54 +00:00
class ThumbManager(ThumbManagerBase):
"""handle thumbnails related functions"""
2022-08-10 14:03:54 +00:00
def __init__(self, item_id, item_type="video", fallback=False):
super().__init__(item_id, item_type, fallback=fallback)
def download(self, url):
"""download thumbnail"""
print(f"{self.item_id}: download {self.item_type} thumbnail")
if self.item_type == "video":
self.download_video_thumb(url)
elif self.item_type == "channel":
self.download_channel_art(url)
elif self.item_type == "playlist":
self.download_playlist_thumb(url)
def delete(self):
"""delete thumbnail file"""
print(f"{self.item_id}: delete {self.item_type} thumbnail")
if self.item_type == "video":
self.delete_video_thumb()
elif self.item_type == "channel":
self.delete_channel_thumb()
elif self.item_type == "playlist":
self.delete_playlist_thumb()
def download_video_thumb(self, url, skip_existing=False):
"""pass url for video thumbnail"""
folder_path = os.path.join(self.VIDEO_DIR, self.item_id[0].lower())
thumb_path = self.vid_thumb_path(absolute=True)
if skip_existing and os.path.exists(thumb_path):
return
2022-08-10 14:03:54 +00:00
os.makedirs(folder_path, exist_ok=True)
img_raw = self.download_raw(url)
width, height = img_raw.size
2022-08-10 14:03:54 +00:00
if not width / height == 16 / 9:
new_height = width / 16 * 9
offset = (height - new_height) / 2
img_raw = img_raw.crop((0, offset, width, height - offset))
2022-04-07 17:29:09 +00:00
2022-08-10 14:03:54 +00:00
img_raw.convert("RGB").save(thumb_path)
2022-04-07 17:29:09 +00:00
def vid_thumb_path(self, absolute=False, create_folder=False):
"""build expected path for video thumbnail from youtube_id"""
2022-08-10 14:03:54 +00:00
folder_name = self.item_id[0].lower()
folder_path = os.path.join("videos", folder_name)
2022-08-10 14:03:54 +00:00
thumb_path = os.path.join(folder_path, f"{self.item_id}.jpg")
if absolute:
thumb_path = os.path.join(self.CACHE_DIR, thumb_path)
if create_folder:
folder_path = os.path.join(self.CACHE_DIR, folder_path)
os.makedirs(folder_path, exist_ok=True)
return thumb_path
2022-08-10 14:03:54 +00:00
def download_channel_art(self, urls, skip_existing=False):
"""pass tuple of channel thumbnails"""
2023-04-05 14:49:03 +00:00
channel_thumb, channel_banner, channel_tv = urls
2022-08-10 14:03:54 +00:00
self._download_channel_thumb(channel_thumb, skip_existing)
self._download_channel_banner(channel_banner, skip_existing)
2023-04-05 14:49:03 +00:00
self._download_channel_tv(channel_tv, skip_existing)
2022-08-10 14:03:54 +00:00
def _download_channel_thumb(self, channel_thumb, skip_existing):
"""download channel thumbnail"""
thumb_path = os.path.join(
self.CHANNEL_DIR, f"{self.item_id}_thumb.jpg"
)
self.item_type = "icon"
if skip_existing and os.path.exists(thumb_path):
return
img_raw = self.download_raw(channel_thumb)
img_raw.convert("RGB").save(thumb_path)
def _download_channel_banner(self, channel_banner, skip_existing):
"""download channel banner"""
banner_path = os.path.join(
self.CHANNEL_DIR, self.item_id + "_banner.jpg"
)
self.item_type = "banner"
if skip_existing and os.path.exists(banner_path):
return
img_raw = self.download_raw(channel_banner)
img_raw.convert("RGB").save(banner_path)
2023-04-05 14:49:03 +00:00
def _download_channel_tv(self, channel_tv, skip_existing):
"""download channel tv art"""
art_path = os.path.join(self.CHANNEL_DIR, self.item_id + "_tvart.jpg")
self.item_type = "tvart"
if skip_existing and os.path.exists(art_path):
return
img_raw = self.download_raw(channel_tv)
img_raw.convert("RGB").save(art_path)
2022-08-10 14:03:54 +00:00
def download_playlist_thumb(self, url, skip_existing=False):
"""pass thumbnail url"""
thumb_path = os.path.join(self.PLAYLIST_DIR, f"{self.item_id}.jpg")
if skip_existing and os.path.exists(thumb_path):
return
img_raw = self.download_raw(url)
img_raw.convert("RGB").save(thumb_path)
def delete_video_thumb(self):
"""delete video thumbnail if exists"""
2022-08-10 14:03:54 +00:00
thumb_path = self.vid_thumb_path()
to_delete = os.path.join(self.CACHE_DIR, thumb_path)
if os.path.exists(to_delete):
os.remove(to_delete)
2022-08-10 14:03:54 +00:00
def delete_channel_thumb(self):
"""delete all artwork of channel"""
2022-08-10 14:03:54 +00:00
thumb = os.path.join(self.CHANNEL_DIR, f"{self.item_id}_thumb.jpg")
banner = os.path.join(self.CHANNEL_DIR, f"{self.item_id}_banner.jpg")
if os.path.exists(thumb):
os.remove(thumb)
if os.path.exists(banner):
os.remove(banner)
2022-08-10 14:03:54 +00:00
def delete_playlist_thumb(self):
"""delete playlist thumbnail"""
thumb_path = os.path.join(self.PLAYLIST_DIR, f"{self.item_id}.jpg")
if os.path.exists(thumb_path):
os.remove(thumb_path)
def get_vid_base64_blur(self):
"""return base64 encoded placeholder"""
file_path = os.path.join(self.CACHE_DIR, self.vid_thumb_path())
img_raw = Image.open(file_path)
img_raw.thumbnail((img_raw.width // 20, img_raw.height // 20))
img_blur = img_raw.filter(ImageFilter.BLUR)
buffer = BytesIO()
img_blur.save(buffer, format="JPEG")
img_data = buffer.getvalue()
img_base64 = base64.b64encode(img_data).decode()
data_url = f"data:image/jpg;base64,{img_base64}"
return data_url
class ValidatorCallback:
"""handle callback validate thumbnails page by page"""
def __init__(self, source, index_name):
self.source = source
self.index_name = index_name
def run(self):
"""run the task for page"""
print(f"{self.index_name}: validate artwork")
if self.index_name == "ta_video":
self._validate_videos()
elif self.index_name == "ta_channel":
self._validate_channels()
elif self.index_name == "ta_playlist":
self._validate_playlists()
def _validate_videos(self):
"""check if video thumbnails are correct"""
for video in self.source:
url = video["_source"]["vid_thumb_url"]
handler = ThumbManager(video["_source"]["youtube_id"])
handler.download_video_thumb(url, skip_existing=True)
def _validate_channels(self):
"""check if all channel artwork is there"""
for channel in self.source:
urls = (
channel["_source"]["channel_thumb_url"],
channel["_source"]["channel_banner_url"],
channel["_source"].get("channel_tvart_url", False),
2022-08-10 14:03:54 +00:00
)
handler = ThumbManager(channel["_source"]["channel_id"])
handler.download_channel_art(urls, skip_existing=True)
def _validate_playlists(self):
"""check if all playlist artwork is there"""
for playlist in self.source:
url = playlist["_source"]["playlist_thumbnail"]
handler = ThumbManager(playlist["_source"]["playlist_id"])
handler.download_playlist_thumb(url, skip_existing=True)
class ThumbValidator:
"""validate thumbnails"""
2023-03-16 10:12:50 +00:00
INDEX = [
{
"data": {
"query": {"term": {"active": {"value": True}}},
"_source": ["vid_thumb_url", "youtube_id"],
2022-08-10 14:03:54 +00:00
},
2023-03-16 10:12:50 +00:00
"name": "ta_video",
},
{
"data": {
"query": {"term": {"channel_active": {"value": True}}},
"_source": {
"excludes": ["channel_description", "channel_overwrites"]
},
},
"name": "ta_channel",
},
{
"data": {
"query": {"term": {"playlist_active": {"value": True}}},
"_source": ["playlist_id", "playlist_thumbnail"],
},
"name": "ta_playlist",
},
]
def __init__(self, task):
self.task = task
def validate(self):
"""validate all indexes"""
for index in self.INDEX:
total = self._get_total(index["name"])
if not total:
continue
paginate = IndexPaginate(
index_name=index["name"],
data=index["data"],
size=1000,
callback=ValidatorCallback,
task=self.task,
total=total,
)
_ = paginate.get_results()
@staticmethod
def _get_total(index_name):
"""get total documents in index"""
path = f"{index_name}/_count"
response, _ = ElasticWrap(path).get()
return response.get("count")
2022-08-10 14:03:54 +00:00
class ThumbFilesystem:
2023-03-16 11:13:37 +00:00
"""sync thumbnail files to media files"""
INDEX_NAME = "ta_video"
def __init__(self, task=False):
self.task = task
def embed(self):
"""entry point"""
data = {
"query": {"match_all": {}},
"_source": ["media_url", "youtube_id"],
}
paginate = IndexPaginate(
index_name=self.INDEX_NAME,
data=data,
2023-04-09 07:23:10 +00:00
size=200,
2023-03-16 11:13:37 +00:00
callback=EmbedCallback,
task=self.task,
total=self._get_total(),
)
_ = paginate.get_results()
def _get_total(self):
"""get total documents in index"""
path = f"{self.INDEX_NAME}/_count"
response, _ = ElasticWrap(path).get()
return response.get("count")
class EmbedCallback:
"""callback class to embed thumbnails"""
2022-08-10 14:03:54 +00:00
CONFIG = AppConfig().config
CACHE_DIR = CONFIG["application"]["cache_dir"]
MEDIA_DIR = CONFIG["application"]["videos"]
2023-03-16 11:13:37 +00:00
FORMAT = MP4Cover.FORMAT_JPEG
def __init__(self, source, index_name):
self.source = source
self.index_name = index_name
2022-08-10 14:03:54 +00:00
2023-03-16 11:13:37 +00:00
def run(self):
"""run embed"""
for video in self.source:
video_id = video["_source"]["youtube_id"]
media_url = os.path.join(
self.MEDIA_DIR, video["_source"]["media_url"]
)
thumb_path = os.path.join(
2022-08-10 14:03:54 +00:00
self.CACHE_DIR, ThumbManager(video_id).vid_thumb_path()
)
2023-03-16 11:13:37 +00:00
if os.path.exists(thumb_path):
self.embed(media_url, thumb_path)
2023-03-16 11:13:37 +00:00
def embed(self, media_url, thumb_path):
"""embed thumb in single media file"""
video = MP4(media_url)
with open(thumb_path, "rb") as f:
video["covr"] = [MP4Cover(f.read(), imageformat=self.FORMAT)]
2023-03-16 11:13:37 +00:00
video.save()