major refactor, split up modules

This commit is contained in:
simon 2022-01-22 22:13:37 +07:00
parent f3efca7464
commit 0fc0cc8e87
No known key found for this signature in database
GPG Key ID: 2C15AA5E89985DD4
32 changed files with 2467 additions and 2340 deletions

View File

@ -1,9 +1,9 @@
"""all API views"""
import requests
from home.src.config import AppConfig
from home.src.helper import UrlListParser
from home.src.thumbnails import ThumbManager
from home.src.ta.config import AppConfig
from home.src.ta.helper import UrlListParser
from home.src.download.thumbnails import ThumbManager
from home.tasks import extrac_dl, subscribe_to
from rest_framework.authentication import (
SessionAuthentication,

View File

@ -15,7 +15,7 @@ from os import environ, path
from pathlib import Path
from corsheaders.defaults import default_headers
from home.src.config import AppConfig
from home.src.ta.config import AppConfig
# Build paths inside the project like this: BASE_DIR / 'subdir'.
BASE_DIR = Path(__file__).resolve().parent.parent

View File

@ -3,9 +3,9 @@
import os
from django.apps import AppConfig
from home.src.config import AppConfig as ArchivistConfig
from home.src.helper import RedisArchivist
from home.src.index_management import index_check
from home.src.es.index_setup import index_check
from home.src.ta.config import AppConfig as ArchivistConfig
from home.src.ta.ta_redis import RedisArchivist
def sync_redis_state():

View File

@ -1,754 +0,0 @@
"""
Functionality:
- handele the download queue
- manage subscriptions to channels
- manage subscriptions to playlists
- downloading videos
"""
import json
import os
import shutil
from datetime import datetime
from time import sleep
import requests
import yt_dlp
from home.src.config import AppConfig
from home.src.es import IndexPaginate
from home.src.helper import (
DurationConverter,
RedisArchivist,
RedisQueue,
clean_string,
ignore_filelist,
)
from home.src.index import (
YoutubeChannel,
YoutubePlaylist,
YoutubeVideo,
index_new_video,
)
class PendingList:
"""manage the pending videos list"""
CONFIG = AppConfig().config
ES_URL = CONFIG["application"]["es_url"]
ES_AUTH = CONFIG["application"]["es_auth"]
VIDEOS = CONFIG["application"]["videos"]
def __init__(self):
self.all_channel_ids = False
self.all_downloaded = False
self.missing_from_playlists = []
def parse_url_list(self, youtube_ids):
"""extract youtube ids from list"""
missing_videos = []
for entry in youtube_ids:
# notify
mess_dict = {
"status": "message:add",
"level": "info",
"title": "Adding to download queue.",
"message": "Extracting lists",
}
RedisArchivist().set_message("message:add", mess_dict)
# extract
url = entry["url"]
url_type = entry["type"]
if url_type == "video":
missing_videos.append(url)
elif url_type == "channel":
video_results = ChannelSubscription().get_last_youtube_videos(
url, limit=False
)
youtube_ids = [i[0] for i in video_results]
missing_videos = missing_videos + youtube_ids
elif url_type == "playlist":
self.missing_from_playlists.append(entry)
playlist = YoutubePlaylist(url)
playlist.build_json()
video_results = playlist.json_data.get("playlist_entries")
youtube_ids = [i["youtube_id"] for i in video_results]
missing_videos = missing_videos + youtube_ids
return missing_videos
def add_to_pending(self, missing_videos, ignore=False):
"""build the bulk json data from pending"""
# check if channel is indexed
channel_handler = ChannelSubscription()
all_indexed = channel_handler.get_channels(subscribed_only=False)
self.all_channel_ids = [i["channel_id"] for i in all_indexed]
# check if already there
self.all_downloaded = self.get_all_downloaded()
bulk_list, all_videos_added = self.build_bulk(missing_videos, ignore)
# add last newline
bulk_list.append("\n")
query_str = "\n".join(bulk_list)
headers = {"Content-type": "application/x-ndjson"}
url = self.ES_URL + "/_bulk"
request = requests.post(
url, data=query_str, headers=headers, auth=self.ES_AUTH
)
if not request.ok:
print(request)
raise ValueError("failed to add video to download queue")
return all_videos_added
def build_bulk(self, missing_videos, ignore=False):
"""build the bulk lists"""
bulk_list = []
all_videos_added = []
for idx, youtube_id in enumerate(missing_videos):
# check if already downloaded
if youtube_id in self.all_downloaded:
continue
video = self.get_youtube_details(youtube_id)
# skip on download error
if not video:
continue
channel_indexed = video["channel_id"] in self.all_channel_ids
video["channel_indexed"] = channel_indexed
if ignore:
video["status"] = "ignore"
else:
video["status"] = "pending"
action = {"create": {"_id": youtube_id, "_index": "ta_download"}}
bulk_list.append(json.dumps(action))
bulk_list.append(json.dumps(video))
all_videos_added.append((youtube_id, video["vid_thumb_url"]))
# notify
progress = f"{idx + 1}/{len(missing_videos)}"
mess_dict = {
"status": "message:add",
"level": "info",
"title": "Adding new videos to download queue.",
"message": "Progress: " + progress,
}
if idx + 1 == len(missing_videos):
RedisArchivist().set_message(
"message:add", mess_dict, expire=4
)
else:
RedisArchivist().set_message("message:add", mess_dict)
if idx + 1 % 25 == 0:
print("adding to queue progress: " + progress)
return bulk_list, all_videos_added
@staticmethod
def get_youtube_details(youtube_id):
"""get details from youtubedl for single pending video"""
obs = {
"default_search": "ytsearch",
"quiet": True,
"check_formats": "selected",
"noplaylist": True,
"writethumbnail": True,
"simulate": True,
}
try:
vid = yt_dlp.YoutubeDL(obs).extract_info(youtube_id)
except yt_dlp.utils.DownloadError:
print("failed to extract info for: " + youtube_id)
return False
# stop if video is streaming live now
if vid["is_live"]:
return False
# parse response
seconds = vid["duration"]
duration_str = DurationConverter.get_str(seconds)
if duration_str == "NA":
print(f"skip extracting duration for: {youtube_id}")
upload_date = vid["upload_date"]
upload_dt = datetime.strptime(upload_date, "%Y%m%d")
published = upload_dt.strftime("%Y-%m-%d")
# build dict
youtube_details = {
"youtube_id": youtube_id,
"channel_name": vid["channel"],
"vid_thumb_url": vid["thumbnail"],
"title": vid["title"],
"channel_id": vid["channel_id"],
"duration": duration_str,
"published": published,
"timestamp": int(datetime.now().strftime("%s")),
}
return youtube_details
@staticmethod
def get_all_pending():
"""get a list of all pending videos in ta_download"""
data = {
"query": {"match_all": {}},
"sort": [{"timestamp": {"order": "asc"}}],
}
all_results = IndexPaginate("ta_download", data).get_results()
all_pending = []
all_ignore = []
for result in all_results:
if result["status"] == "pending":
all_pending.append(result)
elif result["status"] == "ignore":
all_ignore.append(result)
return all_pending, all_ignore
@staticmethod
def get_all_indexed():
"""get a list of all videos indexed"""
data = {
"query": {"match_all": {}},
"sort": [{"published": {"order": "desc"}}],
}
all_indexed = IndexPaginate("ta_video", data).get_results()
return all_indexed
def get_all_downloaded(self):
"""get a list of all videos in archive"""
channel_folders = os.listdir(self.VIDEOS)
all_channel_folders = ignore_filelist(channel_folders)
all_downloaded = []
for channel_folder in all_channel_folders:
channel_path = os.path.join(self.VIDEOS, channel_folder)
videos = os.listdir(channel_path)
all_videos = ignore_filelist(videos)
youtube_vids = [i[9:20] for i in all_videos]
for youtube_id in youtube_vids:
all_downloaded.append(youtube_id)
return all_downloaded
def delete_from_pending(self, youtube_id):
"""delete the youtube_id from ta_download"""
url = f"{self.ES_URL}/ta_download/_doc/{youtube_id}"
response = requests.delete(url, auth=self.ES_AUTH)
if not response.ok:
print(response.text)
def delete_pending(self, status):
"""delete download queue based on status value"""
data = {"query": {"term": {"status": {"value": status}}}}
payload = json.dumps(data)
url = self.ES_URL + "/ta_download/_delete_by_query"
headers = {"Content-type": "application/json"}
response = requests.post(
url, data=payload, headers=headers, auth=self.ES_AUTH
)
if not response.ok:
print(response.text)
def ignore_from_pending(self, ignore_list):
"""build the bulk query string"""
stamp = int(datetime.now().strftime("%s"))
bulk_list = []
for youtube_id in ignore_list:
action = {"update": {"_id": youtube_id, "_index": "ta_download"}}
source = {"doc": {"status": "ignore", "timestamp": stamp}}
bulk_list.append(json.dumps(action))
bulk_list.append(json.dumps(source))
# add last newline
bulk_list.append("\n")
query_str = "\n".join(bulk_list)
headers = {"Content-type": "application/x-ndjson"}
url = self.ES_URL + "/_bulk"
request = requests.post(
url, data=query_str, headers=headers, auth=self.ES_AUTH
)
if not request.ok:
print(request)
raise ValueError("failed to set video to ignore")
class ChannelSubscription:
"""manage the list of channels subscribed"""
def __init__(self):
config = AppConfig().config
self.es_url = config["application"]["es_url"]
self.es_auth = config["application"]["es_auth"]
self.channel_size = config["subscriptions"]["channel_size"]
@staticmethod
def get_channels(subscribed_only=True):
"""get a list of all channels subscribed to"""
data = {
"sort": [{"channel_name.keyword": {"order": "asc"}}],
}
if subscribed_only:
data["query"] = {"term": {"channel_subscribed": {"value": True}}}
else:
data["query"] = {"match_all": {}}
all_channels = IndexPaginate("ta_channel", data).get_results()
return all_channels
def get_last_youtube_videos(self, channel_id, limit=True):
"""get a list of last videos from channel"""
url = f"https://www.youtube.com/channel/{channel_id}/videos"
obs = {
"default_search": "ytsearch",
"quiet": True,
"skip_download": True,
"extract_flat": True,
}
if limit:
obs["playlistend"] = self.channel_size
chan = yt_dlp.YoutubeDL(obs).extract_info(url, download=False)
last_videos = [(i["id"], i["title"]) for i in chan["entries"]]
return last_videos
def find_missing(self):
"""add missing videos from subscribed channels to pending"""
all_channels = self.get_channels()
pending_handler = PendingList()
all_pending, all_ignore = pending_handler.get_all_pending()
all_ids = [i["youtube_id"] for i in all_ignore + all_pending]
all_downloaded = pending_handler.get_all_downloaded()
to_ignore = all_ids + all_downloaded
missing_videos = []
for idx, channel in enumerate(all_channels):
channel_id = channel["channel_id"]
last_videos = self.get_last_youtube_videos(channel_id)
for video in last_videos:
if video[0] not in to_ignore:
missing_videos.append(video[0])
# notify
message = {
"status": "message:rescan",
"level": "info",
"title": "Scanning channels: Looking for new videos.",
"message": f"Progress: {idx + 1}/{len(all_channels)}",
}
if idx + 1 == len(all_channels):
RedisArchivist().set_message(
"message:rescan", message=message, expire=4
)
else:
RedisArchivist().set_message("message:rescan", message=message)
return missing_videos
@staticmethod
def change_subscribe(channel_id, channel_subscribed):
"""subscribe or unsubscribe from channel and update"""
channel = YoutubeChannel(channel_id)
channel.build_json()
channel.json_data["channel_subscribed"] = channel_subscribed
channel.upload_to_es()
channel.sync_to_videos()
class PlaylistSubscription:
"""manage the playlist download functionality"""
def __init__(self):
self.config = AppConfig().config
@staticmethod
def get_playlists(subscribed_only=True):
"""get a list of all active playlists"""
data = {
"sort": [{"playlist_channel.keyword": {"order": "desc"}}],
}
data["query"] = {
"bool": {"must": [{"term": {"playlist_active": {"value": True}}}]}
}
if subscribed_only:
data["query"]["bool"]["must"].append(
{"term": {"playlist_subscribed": {"value": True}}}
)
all_playlists = IndexPaginate("ta_playlist", data).get_results()
return all_playlists
def process_url_str(self, new_playlists, subscribed=True):
"""process playlist subscribe form url_str"""
all_indexed = PendingList().get_all_indexed()
all_youtube_ids = [i["youtube_id"] for i in all_indexed]
new_thumbs = []
for idx, playlist in enumerate(new_playlists):
url_type = playlist["type"]
playlist_id = playlist["url"]
if not url_type == "playlist":
print(f"{playlist_id} not a playlist, skipping...")
continue
playlist_h = YoutubePlaylist(playlist_id)
playlist_h.all_youtube_ids = all_youtube_ids
playlist_h.build_json()
playlist_h.json_data["playlist_subscribed"] = subscribed
playlist_h.upload_to_es()
playlist_h.add_vids_to_playlist()
self.channel_validate(playlist_h.json_data["playlist_channel_id"])
thumb = playlist_h.json_data["playlist_thumbnail"]
new_thumbs.append((playlist_id, thumb))
# notify
message = {
"status": "message:subplaylist",
"level": "info",
"title": "Subscribing to Playlists",
"message": f"Processing {idx + 1} of {len(new_playlists)}",
}
RedisArchivist().set_message(
"message:subplaylist", message=message
)
return new_thumbs
@staticmethod
def channel_validate(channel_id):
"""make sure channel of playlist is there"""
channel = YoutubeChannel(channel_id)
channel.build_json()
@staticmethod
def change_subscribe(playlist_id, subscribe_status):
"""change the subscribe status of a playlist"""
playlist = YoutubePlaylist(playlist_id)
playlist.build_json()
playlist.json_data["playlist_subscribed"] = subscribe_status
playlist.upload_to_es()
@staticmethod
def get_to_ignore():
"""get all youtube_ids already downloaded or ignored"""
pending_handler = PendingList()
all_pending, all_ignore = pending_handler.get_all_pending()
all_ids = [i["youtube_id"] for i in all_ignore + all_pending]
all_downloaded = pending_handler.get_all_downloaded()
to_ignore = all_ids + all_downloaded
return to_ignore
def find_missing(self):
"""find videos in subscribed playlists not downloaded yet"""
all_playlists = [i["playlist_id"] for i in self.get_playlists()]
to_ignore = self.get_to_ignore()
missing_videos = []
for idx, playlist_id in enumerate(all_playlists):
size_limit = self.config["subscriptions"]["channel_size"]
playlist = YoutubePlaylist(playlist_id)
playlist.update_playlist()
if not playlist:
playlist.deactivate()
continue
playlist_entries = playlist.json_data["playlist_entries"]
if size_limit:
del playlist_entries[size_limit:]
all_missing = [i for i in playlist_entries if not i["downloaded"]]
message = {
"status": "message:rescan",
"level": "info",
"title": "Scanning playlists: Looking for new videos.",
"message": f"Progress: {idx + 1}/{len(all_playlists)}",
}
RedisArchivist().set_message("message:rescan", message=message)
for video in all_missing:
youtube_id = video["youtube_id"]
if youtube_id not in to_ignore:
missing_videos.append(youtube_id)
return missing_videos
class VideoDownloader:
"""
handle the video download functionality
if not initiated with list, take from queue
"""
def __init__(self, youtube_id_list=False):
self.youtube_id_list = youtube_id_list
self.config = AppConfig().config
self.channels = set()
def run_queue(self):
"""setup download queue in redis loop until no more items"""
queue = RedisQueue("dl_queue")
limit_queue = self.config["downloads"]["limit_count"]
if limit_queue:
queue.trim(limit_queue - 1)
while True:
youtube_id = queue.get_next()
if not youtube_id:
break
try:
self.dl_single_vid(youtube_id)
except yt_dlp.utils.DownloadError:
print("failed to download " + youtube_id)
continue
vid_dict = index_new_video(youtube_id)
self.channels.add(vid_dict["channel"]["channel_id"])
self.move_to_archive(vid_dict)
self.delete_from_pending(youtube_id)
autodelete_days = self.config["downloads"]["autodelete_days"]
if autodelete_days:
print(f"auto delete older than {autodelete_days} days")
self.auto_delete_watched(autodelete_days)
@staticmethod
def add_pending():
"""add pending videos to download queue"""
mess_dict = {
"status": "message:download",
"level": "info",
"title": "Looking for videos to download",
"message": "Scanning your download queue.",
}
RedisArchivist().set_message("message:download", mess_dict)
all_pending, _ = PendingList().get_all_pending()
to_add = [i["youtube_id"] for i in all_pending]
if not to_add:
# there is nothing pending
print("download queue is empty")
mess_dict = {
"status": "message:download",
"level": "error",
"title": "Download queue is empty",
"message": "Add some videos to the queue first.",
}
RedisArchivist().set_message("message:download", mess_dict)
return
queue = RedisQueue("dl_queue")
queue.add_list(to_add)
@staticmethod
def progress_hook(response):
"""process the progress_hooks from yt_dlp"""
# title
path = os.path.split(response["filename"])[-1][12:]
filename = os.path.splitext(os.path.splitext(path)[0])[0]
filename_clean = filename.replace("_", " ")
title = "Downloading: " + filename_clean
# message
try:
percent = response["_percent_str"]
size = response["_total_bytes_str"]
speed = response["_speed_str"]
eta = response["_eta_str"]
message = f"{percent} of {size} at {speed} - time left: {eta}"
except KeyError:
message = "processing"
mess_dict = {
"status": "message:download",
"level": "info",
"title": title,
"message": message,
}
RedisArchivist().set_message("message:download", mess_dict)
def build_obs(self):
"""build obs dictionary for yt-dlp"""
obs = {
"default_search": "ytsearch",
"merge_output_format": "mp4",
"restrictfilenames": True,
"outtmpl": (
self.config["application"]["cache_dir"]
+ "/download/"
+ self.config["application"]["file_template"]
),
"progress_hooks": [self.progress_hook],
"noprogress": True,
"quiet": True,
"continuedl": True,
"retries": 3,
"writethumbnail": False,
"noplaylist": True,
"check_formats": "selected",
}
if self.config["downloads"]["format"]:
obs["format"] = self.config["downloads"]["format"]
if self.config["downloads"]["limit_speed"]:
obs["ratelimit"] = self.config["downloads"]["limit_speed"] * 1024
throttle = self.config["downloads"]["throttledratelimit"]
if throttle:
obs["throttledratelimit"] = throttle * 1024
postprocessors = []
if self.config["downloads"]["add_metadata"]:
postprocessors.append(
{
"key": "FFmpegMetadata",
"add_chapters": True,
"add_metadata": True,
}
)
if self.config["downloads"]["add_thumbnail"]:
postprocessors.append(
{
"key": "EmbedThumbnail",
"already_have_thumbnail": True,
}
)
obs["writethumbnail"] = True
obs["postprocessors"] = postprocessors
return obs
def dl_single_vid(self, youtube_id):
"""download single video"""
dl_cache = self.config["application"]["cache_dir"] + "/download/"
obs = self.build_obs()
# check if already in cache to continue from there
all_cached = ignore_filelist(os.listdir(dl_cache))
for file_name in all_cached:
if youtube_id in file_name:
obs["outtmpl"] = os.path.join(dl_cache, file_name)
with yt_dlp.YoutubeDL(obs) as ydl:
try:
ydl.download([youtube_id])
except yt_dlp.utils.DownloadError:
print("retry failed download: " + youtube_id)
sleep(10)
ydl.download([youtube_id])
if obs["writethumbnail"]:
# webp files don't get cleaned up automatically
all_cached = ignore_filelist(os.listdir(dl_cache))
to_clean = [i for i in all_cached if not i.endswith(".mp4")]
for file_name in to_clean:
file_path = os.path.join(dl_cache, file_name)
os.remove(file_path)
def move_to_archive(self, vid_dict):
"""move downloaded video from cache to archive"""
videos = self.config["application"]["videos"]
host_uid = self.config["application"]["HOST_UID"]
host_gid = self.config["application"]["HOST_GID"]
channel_name = clean_string(vid_dict["channel"]["channel_name"])
# make archive folder with correct permissions
new_folder = os.path.join(videos, channel_name)
if not os.path.exists(new_folder):
os.makedirs(new_folder)
if host_uid and host_gid:
os.chown(new_folder, host_uid, host_gid)
# find real filename
cache_dir = self.config["application"]["cache_dir"]
all_cached = ignore_filelist(os.listdir(cache_dir + "/download/"))
for file_str in all_cached:
if vid_dict["youtube_id"] in file_str:
old_file = file_str
old_file_path = os.path.join(cache_dir, "download", old_file)
new_file_path = os.path.join(videos, vid_dict["media_url"])
# move media file and fix permission
shutil.move(old_file_path, new_file_path)
if host_uid and host_gid:
os.chown(new_file_path, host_uid, host_gid)
def delete_from_pending(self, youtube_id):
"""delete downloaded video from pending index if its there"""
es_url = self.config["application"]["es_url"]
es_auth = self.config["application"]["es_auth"]
url = f"{es_url}/ta_download/_doc/{youtube_id}"
response = requests.delete(url, auth=es_auth)
if not response.ok and not response.status_code == 404:
print(response.text)
def add_subscribed_channels(self):
"""add all channels subscribed to refresh"""
all_subscribed = PlaylistSubscription().get_playlists()
if not all_subscribed:
return
channel_ids = [i["playlist_channel_id"] for i in all_subscribed]
for channel_id in channel_ids:
self.channels.add(channel_id)
return
def validate_playlists(self):
"""look for playlist needing to update"""
print("sync playlists")
self.add_subscribed_channels()
all_indexed = PendingList().get_all_indexed()
all_youtube_ids = [i["youtube_id"] for i in all_indexed]
for id_c, channel_id in enumerate(self.channels):
playlists = YoutubeChannel(channel_id).get_indexed_playlists()
all_playlist_ids = [i["playlist_id"] for i in playlists]
for id_p, playlist_id in enumerate(all_playlist_ids):
playlist = YoutubePlaylist(playlist_id)
playlist.all_youtube_ids = all_youtube_ids
playlist.build_json(scrape=True)
if not playlist.json_data:
playlist.deactivate()
playlist.add_vids_to_playlist()
playlist.upload_to_es()
# notify
title = (
"Processing playlists for channels: "
+ f"{id_c + 1}/{len(self.channels)}"
)
message = f"Progress: {id_p + 1}/{len(all_playlist_ids)}"
mess_dict = {
"status": "message:download",
"level": "info",
"title": title,
"message": message,
}
if id_p + 1 == len(all_playlist_ids):
RedisArchivist().set_message(
"message:download", mess_dict, expire=4
)
else:
RedisArchivist().set_message("message:download", mess_dict)
@staticmethod
def auto_delete_watched(autodelete_days):
"""delete watched videos after x days"""
now = int(datetime.now().strftime("%s"))
now_lte = now - autodelete_days * 24 * 60 * 60
data = {
"query": {"range": {"player.watched_date": {"lte": now_lte}}},
"sort": [{"player.watched_date": {"order": "asc"}}],
}
all_to_delete = IndexPaginate("ta_video", data).get_results()
all_youtube_ids = [i["youtube_id"] for i in all_to_delete]
if not all_youtube_ids:
return
for youtube_id in all_youtube_ids:
print(f"autodelete {youtube_id}")
YoutubeVideo(youtube_id).delete_media_file()
print("add deleted to ignore list")
pending_handler = PendingList()
pending_handler.add_to_pending(all_youtube_ids, ignore=True)

View File

@ -0,0 +1,259 @@
"""handle download queue"""
import json
import os
from datetime import datetime
import requests
import yt_dlp
from home.src.download.subscriptions import ChannelSubscription
from home.src.es.connect import IndexPaginate
from home.src.index.playlist import YoutubePlaylist
from home.src.ta.config import AppConfig
from home.src.ta.helper import DurationConverter, ignore_filelist
from home.src.ta.ta_redis import RedisArchivist
class PendingList:
"""manage the pending videos list"""
CONFIG = AppConfig().config
ES_URL = CONFIG["application"]["es_url"]
ES_AUTH = CONFIG["application"]["es_auth"]
VIDEOS = CONFIG["application"]["videos"]
def __init__(self):
self.all_channel_ids = False
self.all_downloaded = False
self.missing_from_playlists = []
def parse_url_list(self, youtube_ids):
"""extract youtube ids from list"""
missing_videos = []
for entry in youtube_ids:
# notify
mess_dict = {
"status": "message:add",
"level": "info",
"title": "Adding to download queue.",
"message": "Extracting lists",
}
RedisArchivist().set_message("message:add", mess_dict)
# extract
url = entry["url"]
url_type = entry["type"]
if url_type == "video":
missing_videos.append(url)
elif url_type == "channel":
video_results = ChannelSubscription().get_last_youtube_videos(
url, limit=False
)
youtube_ids = [i[0] for i in video_results]
missing_videos = missing_videos + youtube_ids
elif url_type == "playlist":
self.missing_from_playlists.append(entry)
playlist = YoutubePlaylist(url)
playlist.build_json()
video_results = playlist.json_data.get("playlist_entries")
youtube_ids = [i["youtube_id"] for i in video_results]
missing_videos = missing_videos + youtube_ids
return missing_videos
def add_to_pending(self, missing_videos, ignore=False):
"""build the bulk json data from pending"""
# check if channel is indexed
channel_handler = ChannelSubscription()
all_indexed = channel_handler.get_channels(subscribed_only=False)
self.all_channel_ids = [i["channel_id"] for i in all_indexed]
# check if already there
self.all_downloaded = self.get_all_downloaded()
bulk_list, all_videos_added = self.build_bulk(missing_videos, ignore)
# add last newline
bulk_list.append("\n")
query_str = "\n".join(bulk_list)
headers = {"Content-type": "application/x-ndjson"}
url = self.ES_URL + "/_bulk"
request = requests.post(
url, data=query_str, headers=headers, auth=self.ES_AUTH
)
if not request.ok:
print(request)
raise ValueError("failed to add video to download queue")
return all_videos_added
def build_bulk(self, missing_videos, ignore=False):
"""build the bulk lists"""
bulk_list = []
all_videos_added = []
for idx, youtube_id in enumerate(missing_videos):
# check if already downloaded
if youtube_id in self.all_downloaded:
continue
video = self.get_youtube_details(youtube_id)
# skip on download error
if not video:
continue
channel_indexed = video["channel_id"] in self.all_channel_ids
video["channel_indexed"] = channel_indexed
if ignore:
video["status"] = "ignore"
else:
video["status"] = "pending"
action = {"create": {"_id": youtube_id, "_index": "ta_download"}}
bulk_list.append(json.dumps(action))
bulk_list.append(json.dumps(video))
all_videos_added.append((youtube_id, video["vid_thumb_url"]))
# notify
progress = f"{idx + 1}/{len(missing_videos)}"
mess_dict = {
"status": "message:add",
"level": "info",
"title": "Adding new videos to download queue.",
"message": "Progress: " + progress,
}
if idx + 1 == len(missing_videos):
RedisArchivist().set_message(
"message:add", mess_dict, expire=4
)
else:
RedisArchivist().set_message("message:add", mess_dict)
if idx + 1 % 25 == 0:
print("adding to queue progress: " + progress)
return bulk_list, all_videos_added
@staticmethod
def get_youtube_details(youtube_id):
"""get details from youtubedl for single pending video"""
obs = {
"default_search": "ytsearch",
"quiet": True,
"check_formats": "selected",
"noplaylist": True,
"writethumbnail": True,
"simulate": True,
}
try:
vid = yt_dlp.YoutubeDL(obs).extract_info(youtube_id)
except yt_dlp.utils.DownloadError:
print("failed to extract info for: " + youtube_id)
return False
# stop if video is streaming live now
if vid["is_live"]:
return False
# parse response
seconds = vid["duration"]
duration_str = DurationConverter.get_str(seconds)
if duration_str == "NA":
print(f"skip extracting duration for: {youtube_id}")
upload_date = vid["upload_date"]
upload_dt = datetime.strptime(upload_date, "%Y%m%d")
published = upload_dt.strftime("%Y-%m-%d")
# build dict
youtube_details = {
"youtube_id": youtube_id,
"channel_name": vid["channel"],
"vid_thumb_url": vid["thumbnail"],
"title": vid["title"],
"channel_id": vid["channel_id"],
"duration": duration_str,
"published": published,
"timestamp": int(datetime.now().strftime("%s")),
}
return youtube_details
@staticmethod
def get_all_pending():
"""get a list of all pending videos in ta_download"""
data = {
"query": {"match_all": {}},
"sort": [{"timestamp": {"order": "asc"}}],
}
all_results = IndexPaginate("ta_download", data).get_results()
all_pending = []
all_ignore = []
for result in all_results:
if result["status"] == "pending":
all_pending.append(result)
elif result["status"] == "ignore":
all_ignore.append(result)
return all_pending, all_ignore
@staticmethod
def get_all_indexed():
"""get a list of all videos indexed"""
data = {
"query": {"match_all": {}},
"sort": [{"published": {"order": "desc"}}],
}
all_indexed = IndexPaginate("ta_video", data).get_results()
return all_indexed
def get_all_downloaded(self):
"""get a list of all videos in archive"""
channel_folders = os.listdir(self.VIDEOS)
all_channel_folders = ignore_filelist(channel_folders)
all_downloaded = []
for channel_folder in all_channel_folders:
channel_path = os.path.join(self.VIDEOS, channel_folder)
videos = os.listdir(channel_path)
all_videos = ignore_filelist(videos)
youtube_vids = [i[9:20] for i in all_videos]
for youtube_id in youtube_vids:
all_downloaded.append(youtube_id)
return all_downloaded
def delete_from_pending(self, youtube_id):
"""delete the youtube_id from ta_download"""
url = f"{self.ES_URL}/ta_download/_doc/{youtube_id}"
response = requests.delete(url, auth=self.ES_AUTH)
if not response.ok:
print(response.text)
def delete_pending(self, status):
"""delete download queue based on status value"""
data = {"query": {"term": {"status": {"value": status}}}}
payload = json.dumps(data)
url = self.ES_URL + "/ta_download/_delete_by_query"
headers = {"Content-type": "application/json"}
response = requests.post(
url, data=payload, headers=headers, auth=self.ES_AUTH
)
if not response.ok:
print(response.text)
def ignore_from_pending(self, ignore_list):
"""build the bulk query string"""
stamp = int(datetime.now().strftime("%s"))
bulk_list = []
for youtube_id in ignore_list:
action = {"update": {"_id": youtube_id, "_index": "ta_download"}}
source = {"doc": {"status": "ignore", "timestamp": stamp}}
bulk_list.append(json.dumps(action))
bulk_list.append(json.dumps(source))
# add last newline
bulk_list.append("\n")
query_str = "\n".join(bulk_list)
headers = {"Content-type": "application/x-ndjson"}
url = self.ES_URL + "/_bulk"
request = requests.post(
url, data=query_str, headers=headers, auth=self.ES_AUTH
)
if not request.ok:
print(request)
raise ValueError("failed to set video to ignore")

View File

@ -0,0 +1,210 @@
"""handle subscriptions"""
import yt_dlp
from home.src.download import queue # partial import
from home.src.es.connect import IndexPaginate
from home.src.index.channel import YoutubeChannel
from home.src.index.playlist import YoutubePlaylist
from home.src.ta.config import AppConfig
from home.src.ta.ta_redis import RedisArchivist
class ChannelSubscription:
"""manage the list of channels subscribed"""
def __init__(self):
config = AppConfig().config
self.es_url = config["application"]["es_url"]
self.es_auth = config["application"]["es_auth"]
self.channel_size = config["subscriptions"]["channel_size"]
@staticmethod
def get_channels(subscribed_only=True):
"""get a list of all channels subscribed to"""
data = {
"sort": [{"channel_name.keyword": {"order": "asc"}}],
}
if subscribed_only:
data["query"] = {"term": {"channel_subscribed": {"value": True}}}
else:
data["query"] = {"match_all": {}}
all_channels = IndexPaginate("ta_channel", data).get_results()
return all_channels
def get_last_youtube_videos(self, channel_id, limit=True):
"""get a list of last videos from channel"""
url = f"https://www.youtube.com/channel/{channel_id}/videos"
obs = {
"default_search": "ytsearch",
"quiet": True,
"skip_download": True,
"extract_flat": True,
}
if limit:
obs["playlistend"] = self.channel_size
chan = yt_dlp.YoutubeDL(obs).extract_info(url, download=False)
last_videos = [(i["id"], i["title"]) for i in chan["entries"]]
return last_videos
def find_missing(self):
"""add missing videos from subscribed channels to pending"""
all_channels = self.get_channels()
pending_handler = queue.PendingList()
all_pending, all_ignore = pending_handler.get_all_pending()
all_ids = [i["youtube_id"] for i in all_ignore + all_pending]
all_downloaded = pending_handler.get_all_downloaded()
to_ignore = all_ids + all_downloaded
missing_videos = []
for idx, channel in enumerate(all_channels):
channel_id = channel["channel_id"]
last_videos = self.get_last_youtube_videos(channel_id)
for video in last_videos:
if video[0] not in to_ignore:
missing_videos.append(video[0])
# notify
message = {
"status": "message:rescan",
"level": "info",
"title": "Scanning channels: Looking for new videos.",
"message": f"Progress: {idx + 1}/{len(all_channels)}",
}
if idx + 1 == len(all_channels):
RedisArchivist().set_message(
"message:rescan", message=message, expire=4
)
else:
RedisArchivist().set_message("message:rescan", message=message)
return missing_videos
@staticmethod
def change_subscribe(channel_id, channel_subscribed):
"""subscribe or unsubscribe from channel and update"""
channel = YoutubeChannel(channel_id)
channel.build_json()
channel.json_data["channel_subscribed"] = channel_subscribed
channel.upload_to_es()
channel.sync_to_videos()
class PlaylistSubscription:
"""manage the playlist download functionality"""
def __init__(self):
self.config = AppConfig().config
@staticmethod
def get_playlists(subscribed_only=True):
"""get a list of all active playlists"""
data = {
"sort": [{"playlist_channel.keyword": {"order": "desc"}}],
}
data["query"] = {
"bool": {"must": [{"term": {"playlist_active": {"value": True}}}]}
}
if subscribed_only:
data["query"]["bool"]["must"].append(
{"term": {"playlist_subscribed": {"value": True}}}
)
all_playlists = IndexPaginate("ta_playlist", data).get_results()
return all_playlists
def process_url_str(self, new_playlists, subscribed=True):
"""process playlist subscribe form url_str"""
all_indexed = queue.PendingList().get_all_indexed()
all_youtube_ids = [i["youtube_id"] for i in all_indexed]
new_thumbs = []
for idx, playlist in enumerate(new_playlists):
url_type = playlist["type"]
playlist_id = playlist["url"]
if not url_type == "playlist":
print(f"{playlist_id} not a playlist, skipping...")
continue
playlist_h = YoutubePlaylist(playlist_id)
playlist_h.all_youtube_ids = all_youtube_ids
playlist_h.build_json()
playlist_h.json_data["playlist_subscribed"] = subscribed
playlist_h.upload_to_es()
playlist_h.add_vids_to_playlist()
self.channel_validate(playlist_h.json_data["playlist_channel_id"])
thumb = playlist_h.json_data["playlist_thumbnail"]
new_thumbs.append((playlist_id, thumb))
# notify
message = {
"status": "message:subplaylist",
"level": "info",
"title": "Subscribing to Playlists",
"message": f"Processing {idx + 1} of {len(new_playlists)}",
}
RedisArchivist().set_message(
"message:subplaylist", message=message
)
return new_thumbs
@staticmethod
def channel_validate(channel_id):
"""make sure channel of playlist is there"""
channel = YoutubeChannel(channel_id)
channel.build_json()
@staticmethod
def change_subscribe(playlist_id, subscribe_status):
"""change the subscribe status of a playlist"""
playlist = YoutubePlaylist(playlist_id)
playlist.build_json()
playlist.json_data["playlist_subscribed"] = subscribe_status
playlist.upload_to_es()
@staticmethod
def get_to_ignore():
"""get all youtube_ids already downloaded or ignored"""
pending_handler = queue.PendingList()
all_pending, all_ignore = pending_handler.get_all_pending()
all_ids = [i["youtube_id"] for i in all_ignore + all_pending]
all_downloaded = pending_handler.get_all_downloaded()
to_ignore = all_ids + all_downloaded
return to_ignore
def find_missing(self):
"""find videos in subscribed playlists not downloaded yet"""
all_playlists = [i["playlist_id"] for i in self.get_playlists()]
to_ignore = self.get_to_ignore()
missing_videos = []
for idx, playlist_id in enumerate(all_playlists):
size_limit = self.config["subscriptions"]["channel_size"]
playlist = YoutubePlaylist(playlist_id)
playlist.update_playlist()
if not playlist:
playlist.deactivate()
continue
playlist_entries = playlist.json_data["playlist_entries"]
if size_limit:
del playlist_entries[size_limit:]
all_missing = [i for i in playlist_entries if not i["downloaded"]]
message = {
"status": "message:rescan",
"level": "info",
"title": "Scanning playlists: Looking for new videos.",
"message": f"Progress: {idx + 1}/{len(all_playlists)}",
}
RedisArchivist().set_message("message:rescan", message=message)
for video in all_missing:
youtube_id = video["youtube_id"]
if youtube_id not in to_ignore:
missing_videos.append(youtube_id)
return missing_videos

View File

@ -7,10 +7,12 @@ import os
from collections import Counter
from time import sleep
import home.src.download as download
import requests
from home.src.config import AppConfig
from home.src.helper import RedisArchivist, ignore_filelist
from home.src.download import queue # partial import
from home.src.download import subscriptions # partial import
from home.src.ta.config import AppConfig
from home.src.ta.helper import ignore_filelist
from home.src.ta.ta_redis import RedisArchivist
from mutagen.mp4 import MP4, MP4Cover
from PIL import Image
@ -55,8 +57,8 @@ class ThumbManager:
def get_needed_thumbs(self, missing_only=False):
"""get a list of all missing thumbnails"""
all_thumbs = self.get_all_thumbs()
all_indexed = download.PendingList().get_all_indexed()
all_in_queue, all_ignored = download.PendingList().get_all_pending()
all_indexed = queue.PendingList().get_all_indexed()
all_in_queue, all_ignored = queue.PendingList().get_all_pending()
needed_thumbs = []
for video in all_indexed:
@ -84,9 +86,8 @@ class ThumbManager:
all_channel_art = os.listdir(self.CHANNEL_DIR)
files = [i[0:24] for i in all_channel_art]
cached_channel_ids = [k for (k, v) in Counter(files).items() if v > 1]
channels = download.ChannelSubscription().get_channels(
subscribed_only=False
)
channel_sub = subscriptions.ChannelSubscription()
channels = channel_sub.get_channels(subscribed_only=False)
missing_channels = []
for channel in channels:
@ -104,10 +105,8 @@ class ThumbManager:
"""get all missing playlist artwork"""
all_downloaded = ignore_filelist(os.listdir(self.PLAYLIST_DIR))
all_ids_downloaded = [i.replace(".jpg", "") for i in all_downloaded]
playlists = download.PlaylistSubscription().get_playlists(
subscribed_only=False
)
playlist_sub = subscriptions.PlaylistSubscription()
playlists = playlist_sub.get_playlists(subscribed_only=False)
missing_playlists = []
for playlist in playlists:
@ -276,7 +275,7 @@ class ThumbManager:
def get_thumb_list(self):
"""get list of mediafiles and matching thumbnails"""
all_indexed = download.PendingList().get_all_indexed()
all_indexed = queue.PendingList().get_all_indexed()
video_list = []
for video in all_indexed:
youtube_id = video["youtube_id"]

View File

@ -0,0 +1,295 @@
"""handle yt_dlp downloads"""
import os
import shutil
from datetime import datetime
from time import sleep
import requests
import yt_dlp
from home.src.download.queue import PendingList
from home.src.download.subscriptions import PlaylistSubscription
from home.src.es.connect import IndexPaginate
from home.src.index.channel import YoutubeChannel
from home.src.index.playlist import YoutubePlaylist
from home.src.index.video import YoutubeVideo, index_new_video
from home.src.ta.config import AppConfig
from home.src.ta.helper import clean_string, ignore_filelist
from home.src.ta.ta_redis import RedisArchivist, RedisQueue
class VideoDownloader:
"""
handle the video download functionality
if not initiated with list, take from queue
"""
def __init__(self, youtube_id_list=False):
self.youtube_id_list = youtube_id_list
self.config = AppConfig().config
self.channels = set()
def run_queue(self):
"""setup download queue in redis loop until no more items"""
queue = RedisQueue("dl_queue")
limit_queue = self.config["downloads"]["limit_count"]
if limit_queue:
queue.trim(limit_queue - 1)
while True:
youtube_id = queue.get_next()
if not youtube_id:
break
try:
self.dl_single_vid(youtube_id)
except yt_dlp.utils.DownloadError:
print("failed to download " + youtube_id)
continue
vid_dict = index_new_video(youtube_id)
self.channels.add(vid_dict["channel"]["channel_id"])
self.move_to_archive(vid_dict)