tubearchivist/tubearchivist/home/src/download.py

478 lines
18 KiB
Python

"""
Functionality:
- handele the download queue
- manage subscriptions to channels
- downloading videos
"""
import json
import shutil
import os
from datetime import datetime
from time import sleep
import requests
import yt_dlp as youtube_dl
from home.src.index import YoutubeChannel, index_new_video
from home.src.config import AppConfig
from home.src.helper import clean_string, DurationConverter, set_message
class PendingList:
""" manage the pending videos list """
CONFIG = AppConfig().config
ES_URL = CONFIG['application']['es_url']
VIDEOS = CONFIG['application']['videos']
@staticmethod
def parse_url_list(youtube_ids):
""" extract youtube ids from list """
missing_videos = []
for entry in youtube_ids:
url = entry['url']
url_type = entry['type']
if url_type == 'video':
missing_videos.append(url)
elif url_type == 'channel':
youtube_ids = ChannelSubscription().get_last_youtube_videos(
url, limit=False
)
missing_videos = missing_videos + youtube_ids
return missing_videos
def add_to_pending(self, missing_videos):
""" build the bulk json data from pending """
# check if channel is indexed
channel_handler = ChannelSubscription()
all_indexed = channel_handler.get_channels(subscribed_only=False)
all_channel_ids = [i['channel_id'] for i in all_indexed]
# check if already there
all_downloaded = self.get_all_downloaded()
# loop
bulk_list = []
for video in missing_videos:
if isinstance(video, str):
youtube_id = video
elif isinstance(video, tuple):
youtube_id = video[0]
if youtube_id in all_downloaded:
# skip already downlaoded
continue
video = self.get_youtube_details(youtube_id)
# skip on download error
if not video:
continue
if video['channel_id'] in all_channel_ids:
video['channel_indexed'] = True
else:
video['channel_indexed'] = False
video['status'] = "pending"
action = {"create": {"_id": youtube_id, "_index": "ta_download"}}
bulk_list.append(json.dumps(action))
bulk_list.append(json.dumps(video))
# add last newline
bulk_list.append('\n')
query_str = '\n'.join(bulk_list)
headers = {'Content-type': 'application/x-ndjson'}
url = self.ES_URL + '/_bulk'
request = requests.post(url, data=query_str, headers=headers)
# notify
mess_dict = {
"status": "pending",
"level": "info",
"title": "Adding to download queue.",
"message": 'Processing IDs...'
}
set_message('progress:download', mess_dict)
if not request.ok:
print(request)
@staticmethod
def get_youtube_details(youtube_id):
""" get details from youtubedl for single pending video """
obs = {
'default_search': 'ytsearch',
'quiet': True,
'skip_download': True,
}
try:
vid = youtube_dl.YoutubeDL(obs).extract_info(youtube_id)
except youtube_dl.utils.DownloadError:
print('failed to extract info for: ' + youtube_id)
return False
# parse response
seconds = vid['duration']
duration_str = DurationConverter.get_str(seconds)
upload_date = vid['upload_date']
upload_dt = datetime.strptime(upload_date, "%Y%m%d")
published = upload_dt.strftime("%Y-%m-%d")
# build dict
youtube_details = {
"youtube_id": youtube_id,
"channel_name": vid['channel'],
"vid_thumb_url": vid['thumbnail'],
"title": vid['title'],
"channel_id": vid['channel_id'],
"duration": duration_str,
"published": published,
"timestamp": int(datetime.now().strftime("%s"))
}
return youtube_details
def get_all_pending(self):
""" get a list of all pending videos in ta_download """
headers = {'Content-type': 'application/json'}
# get PIT ID
url = self.ES_URL + '/ta_download/_pit?keep_alive=1m'
response = requests.post(url)
json_data = json.loads(response.text)
pit_id = json_data['id']
# query
data = {
"size": 50, "query": {"match_all": {}},
"pit": {"id": pit_id, "keep_alive": "1m"},
"sort": [{"timestamp": {"order": "desc"}}]
}
query_str = json.dumps(data)
url = self.ES_URL + '/_search'
all_pending = []
all_ignore = []
while True:
response = requests.get(url, data=query_str, headers=headers)
json_data = json.loads(response.text)
all_hits = json_data['hits']['hits']
if all_hits:
for hit in all_hits:
youtube_id = hit['_source']['youtube_id']
status = hit['_source']['status']
if status == 'pending':
all_pending.append(hit['_source'])
elif status == 'ignore':
all_ignore.append(youtube_id)
search_after = hit['sort']
# update search_after with last hit data
data['search_after'] = search_after
query_str = json.dumps(data)
else:
break
# clean up PIT
query_str = json.dumps({"id": pit_id})
requests.delete(self.ES_URL + '/_pit', data=query_str, headers=headers)
return all_pending, all_ignore
def get_all_indexed(self):
""" get a list of all videos indexed """
headers = {'Content-type': 'application/json'}
# get PIT ID
url = self.ES_URL + '/ta_video/_pit?keep_alive=1m'
response = requests.post(url)
json_data = json.loads(response.text)
pit_id = json_data['id']
# query
data = {
"size": 500, "query": {"match_all": {}},
"pit": {"id": pit_id, "keep_alive": "1m"},
"sort": [{"published": {"order": "desc"}}]
}
query_str = json.dumps(data)
url = self.ES_URL + '/_search'
all_indexed = []
while True:
response = requests.get(url, data=query_str, headers=headers)
json_data = json.loads(response.text)
all_hits = json_data['hits']['hits']
if all_hits:
for hit in all_hits:
all_indexed.append(hit)
search_after = hit['sort']
# update search_after with last hit data
data['search_after'] = search_after
query_str = json.dumps(data)
else:
break
# clean up PIT
query_str = json.dumps({"id": pit_id})
requests.delete(self.ES_URL + '/_pit', data=query_str, headers=headers)
return all_indexed
def get_all_downloaded(self):
""" get a list of all videos in archive """
all_channel_folders = os.listdir(self.VIDEOS)
all_downloaded = []
for channel_folder in all_channel_folders:
channel_path = os.path.join(self.VIDEOS, channel_folder)
all_videos = os.listdir(channel_path)
youtube_vids = [i[9:20] for i in all_videos]
for youtube_id in youtube_vids:
all_downloaded.append(youtube_id)
return all_downloaded
def delete_from_pending(self, youtube_id):
""" delete the youtube_id from ta_download """
url = f'{self.ES_URL}/ta_download/_doc/{youtube_id}'
response = requests.delete(url)
if not response.ok:
print(response.text)
def ignore_from_pending(self, ignore_list):
""" build the bulk query string """
stamp = int(datetime.now().strftime("%s"))
bulk_list = []
for youtube_id in ignore_list:
action = {"update": {"_id": youtube_id, "_index": "ta_download"}}
source = {"doc": {"status": 'ignore', "timestamp": stamp}}
bulk_list.append(json.dumps(action))
bulk_list.append(json.dumps(source))
# add last newline
bulk_list.append('\n')
query_str = '\n'.join(bulk_list)
headers = {'Content-type': 'application/x-ndjson'}
url = self.ES_URL + '/_bulk'
request = requests.post(url, data=query_str, headers=headers)
mess_dict = {
"status": "ignore",
"level": "info",
"title": "Added to ignore list",
"message": ''
}
set_message('progress:download', mess_dict)
if not request.ok:
print(request)
class ChannelSubscription:
""" manage the list of channels subscribed """
def __init__(self):
config = AppConfig().config
self.es_url = config['application']['es_url']
self.channel_size = config['subscriptions']['channel_size']
def get_channels(self, subscribed_only=True):
""" get a list of all channels subscribed to """
headers = {'Content-type': 'application/json'}
# get PIT ID
url = self.es_url + '/ta_channel/_pit?keep_alive=1m'
response = requests.post(url)
json_data = json.loads(response.text)
pit_id = json_data['id']
# query
if subscribed_only:
data = {
"query": {"term": {"channel_subscribed": {"value": True}}},
"size": 50, "pit": {"id": pit_id, "keep_alive": "1m"},
"sort": [{"channel_name.keyword": {"order": "asc"}}]
}
else:
data = {
"query": {"match_all": {}},
"size": 50, "pit": {"id": pit_id, "keep_alive": "1m"},
"sort": [{"channel_name.keyword": {"order": "asc"}}]
}
query_str = json.dumps(data)
url = self.es_url + '/_search'
all_channels = []
while True:
response = requests.get(url, data=query_str, headers=headers)
json_data = json.loads(response.text)
all_hits = json_data['hits']['hits']
if all_hits:
for hit in all_hits:
source = hit['_source']
search_after = hit['sort']
all_channels.append(source)
# update search_after with last hit data
data['search_after'] = search_after
query_str = json.dumps(data)
else:
break
# clean up PIT
query_str = json.dumps({"id": pit_id})
requests.delete(self.es_url + '/_pit', data=query_str, headers=headers)
return all_channels
def get_last_youtube_videos(self, channel_id, limit=True):
""" get a list of last videos from channel """
url = f'https://www.youtube.com/channel/{channel_id}/videos'
obs = {
'default_search': 'ytsearch', 'quiet': True,
'skip_download': True, 'extract_flat': True
}
if limit:
obs['playlistend'] = self.channel_size
chan = youtube_dl.YoutubeDL(obs).extract_info(url, download=False)
last_videos = [(i['id'], i['title']) for i in chan['entries']]
return last_videos
def find_missing(self):
""" add missing videos from subscribed channels to pending """
all_channels = self.get_channels()
pending_handler = PendingList()
all_pending, all_ignore = pending_handler.get_all_pending()
all_pending_ids = [i['youtube_id'] for i in all_pending]
all_downloaded = pending_handler.get_all_downloaded()
to_ignore = all_pending_ids + all_ignore + all_downloaded
missing_videos = []
counter = 1
for channel in all_channels:
channel_id = channel['channel_id']
last_videos = self.get_last_youtube_videos(channel_id)
set_message('progress:download', {
"status": "rescan",
"level": "info",
"title": "Rescanning: Looking for new videos.",
"message": f'Progress: {counter}/{len(all_channels)}'
}
)
for video in last_videos:
youtube_id = video[0]
if youtube_id not in to_ignore:
missing_videos.append(youtube_id)
counter = counter + 1
return missing_videos
def change_subscribe(self, channel_id, channel_subscribed):
""" subscribe or unsubscribe from channel and update """
if not isinstance(channel_subscribed, bool):
print('invalid status, should be bool')
return
headers = {'Content-type': 'application/json'}
channel_handler = YoutubeChannel(channel_id)
channel_dict = channel_handler.channel_dict
channel_dict['channel_subscribed'] = channel_subscribed
if channel_subscribed:
# handle subscribe
url = self.es_url + '/ta_channel/_doc/' + channel_id
payload = json.dumps(channel_dict)
print(channel_dict)
else:
url = self.es_url + '/ta_channel/_update/' + channel_id
payload = json.dumps({'doc': channel_dict})
# update channel
request = requests.post(url, data=payload, headers=headers)
if not request.ok:
print(request.text)
# sync to videos
channel_handler.sync_to_videos()
class VideoDownloader:
""" handle the video download functionality """
def __init__(self, youtube_id_list):
self.youtube_id_list = youtube_id_list
self.config = AppConfig().config
def download_list(self):
""" download the list of youtube_ids """
for youtube_id in self.youtube_id_list:
try:
self.dl_single_vid(youtube_id)
except youtube_dl.utils.DownloadError:
print('failed to download ' + youtube_id)
continue
vid_dict = index_new_video(youtube_id)
self.move_to_archive(vid_dict)
self.delete_from_pending(youtube_id)
if self.config['downloads']['sleep_interval']:
sleep(self.config['downloads']['sleep_interval'])
@staticmethod
def progress_hook(response):
""" process the progress_hooks from youtube_dl """
# title
filename = response['filename'][12:].replace('_', ' ')
title = "Downloading: " + os.path.split(filename)[-1]
# message
try:
percent = response['_percent_str']
size = response['_total_bytes_str']
speed = response['_speed_str']
eta = response['_eta_str']
message = f'{percent} of {size} at {speed} - time left: {eta}'
except KeyError:
message = ''
mess_dict = {
"status": "downloading",
"level": "info",
"title": title,
"message": message
}
set_message('progress:download', mess_dict)
def dl_single_vid(self, youtube_id):
""" download single video """
obs = {
'default_search': 'ytsearch',
'merge_output_format': 'mp4', 'restrictfilenames': True,
'outtmpl': (self.config['application']['cache_dir'] +
'/download/' +
self.config['application']['file_template']),
'progress_hooks': [self.progress_hook],
'quiet': True, 'continuedl': True, 'retries': 3
}
if self.config['downloads']['format']:
obs['format'] = self.config['downloads']['format']
if self.config['downloads']['limit_speed']:
obs['ratelimit'] = self.config['downloads']['limit_speed'] * 1024
external = False
if external:
obs['external_downloader'] = 'aria2c'
# check if already in cache to continue from there
cache_dir = self.config['application']['cache_dir']
all_cached = os.listdir(cache_dir + '/download/')
for file_name in all_cached:
if youtube_id in file_name:
obs['outtmpl'] = cache_dir + '/download/' + file_name
with youtube_dl.YoutubeDL(obs) as ydl:
try:
ydl.download([youtube_id])
except youtube_dl.utils.DownloadError:
print('retry failed download: ' + youtube_id)
sleep(10)
ydl.download([youtube_id])
def move_to_archive(self, vid_dict):
""" move downloaded video from cache to archive """
videos = self.config['application']['videos']
channel_name = vid_dict['channel']['channel_name']
channel_name_clean = clean_string(channel_name)
media_url = vid_dict['media_url']
youtube_id = vid_dict['youtube_id']
# make archive folder
videos = self.config['application']['videos']
new_folder = os.path.join(videos, channel_name_clean)
os.makedirs(new_folder, exist_ok=True)
# find real filename
cache_dir = self.config['application']['cache_dir']
for file_str in os.listdir(cache_dir + '/download'):
if youtube_id in file_str:
old_file = file_str
old_file_path = os.path.join(cache_dir, 'download', old_file)
new_file_path = os.path.join(videos, media_url)
# move and fix permission
shutil.move(old_file_path, new_file_path)
os.chown(
new_file_path,
self.config['application']['HOST_UID'],
self.config['application']['HOST_GID']
)
def delete_from_pending(self, youtube_id):
""" delete downloaded video from pending index if its there """
es_url = self.config['application']['es_url']
url = f'{es_url}/ta_download/_doc/{youtube_id}'
response = requests.delete(url)
if not response.ok and not response.status_code == 404:
print(response.text)