tubearchivist/tubearchivist/home/src/download.py

604 lines
22 KiB
Python
Raw Normal View History

2021-09-05 17:10:14 +00:00
"""
Functionality:
- handele the download queue
- manage subscriptions to channels
- downloading videos
"""
import json
import os
2021-09-18 13:02:54 +00:00
import shutil
2021-09-05 17:10:14 +00:00
from datetime import datetime
from time import sleep
import requests
import yt_dlp as youtube_dl
from home.src.config import AppConfig
from home.src.helper import (
DurationConverter,
RedisArchivist,
RedisQueue,
clean_string,
ignore_filelist,
)
2021-09-18 13:02:54 +00:00
from home.src.index import YoutubeChannel, index_new_video
2021-09-05 17:10:14 +00:00
class PendingList:
2021-09-21 09:25:22 +00:00
"""manage the pending videos list"""
2021-09-05 17:10:14 +00:00
CONFIG = AppConfig().config
2021-09-21 09:25:22 +00:00
ES_URL = CONFIG["application"]["es_url"]
VIDEOS = CONFIG["application"]["videos"]
2021-09-05 17:10:14 +00:00
@staticmethod
def parse_url_list(youtube_ids):
2021-09-21 09:25:22 +00:00
"""extract youtube ids from list"""
2021-09-05 17:10:14 +00:00
missing_videos = []
for entry in youtube_ids:
# notify
mess_dict = {
"status": "pending",
"level": "info",
"title": "Adding to download queue.",
2021-09-21 09:25:22 +00:00
"message": "Extracting lists",
}
RedisArchivist().set_message("progress:download", mess_dict)
# extract
2021-09-21 09:25:22 +00:00
url = entry["url"]
url_type = entry["type"]
if url_type == "video":
2021-09-05 17:10:14 +00:00
missing_videos.append(url)
2021-09-21 09:25:22 +00:00
elif url_type == "channel":
2021-09-05 17:10:14 +00:00
youtube_ids = ChannelSubscription().get_last_youtube_videos(
url, limit=False
)
missing_videos = missing_videos + youtube_ids
2021-09-21 09:25:22 +00:00
elif url_type == "playlist":
2021-09-10 08:07:38 +00:00
youtube_ids = playlist_extractor(url)
missing_videos = missing_videos + youtube_ids
2021-09-05 17:10:14 +00:00
return missing_videos
def add_to_pending(self, missing_videos):
2021-09-21 09:25:22 +00:00
"""build the bulk json data from pending"""
2021-09-05 17:10:14 +00:00
# check if channel is indexed
channel_handler = ChannelSubscription()
all_indexed = channel_handler.get_channels(subscribed_only=False)
2021-09-21 09:25:22 +00:00
all_channel_ids = [i["channel_id"] for i in all_indexed]
2021-09-05 17:10:14 +00:00
# check if already there
all_downloaded = self.get_all_downloaded()
# loop
bulk_list = []
all_videos_added = []
2021-09-05 17:10:14 +00:00
for video in missing_videos:
if isinstance(video, str):
youtube_id = video
elif isinstance(video, tuple):
youtube_id = video[0]
if youtube_id in all_downloaded:
2021-09-18 10:28:16 +00:00
# skip already downloaded
2021-09-05 17:10:14 +00:00
continue
video = self.get_youtube_details(youtube_id)
# skip on download error
if not video:
continue
2021-09-21 09:25:22 +00:00
if video["channel_id"] in all_channel_ids:
video["channel_indexed"] = True
2021-09-05 17:10:14 +00:00
else:
2021-09-21 09:25:22 +00:00
video["channel_indexed"] = False
2021-10-17 03:49:02 +00:00
thumb_url = video["vid_thumb_url"]
2021-09-21 09:25:22 +00:00
video["status"] = "pending"
2021-09-05 17:10:14 +00:00
action = {"create": {"_id": youtube_id, "_index": "ta_download"}}
bulk_list.append(json.dumps(action))
bulk_list.append(json.dumps(video))
all_videos_added.append((youtube_id, thumb_url))
# notify
mess_dict = {
"status": "pending",
"level": "info",
"title": "Adding to download queue.",
2021-09-21 09:25:22 +00:00
"message": "Processing IDs...",
}
RedisArchivist().set_message("progress:download", mess_dict)
2021-09-05 17:10:14 +00:00
# add last newline
2021-09-21 09:25:22 +00:00
bulk_list.append("\n")
query_str = "\n".join(bulk_list)
headers = {"Content-type": "application/x-ndjson"}
url = self.ES_URL + "/_bulk"
2021-09-05 17:10:14 +00:00
request = requests.post(url, data=query_str, headers=headers)
if not request.ok:
print(request)
return all_videos_added
2021-09-05 17:10:14 +00:00
@staticmethod
def get_youtube_details(youtube_id):
2021-09-21 09:25:22 +00:00
"""get details from youtubedl for single pending video"""
2021-09-05 17:10:14 +00:00
obs = {
2021-09-21 09:25:22 +00:00
"default_search": "ytsearch",
"quiet": True,
"skip_download": True,
2021-09-05 17:10:14 +00:00
}
try:
vid = youtube_dl.YoutubeDL(obs).extract_info(youtube_id)
except youtube_dl.utils.DownloadError:
2021-09-21 09:25:22 +00:00
print("failed to extract info for: " + youtube_id)
2021-09-05 17:10:14 +00:00
return False
2021-10-17 03:49:02 +00:00
# stop if video is streaming live now
if vid["is_live"]:
return False
2021-09-05 17:10:14 +00:00
# parse response
2021-09-21 09:25:22 +00:00
seconds = vid["duration"]
2021-09-05 17:10:14 +00:00
duration_str = DurationConverter.get_str(seconds)
if duration_str == "NA":
print(f"skip extracting duration for: {youtube_id}")
2021-09-21 09:25:22 +00:00
upload_date = vid["upload_date"]
2021-09-05 17:10:14 +00:00
upload_dt = datetime.strptime(upload_date, "%Y%m%d")
published = upload_dt.strftime("%Y-%m-%d")
# build dict
youtube_details = {
"youtube_id": youtube_id,
2021-09-21 09:25:22 +00:00
"channel_name": vid["channel"],
"vid_thumb_url": vid["thumbnail"],
"title": vid["title"],
"channel_id": vid["channel_id"],
2021-09-05 17:10:14 +00:00
"duration": duration_str,
"published": published,
2021-09-21 09:25:22 +00:00
"timestamp": int(datetime.now().strftime("%s")),
2021-09-05 17:10:14 +00:00
}
return youtube_details
def get_all_pending(self):
2021-09-21 09:25:22 +00:00
"""get a list of all pending videos in ta_download"""
headers = {"Content-type": "application/json"}
2021-09-05 17:10:14 +00:00
# get PIT ID
2021-09-21 09:25:22 +00:00
url = self.ES_URL + "/ta_download/_pit?keep_alive=1m"
2021-09-05 17:10:14 +00:00
response = requests.post(url)
json_data = json.loads(response.text)
2021-09-21 09:25:22 +00:00
pit_id = json_data["id"]
2021-09-05 17:10:14 +00:00
# query
data = {
2021-09-21 09:25:22 +00:00
"size": 50,
"query": {"match_all": {}},
2021-09-05 17:10:14 +00:00
"pit": {"id": pit_id, "keep_alive": "1m"},
"sort": [{"timestamp": {"order": "asc"}}],
2021-09-05 17:10:14 +00:00
}
query_str = json.dumps(data)
2021-09-21 09:25:22 +00:00
url = self.ES_URL + "/_search"
2021-09-05 17:10:14 +00:00
all_pending = []
all_ignore = []
while True:
response = requests.get(url, data=query_str, headers=headers)
json_data = json.loads(response.text)
2021-09-21 09:25:22 +00:00
all_hits = json_data["hits"]["hits"]
2021-09-05 17:10:14 +00:00
if all_hits:
for hit in all_hits:
2021-09-21 09:25:22 +00:00
status = hit["_source"]["status"]
if status == "pending":
all_pending.append(hit["_source"])
elif status == "ignore":
all_ignore.append(hit["_source"])
2021-09-21 09:25:22 +00:00
search_after = hit["sort"]
2021-09-05 17:10:14 +00:00
# update search_after with last hit data
2021-09-21 09:25:22 +00:00
data["search_after"] = search_after
2021-09-05 17:10:14 +00:00
query_str = json.dumps(data)
else:
break
# clean up PIT
query_str = json.dumps({"id": pit_id})
2021-09-21 09:25:22 +00:00
requests.delete(self.ES_URL + "/_pit", data=query_str, headers=headers)
2021-09-05 17:10:14 +00:00
return all_pending, all_ignore
def get_all_indexed(self):
2021-09-21 09:25:22 +00:00
"""get a list of all videos indexed"""
headers = {"Content-type": "application/json"}
2021-09-05 17:10:14 +00:00
# get PIT ID
2021-09-21 09:25:22 +00:00
url = self.ES_URL + "/ta_video/_pit?keep_alive=1m"
2021-09-05 17:10:14 +00:00
response = requests.post(url)
json_data = json.loads(response.text)
2021-09-21 09:25:22 +00:00
pit_id = json_data["id"]
2021-09-05 17:10:14 +00:00
# query
data = {
2021-09-21 09:25:22 +00:00
"size": 500,
"query": {"match_all": {}},
2021-09-05 17:10:14 +00:00
"pit": {"id": pit_id, "keep_alive": "1m"},
2021-09-21 09:25:22 +00:00
"sort": [{"published": {"order": "desc"}}],
2021-09-05 17:10:14 +00:00
}
query_str = json.dumps(data)
2021-09-21 09:25:22 +00:00
url = self.ES_URL + "/_search"
2021-09-05 17:10:14 +00:00
all_indexed = []
while True:
response = requests.get(url, data=query_str, headers=headers)
json_data = json.loads(response.text)
2021-09-21 09:25:22 +00:00
all_hits = json_data["hits"]["hits"]
2021-09-05 17:10:14 +00:00
if all_hits:
for hit in all_hits:
all_indexed.append(hit)
2021-09-21 09:25:22 +00:00
search_after = hit["sort"]
2021-09-05 17:10:14 +00:00
# update search_after with last hit data
2021-09-21 09:25:22 +00:00
data["search_after"] = search_after
2021-09-05 17:10:14 +00:00
query_str = json.dumps(data)
else:
break
# clean up PIT
query_str = json.dumps({"id": pit_id})
2021-09-21 09:25:22 +00:00
requests.delete(self.ES_URL + "/_pit", data=query_str, headers=headers)
2021-09-05 17:10:14 +00:00
return all_indexed
def get_all_downloaded(self):
2021-09-21 09:25:22 +00:00
"""get a list of all videos in archive"""
channel_folders = os.listdir(self.VIDEOS)
all_channel_folders = ignore_filelist(channel_folders)
2021-09-05 17:10:14 +00:00
all_downloaded = []
for channel_folder in all_channel_folders:
channel_path = os.path.join(self.VIDEOS, channel_folder)
videos = os.listdir(channel_path)
all_videos = ignore_filelist(videos)
2021-09-05 17:10:14 +00:00
youtube_vids = [i[9:20] for i in all_videos]
for youtube_id in youtube_vids:
all_downloaded.append(youtube_id)
return all_downloaded
def delete_from_pending(self, youtube_id):
2021-09-21 09:25:22 +00:00
"""delete the youtube_id from ta_download"""
url = f"{self.ES_URL}/ta_download/_doc/{youtube_id}"
2021-09-05 17:10:14 +00:00
response = requests.delete(url)
if not response.ok:
print(response.text)
def ignore_from_pending(self, ignore_list):
2021-09-21 09:25:22 +00:00
"""build the bulk query string"""
2021-09-05 17:10:14 +00:00
stamp = int(datetime.now().strftime("%s"))
bulk_list = []
for youtube_id in ignore_list:
action = {"update": {"_id": youtube_id, "_index": "ta_download"}}
2021-09-21 09:25:22 +00:00
source = {"doc": {"status": "ignore", "timestamp": stamp}}
2021-09-05 17:10:14 +00:00
bulk_list.append(json.dumps(action))
bulk_list.append(json.dumps(source))
# add last newline
2021-09-21 09:25:22 +00:00
bulk_list.append("\n")
query_str = "\n".join(bulk_list)
2021-09-05 17:10:14 +00:00
2021-09-21 09:25:22 +00:00
headers = {"Content-type": "application/x-ndjson"}
url = self.ES_URL + "/_bulk"
2021-09-05 17:10:14 +00:00
request = requests.post(url, data=query_str, headers=headers)
mess_dict = {
"status": "ignore",
"level": "info",
"title": "Added to ignore list",
2021-09-21 09:25:22 +00:00
"message": "",
2021-09-05 17:10:14 +00:00
}
RedisArchivist().set_message("progress:download", mess_dict)
2021-09-05 17:10:14 +00:00
if not request.ok:
print(request)
class ChannelSubscription:
2021-09-21 09:25:22 +00:00
"""manage the list of channels subscribed"""
2021-09-05 17:10:14 +00:00
def __init__(self):
config = AppConfig().config
2021-09-21 09:25:22 +00:00
self.es_url = config["application"]["es_url"]
self.channel_size = config["subscriptions"]["channel_size"]
2021-09-05 17:10:14 +00:00
def get_channels(self, subscribed_only=True):
2021-09-21 09:25:22 +00:00
"""get a list of all channels subscribed to"""
headers = {"Content-type": "application/json"}
2021-09-05 17:10:14 +00:00
# get PIT ID
2021-09-21 09:25:22 +00:00
url = self.es_url + "/ta_channel/_pit?keep_alive=1m"
2021-09-05 17:10:14 +00:00
response = requests.post(url)
json_data = json.loads(response.text)
2021-09-21 09:25:22 +00:00
pit_id = json_data["id"]
2021-09-05 17:10:14 +00:00
# query
if subscribed_only:
data = {
"query": {"term": {"channel_subscribed": {"value": True}}},
2021-09-21 09:25:22 +00:00
"size": 50,
"pit": {"id": pit_id, "keep_alive": "1m"},
"sort": [{"channel_name.keyword": {"order": "asc"}}],
2021-09-05 17:10:14 +00:00
}
else:
data = {
"query": {"match_all": {}},
2021-09-21 09:25:22 +00:00
"size": 50,
"pit": {"id": pit_id, "keep_alive": "1m"},
"sort": [{"channel_name.keyword": {"order": "asc"}}],
2021-09-05 17:10:14 +00:00
}
query_str = json.dumps(data)
2021-09-21 09:25:22 +00:00
url = self.es_url + "/_search"
2021-09-05 17:10:14 +00:00
all_channels = []
while True:
response = requests.get(url, data=query_str, headers=headers)
json_data = json.loads(response.text)
2021-09-21 09:25:22 +00:00
all_hits = json_data["hits"]["hits"]
2021-09-05 17:10:14 +00:00
if all_hits:
for hit in all_hits:
2021-09-21 09:25:22 +00:00
source = hit["_source"]
search_after = hit["sort"]
2021-09-05 17:10:14 +00:00
all_channels.append(source)
# update search_after with last hit data
2021-09-21 09:25:22 +00:00
data["search_after"] = search_after
2021-09-05 17:10:14 +00:00
query_str = json.dumps(data)
else:
break
# clean up PIT
query_str = json.dumps({"id": pit_id})
2021-09-21 09:25:22 +00:00
requests.delete(self.es_url + "/_pit", data=query_str, headers=headers)
2021-09-05 17:10:14 +00:00
return all_channels
def get_last_youtube_videos(self, channel_id, limit=True):
2021-09-21 09:25:22 +00:00
"""get a list of last videos from channel"""
url = f"https://www.youtube.com/channel/{channel_id}/videos"
2021-09-05 17:10:14 +00:00
obs = {
2021-09-21 09:25:22 +00:00
"default_search": "ytsearch",
"quiet": True,
"skip_download": True,
"extract_flat": True,
2021-09-05 17:10:14 +00:00
}
if limit:
2021-09-21 09:25:22 +00:00
obs["playlistend"] = self.channel_size
2021-09-05 17:10:14 +00:00
chan = youtube_dl.YoutubeDL(obs).extract_info(url, download=False)
2021-09-21 09:25:22 +00:00
last_videos = [(i["id"], i["title"]) for i in chan["entries"]]
2021-09-05 17:10:14 +00:00
return last_videos
def find_missing(self):
2021-09-21 09:25:22 +00:00
"""add missing videos from subscribed channels to pending"""
2021-09-05 17:10:14 +00:00
all_channels = self.get_channels()
pending_handler = PendingList()
all_pending, all_ignore = pending_handler.get_all_pending()
all_ids = [i["youtube_id"] for i in all_ignore + all_pending]
2021-09-05 17:10:14 +00:00
all_downloaded = pending_handler.get_all_downloaded()
to_ignore = all_ids + all_downloaded
2021-09-05 17:10:14 +00:00
missing_videos = []
counter = 1
for channel in all_channels:
2021-09-21 09:25:22 +00:00
channel_id = channel["channel_id"]
2021-09-05 17:10:14 +00:00
last_videos = self.get_last_youtube_videos(channel_id)
RedisArchivist().set_message(
2021-09-21 09:25:22 +00:00
"progress:download",
{
"status": "rescan",
"level": "info",
"title": "Rescanning: Looking for new videos.",
"message": f"Progress: {counter}/{len(all_channels)}",
},
2021-09-05 17:10:14 +00:00
)
for video in last_videos:
youtube_id = video[0]
if youtube_id not in to_ignore:
missing_videos.append(youtube_id)
counter = counter + 1
return missing_videos
def change_subscribe(self, channel_id, channel_subscribed):
2021-09-21 09:25:22 +00:00
"""subscribe or unsubscribe from channel and update"""
2021-09-05 17:10:14 +00:00
if not isinstance(channel_subscribed, bool):
2021-09-21 09:25:22 +00:00
print("invalid status, should be bool")
2021-09-05 17:10:14 +00:00
return
2021-09-21 09:25:22 +00:00
headers = {"Content-type": "application/json"}
2021-09-05 17:10:14 +00:00
channel_handler = YoutubeChannel(channel_id)
channel_dict = channel_handler.channel_dict
2021-09-21 09:25:22 +00:00
channel_dict["channel_subscribed"] = channel_subscribed
2021-09-05 17:10:14 +00:00
if channel_subscribed:
# handle subscribe
2021-09-21 09:25:22 +00:00
url = self.es_url + "/ta_channel/_doc/" + channel_id
2021-09-05 17:10:14 +00:00
payload = json.dumps(channel_dict)
print(channel_dict)
else:
2021-09-21 09:25:22 +00:00
url = self.es_url + "/ta_channel/_update/" + channel_id
payload = json.dumps({"doc": channel_dict})
2021-09-05 17:10:14 +00:00
# update channel
request = requests.post(url, data=payload, headers=headers)
if not request.ok:
print(request.text)
# sync to videos
channel_handler.sync_to_videos()
if channel_handler.source == "scraped":
channel_handler.get_channel_art()
2021-09-05 17:10:14 +00:00
2021-09-10 08:07:38 +00:00
def playlist_extractor(playlist_id):
2021-09-21 09:25:22 +00:00
"""return youtube_ids from a playlist_id"""
url = "https://www.youtube.com/playlist?list=" + playlist_id
2021-09-10 08:07:38 +00:00
obs = {
2021-09-21 09:25:22 +00:00
"default_search": "ytsearch",
"quiet": True,
"ignoreerrors": True,
"skip_download": True,
"extract_flat": True,
2021-09-10 08:07:38 +00:00
}
playlist = youtube_dl.YoutubeDL(obs).extract_info(url, download=False)
2021-09-21 09:25:22 +00:00
playlist_vids = [(i["id"], i["title"]) for i in playlist["entries"]]
2021-09-10 08:07:38 +00:00
return playlist_vids
2021-09-05 17:10:14 +00:00
class VideoDownloader:
"""
handle the video download functionality
if not initiated with list, take from queue
"""
2021-09-05 17:10:14 +00:00
def __init__(self, youtube_id_list=False):
2021-09-05 17:10:14 +00:00
self.youtube_id_list = youtube_id_list
self.config = AppConfig().config
def run_queue(self):
"""setup download queue in redis loop until no more items"""
queue = RedisQueue("dl_queue")
limit_queue = self.config["downloads"]["limit_count"]
if limit_queue:
queue.trim(limit_queue - 1)
while True:
youtube_id = queue.get_next()
if not youtube_id:
break
2021-09-05 17:10:14 +00:00
try:
self.dl_single_vid(youtube_id)
except youtube_dl.utils.DownloadError:
2021-09-21 09:25:22 +00:00
print("failed to download " + youtube_id)
2021-09-05 17:10:14 +00:00
continue
vid_dict = index_new_video(youtube_id)
self.move_to_archive(vid_dict)
self.delete_from_pending(youtube_id)
@staticmethod
def add_pending():
"""add pending videos to download queue"""
all_pending, _ = PendingList().get_all_pending()
to_add = [i["youtube_id"] for i in all_pending]
2021-10-17 04:28:05 +00:00
if not to_add:
# there is nothing pending
print("download queue is empty")
mess_dict = {
"status": "downloading",
"level": "error",
"title": "Download queue is empty",
"message": "",
}
RedisArchivist().set_message("progress:download", mess_dict)
return
queue = RedisQueue("dl_queue")
queue.add_list(to_add)
2021-09-05 17:10:14 +00:00
@staticmethod
def progress_hook(response):
2021-09-21 09:25:22 +00:00
"""process the progress_hooks from youtube_dl"""
2021-09-05 17:10:14 +00:00
# title
2021-09-21 09:25:22 +00:00
filename = response["filename"][12:].replace("_", " ")
2021-09-05 17:10:14 +00:00
title = "Downloading: " + os.path.split(filename)[-1]
# message
try:
2021-09-21 09:25:22 +00:00
percent = response["_percent_str"]
size = response["_total_bytes_str"]
speed = response["_speed_str"]
eta = response["_eta_str"]
message = f"{percent} of {size} at {speed} - time left: {eta}"
2021-09-05 17:10:14 +00:00
except KeyError:
2021-09-21 09:25:22 +00:00
message = ""
2021-09-05 17:10:14 +00:00
mess_dict = {
"status": "downloading",
"level": "info",
"title": title,
2021-09-21 09:25:22 +00:00
"message": message,
2021-09-05 17:10:14 +00:00
}
RedisArchivist().set_message("progress:download", mess_dict)
2021-09-05 17:10:14 +00:00
def build_obs(self):
"""build obs dictionary for yt-dlp"""
2021-09-05 17:10:14 +00:00
obs = {
2021-09-21 09:25:22 +00:00
"default_search": "ytsearch",
"merge_output_format": "mp4",
"restrictfilenames": True,
"outtmpl": (
self.config["application"]["cache_dir"]
+ "/download/"
+ self.config["application"]["file_template"]
),
"progress_hooks": [self.progress_hook],
"noprogress": True,
2021-09-21 09:25:22 +00:00
"quiet": True,
"continuedl": True,
"retries": 3,
"writethumbnail": False,
2021-09-05 17:10:14 +00:00
}
2021-09-21 09:25:22 +00:00
if self.config["downloads"]["format"]:
obs["format"] = self.config["downloads"]["format"]
if self.config["downloads"]["limit_speed"]:
obs["ratelimit"] = self.config["downloads"]["limit_speed"] * 1024
2021-09-05 17:10:14 +00:00
external = False
if external:
2021-09-21 09:25:22 +00:00
obs["external_downloader"] = "aria2c"
postprocessors = []
2021-09-21 09:25:22 +00:00
if self.config["downloads"]["add_metadata"]:
postprocessors.append(
{
"key": "FFmpegMetadata",
"add_chapters": True,
"add_metadata": True,
}
)
if self.config["downloads"]["add_thumbnail"]:
postprocessors.append(
{
"key": "EmbedThumbnail",
"already_have_thumbnail": True,
}
)
obs["writethumbnail"] = True
2021-09-21 09:25:22 +00:00
obs["postprocessors"] = postprocessors
return obs
def dl_single_vid(self, youtube_id):
"""download single video"""
dl_cache = self.config["application"]["cache_dir"] + "/download/"
obs = self.build_obs()
2021-09-05 17:10:14 +00:00
# check if already in cache to continue from there
all_cached = ignore_filelist(os.listdir(dl_cache))
2021-09-05 17:10:14 +00:00
for file_name in all_cached:
if youtube_id in file_name:
obs["outtmpl"] = os.path.join(dl_cache, file_name)
2021-09-05 17:10:14 +00:00
with youtube_dl.YoutubeDL(obs) as ydl:
try:
ydl.download([youtube_id])
except youtube_dl.utils.DownloadError:
2021-09-21 09:25:22 +00:00
print("retry failed download: " + youtube_id)
2021-09-05 17:10:14 +00:00
sleep(10)
ydl.download([youtube_id])
if obs["writethumbnail"]:
# webp files don't get cleaned up automatically
all_cached = ignore_filelist(os.listdir(dl_cache))
to_clean = [i for i in all_cached if not i.endswith(".mp4")]
for file_name in to_clean:
file_path = os.path.join(dl_cache, file_name)
os.remove(file_path)
2021-09-05 17:10:14 +00:00
def move_to_archive(self, vid_dict):
2021-09-21 09:25:22 +00:00
"""move downloaded video from cache to archive"""
videos = self.config["application"]["videos"]
host_uid = self.config["application"]["HOST_UID"]
host_gid = self.config["application"]["HOST_GID"]
channel_name = clean_string(vid_dict["channel"]["channel_name"])
# make archive folder with correct permissions
new_folder = os.path.join(videos, channel_name)
if not os.path.exists(new_folder):
os.makedirs(new_folder)
if host_uid and host_gid:
os.chown(new_folder, host_uid, host_gid)
2021-09-05 17:10:14 +00:00
# find real filename
2021-09-21 09:25:22 +00:00
cache_dir = self.config["application"]["cache_dir"]
all_cached = ignore_filelist(os.listdir(cache_dir + "/download/"))
for file_str in all_cached:
if vid_dict["youtube_id"] in file_str:
2021-09-05 17:10:14 +00:00
old_file = file_str
2021-09-21 09:25:22 +00:00
old_file_path = os.path.join(cache_dir, "download", old_file)
new_file_path = os.path.join(videos, vid_dict["media_url"])
# move media file and fix permission
2021-09-05 17:10:14 +00:00
shutil.move(old_file_path, new_file_path)
if host_uid and host_gid:
os.chown(new_file_path, host_uid, host_gid)
2021-09-05 17:10:14 +00:00
def delete_from_pending(self, youtube_id):
2021-09-21 09:25:22 +00:00
"""delete downloaded video from pending index if its there"""
es_url = self.config["application"]["es_url"]
url = f"{es_url}/ta_download/_doc/{youtube_id}"
2021-09-05 17:10:14 +00:00
response = requests.delete(url)
if not response.ok and not response.status_code == 404:
print(response.text)