major refactor, #build

Changes:
- merges new restructured and split up modules
- merges refactor channel, video, playlist index classes
- merges code clean up and readability improvements
This commit is contained in:
simon 2022-01-27 16:08:16 +07:00
commit 8591c44ef2
45 changed files with 3080 additions and 2981 deletions

View File

@ -9,7 +9,6 @@ jobs:
- run: pip install --upgrade pip wheel - run: pip install --upgrade pip wheel
- run: pip install bandit black codespell flake8 flake8-bugbear - run: pip install bandit black codespell flake8 flake8-bugbear
flake8-comprehensions isort flake8-comprehensions isort
- run: bandit --recursive --skip B105,B108,B404,B603,B607 .
- run: black --check --diff --line-length 79 . - run: black --check --diff --line-length 79 .
- run: codespell - run: codespell
- run: flake8 . --count --max-complexity=12 --max-line-length=79 - run: flake8 . --count --max-complexity=12 --max-line-length=79

3
.gitignore vendored
View File

@ -3,3 +3,6 @@ __pycache__
# django testing db # django testing db
db.sqlite3 db.sqlite3
# vscode custom conf
.vscode

View File

@ -1,5 +0,0 @@
{
"python.linting.pylintEnabled": true,
"python.linting.pycodestyleEnabled": false,
"python.linting.enabled": true
}

View File

@ -1,6 +1,6 @@
# build the tube archivist image from default python slim image # build the tube archivist image from default python slim image
FROM python:3.10.1-slim-bullseye FROM python:3.10.2-slim-bullseye
ARG TARGETPLATFORM ARG TARGETPLATFORM
ENV PYTHONUNBUFFERED 1 ENV PYTHONUNBUFFERED 1
@ -35,12 +35,12 @@ COPY ./tubearchivist/requirements.txt /requirements.txt
RUN pip install --no-cache-dir -r requirements.txt --src /usr/local/src RUN pip install --no-cache-dir -r requirements.txt --src /usr/local/src
# copy config files # copy config files
COPY nginx.conf /etc/nginx/conf.d/ COPY docker_assets/nginx.conf /etc/nginx/conf.d/
# copy application into container # copy application into container
COPY ./tubearchivist /app COPY ./tubearchivist /app
COPY ./run.sh /app COPY ./docker_assets/run.sh /app
COPY ./uwsgi.ini /app COPY ./docker_assets/uwsgi.ini /app
# volumes # volumes
VOLUME /cache VOLUME /cache

View File

@ -85,9 +85,7 @@ function validate {
fi fi
echo "run validate on $check_path" echo "run validate on $check_path"
echo "running bandit"
bandit --recursive --skip B105,B108,B404,B603,B607 "$check_path"
echo "running black" echo "running black"
black --diff --color --check -l 79 "$check_path" black --diff --color --check -l 79 "$check_path"
echo "running codespell" echo "running codespell"

View File

@ -1,11 +1,6 @@
#!/bin/bash #!/bin/bash
# startup script inside the container for tubearchivist # startup script inside the container for tubearchivist
# check environment
if [[ -z "$DJANGO_DEBUG" ]]; then
export DJANGO_DEBUG=False
fi
if [[ -z "$ELASTIC_USER" ]]; then if [[ -z "$ELASTIC_USER" ]]; then
export ELASTIC_USER=elastic export ELASTIC_USER=elastic
fi fi

View File

@ -1,9 +1,9 @@
"""all API views""" """all API views"""
import requests import requests
from home.src.config import AppConfig from home.src.download.thumbnails import ThumbManager
from home.src.helper import UrlListParser from home.src.ta.config import AppConfig
from home.src.thumbnails import ThumbManager from home.src.ta.helper import UrlListParser
from home.tasks import extrac_dl, subscribe_to from home.tasks import extrac_dl, subscribe_to
from rest_framework.authentication import ( from rest_framework.authentication import (
SessionAuthentication, SessionAuthentication,

View File

@ -15,7 +15,7 @@ from os import environ, path
from pathlib import Path from pathlib import Path
from corsheaders.defaults import default_headers from corsheaders.defaults import default_headers
from home.src.config import AppConfig from home.src.ta.config import AppConfig
# Build paths inside the project like this: BASE_DIR / 'subdir'. # Build paths inside the project like this: BASE_DIR / 'subdir'.
BASE_DIR = Path(__file__).resolve().parent.parent BASE_DIR = Path(__file__).resolve().parent.parent

View File

@ -3,9 +3,9 @@
import os import os
from django.apps import AppConfig from django.apps import AppConfig
from home.src.config import AppConfig as ArchivistConfig from home.src.es.index_setup import index_check
from home.src.helper import RedisArchivist from home.src.ta.config import AppConfig as ArchivistConfig
from home.src.index_management import index_check from home.src.ta.ta_redis import RedisArchivist
def sync_redis_state(): def sync_redis_state():

View File

@ -1,802 +0,0 @@
"""
Functionality:
- handele the download queue
- manage subscriptions to channels
- manage subscriptions to playlists
- downloading videos
"""
import json
import os
import shutil
from datetime import datetime
from time import sleep
import requests
import yt_dlp
from home.src.config import AppConfig
from home.src.helper import (
DurationConverter,
RedisArchivist,
RedisQueue,
clean_string,
ignore_filelist,
)
from home.src.index import (
IndexPaginate,
YoutubeChannel,
YoutubePlaylist,
YoutubeVideo,
index_new_video,
)
class PendingList:
"""manage the pending videos list"""
CONFIG = AppConfig().config
ES_URL = CONFIG["application"]["es_url"]
ES_AUTH = CONFIG["application"]["es_auth"]
VIDEOS = CONFIG["application"]["videos"]
def __init__(self):
self.all_channel_ids = False
self.all_downloaded = False
self.missing_from_playlists = []
def parse_url_list(self, youtube_ids):
"""extract youtube ids from list"""
missing_videos = []
for entry in youtube_ids:
# notify
mess_dict = {
"status": "message:add",
"level": "info",
"title": "Adding to download queue.",
"message": "Extracting lists",
}
RedisArchivist().set_message("message:add", mess_dict)
# extract
url = entry["url"]
url_type = entry["type"]
if url_type == "video":
missing_videos.append(url)
elif url_type == "channel":
video_results = ChannelSubscription().get_last_youtube_videos(
url, limit=False
)
youtube_ids = [i[0] for i in video_results]
missing_videos = missing_videos + youtube_ids
elif url_type == "playlist":
self.missing_from_playlists.append(entry)
video_results = YoutubePlaylist(url).get_entries()
youtube_ids = [i["youtube_id"] for i in video_results]
missing_videos = missing_videos + youtube_ids
return missing_videos
def add_to_pending(self, missing_videos, ignore=False):
"""build the bulk json data from pending"""
# check if channel is indexed
channel_handler = ChannelSubscription()
all_indexed = channel_handler.get_channels(subscribed_only=False)
self.all_channel_ids = [i["channel_id"] for i in all_indexed]
# check if already there
self.all_downloaded = self.get_all_downloaded()
bulk_list, all_videos_added = self.build_bulk(missing_videos, ignore)
# add last newline
bulk_list.append("\n")
query_str = "\n".join(bulk_list)
headers = {"Content-type": "application/x-ndjson"}
url = self.ES_URL + "/_bulk"
request = requests.post(
url, data=query_str, headers=headers, auth=self.ES_AUTH
)
if not request.ok:
print(request)
raise ValueError("failed to add video to download queue")
return all_videos_added
def build_bulk(self, missing_videos, ignore=False):
"""build the bulk lists"""
bulk_list = []
all_videos_added = []
for idx, youtube_id in enumerate(missing_videos):
# check if already downloaded
if youtube_id in self.all_downloaded:
continue
video = self.get_youtube_details(youtube_id)
# skip on download error
if not video:
continue
channel_indexed = video["channel_id"] in self.all_channel_ids
video["channel_indexed"] = channel_indexed
if ignore:
video["status"] = "ignore"
else:
video["status"] = "pending"
action = {"create": {"_id": youtube_id, "_index": "ta_download"}}
bulk_list.append(json.dumps(action))
bulk_list.append(json.dumps(video))
all_videos_added.append((youtube_id, video["vid_thumb_url"]))
# notify
progress = f"{idx + 1}/{len(missing_videos)}"
mess_dict = {
"status": "message:add",
"level": "info",
"title": "Adding new videos to download queue.",
"message": "Progress: " + progress,
}
if idx + 1 == len(missing_videos):
RedisArchivist().set_message(
"message:add", mess_dict, expire=4
)
else:
RedisArchivist().set_message("message:add", mess_dict)
if idx + 1 % 25 == 0:
print("adding to queue progress: " + progress)
return bulk_list, all_videos_added
@staticmethod
def get_youtube_details(youtube_id):
"""get details from youtubedl for single pending video"""
obs = {
"default_search": "ytsearch",
"quiet": True,
"check_formats": "selected",
"noplaylist": True,
"writethumbnail": True,
"simulate": True,
}
try:
vid = yt_dlp.YoutubeDL(obs).extract_info(youtube_id)
except yt_dlp.utils.DownloadError:
print("failed to extract info for: " + youtube_id)
return False
# stop if video is streaming live now
if vid["is_live"]:
return False
# parse response
seconds = vid["duration"]
duration_str = DurationConverter.get_str(seconds)
if duration_str == "NA":
print(f"skip extracting duration for: {youtube_id}")
upload_date = vid["upload_date"]
upload_dt = datetime.strptime(upload_date, "%Y%m%d")
published = upload_dt.strftime("%Y-%m-%d")
# build dict
youtube_details = {
"youtube_id": youtube_id,
"channel_name": vid["channel"],
"vid_thumb_url": vid["thumbnail"],
"title": vid["title"],
"channel_id": vid["channel_id"],
"duration": duration_str,
"published": published,
"timestamp": int(datetime.now().strftime("%s")),
}
return youtube_details
@staticmethod
def get_all_pending():
"""get a list of all pending videos in ta_download"""
data = {
"query": {"match_all": {}},
"sort": [{"timestamp": {"order": "asc"}}],
}
all_results = IndexPaginate("ta_download", data).get_results()
all_pending = []
all_ignore = []
for result in all_results:
if result["status"] == "pending":
all_pending.append(result)
elif result["status"] == "ignore":
all_ignore.append(result)
return all_pending, all_ignore
@staticmethod
def get_all_indexed():
"""get a list of all videos indexed"""
data = {
"query": {"match_all": {}},
"sort": [{"published": {"order": "desc"}}],
}
all_indexed = IndexPaginate("ta_video", data).get_results()
return all_indexed
def get_all_downloaded(self):
"""get a list of all videos in archive"""
channel_folders = os.listdir(self.VIDEOS)
all_channel_folders = ignore_filelist(channel_folders)
all_downloaded = []
for channel_folder in all_channel_folders:
channel_path = os.path.join(self.VIDEOS, channel_folder)
videos = os.listdir(channel_path)
all_videos = ignore_filelist(videos)
youtube_vids = [i[9:20] for i in all_videos]
for youtube_id in youtube_vids:
all_downloaded.append(youtube_id)
return all_downloaded
def delete_from_pending(self, youtube_id):
"""delete the youtube_id from ta_download"""
url = f"{self.ES_URL}/ta_download/_doc/{youtube_id}"
response = requests.delete(url, auth=self.ES_AUTH)
if not response.ok:
print(response.text)
def delete_pending(self, status):
"""delete download queue based on status value"""
data = {"query": {"term": {"status": {"value": status}}}}
payload = json.dumps(data)
url = self.ES_URL + "/ta_download/_delete_by_query"
headers = {"Content-type": "application/json"}
response = requests.post(
url, data=payload, headers=headers, auth=self.ES_AUTH
)
if not response.ok:
print(response.text)
def ignore_from_pending(self, ignore_list):
"""build the bulk query string"""
stamp = int(datetime.now().strftime("%s"))
bulk_list = []
for youtube_id in ignore_list:
action = {"update": {"_id": youtube_id, "_index": "ta_download"}}
source = {"doc": {"status": "ignore", "timestamp": stamp}}
bulk_list.append(json.dumps(action))
bulk_list.append(json.dumps(source))
# add last newline
bulk_list.append("\n")
query_str = "\n".join(bulk_list)
headers = {"Content-type": "application/x-ndjson"}
url = self.ES_URL + "/_bulk"
request = requests.post(
url, data=query_str, headers=headers, auth=self.ES_AUTH
)
if not request.ok:
print(request)
raise ValueError("failed to set video to ignore")
class ChannelSubscription:
"""manage the list of channels subscribed"""
def __init__(self):
config = AppConfig().config
self.es_url = config["application"]["es_url"]
self.es_auth = config["application"]["es_auth"]
self.channel_size = config["subscriptions"]["channel_size"]
@staticmethod
def get_channels(subscribed_only=True):
"""get a list of all channels subscribed to"""
data = {
"sort": [{"channel_name.keyword": {"order": "asc"}}],
}
if subscribed_only:
data["query"] = {"term": {"channel_subscribed": {"value": True}}}
else:
data["query"] = {"match_all": {}}
all_channels = IndexPaginate("ta_channel", data).get_results()
return all_channels
def get_last_youtube_videos(self, channel_id, limit=True):
"""get a list of last videos from channel"""
url = f"https://www.youtube.com/channel/{channel_id}/videos"
obs = {
"default_search": "ytsearch",
"quiet": True,
"skip_download": True,
"extract_flat": True,
}
if limit:
obs["playlistend"] = self.channel_size
chan = yt_dlp.YoutubeDL(obs).extract_info(url, download=False)
last_videos = [(i["id"], i["title"]) for i in chan["entries"]]
return last_videos
def find_missing(self):
"""add missing videos from subscribed channels to pending"""
all_channels = self.get_channels()
pending_handler = PendingList()
all_pending, all_ignore = pending_handler.get_all_pending()
all_ids = [i["youtube_id"] for i in all_ignore + all_pending]
all_downloaded = pending_handler.get_all_downloaded()
to_ignore = all_ids + all_downloaded
missing_videos = []
for idx, channel in enumerate(all_channels):
channel_id = channel["channel_id"]
last_videos = self.get_last_youtube_videos(channel_id)
for video in last_videos:
if video[0] not in to_ignore:
missing_videos.append(video[0])
# notify
message = {
"status": "message:rescan",
"level": "info",
"title": "Scanning channels: Looking for new videos.",
"message": f"Progress: {idx + 1}/{len(all_channels)}",
}
if idx + 1 == len(all_channels):
RedisArchivist().set_message(
"message:rescan", message=message, expire=4
)
else:
RedisArchivist().set_message("message:rescan", message=message)
return missing_videos
def change_subscribe(self, channel_id, channel_subscribed):
"""subscribe or unsubscribe from channel and update"""
if not isinstance(channel_subscribed, bool):
print("invalid status, should be bool")
return
headers = {"Content-type": "application/json"}
channel_handler = YoutubeChannel(channel_id)
channel_dict = channel_handler.channel_dict
channel_dict["channel_subscribed"] = channel_subscribed
if channel_subscribed:
# handle subscribe
url = self.es_url + "/ta_channel/_doc/" + channel_id
payload = json.dumps(channel_dict)
print(channel_dict)
else:
url = self.es_url + "/ta_channel/_update/" + channel_id
payload = json.dumps({"doc": channel_dict})
# update channel
request = requests.post(
url, data=payload, headers=headers, auth=self.es_auth
)
if not request.ok:
print(request.text)
raise ValueError("failed change subscribe status")
# sync to videos
channel_handler.sync_to_videos()
if channel_handler.source == "scraped":
channel_handler.get_channel_art()
class PlaylistSubscription:
"""manage the playlist download functionality"""
def __init__(self):
self.config = AppConfig().config
@staticmethod
def get_playlists(subscribed_only=True):
"""get a list of all active playlists"""
data = {
"sort": [{"playlist_channel.keyword": {"order": "desc"}}],
}
data["query"] = {
"bool": {"must": [{"term": {"playlist_active": {"value": True}}}]}
}
if subscribed_only:
data["query"]["bool"]["must"].append(
{"term": {"playlist_subscribed": {"value": True}}}
)
all_playlists = IndexPaginate("ta_playlist", data).get_results()
return all_playlists
def process_url_str(self, new_playlists, subscribed=True):
"""process playlist subscribe form url_str"""
all_indexed = PendingList().get_all_indexed()
all_youtube_ids = [i["youtube_id"] for i in all_indexed]
new_thumbs = []
for idx, playlist in enumerate(new_playlists):
url_type = playlist["type"]
playlist_id = playlist["url"]
if not url_type == "playlist":
print(f"{playlist_id} not a playlist, skipping...")
continue
playlist_h = YoutubePlaylist(
playlist_id, all_youtube_ids=all_youtube_ids
)
if not playlist_h.get_es_playlist():
playlist_h.get_playlist_dict()
playlist_h.playlist_dict["playlist_subscribed"] = subscribed
playlist_h.upload_to_es()
playlist_h.add_vids_to_playlist()
thumb = playlist_h.playlist_dict["playlist_thumbnail"]
new_thumbs.append((playlist_id, thumb))
self.channel_validate(playlist_h)
else:
self.change_subscribe(playlist_id, subscribe_status=True)
# notify
message = {
"status": "message:subplaylist",
"level": "info",
"title": "Subscribing to Playlists",
"message": f"Processing {idx + 1} of {len(new_playlists)}",
}
RedisArchivist().set_message(
"message:subplaylist", message=message
)
return new_thumbs
@staticmethod
def channel_validate(playlist_handler):
"""make sure channel of playlist is there"""
channel_id = playlist_handler.playlist_dict["playlist_channel_id"]
channel_handler = YoutubeChannel(channel_id)
if channel_handler.source == "scraped":
channel_handler.channel_dict["channel_subscribed"] = False
channel_handler.upload_to_es()
channel_handler.get_channel_art()
def change_subscribe(self, playlist_id, subscribe_status):
"""change the subscribe status of a playlist"""
es_url = self.config["application"]["es_url"]
es_auth = self.config["application"]["es_auth"]
playlist_handler = YoutubePlaylist(playlist_id)
playlist_handler.get_playlist_dict()
subed_now = playlist_handler.playlist_dict["playlist_subscribed"]
if subed_now == subscribe_status:
# status already as expected, do nothing
return False
# update subscribed status
headers = {"Content-type": "application/json"}
url = f"{es_url}/ta_playlist/_update/{playlist_id}"
payload = json.dumps(
{"doc": {"playlist_subscribed": subscribe_status}}
)
response = requests.post(
url, data=payload, headers=headers, auth=es_auth
)
if not response.ok:
print(response.text)
raise ValueError("failed to change subscribe status")
return True
@staticmethod
def get_to_ignore():
"""get all youtube_ids already downloaded or ignored"""
pending_handler = PendingList()
all_pending, all_ignore = pending_handler.get_all_pending()
all_ids = [i["youtube_id"] for i in all_ignore + all_pending]
all_downloaded = pending_handler.get_all_downloaded()
to_ignore = all_ids + all_downloaded
return to_ignore
def find_missing(self):
"""find videos in subscribed playlists not downloaded yet"""
all_playlists = [i["playlist_id"] for i in self.get_playlists()]
to_ignore = self.get_to_ignore()
missing_videos = []
counter = 1
for playlist_id in all_playlists:
size_limit = self.config["subscriptions"]["channel_size"]
playlist_handler = YoutubePlaylist(playlist_id)
playlist = playlist_handler.update_playlist()
if not playlist:
playlist_handler.deactivate()
continue
if size_limit:
playlist_entries = playlist["playlist_entries"][:size_limit]
else:
playlist_entries = playlist["playlist_entries"]
all_missing = [i for i in playlist_entries if not i["downloaded"]]
message = {
"status": "message:rescan",
"level": "info",
"title": "Scanning playlists: Looking for new videos.",
"message": f"Progress: {counter}/{len(all_playlists)}",
}
RedisArchivist().set_message("message:rescan", message=message)
for video in all_missing:
youtube_id = video["youtube_id"]
if youtube_id not in to_ignore:
missing_videos.append(youtube_id)
counter = counter + 1
return missing_videos
class VideoDownloader:
"""
handle the video download functionality
if not initiated with list, take from queue
"""
def __init__(self, youtube_id_list=False):
self.youtube_id_list = youtube_id_list
self.config = AppConfig().config
self.channels = set()
def run_queue(self):
"""setup download queue in redis loop until no more items"""
queue = RedisQueue("dl_queue")
limit_queue = self.config["downloads"]["limit_count"]
if limit_queue:
queue.trim(limit_queue - 1)
while True:
youtube_id = queue.get_next()
if not youtube_id:
break
try:
self.dl_single_vid(youtube_id)
except yt_dlp.utils.DownloadError:
print("failed to download " + youtube_id)
continue
vid_dict = index_new_video(youtube_id)
self.channels.add(vid_dict["channel"]["channel_id"])
self.move_to_archive(vid_dict)
self.delete_from_pending(youtube_id)
autodelete_days = self.config["downloads"]["autodelete_days"]
if autodelete_days:
print(f"auto delete older than {autodelete_days} days")
self.auto_delete_watched(autodelete_days)
@staticmethod
def add_pending():
"""add pending videos to download queue"""
mess_dict = {
"status": "message:download",
"level": "info",
"title": "Looking for videos to download",
"message": "Scanning your download queue.",
}
RedisArchivist().set_message("message:download", mess_dict)
all_pending, _ = PendingList().get_all_pending()
to_add = [i["youtube_id"] for i in all_pending]
if not to_add:
# there is nothing pending
print("download queue is empty")
mess_dict = {
"status": "message:download",
"level": "error",
"title": "Download queue is empty",
"message": "Add some videos to the queue first.",
}
RedisArchivist().set_message("message:download", mess_dict)
return
queue = RedisQueue("dl_queue")
queue.add_list(to_add)
@staticmethod
def progress_hook(response):
"""process the progress_hooks from yt_dlp"""
# title
path = os.path.split(response["filename"])[-1][12:]
filename = os.path.splitext(os.path.splitext(path)[0])[0]
filename_clean = filename.replace("_", " ")
title = "Downloading: " + filename_clean
# message
try:
percent = response["_percent_str"]
size = response["_total_bytes_str"]
speed = response["_speed_str"]
eta = response["_eta_str"]
message = f"{percent} of {size} at {speed} - time left: {eta}"
except KeyError:
message = "processing"
mess_dict = {
"status": "message:download",
"level": "info",
"title": title,
"message": message,
}
RedisArchivist().set_message("message:download", mess_dict)
def build_obs(self):
"""build obs dictionary for yt-dlp"""
obs = {
"default_search": "ytsearch",
"merge_output_format": "mp4",
"restrictfilenames": True,
"outtmpl": (
self.config["application"]["cache_dir"]
+ "/download/"
+ self.config["application"]["file_template"]
),
"progress_hooks": [self.progress_hook],
"noprogress": True,
"quiet": True,
"continuedl": True,
"retries": 3,
"writethumbnail": False,
"noplaylist": True,
"check_formats": "selected",
}
if self.config["downloads"]["format"]:
obs["format"] = self.config["downloads"]["format"]
if self.config["downloads"]["limit_speed"]:
obs["ratelimit"] = self.config["downloads"]["limit_speed"] * 1024
throttle = self.config["downloads"]["throttledratelimit"]
if throttle:
obs["throttledratelimit"] = throttle * 1024
postprocessors = []
if self.config["downloads"]["add_metadata"]:
postprocessors.append(
{
"key": "FFmpegMetadata",
"add_chapters": True,
"add_metadata": True,
}
)
if self.config["downloads"]["add_thumbnail"]:
postprocessors.append(
{
"key": "EmbedThumbnail",
"already_have_thumbnail": True,
}
)
obs["writethumbnail"] = True
obs["postprocessors"] = postprocessors
return obs
def dl_single_vid(self, youtube_id):
"""download single video"""
dl_cache = self.config["application"]["cache_dir"] + "/download/"
obs = self.build_obs()
# check if already in cache to continue from there
all_cached = ignore_filelist(os.listdir(dl_cache))
for file_name in all_cached:
if youtube_id in file_name:
obs["outtmpl"] = os.path.join(dl_cache, file_name)
with yt_dlp.YoutubeDL(obs) as ydl:
try:
ydl.download([youtube_id])
except yt_dlp.utils.DownloadError:
print("retry failed download: " + youtube_id)
sleep(10)
ydl.download([youtube_id])
if obs["writethumbnail"]:
# webp files don't get cleaned up automatically
all_cached = ignore_filelist(os.listdir(dl_cache))
to_clean = [i for i in all_cached if not i.endswith(".mp4")]
for file_name in to_clean:
file_path = os.path.join(dl_cache, file_name)
os.remove(file_path)
def move_to_archive(self, vid_dict):
"""move downloaded video from cache to archive"""
videos = self.config["application"]["videos"]
host_uid = self.config["application"]["HOST_UID"]
host_gid = self.config["application"]["HOST_GID"]
channel_name = clean_string(vid_dict["channel"]["channel_name"])
# make archive folder with correct permissions
new_folder = os.path.join(videos, channel_name)
if not os.path.exists(new_folder):
os.makedirs(new_folder)
if host_uid and host_gid:
os.chown(new_folder, host_uid, host_gid)
# find real filename
cache_dir = self.config["application"]["cache_dir"]
all_cached = ignore_filelist(os.listdir(cache_dir + "/download/"))
for file_str in all_cached:
if vid_dict["youtube_id"] in file_str:
old_file = file_str
old_file_path = os.path.join(cache_dir, "download", old_file)
new_file_path = os.path.join(videos, vid_dict["media_url"])
# move media file and fix permission
shutil.move(old_file_path, new_file_path)
if host_uid and host_gid:
os.chown(new_file_path, host_uid, host_gid)
def delete_from_pending(self, youtube_id):
"""delete downloaded video from pending index if its there"""
es_url = self.config["application"]["es_url"]
es_auth = self.config["application"]["es_auth"]
url = f"{es_url}/ta_download/_doc/{youtube_id}"
response = requests.delete(url, auth=es_auth)
if not response.ok and not response.status_code == 404:
print(response.text)
def add_subscribed_channels(self):
"""add all channels subscribed to refresh"""
all_subscribed = PlaylistSubscription().get_playlists()
if not all_subscribed:
return
channel_ids = [i["playlist_channel_id"] for i in all_subscribed]
for channel_id in channel_ids:
self.channels.add(channel_id)
return
def validate_playlists(self):
"""look for playlist needing to update"""
print("sync playlists")
self.add_subscribed_channels()
all_indexed = PendingList().get_all_indexed()
all_youtube_ids = [i["youtube_id"] for i in all_indexed]
for id_c, channel_id in enumerate(self.channels):
playlists = YoutubeChannel(channel_id).get_indexed_playlists()
all_playlist_ids = [i["playlist_id"] for i in playlists]
for id_p, playlist_id in enumerate(all_playlist_ids):
playlist_handler = YoutubePlaylist(
playlist_id, all_youtube_ids=all_youtube_ids
)
playlist_dict = playlist_handler.update_playlist()
if not playlist_dict:
playlist_handler.deactivate()
continue
playlist_handler.add_vids_to_playlist()
# notify
title = (
"Processing playlists for channels: "
+ f"{id_c + 1}/{len(self.channels)}"
)
message = f"Progress: {id_p + 1}/{len(all_playlist_ids)}"
mess_dict = {
"status": "message:download",
"level": "info",
"title": title,
"message": message,
}
if id_p + 1 == len(all_playlist_ids):
RedisArchivist().set_message(
"message:download", mess_dict, expire=4
)
else:
RedisArchivist().set_message("message:download", mess_dict)
@staticmethod
def auto_delete_watched(autodelete_days):
"""delete watched videos after x days"""
now = int(datetime.now().strftime("%s"))
now_lte = now - autodelete_days * 24 * 60 * 60
data = {
"query": {"range": {"player.watched_date": {"lte": now_lte}}},
"sort": [{"player.watched_date": {"order": "asc"}}],
}
all_to_delete = IndexPaginate("ta_video", data).get_results()
all_youtube_ids = [i["youtube_id"] for i in all_to_delete]
if not all_youtube_ids:
return
for youtube_id in all_youtube_ids:
print(f"autodelete {youtube_id}")
YoutubeVideo(youtube_id).delete_media_file()
print("add deleted to ignore list")
pending_handler = PendingList()
pending_handler.add_to_pending(all_youtube_ids, ignore=True)

View File

@ -0,0 +1,263 @@
"""
Functionality:
- handle download queue
- linked with ta_dowload index
"""
import json
import os
from datetime import datetime
import requests
import yt_dlp
from home.src.download.subscriptions import ChannelSubscription
from home.src.es.connect import IndexPaginate
from home.src.index.playlist import YoutubePlaylist
from home.src.ta.config import AppConfig
from home.src.ta.helper import DurationConverter, ignore_filelist
from home.src.ta.ta_redis import RedisArchivist
class PendingList:
"""manage the pending videos list"""
CONFIG = AppConfig().config
ES_URL = CONFIG["application"]["es_url"]
ES_AUTH = CONFIG["application"]["es_auth"]
VIDEOS = CONFIG["application"]["videos"]
def __init__(self):
self.all_channel_ids = False
self.all_downloaded = False
self.missing_from_playlists = []
def parse_url_list(self, youtube_ids):
"""extract youtube ids from list"""
missing_videos = []
for entry in youtube_ids:
# notify
mess_dict = {
"status": "message:add",
"level": "info",
"title": "Adding to download queue.",
"message": "Extracting lists",
}
RedisArchivist().set_message("message:add", mess_dict)
# extract
url = entry["url"]
url_type = entry["type"]
if url_type == "video":
missing_videos.append(url)
elif url_type == "channel":
video_results = ChannelSubscription().get_last_youtube_videos(
url, limit=False
)
youtube_ids = [i[0] for i in video_results]
missing_videos = missing_videos + youtube_ids
elif url_type == "playlist":
self.missing_from_playlists.append(entry)
playlist = YoutubePlaylist(url)
playlist.build_json()
video_results = playlist.json_data.get("playlist_entries")
youtube_ids = [i["youtube_id"] for i in video_results]
missing_videos = missing_videos + youtube_ids
return missing_videos
def add_to_pending(self, missing_videos, ignore=False):
"""build the bulk json data from pending"""
# check if channel is indexed
channel_handler = ChannelSubscription()
all_indexed = channel_handler.get_channels(subscribed_only=False)
self.all_channel_ids = [i["channel_id"] for i in all_indexed]
# check if already there
self.all_downloaded = self.get_all_downloaded()
bulk_list, all_videos_added = self.build_bulk(missing_videos, ignore)
# add last newline
bulk_list.append("\n")
query_str = "\n".join(bulk_list)
headers = {"Content-type": "application/x-ndjson"}
url = self.ES_URL + "/_bulk"
request = requests.post(
url, data=query_str, headers=headers, auth=self.ES_AUTH
)
if not request.ok:
print(request)
raise ValueError("failed to add video to download queue")
return all_videos_added
def build_bulk(self, missing_videos, ignore=False):
"""build the bulk lists"""
bulk_list = []
all_videos_added = []
for idx, youtube_id in enumerate(missing_videos):
# check if already downloaded
if youtube_id in self.all_downloaded:
continue
video = self.get_youtube_details(youtube_id)
# skip on download error
if not video:
continue
channel_indexed = video["channel_id"] in self.all_channel_ids
video["channel_indexed"] = channel_indexed
if ignore:
video["status"] = "ignore"
else:
video["status"] = "pending"
action = {"create": {"_id": youtube_id, "_index": "ta_download"}}
bulk_list.append(json.dumps(action))
bulk_list.append(json.dumps(video))
all_videos_added.append((youtube_id, video["vid_thumb_url"]))
# notify
progress = f"{idx + 1}/{len(missing_videos)}"
mess_dict = {
"status": "message:add",
"level": "info",
"title": "Adding new videos to download queue.",
"message": "Progress: " + progress,
}
if idx + 1 == len(missing_videos):
RedisArchivist().set_message(
"message:add", mess_dict, expire=4
)
else:
RedisArchivist().set_message("message:add", mess_dict)
if idx + 1 % 25 == 0:
print("adding to queue progress: " + progress)
return bulk_list, all_videos_added
@staticmethod
def get_youtube_details(youtube_id):
"""get details from youtubedl for single pending video"""
obs = {
"default_search": "ytsearch",
"quiet": True,
"check_formats": "selected",
"noplaylist": True,
"writethumbnail": True,
"simulate": True,
}
try:
vid = yt_dlp.YoutubeDL(obs).extract_info(youtube_id)
except yt_dlp.utils.DownloadError:
print("failed to extract info for: " + youtube_id)
return False
# stop if video is streaming live now
if vid["is_live"]:
return False
# parse response
seconds = vid["duration"]
duration_str = DurationConverter.get_str(seconds)
if duration_str == "NA":
print(f"skip extracting duration for: {youtube_id}")
upload_date = vid["upload_date"]
upload_dt = datetime.strptime(upload_date, "%Y%m%d")
published = upload_dt.strftime("%Y-%m-%d")
# build dict
youtube_details = {
"youtube_id": youtube_id,
"channel_name": vid["channel"],
"vid_thumb_url": vid["thumbnail"],
"title": vid["title"],
"channel_id": vid["channel_id"],
"duration": duration_str,
"published": published,
"timestamp": int(datetime.now().strftime("%s")),
}
return youtube_details
@staticmethod
def get_all_pending():
"""get a list of all pending videos in ta_download"""
data = {
"query": {"match_all": {}},
"sort": [{"timestamp": {"order": "asc"}}],
}
all_results = IndexPaginate("ta_download", data).get_results()
all_pending = []
all_ignore = []
for result in all_results:
if result["status"] == "pending":
all_pending.append(result)
elif result["status"] == "ignore":
all_ignore.append(result)
return all_pending, all_ignore
@staticmethod
def get_all_indexed():
"""get a list of all videos indexed"""
data = {
"query": {"match_all": {}},
"sort": [{"published": {"order": "desc"}}],
}
all_indexed = IndexPaginate("ta_video", data).get_results()
return all_indexed
def get_all_downloaded(self):
"""get a list of all videos in archive"""
channel_folders = os.listdir(self.VIDEOS)
all_channel_folders = ignore_filelist(channel_folders)
all_downloaded = []
for channel_folder in all_channel_folders:
channel_path = os.path.join(self.VIDEOS, channel_folder)
videos = os.listdir(channel_path)
all_videos = ignore_filelist(videos)
youtube_vids = [i[9:20] for i in all_videos]
for youtube_id in youtube_vids:
all_downloaded.append(youtube_id)
return all_downloaded
def delete_from_pending(self, youtube_id):
"""delete the youtube_id from ta_download"""
url = f"{self.ES_URL}/ta_download/_doc/{youtube_id}"
response = requests.delete(url, auth=self.ES_AUTH)
if not response.ok:
print(response.text)
def delete_pending(self, status):
"""delete download queue based on status value"""
data = {"query": {"term": {"status": {"value": status}}}}
payload = json.dumps(data)
url = self.ES_URL + "/ta_download/_delete_by_query"
headers = {"Content-type": "application/json"}
response = requests.post(
url, data=payload, headers=headers, auth=self.ES_AUTH
)
if not response.ok:
print(response.text)
def ignore_from_pending(self, ignore_list):
"""build the bulk query string"""
stamp = int(datetime.now().strftime("%s"))
bulk_list = []
for youtube_id in ignore_list:
action = {"update": {"_id": youtube_id, "_index": "ta_download"}}
source = {"doc": {"status": "ignore", "timestamp": stamp}}
bulk_list.append(json.dumps(action))
bulk_list.append(json.dumps(source))
# add last newline
bulk_list.append("\n")
query_str = "\n".join(bulk_list)
headers = {"Content-type": "application/x-ndjson"}
url = self.ES_URL + "/_bulk"
request = requests.post(
url, data=query_str, headers=headers, auth=self.ES_AUTH
)
if not request.ok:
print(request)
raise ValueError("failed to set video to ignore")

View File

@ -0,0 +1,214 @@
"""
Functionality:
- handle channel subscriptions
- handle playlist subscriptions
"""
import yt_dlp
from home.src.download import queue # partial import
from home.src.es.connect import IndexPaginate
from home.src.index.channel import YoutubeChannel
from home.src.index.playlist import YoutubePlaylist
from home.src.ta.config import AppConfig
from home.src.ta.ta_redis import RedisArchivist
class ChannelSubscription:
"""manage the list of channels subscribed"""
def __init__(self):
config = AppConfig().config
self.es_url = config["application"]["es_url"]
self.es_auth = config["application"]["es_auth"]
self.channel_size = config["subscriptions"]["channel_size"]
@staticmethod
def get_channels(subscribed_only=True):
"""get a list of all channels subscribed to"""
data = {
"sort": [{"channel_name.keyword": {"order": "asc"}}],
}
if subscribed_only:
data["query"] = {"term": {"channel_subscribed": {"value": True}}}
else:
data["query"] = {"match_all": {}}
all_channels = IndexPaginate("ta_channel", data).get_results()
return all_channels
def get_last_youtube_videos(self, channel_id, limit=True):
"""get a list of last videos from channel"""
url = f"https://www.youtube.com/channel/{channel_id}/videos"
obs = {
"default_search": "ytsearch",
"quiet": True,
"skip_download": True,
"extract_flat": True,
}
if limit:
obs["playlistend"] = self.channel_size
chan = yt_dlp.YoutubeDL(obs).extract_info(url, download=False)
last_videos = [(i["id"], i["title"]) for i in chan["entries"]]
return last_videos
def find_missing(self):
"""add missing videos from subscribed channels to pending"""
all_channels = self.get_channels()
pending_handler = queue.PendingList()
all_pending, all_ignore = pending_handler.get_all_pending()
all_ids = [i["youtube_id"] for i in all_ignore + all_pending]
all_downloaded = pending_handler.get_all_downloaded()
to_ignore = all_ids + all_downloaded
missing_videos = []
for idx, channel in enumerate(all_channels):
channel_id = channel["channel_id"]
last_videos = self.get_last_youtube_videos(channel_id)
for video in last_videos:
if video[0] not in to_ignore:
missing_videos.append(video[0])
# notify
message = {
"status": "message:rescan",
"level": "info",
"title": "Scanning channels: Looking for new videos.",
"message": f"Progress: {idx + 1}/{len(all_channels)}",
}
if idx + 1 == len(all_channels):
RedisArchivist().set_message(
"message:rescan", message=message, expire=4
)
else:
RedisArchivist().set_message("message:rescan", message=message)
return missing_videos
@staticmethod
def change_subscribe(channel_id, channel_subscribed):
"""subscribe or unsubscribe from channel and update"""
channel = YoutubeChannel(channel_id)
channel.build_json()
channel.json_data["channel_subscribed"] = channel_subscribed
channel.upload_to_es()
channel.sync_to_videos()
class PlaylistSubscription:
"""manage the playlist download functionality"""
def __init__(self):
self.config = AppConfig().config
@staticmethod
def get_playlists(subscribed_only=True):
"""get a list of all active playlists"""
data = {
"sort": [{"playlist_channel.keyword": {"order": "desc"}}],
}
data["query"] = {
"bool": {"must": [{"term": {"playlist_active": {"value": True}}}]}
}
if subscribed_only:
data["query"]["bool"]["must"].append(
{"term": {"playlist_subscribed": {"value": True}}}
)
all_playlists = IndexPaginate("ta_playlist", data).get_results()
return all_playlists
def process_url_str(self, new_playlists, subscribed=True):
"""process playlist subscribe form url_str"""
all_indexed = queue.PendingList().get_all_indexed()
all_youtube_ids = [i["youtube_id"] for i in all_indexed]
new_thumbs = []
for idx, playlist in enumerate(new_playlists):
url_type = playlist["type"]
playlist_id = playlist["url"]
if not url_type == "playlist":
print(f"{playlist_id} not a playlist, skipping...")
continue
playlist_h = YoutubePlaylist(playlist_id)
playlist_h.all_youtube_ids = all_youtube_ids
playlist_h.build_json()
playlist_h.json_data["playlist_subscribed"] = subscribed
playlist_h.upload_to_es()
playlist_h.add_vids_to_playlist()
self.channel_validate(playlist_h.json_data["playlist_channel_id"])
thumb = playlist_h.json_data["playlist_thumbnail"]
new_thumbs.append((playlist_id, thumb))
# notify
message = {
"status": "message:subplaylist",
"level": "info",
"title": "Subscribing to Playlists",
"message": f"Processing {idx + 1} of {len(new_playlists)}",
}
RedisArchivist().set_message(
"message:subplaylist", message=message
)
return new_thumbs
@staticmethod
def channel_validate(channel_id):
"""make sure channel of playlist is there"""
channel = YoutubeChannel(channel_id)
channel.build_json()
@staticmethod
def change_subscribe(playlist_id, subscribe_status):
"""change the subscribe status of a playlist"""
playlist = YoutubePlaylist(playlist_id)
playlist.build_json()
playlist.json_data["playlist_subscribed"] = subscribe_status
playlist.upload_to_es()
@staticmethod
def get_to_ignore():
"""get all youtube_ids already downloaded or ignored"""
pending_handler = queue.PendingList()
all_pending, all_ignore = pending_handler.get_all_pending()
all_ids = [i["youtube_id"] for i in all_ignore + all_pending]
all_downloaded = pending_handler.get_all_downloaded()
to_ignore = all_ids + all_downloaded
return to_ignore
def find_missing(self):
"""find videos in subscribed playlists not downloaded yet"""
all_playlists = [i["playlist_id"] for i in self.get_playlists()]
to_ignore = self.get_to_ignore()
missing_videos = []
for idx, playlist_id in enumerate(all_playlists):
size_limit = self.config["subscriptions"]["channel_size"]
playlist = YoutubePlaylist(playlist_id)
playlist.update_playlist()
if not playlist:
playlist.deactivate()
continue
playlist_entries = playlist.json_data["playlist_entries"]
if size_limit:
del playlist_entries[size_limit:]
all_missing = [i for i in playlist_entries if not i["downloaded"]]
message = {
"status": "message:rescan",
"level": "info",
"title": "Scanning playlists: Looking for new videos.",
"message": f"Progress: {idx + 1}/{len(all_playlists)}",
}
RedisArchivist().set_message("message:rescan", message=message)
for video in all_missing:
youtube_id = video["youtube_id"]
if youtube_id not in to_ignore:
missing_videos.append(youtube_id)
return missing_videos

View File

@ -1,16 +1,19 @@
""" """
functionality: functionality:
- handle download and caching for thumbnails - handle download and caching for thumbnails
- check for missing thumbnails
""" """
import os import os
from collections import Counter from collections import Counter
from time import sleep from time import sleep
import home.src.download as download
import requests import requests
from home.src.config import AppConfig from home.src.download import queue # partial import
from home.src.helper import RedisArchivist, ignore_filelist from home.src.download import subscriptions # partial import
from home.src.ta.config import AppConfig
from home.src.ta.helper import ignore_filelist
from home.src.ta.ta_redis import RedisArchivist
from mutagen.mp4 import MP4, MP4Cover from mutagen.mp4 import MP4, MP4Cover
from PIL import Image from PIL import Image
@ -55,8 +58,8 @@ class ThumbManager:
def get_needed_thumbs(self, missing_only=False): def get_needed_thumbs(self, missing_only=False):
"""get a list of all missing thumbnails""" """get a list of all missing thumbnails"""
all_thumbs = self.get_all_thumbs() all_thumbs = self.get_all_thumbs()
all_indexed = download.PendingList().get_all_indexed() all_indexed = queue.PendingList().get_all_indexed()
all_in_queue, all_ignored = download.PendingList().get_all_pending() all_in_queue, all_ignored = queue.PendingList().get_all_pending()
needed_thumbs = [] needed_thumbs = []
for video in all_indexed: for video in all_indexed:
@ -84,9 +87,8 @@ class ThumbManager:
all_channel_art = os.listdir(self.CHANNEL_DIR) all_channel_art = os.listdir(self.CHANNEL_DIR)
files = [i[0:24] for i in all_channel_art] files = [i[0:24] for i in all_channel_art]
cached_channel_ids = [k for (k, v) in Counter(files).items() if v > 1] cached_channel_ids = [k for (k, v) in Counter(files).items() if v > 1]
channels = download.ChannelSubscription().get_channels( channel_sub = subscriptions.ChannelSubscription()
subscribed_only=False channels = channel_sub.get_channels(subscribed_only=False)
)
missing_channels = [] missing_channels = []
for channel in channels: for channel in channels:
@ -104,10 +106,8 @@ class ThumbManager:
"""get all missing playlist artwork""" """get all missing playlist artwork"""
all_downloaded = ignore_filelist(os.listdir(self.PLAYLIST_DIR)) all_downloaded = ignore_filelist(os.listdir(self.PLAYLIST_DIR))
all_ids_downloaded = [i.replace(".jpg", "") for i in all_downloaded] all_ids_downloaded = [i.replace(".jpg", "") for i in all_downloaded]
playlist_sub = subscriptions.PlaylistSubscription()
playlists = download.PlaylistSubscription().get_playlists( playlists = playlist_sub.get_playlists(subscribed_only=False)
subscribed_only=False
)
missing_playlists = [] missing_playlists = []
for playlist in playlists: for playlist in playlists:
@ -276,7 +276,7 @@ class ThumbManager:
def get_thumb_list(self): def get_thumb_list(self):
"""get list of mediafiles and matching thumbnails""" """get list of mediafiles and matching thumbnails"""
all_indexed = download.PendingList().get_all_indexed() all_indexed = queue.PendingList().get_all_indexed()
video_list = [] video_list = []
for video in all_indexed: for video in all_indexed:
youtube_id = video["youtube_id"] youtube_id = video["youtube_id"]

View File

@ -0,0 +1,313 @@
"""
functionality:
- handle yt_dlp
- build options and post processor
- download video files
- move to archive
"""
import os
import shutil
from datetime import datetime
from time import sleep
import requests
import yt_dlp
from home.src.download.queue import PendingList
from home.src.download.subscriptions import PlaylistSubscription
from home.src.es.connect import IndexPaginate
from home.src.index.channel import YoutubeChannel
from home.src.index.playlist import YoutubePlaylist
from home.src.index.video import YoutubeVideo, index_new_video
from home.src.ta.config import AppConfig
from home.src.ta.helper import clean_string, ignore_filelist
from home.src.ta.ta_redis import RedisArchivist, RedisQueue
class VideoDownloader:
"""
handle the video download functionality
if not initiated with list, take from queue
"""
def __init__(self, youtube_id_list=False):
self.obs = False
self.youtube_id_list = youtube_id_list
self.config = AppConfig().config
self._build_obs()
self.channels = set()
def run_queue(self):
"""setup download queue in redis loop until no more items"""
queue = RedisQueue("dl_queue")
limit_queue = self.config["downloads"]["limit_count"]
if limit_queue:
queue.trim(limit_queue - 1)
while True:
youtube_id = queue.get_next()
if not youtube_id:
break
try:
self._dl_single_vid(youtube_id)
except yt_dlp.utils.DownloadError:
print("failed to download " + youtube_id)
continue
vid_dict = index_new_video(youtube_id)
self.channels.add(vid_dict["channel"]["channel_id"])
self.move_to_archive(vid_dict)
self._delete_from_pending(youtube_id)
autodelete_days = self.config["downloads"]["autodelete_days"]
if autodelete_days:
print(f"auto delete older than {autodelete_days} days")
self.auto_delete_watched(autodelete_days)
@staticmethod
def add_pending():
"""add pending videos to download queue"""
mess_dict = {
"status": "message:download",
"level": "info",
"title": "Looking for videos to download",
"message": "Scanning your download queue.",
}
RedisArchivist().set_message("message:download", mess_dict)
all_pending, _ = PendingList().get_all_pending()
to_add = [i["youtube_id"] for i in all_pending]
if not to_add:
# there is nothing pending
print("download queue is empty")
mess_dict = {
"status": "message:download",
"level": "error",
"title": "Download queue is empty",
"message": "Add some videos to the queue first.",
}
RedisArchivist().set_message("message:download", mess_dict)
return
queue = RedisQueue("dl_queue")
queue.add_list(to_add)
@staticmethod
def _progress_hook(response):
"""process the progress_hooks from yt_dlp"""
# title
path = os.path.split(response["filename"])[-1][12:]
filename = os.path.splitext(os.path.splitext(path)[0])[0]
filename_clean = filename.replace("_", " ")
title = "Downloading: " + filename_clean
# message
try:
percent = response["_percent_str"]
size = response["_total_bytes_str"]
speed = response["_speed_str"]
eta = response["_eta_str"]
message = f"{percent} of {size} at {speed} - time left: {eta}"
except KeyError:
message = "processing"
mess_dict = {
"status": "message:download",
"level": "info",
"title": title,
"message": message,
}
RedisArchivist().set_message("message:download", mess_dict)
def _build_obs(self):
"""collection to build all obs passed to yt-dlp"""
self._build_obs_basic()
self._build_obs_user()
self._build_obs_postprocessors()
def _build_obs_basic(self):
"""initial obs"""
self.obs = {
"default_search": "ytsearch",
"merge_output_format": "mp4",
"restrictfilenames": True,
"outtmpl": (
self.config["application"]["cache_dir"]
+ "/download/"
+ self.config["application"]["file_template"]
),
"progress_hooks": [self._progress_hook],
"noprogress": True,
"quiet": True,
"continuedl": True,
"retries": 3,
"writethumbnail": False,
"noplaylist": True,
"check_formats": "selected",
}
def _build_obs_user(self):
"""build user customized options"""
if self.config["downloads"]["format"]:
self.obs["format"] = self.config["downloads"]["format"]
if self.config["downloads"]["limit_speed"]:
self.obs["ratelimit"] = (
self.config["downloads"]["limit_speed"] * 1024
)
throttle = self.config["downloads"]["throttledratelimit"]
if throttle:
self.obs["throttledratelimit"] = throttle * 1024
def _build_obs_postprocessors(self):
"""add postprocessor to obs"""
postprocessors = []
if self.config["downloads"]["add_metadata"]:
postprocessors.append(
{
"key": "FFmpegMetadata",
"add_chapters": True,
"add_metadata": True,
}
)
if self.config["downloads"]["add_thumbnail"]:
postprocessors.append(
{
"key": "EmbedThumbnail",
"already_have_thumbnail": True,
}
)
self.obs["writethumbnail"] = True
self.obs["postprocessors"] = postprocessors
def _dl_single_vid(self, youtube_id):
"""download single video"""
dl_cache = self.config["application"]["cache_dir"] + "/download/"
# check if already in cache to continue from there
all_cached = ignore_filelist(os.listdir(dl_cache))
for file_name in all_cached:
if youtube_id in file_name:
self.obs["outtmpl"] = os.path.join(dl_cache, file_name)
with yt_dlp.YoutubeDL(self.obs) as ydl:
try:
ydl.download([youtube_id])
except yt_dlp.utils.DownloadError:
print("retry failed download: " + youtube_id)
sleep(10)
ydl.download([youtube_id])
if self.obs["writethumbnail"]:
# webp files don't get cleaned up automatically
all_cached = ignore_filelist(os.listdir(dl_cache))
to_clean = [i for i in all_cached if not i.endswith(".mp4")]
for file_name in to_clean:
file_path = os.path.join(dl_cache, file_name)
os.remove(file_path)
def move_to_archive(self, vid_dict):
"""move downloaded video from cache to archive"""
videos = self.config["application"]["videos"]
host_uid = self.config["application"]["HOST_UID"]
host_gid = self.config["application"]["HOST_GID"]
channel_name = clean_string(vid_dict["channel"]["channel_name"])
# make archive folder with correct permissions
new_folder = os.path.join(videos, channel_name)
if not os.path.exists(new_folder):
os.makedirs(new_folder)
if host_uid and host_gid:
os.chown(new_folder, host_uid, host_gid)
# find real filename
cache_dir = self.config["application"]["cache_dir"]
all_cached = ignore_filelist(os.listdir(cache_dir + "/download/"))
for file_str in all_cached:
if vid_dict["youtube_id"] in file_str:
old_file = file_str
old_file_path = os.path.join(cache_dir, "download", old_file)
new_file_path = os.path.join(videos, vid_dict["media_url"])
# move media file and fix permission
shutil.move(old_file_path, new_file_path)
if host_uid and host_gid:
os.chown(new_file_path, host_uid, host_gid)
def _delete_from_pending(self, youtube_id):
"""delete downloaded video from pending index if its there"""
es_url = self.config["application"]["es_url"]
es_auth = self.config["application"]["es_auth"]
url = f"{es_url}/ta_download/_doc/{youtube_id}"
response = requests.delete(url, auth=es_auth)
if not response.ok and not response.status_code == 404:
print(response.text)
def _add_subscribed_channels(self):
"""add all channels subscribed to refresh"""
all_subscribed = PlaylistSubscription().get_playlists()
if not all_subscribed:
return
channel_ids = [i["playlist_channel_id"] for i in all_subscribed]
for channel_id in channel_ids:
self.channels.add(channel_id)
return
def validate_playlists(self):
"""look for playlist needing to update"""
print("sync playlists")
self._add_subscribed_channels()
all_indexed = PendingList().get_all_indexed()
all_youtube_ids = [i["youtube_id"] for i in all_indexed]
for id_c, channel_id in enumerate(self.channels):
playlists = YoutubeChannel(channel_id).get_indexed_playlists()
all_playlist_ids = [i["playlist_id"] for i in playlists]
for id_p, playlist_id in enumerate(all_playlist_ids):
playlist = YoutubePlaylist(playlist_id)
playlist.all_youtube_ids = all_youtube_ids
playlist.build_json(scrape=True)
if not playlist.json_data:
playlist.deactivate()
playlist.add_vids_to_playlist()
playlist.upload_to_es()
# notify
title = (
"Processing playlists for channels: "
+ f"{id_c + 1}/{len(self.channels)}"
)
message = f"Progress: {id_p + 1}/{len(all_playlist_ids)}"
mess_dict = {
"status": "message:download",
"level": "info",
"title": title,
"message": message,
}
if id_p + 1 == len(all_playlist_ids):
RedisArchivist().set_message(
"message:download", mess_dict, expire=4
)
else:
RedisArchivist().set_message("message:download", mess_dict)
@staticmethod
def auto_delete_watched(autodelete_days):
"""delete watched videos after x days"""
now = int(datetime.now().strftime("%s"))
now_lte = now - autodelete_days * 24 * 60 * 60
data = {
"query": {"range": {"player.watched_date": {"lte": now_lte}}},
"sort": [{"player.watched_date": {"order": "asc"}}],
}
all_to_delete = IndexPaginate("ta_video", data).get_results()
all_youtube_ids = [i["youtube_id"] for i in all_to_delete]
if not all_youtube_ids:
return
for youtube_id in all_youtube_ids:
print(f"autodelete {youtube_id}")
YoutubeVideo(youtube_id).delete_media_file()
print("add deleted to ignore list")
pending_handler = PendingList()
pending_handler.add_to_pending(all_youtube_ids, ignore=True)

View File

View File

@ -0,0 +1,148 @@
"""
functionality:
- wrapper around requests to call elastic search
- reusable search_after to extract total index
"""
import json
import requests
from home.src.ta.config import AppConfig
class ElasticWrap:
"""makes all calls to elastic search
returns response json and status code tuple
"""
def __init__(self, path, config=False):
self.url = False
self.auth = False
self.path = path
self.config = config
self._get_config()
def _get_config(self):
"""add config if not passed"""
if not self.config:
self.config = AppConfig().config
es_url = self.config["application"]["es_url"]
self.auth = self.config["application"]["es_auth"]
self.url = f"{es_url}/{self.path}"
def get(self, data=False):
"""get data from es"""
if data:
response = requests.get(self.url, json=data, auth=self.auth)
else:
response = requests.get(self.url, auth=self.auth)
if not response.ok:
print(response.text)
return response.json(), response.status_code
def post(self, data=False, ndjson=False):
"""post data to es"""
if ndjson:
headers = {"Content-type": "application/x-ndjson"}
payload = data
else:
headers = {"Content-type": "application/json"}
payload = json.dumps(data)
if data:
response = requests.post(
self.url, data=payload, headers=headers, auth=self.auth
)
else:
response = requests.post(self.url, headers=headers, auth=self.auth)
if not response.ok:
print(response.text)
return response.json(), response.status_code
def put(self, data, refresh=False):
"""put data to es"""
if refresh:
self.url = f"{self.url}/?refresh=true"
response = requests.put(f"{self.url}", json=data, auth=self.auth)
if not response.ok:
print(response.text)
print(data)
raise ValueError("failed to add item to index")
return response.json(), response.status_code
def delete(self, data=False):
"""delete document from es"""
if data:
response = requests.delete(self.url, json=data, auth=self.auth)
else:
response = requests.delete(self.url, auth=self.auth)
if not response.ok:
print(response.text)
return response.json(), response.status_code
class IndexPaginate:
"""use search_after to go through whole index"""
DEFAULT_SIZE = 500
def __init__(self, index_name, data, size=False):
self.index_name = index_name
self.data = data
self.pit_id = False
self.size = size
def get_results(self):
"""get all results"""
self.get_pit()
self.validate_data()
all_results = self.run_loop()
self.clean_pit()
return all_results
def get_pit(self):
"""get pit for index"""
path = f"{self.index_name}/_pit?keep_alive=10m"
response, _ = ElasticWrap(path).post()
self.pit_id = response["id"]
def validate_data(self):
"""add pit and size to data"""
if "sort" not in self.data.keys():
print(self.data)
raise ValueError("missing sort key in data")
size = self.size or self.DEFAULT_SIZE
self.data["size"] = size
self.data["pit"] = {"id": self.pit_id, "keep_alive": "10m"}
def run_loop(self):
"""loop through results until last hit"""
all_results = []
while True:
response, _ = ElasticWrap("_search").get(data=self.data)
all_hits = response["hits"]["hits"]
if all_hits:
for hit in all_hits:
source = hit["_source"]
search_after = hit["sort"]
all_results.append(source)
# update search_after with last hit data
self.data["search_after"] = search_after
else:
break
return all_results
def clean_pit(self):
"""delete pit from elastic search"""
data = {"id": self.pit_id}
ElasticWrap("_pit").delete(data=data)

View File

@ -0,0 +1,274 @@
{
"index_config": [{
"index_name": "channel",
"expected_map": {
"channel_id": {
"type": "keyword"
},
"channel_name": {
"type": "text",
"analyzer": "english",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256,
"normalizer": "to_lower"
},
"search_as_you_type": {
"type": "search_as_you_type",
"doc_values": false,
"max_shingle_size": 3
}
}
},
"channel_banner_url": {
"type": "keyword",
"index": false
},
"channel_thumb_url": {
"type": "keyword",
"index": false
},
"channel_description": {
"type": "text"
},
"channel_last_refresh": {
"type": "date",
"format": "epoch_second"
}
},
"expected_set": {
"analysis": {
"normalizer": {
"to_lower": {
"type": "custom",
"filter": ["lowercase"]
}
}
},
"number_of_replicas": "0"
}
},
{
"index_name": "video",
"expected_map": {
"vid_thumb_url": {
"type": "text",
"index": false
},
"date_downloaded": {
"type": "date"
},
"channel": {
"properties": {
"channel_id": {
"type": "keyword"
},
"channel_name": {
"type": "text",
"analyzer": "english",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256,
"normalizer": "to_lower"
},
"search_as_you_type": {
"type": "search_as_you_type",
"doc_values": false,
"max_shingle_size": 3
}
}
},
"channel_banner_url": {
"type": "keyword",
"index": false
},
"channel_thumb_url": {
"type": "keyword",
"index": false
},
"channel_description": {
"type": "text"
},
"channel_last_refresh": {
"type": "date",
"format": "epoch_second"
}
}
},
"description": {
"type": "text"
},
"media_url": {
"type": "keyword",
"index": false
},
"tags": {
"type": "text",
"analyzer": "english",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"title": {
"type": "text",
"analyzer": "english",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256,
"normalizer": "to_lower"
},
"search_as_you_type": {
"type": "search_as_you_type",
"doc_values": false,
"max_shingle_size": 3
}
}
},
"vid_last_refresh": {
"type": "date"
},
"youtube_id": {
"type": "keyword"
},
"published": {
"type": "date"
},
"playlist": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256,
"normalizer": "to_lower"
}
}
}
},
"expected_set": {
"analysis": {
"normalizer": {
"to_lower": {
"type": "custom",
"filter": ["lowercase"]
}
}
},
"number_of_replicas": "0"
}
},
{
"index_name": "download",
"expected_map": {
"timestamp": {
"type": "date"
},
"channel_id": {
"type": "keyword"
},
"channel_name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256,
"normalizer": "to_lower"
}
}
},
"status": {
"type": "keyword"
},
"title": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256,
"normalizer": "to_lower"
}
}
},
"vid_thumb_url": {
"type": "keyword"
},
"youtube_id": {
"type": "keyword"
}
},
"expected_set": {
"analysis": {
"normalizer": {
"to_lower": {
"type": "custom",
"filter": ["lowercase"]
}
}
},
"number_of_replicas": "0"
}
},
{
"index_name": "playlist",
"expected_map": {
"playlist_id": {
"type": "keyword"
},
"playlist_description": {
"type": "text"
},
"playlist_name": {
"type": "text",
"analyzer": "english",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256,
"normalizer": "to_lower"
},
"search_as_you_type": {
"type": "search_as_you_type",
"doc_values": false,
"max_shingle_size": 3
}
}
},
"playlist_channel": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256,
"normalizer": "to_lower"
}
}
},
"playlist_channel_id": {
"type": "keyword"
},
"playlist_thumbnail": {
"type": "keyword"
},
"playlist_last_refresh": {
"type": "date"
}
},
"expected_set": {
"analysis": {
"normalizer": {
"to_lower": {
"type": "custom",
"filter": ["lowercase"]
}
}
},
"number_of_replicas": "0"
}
}
]
}

View File

@ -1,9 +1,8 @@
""" """
Functionality: functionality:
- initial elastic search setup - setup elastic index at first start
- index configuration is represented in INDEX_CONFIG - verify and update index mapping and settings if needed
- index mapping and settings validation - backup and restore metadata
- backup and restore
""" """
import json import json
@ -12,213 +11,8 @@ import zipfile
from datetime import datetime from datetime import datetime
import requests import requests
from home.src.config import AppConfig from home.src.ta.config import AppConfig
from home.src.helper import ignore_filelist from home.src.ta.helper import ignore_filelist
# expected mapping and settings
INDEX_CONFIG = [
{
"index_name": "channel",
"expected_map": {
"channel_id": {
"type": "keyword",
},
"channel_name": {
"type": "text",
"analyzer": "english",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256,
"normalizer": "to_lower",
},
"search_as_you_type": {
"type": "search_as_you_type",
"doc_values": False,
"max_shingle_size": 3,
},
},
},
"channel_banner_url": {"type": "keyword", "index": False},
"channel_thumb_url": {"type": "keyword", "index": False},
"channel_description": {"type": "text"},
"channel_last_refresh": {"type": "date", "format": "epoch_second"},
},
"expected_set": {
"analysis": {
"normalizer": {
"to_lower": {"type": "custom", "filter": ["lowercase"]}
}
},
"number_of_replicas": "0",
},
},
{
"index_name": "video",
"expected_map": {
"vid_thumb_url": {"type": "text", "index": False},
"date_downloaded": {"type": "date"},
"channel": {
"properties": {
"channel_id": {
"type": "keyword",
},
"channel_name": {
"type": "text",
"analyzer": "english",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256,
"normalizer": "to_lower",
},
"search_as_you_type": {
"type": "search_as_you_type",
"doc_values": False,
"max_shingle_size": 3,
},
},
},
"channel_banner_url": {"type": "keyword", "index": False},
"channel_thumb_url": {"type": "keyword", "index": False},
"channel_description": {"type": "text"},
"channel_last_refresh": {
"type": "date",
"format": "epoch_second",
},
}
},
"description": {"type": "text"},
"media_url": {"type": "keyword", "index": False},
"tags": {
"type": "text",
"analyzer": "english",
"fields": {
"keyword": {"type": "keyword", "ignore_above": 256}
},
},
"title": {
"type": "text",
"analyzer": "english",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256,
"normalizer": "to_lower",
},
"search_as_you_type": {
"type": "search_as_you_type",
"doc_values": False,
"max_shingle_size": 3,
},
},
},
"vid_last_refresh": {"type": "date"},
"youtube_id": {"type": "keyword"},
"published": {"type": "date"},
"playlist": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256,
"normalizer": "to_lower",
}
},
},
},
"expected_set": {
"analysis": {
"normalizer": {
"to_lower": {"type": "custom", "filter": ["lowercase"]}
}
},
"number_of_replicas": "0",
},
},
{
"index_name": "download",
"expected_map": {
"timestamp": {"type": "date"},
"channel_id": {"type": "keyword"},
"channel_name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256,
"normalizer": "to_lower",
}
},
},
"status": {"type": "keyword"},
"title": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256,
"normalizer": "to_lower",
}
},
},
"vid_thumb_url": {"type": "keyword"},
"youtube_id": {"type": "keyword"},
},
"expected_set": {
"analysis": {
"normalizer": {
"to_lower": {"type": "custom", "filter": ["lowercase"]}
}
},
"number_of_replicas": "0",
},
},
{
"index_name": "playlist",
"expected_map": {
"playlist_id": {"type": "keyword"},
"playlist_description": {"type": "text"},
"playlist_name": {
"type": "text",
"analyzer": "english",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256,
"normalizer": "to_lower",
},
"search_as_you_type": {
"type": "search_as_you_type",
"doc_values": False,
"max_shingle_size": 3,
},
},
},
"playlist_channel": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256,
"normalizer": "to_lower",
}
},
},
"playlist_channel_id": {"type": "keyword"},
"playlist_thumbnail": {"type": "keyword"},
"playlist_last_refresh": {"type": "date"},
},
"expected_set": {
"analysis": {
"normalizer": {
"to_lower": {"type": "custom", "filter": ["lowercase"]}
}
},
"number_of_replicas": "0",
},
},
]
class ElasticIndex: class ElasticIndex:
@ -602,48 +396,21 @@ class ElasticBackup:
os.remove(file_path) os.remove(file_path)
def get_available_backups(): def get_mapping():
"""return dict of available backups for settings view""" """read index_mapping.json and get expected mapping and settings"""
backup_handler = ElasticBackup(INDEX_CONFIG, reason=False) with open("home/src/es/index_mapping.json", "r", encoding="utf-8") as f:
all_backup_files = backup_handler.get_all_backup_files() index_config = json.load(f).get("index_config")
return all_backup_files
return index_config
def backup_all_indexes(reason):
"""backup all es indexes to disk"""
backup_handler = ElasticBackup(INDEX_CONFIG, reason)
for index in backup_handler.index_config:
index_name = index["index_name"]
if not backup_handler.index_exists(index_name):
continue
all_results = backup_handler.get_all_documents(index_name)
file_content = backup_handler.build_bulk(all_results)
backup_handler.write_es_json(file_content, index_name)
backup_handler.write_ta_json(all_results, index_name)
backup_handler.zip_it()
if reason == "auto":
backup_handler.rotate_backup()
def restore_from_backup(filename):
"""restore indexes from backup file"""
# delete
index_check(force_restore=True)
# recreate
backup_handler = ElasticBackup(INDEX_CONFIG, reason=False)
zip_content = backup_handler.unpack_zip_backup(filename)
backup_handler.restore_json_files(zip_content)
def index_check(force_restore=False): def index_check(force_restore=False):
"""check if all indexes are created and have correct mapping""" """check if all indexes are created and have correct mapping"""
backed_up = False backed_up = False
index_config = get_mapping()
for index in INDEX_CONFIG: for index in index_config:
index_name = index["index_name"] index_name = index["index_name"]
expected_map = index["expected_map"] expected_map = index["expected_map"]
expected_set = index["expected_set"] expected_set = index["expected_set"]
@ -675,3 +442,42 @@ def index_check(force_restore=False):
# else all good # else all good
print(f"ta_{index_name} index is created and up to date...") print(f"ta_{index_name} index is created and up to date...")
def get_available_backups():
"""return dict of available backups for settings view"""
index_config = get_mapping()
backup_handler = ElasticBackup(index_config, reason=False)
all_backup_files = backup_handler.get_all_backup_files()
return all_backup_files
def backup_all_indexes(reason):
"""backup all es indexes to disk"""
index_config = get_mapping()
backup_handler = ElasticBackup(index_config, reason)
for index in backup_handler.index_config:
index_name = index["index_name"]
if not backup_handler.index_exists(index_name):
continue
all_results = backup_handler.get_all_documents(index_name)
file_content = backup_handler.build_bulk(all_results)
backup_handler.write_es_json(file_content, index_name)
backup_handler.write_ta_json(all_results, index_name)
backup_handler.zip_it()
if reason == "auto":
backup_handler.rotate_backup()
def restore_from_backup(filename):
"""restore indexes from backup file"""
# delete
index_check(force_restore=True)
# recreate
index_config = get_mapping()
backup_handler = ElasticBackup(index_config, reason=False)
zip_content = backup_handler.unpack_zip_backup(filename)
backup_handler.restore_json_files(zip_content)

View File

@ -4,19 +4,18 @@ Functionality:
- called via user input - called via user input
""" """
from home.src.download import ( from home.src.download.queue import PendingList
from home.src.download.subscriptions import (
ChannelSubscription, ChannelSubscription,
PendingList,
PlaylistSubscription, PlaylistSubscription,
) )
from home.src.helper import RedisArchivist, RedisQueue, UrlListParser from home.src.frontend.searching import SearchForm
from home.src.index import ( from home.src.frontend.watched import WatchState
WatchState, from home.src.index.channel import YoutubeChannel
YoutubeChannel, from home.src.index.playlist import YoutubePlaylist
YoutubePlaylist, from home.src.index.video import YoutubeVideo
YoutubeVideo, from home.src.ta.helper import UrlListParser
) from home.src.ta.ta_redis import RedisArchivist, RedisQueue
from home.src.searching import SearchForm
from home.tasks import ( from home.tasks import (
download_pending, download_pending,
download_single, download_single,
@ -306,7 +305,7 @@ class PostData:
playlist_dict = self.exec_val playlist_dict = self.exec_val
playlist_id = playlist_dict["playlist-id"] playlist_id = playlist_dict["playlist-id"]
playlist_action = playlist_dict["playlist-action"] playlist_action = playlist_dict["playlist-action"]
print(f"delete {playlist_action} from playlist {playlist_id}") print(f"{playlist_id}: delete playlist {playlist_action}")
if playlist_action == "metadata": if playlist_action == "metadata":
YoutubePlaylist(playlist_id).delete_metadata() YoutubePlaylist(playlist_id).delete_metadata()
elif playlist_action == "all": elif playlist_action == "all":

View File

@ -6,36 +6,26 @@ Functionality:
- calculate pagination values - calculate pagination values
""" """
import math
import urllib.parse import urllib.parse
from datetime import datetime from datetime import datetime
import requests from home.src.download.thumbnails import ThumbManager
from home.src.config import AppConfig from home.src.es.connect import ElasticWrap
from home.src.helper import RedisArchivist from home.src.ta.config import AppConfig
from home.src.thumbnails import ThumbManager
class SearchHandler: class SearchHandler:
"""search elastic search""" """search elastic search"""
CONFIG = AppConfig().config def __init__(self, path, config, data=False):
CACHE_DIR = CONFIG["application"]["cache_dir"]
ES_AUTH = CONFIG["application"]["es_auth"]
def __init__(self, url, data):
self.max_hits = None self.max_hits = None
self.url = url self.path = path
self.config = config
self.data = data self.data = data
def get_data(self): def get_data(self):
"""get the data""" """get the data"""
if self.data: response, _ = ElasticWrap(self.path, config=self.config).get(self.data)
response = requests.get(
self.url, json=self.data, auth=self.ES_AUTH
).json()
else:
response = requests.get(self.url, auth=self.ES_AUTH).json()
if "hits" in response.keys(): if "hits" in response.keys():
self.max_hits = response["hits"]["total"]["value"] self.max_hits = response["hits"]["total"]["value"]
@ -153,11 +143,10 @@ class SearchForm:
"""build query from search form data""" """build query from search form data"""
CONFIG = AppConfig().config CONFIG = AppConfig().config
ES_URL = CONFIG["application"]["es_url"]
def multi_search(self, search_query): def multi_search(self, search_query):
"""searching through index""" """searching through index"""
url = self.ES_URL + "/ta_video,ta_channel,ta_playlist/_search" path = "ta_video,ta_channel,ta_playlist/_search"
data = { data = {
"size": 30, "size": 30,
"query": { "query": {
@ -184,7 +173,7 @@ class SearchForm:
} }
}, },
} }
look_up = SearchHandler(url, data) look_up = SearchHandler(path, config=self.CONFIG, data=data)
search_results = look_up.get_data() search_results = look_up.get_data()
all_results = self.build_results(search_results) all_results = self.build_results(search_results)
@ -212,62 +201,3 @@ class SearchForm:
} }
return all_results return all_results
class Pagination:
"""
figure out the pagination based on page size and total_hits
"""
def __init__(self, page_get, user_id, search_get=False):
self.user_id = user_id
self.page_size = self.get_page_size()
self.page_get = page_get
self.search_get = search_get
self.pagination = self.first_guess()
def get_page_size(self):
"""get default or user modified page_size"""
key = f"{self.user_id}:page_size"
page_size = RedisArchivist().get_message(key)["status"]
if not page_size:
config = AppConfig().config
page_size = config["archive"]["page_size"]
return page_size
def first_guess(self):
"""build first guess before api call"""
page_get = self.page_get
if page_get in [0, 1]:
page_from = 0
prev_pages = False
elif page_get > 1:
page_from = (page_get - 1) * self.page_size
prev_pages = [
i for i in range(page_get - 1, page_get - 6, -1) if i > 1
]
prev_pages.reverse()
pagination = {
"page_size": self.page_size,
"page_from": page_from,
"prev_pages": prev_pages,
"current_page": page_get,
}
if self.search_get:
pagination.update({"search_get": self.search_get})
return pagination
def validate(self, total_hits):
"""validate pagination with total_hits after making api call"""
page_get = self.page_get
max_pages = math.ceil(total_hits / self.page_size)
if page_get < max_pages and max_pages > 1:
self.pagination["last_page"] = max_pages
else:
self.pagination["last_page"] = False
next_pages = [
i for i in range(page_get + 1, page_get + 6) if 1 < i < max_pages
]
self.pagination["next_pages"] = next_pages

View File

@ -0,0 +1,128 @@
"""
functionality:
- handle watched state for videos, channels and playlists
"""
import json
from datetime import datetime
import requests
from home.src.ta.config import AppConfig
from home.src.ta.helper import UrlListParser
class WatchState:
"""handle watched checkbox for videos and channels"""
CONFIG = AppConfig().config
ES_URL = CONFIG["application"]["es_url"]
ES_AUTH = CONFIG["application"]["es_auth"]
HEADERS = {"Content-type": "application/json"}
def __init__(self, youtube_id):
self.youtube_id = youtube_id
self.stamp = int(datetime.now().strftime("%s"))
def mark_as_watched(self):
"""update es with new watched value"""
url_type = self.dedect_type()
if url_type == "video":
self.mark_vid_watched()
elif url_type == "channel":
self.mark_channel_watched()
elif url_type == "playlist":
self.mark_playlist_watched()
print(f"marked {self.youtube_id} as watched")
def mark_as_unwatched(self):
"""revert watched state to false"""
url_type = self.dedect_type()
if url_type == "video":
self.mark_vid_watched(revert=True)
print(f"revert {self.youtube_id} as unwatched")
def dedect_type(self):
"""find youtube id type"""
print(self.youtube_id)
url_process = UrlListParser(self.youtube_id).process_list()
url_type = url_process[0]["type"]
return url_type
def mark_vid_watched(self, revert=False):
"""change watched status of single video"""
url = self.ES_URL + "/ta_video/_update/" + self.youtube_id
data = {
"doc": {"player": {"watched": True, "watched_date": self.stamp}}
}
if revert:
data["doc"]["player"]["watched"] = False
payload = json.dumps(data)
request = requests.post(
url, data=payload, headers=self.HEADERS, auth=self.ES_AUTH
)
if not request.ok:
print(request.text)
raise ValueError("failed to mark video as watched")
def mark_channel_watched(self):
"""change watched status of every video in channel"""
data = {
"query": {
"bool": {
"must": [
{
"term": {
"channel.channel_id": {
"value": self.youtube_id
}
}
},
{"term": {"player.watched": {"value": False}}},
]
}
},
"script": {
"source": "ctx._source.player['watched'] = true",
"lang": "painless",
},
}
payload = json.dumps(data)
url = f"{self.ES_URL}/ta_video/_update_by_query"
request = requests.post(
url, data=payload, headers=self.HEADERS, auth=self.ES_AUTH
)
if not request.ok:
print(request.text)
raise ValueError("failed mark channel as watched")
def mark_playlist_watched(self):
"""change watched state of all videos in playlist"""
data = {
"query": {
"bool": {
"must": [
{
"term": {
"playlist.keyword": {"value": self.youtube_id}
}
},
{"term": {"player.watched": {"value": False}}},
]
}
},
"script": {
"source": "ctx._source.player['watched'] = true",
"lang": "painless",
},
}
payload = json.dumps(data)
url = f"{self.ES_URL}/ta_video/_update_by_query"
request = requests.post(
url, data=payload, headers=self.HEADERS, auth=self.ES_AUTH
)
if not request.ok:
print(request.text)
raise ValueError("failed mark playlist as watched")

View File

@ -1,970 +0,0 @@
"""
Functionality:
- index new videos into elastisearch
- extract video info with yt_dlp
- scrape youtube channel page if needed
"""
import json
import os
import re
from datetime import datetime
from time import sleep
import requests
import yt_dlp
from bs4 import BeautifulSoup
from home.src.config import AppConfig
from home.src.helper import DurationConverter, UrlListParser, clean_string
from home.src.thumbnails import ThumbManager
from ryd_client import ryd_client
class YoutubeChannel:
"""represents a single youtube channel"""
CONFIG = AppConfig().config
ES_URL = CONFIG["application"]["es_url"]
ES_AUTH = CONFIG["application"]["es_auth"]
CACHE_DIR = CONFIG["application"]["cache_dir"]
VIDEOS = CONFIG["application"]["videos"]
def __init__(self, channel_id):
self.channel_id = channel_id
self.json_data = None
self.source = None
self.channel_dict = self.build_channel_dict()
def build_channel_dict(self, scrape=False):
"""combine the dicts build from extracted json payload"""
if scrape:
channel_dict = False
else:
channel_dict = self.get_es_channel()
if not channel_dict:
print("scrape data from youtube")
self.scrape_channel()
channel_dict = self.parse_channel_main()
channel_dict.update(self.parse_channel_meta())
self.source = "scraped"
return channel_dict
def get_es_channel(self):
"""get from elastic search first if possible"""
channel_id = self.channel_id
url = f"{self.ES_URL}/ta_channel/_doc/{channel_id}"
response = requests.get(url, auth=self.ES_AUTH)
if response.ok:
channel_source = response.json()["_source"]
self.source = "elastic"
return channel_source
return False
def scrape_channel(self):
"""scrape channel page for additional infos"""
channel_id = self.channel_id
url = f"https://www.youtube.com/channel/{channel_id}/about?hl=en"
cookies = {"CONSENT": "YES+xxxxxxxxxxxxxxxxxxxxxxxxxxx"}
response = requests.get(url, cookies=cookies, auth=self.ES_AUTH)
if response.ok:
channel_page = response.text
else:
print(f"failed to extract channel info for: {channel_id}")
raise ConnectionError
soup = BeautifulSoup(channel_page, "html.parser")
# load script into json
all_scripts = soup.find("body").find_all("script")
for script in all_scripts:
if "var ytInitialData = " in str(script):
script_content = str(script)
break
# extract payload
script_content = script_content.split("var ytInitialData = ")[1]
json_raw = script_content.rstrip(";</script>")
json_data = json.loads(json_raw)
# add to self
self.json_data = json_data
def parse_channel_main(self):
"""extract maintab values from scraped channel json data"""
main_tab = self.json_data["header"]["c4TabbedHeaderRenderer"]
channel_name = main_tab["title"]
last_refresh = int(datetime.now().strftime("%s"))
# channel_subs
try:
sub_text_simple = main_tab["subscriberCountText"]["simpleText"]
sub_text = sub_text_simple.split(" ")[0]
if sub_text[-1] == "K":
channel_subs = int(float(sub_text.replace("K", "")) * 1000)
elif sub_text[-1] == "M":
channel_subs = int(float(sub_text.replace("M", "")) * 1000000)
elif int(sub_text) >= 0:
channel_subs = int(sub_text)
else:
message = f"{sub_text} not dealt with"
print(message)
except KeyError:
channel_subs = 0
# banner
try:
all_banners = main_tab["banner"]["thumbnails"]
banner = sorted(all_banners, key=lambda k: k["width"])[-1]["url"]
except KeyError:
banner = False
# build and return dict
main_channel_dict = {
"channel_active": True,
"channel_last_refresh": last_refresh,
"channel_subs": channel_subs,
"channel_banner_url": banner,
"channel_name": channel_name,
"channel_id": self.channel_id,
}
return main_channel_dict
def parse_channel_meta(self):
"""extract meta tab values from channel payload"""
# meta tab
json_data = self.json_data
meta_tab = json_data["metadata"]["channelMetadataRenderer"]
description = meta_tab["description"]
all_thumbs = meta_tab["avatar"]["thumbnails"]
thumb_url = sorted(all_thumbs, key=lambda k: k["width"])[-1]["url"]
# stats tab
renderer = "twoColumnBrowseResultsRenderer"
all_tabs = json_data["contents"][renderer]["tabs"]
for tab in all_tabs:
if "tabRenderer" in tab.keys():
if tab["tabRenderer"]["title"] == "About":
about_tab = tab["tabRenderer"]["content"][
"sectionListRenderer"
]["contents"][0]["itemSectionRenderer"]["contents"][0][
"channelAboutFullMetadataRenderer"
]
break
try:
channel_views_text = about_tab["viewCountText"]["simpleText"]
channel_views = int(re.sub(r"\D", "", channel_views_text))
except KeyError:
channel_views = 0
meta_channel_dict = {
"channel_description": description,
"channel_thumb_url": thumb_url,
"channel_views": channel_views,
}
return meta_channel_dict
def get_channel_art(self):
"""download channel art for new channels"""
channel_id = self.channel_id
channel_thumb = self.channel_dict["channel_thumb_url"]
channel_banner = self.channel_dict["channel_banner_url"]
ThumbManager().download_chan(
[(channel_id, channel_thumb, channel_banner)]
)
def upload_to_es(self):
"""upload channel data to elastic search"""
url = f"{self.ES_URL}/ta_channel/_doc/{self.channel_id}"
response = requests.put(url, json=self.channel_dict, auth=self.ES_AUTH)
print(f"added {self.channel_id} to es")
if not response.ok:
print(response.text)
raise ValueError("failed to add channel to index")
def sync_to_videos(self):
"""sync new channel_dict to all videos of channel"""
headers = {"Content-type": "application/json"}
channel_id = self.channel_id
# add ingest pipeline
processors = []
for field, value in self.channel_dict.items():
line = {"set": {"field": "channel." + field, "value": value}}
processors.append(line)
data = {"description": channel_id, "processors": processors}
payload = json.dumps(data)
url = self.ES_URL + "/_ingest/pipeline/" + channel_id
request = requests.put(
url, data=payload, headers=headers, auth=self.ES_AUTH
)
if not request.ok:
print(request.text)
# apply pipeline
data = {"query": {"match": {"channel.channel_id": channel_id}}}
payload = json.dumps(data)
url = self.ES_URL + "/ta_video/_update_by_query?pipeline=" + channel_id
request = requests.post(
url, data=payload, headers=headers, auth=self.ES_AUTH
)
if not request.ok:
print(request.text)
def get_folder_path(self):
"""get folder where media files get stored"""
channel_name = self.channel_dict["channel_name"]
folder_name = clean_string(channel_name)
folder_path = os.path.join(self.VIDEOS, folder_name)
return folder_path
def delete_es_videos(self):
"""delete all channel documents from elasticsearch"""
headers = {"Content-type": "application/json"}
data = {
"query": {
"term": {"channel.channel_id": {"value": self.channel_id}}
}
}
payload = json.dumps(data)
url = self.ES_URL + "/ta_video/_delete_by_query"
response = requests.post(
url, data=payload, headers=headers, auth=self.ES_AUTH
)
if not response.ok:
print(response.text)
def delete_playlists(self):
"""delete all indexed playlist from es"""
all_playlists = self.get_indexed_playlists()
for playlist in all_playlists:
playlist_id = playlist["playlist_id"]
YoutubePlaylist(playlist_id).delete_metadata()
def delete_channel(self):
"""delete channel and all videos"""
print(f"deleting {self.channel_id} and all matching media files")
folder_path = self.get_folder_path()
print("delete all media files")
try:
all_videos = os.listdir(folder_path)
for video in all_videos:
video_path = os.path.join(folder_path, video)
os.remove(video_path)
os.rmdir(folder_path)
except FileNotFoundError:
print(f"no videos found for {folder_path}")
ThumbManager().delete_chan_thumb(self.channel_id)
print("delete indexed playlists")
self.delete_playlists()
print("delete indexed videos")
self.delete_es_videos()
url = self.ES_URL + "/ta_channel/_doc/" + self.channel_id
response = requests.delete(url, auth=self.ES_AUTH)
if not response.ok:
print(response.text)
def get_all_playlists(self):
"""get all playlists owned by this channel"""
url = (
f"https://www.youtube.com/channel/{self.channel_id}"
+ "/playlists?view=1&sort=dd&shelf_id=0"
)
obs = {
"quiet": True,
"skip_download": True,
"extract_flat": True,
}
playlists = yt_dlp.YoutubeDL(obs).extract_info(url)
all_entries = [(i["id"], i["title"]) for i in playlists["entries"]]
return all_entries
def get_indexed_playlists(self):
"""get all indexed playlists from channel"""
data = {
"query": {
"term": {"playlist_channel_id": {"value": self.channel_id}}
},
"sort": [{"playlist_channel.keyword": {"order": "desc"}}],
}
all_playlists = IndexPaginate("ta_playlist", data).get_results()
return all_playlists
class YoutubeVideo:
"""represents a single youtube video"""
CONFIG = AppConfig().config
ES_URL = CONFIG["application"]["es_url"]
ES_AUTH = CONFIG["application"]["es_auth"]
CACHE_DIR = CONFIG["application"]["cache_dir"]
VIDEOS = CONFIG["application"]["videos"]
def __init__(self, youtube_id):
self.youtube_id = youtube_id
self.channel_id = None
self.vid_dict = None
def get_vid_dict(self):
"""wrapper to loop around yt_dlp to retry on failure"""
print(f"get video data for {self.youtube_id}")
vid_dict = False
for i in range(3):
try:
vid_dict = self.get_youtubedl_vid_data()
except KeyError as e:
print(e)
sleep((i + 1) ** 2)
continue
else:
break
self.vid_dict = vid_dict
if self.CONFIG["downloads"]["integrate_ryd"]:
self.get_ryd_stats()
def get_youtubedl_vid_data(self):
"""parse youtubedl extract info"""
youtube_id = self.youtube_id
obs = {
"quiet": True,
"default_search": "ytsearch",
"skip_download": True,
"check_formats": "selected",
"noplaylist": True,
}
try:
vid = yt_dlp.YoutubeDL(obs).extract_info(youtube_id)
except (
yt_dlp.utils.ExtractorError,
yt_dlp.utils.DownloadError,
):
print("failed to get info for " + youtube_id)
return False
# extract
self.channel_id = vid["channel_id"]
upload_date = vid["upload_date"]
upload_date_time = datetime.strptime(upload_date, "%Y%m%d")
published = upload_date_time.strftime("%Y-%m-%d")
last_refresh = int(datetime.now().strftime("%s"))
# likes
try:
like_count = vid["like_count"]
except KeyError:
like_count = 0
try:
dislike_count = vid["dislike_count"]
except KeyError:
dislike_count = 0
# build dicts
stats = {
"view_count": vid["view_count"],
"like_count": like_count,
"dislike_count": dislike_count,
"average_rating": vid["average_rating"],
}
vid_basic = {
"title": vid["title"],
"description": vid["description"],
"category": vid["categories"],
"vid_thumb_url": vid["thumbnail"],
"tags": vid["tags"],
"published": published,
"stats": stats,
"vid_last_refresh": last_refresh,
"date_downloaded": last_refresh,
"youtube_id": youtube_id,
"active": True,
"channel": False,
}
return vid_basic
def add_player(self, missing_vid):
"""add player information for new videos"""
cache_path = self.CACHE_DIR + "/download/"
videos = self.VIDEOS
if missing_vid:
# coming from scan_filesystem
channel_name, file_name, _ = missing_vid
vid_path = os.path.join(videos, channel_name, file_name)
else:
# coming from VideoDownload
all_cached = os.listdir(cache_path)
for file_cached in all_cached:
if self.youtube_id in file_cached:
vid_path = os.path.join(cache_path, file_cached)
break
duration_handler = DurationConverter()
duration = duration_handler.get_sec(vid_path)
duration_str = duration_handler.get_str(duration)
player = {
"watched": False,
"duration": duration,
"duration_str": duration_str,
}
self.vid_dict["player"] = player
def build_file_path(self, channel_name):
"""build media_url from where file will be located"""
clean_channel_name = clean_string(channel_name)
timestamp = self.vid_dict["published"].replace("-", "")
youtube_id = self.vid_dict["youtube_id"]
title = self.vid_dict["title"]
clean_title = clean_string(title)
filename = f"{timestamp}_{youtube_id}_{clean_title}.mp4"
media_url = os.path.join(clean_channel_name, filename)
self.vid_dict["media_url"] = media_url
def get_es_data(self):
"""get current data from elastic search"""
url = self.ES_URL + "/ta_video/_doc/" + self.youtube_id
response = requests.get(url, auth=self.ES_AUTH)
if not response.ok:
print(response.text)
es_vid_dict = json.loads(response.text)
return es_vid_dict
def upload_to_es(self):
"""upload video data to elastic search"""
url = f"{self.ES_URL}/ta_video/_doc/{self.youtube_id}/?refresh=true"
response = requests.put(url, json=self.vid_dict, auth=self.ES_AUTH)
if not response.ok:
print(response.text)
raise ValueError("failed to add video to index")
def deactivate(self):
"""deactivate document on extractor error"""
youtube_id = self.youtube_id
headers = {"Content-type": "application/json"}
url = f"{self.ES_URL}/ta_video/_update/{youtube_id}"
data = {"script": "ctx._source.active = false"}
json_str = json.dumps(data)
response = requests.post(
url, data=json_str, headers=headers, auth=self.ES_AUTH
)
print(f"deactivated {youtube_id}")
if not response.ok:
print(response.text)
def delete_media_file(self):
"""delete video file, meta data, thumbnails"""
# delete media file
es_vid_dict = self.get_es_data()
media_url = es_vid_dict["_source"]["media_url"]
print(f"delete {media_url} from file system")
to_delete = os.path.join(self.VIDEOS, media_url)
os.remove(to_delete)
# delete from index
url = f"{self.ES_URL}/ta_video/_doc/{self.youtube_id}"
response = requests.delete(url, auth=self.ES_AUTH)
if not response.ok:
print(response.text)
# delete thumbs from cache
ThumbManager().delete_vid_thumb(self.youtube_id)
def get_ryd_stats(self):
"""get optional stats from returnyoutubedislikeapi.com"""
try:
print(f"get ryd stats for: {self.youtube_id}")
result = ryd_client.get(self.youtube_id)
except requests.exceptions.ConnectionError:
print(f"failed to query ryd api, skipping {self.youtube_id}")
return False
if result["status"] == 404:
return False
dislikes = {
"dislike_count": result["dislikes"],
"average_rating": result["rating"],
}
self.vid_dict["stats"].update(dislikes)
return True
class YoutubePlaylist:
"""represent a single playlist on YouTube"""
CONFIG = AppConfig().config
ES_URL = CONFIG["application"]["es_url"]
ES_AUTH = CONFIG["application"]["es_auth"]
def __init__(self, playlist_id, all_youtube_ids=False):
self.playlist_id = playlist_id
self.stamp = int(datetime.now().strftime("%s"))
self.all_youtube_ids = all_youtube_ids
self.playlist_dict = False
def get_playlist_dict(self, scrape=False):
"""get data from es or youtube"""
print(f"get playlist with id {self.playlist_id}")
if scrape:
playlist_dict = self.get_youtube_playlist()
if not playlist_dict:
return False
playlist_dict["playlist_entries"] = self.get_entries()
else:
playlist_dict = self.get_es_playlist()
if not playlist_dict:
playlist_dict = self.get_youtube_playlist()
playlist_dict["playlist_entries"] = self.get_entries()
self.playlist_dict = playlist_dict
return True
def get_youtube_playlist(self):
"""get meta data dict from youtube"""
url = "https://www.youtube.com/playlist?list=" + self.playlist_id
obs = {
"default_search": "ytsearch",
"quiet": True,
"skip_download": True,
"extract_flat": True,
"playlistend": 0,
}
try:
playlist = yt_dlp.YoutubeDL(obs).extract_info(url, download=False)
except (
yt_dlp.utils.ExtractorError,
yt_dlp.utils.DownloadError,
):
print("failed to get info for " + self.playlist_id)
return False
playlist_es = {
"playlist_id": self.playlist_id,
"playlist_active": True,
"playlist_subscribed": False,
"playlist_name": playlist["title"],
"playlist_channel": playlist["channel"],
"playlist_channel_id": playlist["channel_id"],
"playlist_thumbnail": playlist["thumbnails"][-1]["url"],
"playlist_description": playlist["description"] or False,
"playlist_last_refresh": self.stamp,
}
return playlist_es
def get_es_playlist(self):
"""get indexed data from es"""
url = f"{self.ES_URL}/ta_playlist/_doc/{self.playlist_id}"
response = requests.get(url, auth=self.ES_AUTH)
if response.ok:
return json.loads(response.text)["_source"]
return False
def get_entries(self, playlistend=False):
"""get all videos in playlist"""
url = "https://www.youtube.com/playlist?list=" + self.playlist_id
obs = {
"default_search": "ytsearch",
"quiet": True,
"skip_download": True,
"extract_flat": True,
}
if playlistend:
obs["playlistend"] = playlistend
try:
playlist = yt_dlp.YoutubeDL(obs).extract_info(url, download=False)
except (
yt_dlp.utils.ExtractorError,
yt_dlp.utils.DownloadError,
):
print("failed to get plealist entries for " + self.playlist_id)
return False
all_members = []
for idx, entry in enumerate(playlist["entries"]):
uploader = entry["uploader"]
youtube_id = entry["id"]
if self.all_youtube_ids:
downloaded = youtube_id in self.all_youtube_ids
else:
downloaded = False
if not uploader:
continue
to_append = {
"youtube_id": youtube_id,
"title": entry["title"],
"uploader": uploader,
"idx": idx,
"downloaded": downloaded,
}
all_members.append(to_append)
return all_members
def upload_to_es(self):
"""add playlist to es with its entries"""
playlist = self.playlist_dict
url = f"{self.ES_URL}/ta_playlist/_doc/{self.playlist_id}"
response = requests.put(url, json=playlist, auth=self.ES_AUTH)
if not response.ok:
print(response.text)
raise ValueError("failed to add playlist to index")
def add_vids_to_playlist(self):
"""sync the playlist id to videos"""
playlist_dict = self.playlist_dict
script = (
'if (!ctx._source.containsKey("playlist")) '
+ "{ctx._source.playlist = [params.playlist]} "
+ "else if (!ctx._source.playlist.contains(params.playlist)) "
+ "{ctx._source.playlist.add(params.playlist)} "
+ "else {ctx.op = 'none'}"
)
bulk_list = []
for entry in playlist_dict["playlist_entries"]:
youtube_id = entry["youtube_id"]
action = {"update": {"_id": youtube_id, "_index": "ta_video"}}
source = {
"script": {
"source": script,
"lang": "painless",
"params": {"playlist": self.playlist_id},
}
}
bulk_list.append(json.dumps(action))
bulk_list.append(json.dumps(source))
# add last newline
bulk_list.append("\n")
query_str = "\n".join(bulk_list)
headers = {"Content-type": "application/x-ndjson"}
url = self.ES_URL + "/_bulk"
response = requests.post(
url, data=query_str, headers=headers, auth=self.ES_AUTH
)
if not response.ok:
print(response.text)
def update_playlist(self):
"""update metadata for playlist with data from YouTube"""
subscribed = self.get_es_playlist()["playlist_subscribed"]
self.get_playlist_dict(scrape=True)
if not self.playlist_dict:
# return false to deactivate
return False
self.playlist_dict["playlist_subscribed"] = subscribed
self.upload_to_es()
return self.playlist_dict
def build_nav(self, youtube_id):
"""find next and previous in playlist of a given youtube_id"""
all_entries_available = self.playlist_dict["playlist_entries"]
all_entries = [i for i in all_entries_available if i["downloaded"]]
current = [i for i in all_entries if i["youtube_id"] == youtube_id]
# stop if not found or playlist of 1
if not current or not len(all_entries) > 1:
return False
current_idx = all_entries.index(current[0])
if current_idx == 0:
previous_item = False
else:
previous_item = all_entries[current_idx - 1]
prev_thumb = ThumbManager().vid_thumb_path(
previous_item["youtube_id"]
)
previous_item["vid_thumb"] = prev_thumb
if current_idx == len(all_entries) - 1:
next_item = False
else:
next_item = all_entries[current_idx + 1]
next_thumb = ThumbManager().vid_thumb_path(next_item["youtube_id"])
next_item["vid_thumb"] = next_thumb
nav = {
"playlist_meta": {
"current_idx": current[0]["idx"],
"playlist_id": self.playlist_id,
"playlist_name": self.playlist_dict["playlist_name"],
"playlist_channel": self.playlist_dict["playlist_channel"],
},
"playlist_previous": previous_item,
"playlist_next": next_item,
}
return nav
def delete_metadata(self):
"""delete metadata for playlist"""
script = (
"ctx._source.playlist.removeAll("
+ "Collections.singleton(params.playlist)) "
)
data = {
"query": {
"term": {"playlist.keyword": {"value": self.playlist_id}}
},
"script": {
"source": script,
"lang": "painless",
"params": {"playlist": self.playlist_id},
},
}
payload = json.dumps(data)
url = f"{self.ES_URL}/ta_video/_update_by_query"
headers = {"Content-type": "application/json"}
response = requests.post(
url, data=payload, headers=headers, auth=self.ES_AUTH
)
if not response.ok:
print(response.text)
self.delete_playlist()
def delete_videos_playlist(self):
"""delete playlist with all videos"""
print(f"delete playlist {self.playlist_id} with all videos")
self.get_playlist_dict()
all_youtube_id = [
i["youtube_id"]
for i in self.playlist_dict["playlist_entries"]
if i["downloaded"]
]
for youtube_id in all_youtube_id:
YoutubeVideo(youtube_id).delete_media_file()
self.delete_playlist()
def delete_playlist(self):
"""delete only playlist document"""
url = f"{self.ES_URL}/ta_playlist/_doc/{self.playlist_id}"
response = requests.delete(url, auth=self.ES_AUTH)
if not response.ok:
print(response.text)
def deactivate(self):
"""deactivate document on extractor error"""
headers = {"Content-type": "application/json"}
url = f"{self.ES_URL}/ta_playlist/_update/{self.playlist_id}"
data = {"script": "ctx._source.playlist_active = false"}
json_str = json.dumps(data)
response = requests.post(
url, data=json_str, headers=headers, auth=self.ES_AUTH
)
print(f"deactivated {self.playlist_id}")
if not response.ok:
print(response.text)
class WatchState:
"""handle watched checkbox for videos and channels"""
CONFIG = AppConfig().config
ES_URL = CONFIG["application"]["es_url"]
ES_AUTH = CONFIG["application"]["es_auth"]
HEADERS = {"Content-type": "application/json"}
def __init__(self, youtube_id):
self.youtube_id = youtube_id
self.stamp = int(datetime.now().strftime("%s"))
def mark_as_watched(self):
"""update es with new watched value"""
url_type = self.dedect_type()
if url_type == "video":
self.mark_vid_watched()
elif url_type == "channel":
self.mark_channel_watched()
elif url_type == "playlist":
self.mark_playlist_watched()
print(f"marked {self.youtube_id} as watched")
def mark_as_unwatched(self):
"""revert watched state to false"""
url_type = self.dedect_type()
if url_type == "video":
self.mark_vid_watched(revert=True)
print(f"revert {self.youtube_id} as unwatched")
def dedect_type(self):
"""find youtube id type"""
print(self.youtube_id)
url_process = UrlListParser(self.youtube_id).process_list()
url_type = url_process[0]["type"]
return url_type
def mark_vid_watched(self, revert=False):
"""change watched status of single video"""
url = self.ES_URL + "/ta_video/_update/" + self.youtube_id
data = {
"doc": {"player": {"watched": True, "watched_date": self.stamp}}
}
if revert:
data["doc"]["player"]["watched"] = False
payload = json.dumps(data)
request = requests.post(
url, data=payload, headers=self.HEADERS, auth=self.ES_AUTH
)
if not request.ok:
print(request.text)
raise ValueError("failed to mark video as watched")
def mark_channel_watched(self):
"""change watched status of every video in channel"""
data = {
"query": {
"bool": {
"must": [
{
"term": {
"channel.channel_id": {
"value": self.youtube_id
}
}
},
{"term": {"player.watched": {"value": False}}},
]
}
},
"script": {
"source": "ctx._source.player['watched'] = true",
"lang": "painless",
},
}
payload = json.dumps(data)
url = f"{self.ES_URL}/ta_video/_update_by_query"
request = requests.post(
url, data=payload, headers=self.HEADERS, auth=self.ES_AUTH
)
if not request.ok:
print(request.text)
raise ValueError("failed mark channel as watched")
def mark_playlist_watched(self):
"""change watched state of all videos in playlist"""
data = {
"query": {
"bool": {
"must": [
{
"term": {
"playlist.keyword": {"value": self.youtube_id}
}
},
{"term": {"player.watched": {"value": False}}},
]
}
},
"script": {
"source": "ctx._source.player['watched'] = true",
"lang": "painless",
},
}
payload = json.dumps(data)
url = f"{self.ES_URL}/ta_video/_update_by_query"
request = requests.post(
url, data=payload, headers=self.HEADERS, auth=self.ES_AUTH
)
if not request.ok:
print(request.text)
raise ValueError("failed mark playlist as watched")
class IndexPaginate:
"""use search_after to go through whole index"""
CONFIG = AppConfig().config
ES_URL = CONFIG["application"]["es_url"]
ES_AUTH = CONFIG["application"]["es_auth"]
HEADERS = {"Content-type": "application/json"}
DEFAULT_SIZE = 500
def __init__(self, index_name, data, size=False):
self.index_name = index_name
self.data = data
self.pit_id = False
self.size = size
def get_results(self):
"""get all results"""
self.get_pit()
self.validate_data()
all_results = self.run_loop()
self.clean_pit()
return all_results
def get_pit(self):
"""get pit for index"""
url = f"{self.ES_URL}/{self.index_name}/_pit?keep_alive=10m"
response = requests.post(url, auth=self.ES_AUTH)
json_data = json.loads(response.text)
self.pit_id = json_data["id"]
def validate_data(self):
"""add pit and size to data"""
if "sort" not in self.data.keys():
print(self.data)
raise ValueError("missing sort key in data")
size = self.size or self.DEFAULT_SIZE
self.data["size"] = size
self.data["pit"] = {"id": self.pit_id, "keep_alive": "10m"}
def run_loop(self):
"""loop through results until last hit"""
query_str = json.dumps(self.data)
url = self.ES_URL + "/_search"
all_results = []
while True:
response = requests.get(
url, data=query_str, headers=self.HEADERS, auth=self.ES_AUTH
)
json_data = json.loads(response.text)
all_hits = json_data["hits"]["hits"]
if all_hits:
for hit in all_hits:
source = hit["_source"]
search_after = hit["sort"]
all_results.append(source)
# update search_after with last hit data
self.data["search_after"] = search_after
query_str = json.dumps(self.data)
else:
break
return all_results
def clean_pit(self):
"""delete pit from elastic search"""
query_str = json.dumps({"id": self.pit_id})
requests.delete(
self.ES_URL + "/_pit",
data=query_str,
headers=self.HEADERS,
auth=self.ES_AUTH,
)
def index_new_video(youtube_id, missing_vid=False):
"""combine video and channel classes for new video index"""
vid_handler = YoutubeVideo(youtube_id)
vid_handler.get_vid_dict()
if not vid_handler.vid_dict:
raise ValueError("failed to get metadata for " + youtube_id)
channel_handler = YoutubeChannel(vid_handler.channel_id)
# add filepath to vid_dict
channel_name = channel_handler.channel_dict["channel_name"]
vid_handler.build_file_path(channel_name)
# add channel and player to video
vid_handler.add_player(missing_vid)
vid_handler.vid_dict["channel"] = channel_handler.channel_dict
# add new channel to es
if channel_handler.source == "scraped":
channel_handler.channel_dict["channel_subscribed"] = False
channel_handler.upload_to_es()
channel_handler.get_channel_art()
# upload video to es
vid_handler.upload_to_es()
# return vid_dict for further processing
return vid_handler.vid_dict

View File

View File

@ -0,0 +1,266 @@
"""
functionality:
- get metadata from youtube for a channel
- index and update in es
"""
import json
import os
import re
from datetime import datetime
import requests
import yt_dlp
from bs4 import BeautifulSoup
from home.src.download.thumbnails import ThumbManager
from home.src.es.connect import ElasticWrap, IndexPaginate
from home.src.index.generic import YouTubeItem
from home.src.index.playlist import YoutubePlaylist
from home.src.ta.helper import clean_string
class ChannelScraper:
"""custom scraper using bs4 to scrape channel about page
will be able to be integrated into yt-dlp
once #2237 and #2350 are merged upstream
"""
def __init__(self, channel_id):
self.channel_id = channel_id
self.soup = False
self.yt_json = False
self.json_data = False
def get_json(self):
"""main method to return channel dict"""
self.get_soup()
self._extract_yt_json()
self._parse_channel_main()
self._parse_channel_meta()
return self.json_data
def get_soup(self):
"""return soup from youtube"""
print(f"{self.channel_id}: scrape channel data from youtube")
url = f"https://www.youtube.com/channel/{self.channel_id}/about?hl=en"
cookies = {"CONSENT": "YES+xxxxxxxxxxxxxxxxxxxxxxxxxxx"}
response = requests.get(url, cookies=cookies)
if response.ok:
channel_page = response.text
else:
print(f"{self.channel_id}: failed to extract channel info")
raise ConnectionError
self.soup = BeautifulSoup(channel_page, "html.parser")
def _extract_yt_json(self):
"""parse soup and get ytInitialData json"""
all_scripts = self.soup.find("body").find_all("script")
for script in all_scripts:
if "var ytInitialData = " in str(script):
script_content = str(script)
break
# extract payload
script_content = script_content.split("var ytInitialData = ")[1]
json_raw = script_content.rstrip(";</script>")
self.yt_json = json.loads(json_raw)
def _parse_channel_main(self):
"""extract maintab values from scraped channel json data"""
main_tab = self.yt_json["header"]["c4TabbedHeaderRenderer"]
# build and return dict
self.json_data = {
"channel_active": True,
"channel_last_refresh": int(datetime.now().strftime("%s")),
"channel_subs": self._get_channel_subs(main_tab),
"channel_name": main_tab["title"],
"channel_banner_url": self._get_thumbnails(main_tab, "banner"),
"channel_tvart_url": self._get_thumbnails(main_tab, "tvBanner"),
"channel_id": self.channel_id,
"channel_subscribed": False,
}
@staticmethod
def _get_thumbnails(main_tab, thumb_name):
"""extract banner url from main_tab"""
try:
all_banners = main_tab[thumb_name]["thumbnails"]
banner = sorted(all_banners, key=lambda k: k["width"])[-1]["url"]
except KeyError:
banner = False
return banner
@staticmethod
def _get_channel_subs(main_tab):
"""process main_tab to get channel subs as int"""
try:
sub_text_simple = main_tab["subscriberCountText"]["simpleText"]
sub_text = sub_text_simple.split(" ")[0]
if sub_text[-1] == "K":
channel_subs = int(float(sub_text.replace("K", "")) * 1000)
elif sub_text[-1] == "M":
channel_subs = int(float(sub_text.replace("M", "")) * 1000000)
elif int(sub_text) >= 0:
channel_subs = int(sub_text)
else:
message = f"{sub_text} not dealt with"
print(message)
except KeyError:
channel_subs = 0
return channel_subs
def _parse_channel_meta(self):
"""extract meta tab values from channel payload"""
# meta tab
meta_tab = self.yt_json["metadata"]["channelMetadataRenderer"]
all_thumbs = meta_tab["avatar"]["thumbnails"]
thumb_url = sorted(all_thumbs, key=lambda k: k["width"])[-1]["url"]
# stats tab
renderer = "twoColumnBrowseResultsRenderer"
all_tabs = self.yt_json["contents"][renderer]["tabs"]
for tab in all_tabs:
if "tabRenderer" in tab.keys():
if tab["tabRenderer"]["title"] == "About":
about_tab = tab["tabRenderer"]["content"][
"sectionListRenderer"
]["contents"][0]["itemSectionRenderer"]["contents"][0][
"channelAboutFullMetadataRenderer"
]
break
try:
channel_views_text = about_tab["viewCountText"]["simpleText"]
channel_views = int(re.sub(r"\D", "", channel_views_text))
except KeyError:
channel_views = 0
self.json_data.update(
{
"channel_description": meta_tab["description"],
"channel_thumb_url": thumb_url,
"channel_views": channel_views,
}
)
class YoutubeChannel(YouTubeItem):
"""represents a single youtube channel"""
es_path = False
index_name = "ta_channel"
yt_base = "https://www.youtube.com/channel/"
def __init__(self, youtube_id):
super().__init__(youtube_id)
self.es_path = f"{self.index_name}/_doc/{youtube_id}"
def build_json(self, upload=False):
"""get from es or from youtube"""
self.get_from_es()
if self.json_data:
return
self.get_from_youtube()
if upload:
self.upload_to_es()
return
def get_from_youtube(self):
"""use bs4 to scrape channel about page"""
self.json_data = ChannelScraper(self.youtube_id).get_json()
self.get_channel_art()
def get_channel_art(self):
"""download channel art for new channels"""
channel_id = self.youtube_id
channel_thumb = self.json_data["channel_thumb_url"]
channel_banner = self.json_data["channel_banner_url"]
ThumbManager().download_chan(
[(channel_id, channel_thumb, channel_banner)]
)
def sync_to_videos(self):
"""sync new channel_dict to all videos of channel"""
# add ingest pipeline
processors = []
for field, value in self.json_data.items():
line = {"set": {"field": "channel." + field, "value": value}}
processors.append(line)
data = {"description": self.youtube_id, "processors": processors}
ingest_path = f"_ingest/pipeline/{self.youtube_id}"
_, _ = ElasticWrap(ingest_path).put(data)
# apply pipeline
data = {"query": {"match": {"channel.channel_id": self.youtube_id}}}
update_path = f"ta_video/_update_by_query?pipeline={self.youtube_id}"
_, _ = ElasticWrap(update_path).post(data)
def get_folder_path(self):
"""get folder where media files get stored"""
channel_name = self.json_data["channel_name"]
folder_name = clean_string(channel_name)
folder_path = os.path.join(self.app_conf["videos"], folder_name)
return folder_path
def delete_es_videos(self):
"""delete all channel documents from elasticsearch"""
data = {
"query": {
"term": {"channel.channel_id": {"value": self.youtube_id}}
}
}
_, _ = ElasticWrap("ta_video/_delete_by_query").post(data)
def delete_playlists(self):
"""delete all indexed playlist from es"""
all_playlists = self.get_indexed_playlists()
for playlist in all_playlists:
playlist_id = playlist["playlist_id"]
YoutubePlaylist(playlist_id).delete_metadata()
def delete_channel(self):
"""delete channel and all videos"""
print(f"{self.youtube_id}: delete channel")
self.get_from_es()
folder_path = self.get_folder_path()
print(f"{self.youtube_id}: delete all media files")
try:
all_videos = os.listdir(folder_path)
for video in all_videos:
video_path = os.path.join(folder_path, video)
os.remove(video_path)
os.rmdir(folder_path)
except FileNotFoundError:
print(f"no videos found for {folder_path}")
print(f"{self.youtube_id}: delete indexed playlists")
self.delete_playlists()
print(f"{self.youtube_id}: delete indexed videos")
self.delete_es_videos()
self.del_in_es()
def get_all_playlists(self):
"""get all playlists owned by this channel"""
url = (
f"https://www.youtube.com/channel/{self.youtube_id}"
+ "/playlists?view=1&sort=dd&shelf_id=0"
)
obs = {
"quiet": True,
"skip_download": True,
"extract_flat": True,
}
playlists = yt_dlp.YoutubeDL(obs).extract_info(url)
all_entries = [(i["id"], i["title"]) for i in playlists["entries"]]
return all_entries
def get_indexed_playlists(self):
"""get all indexed playlists from channel"""
data = {
"query": {
"term": {"playlist_channel_id": {"value": self.youtube_id}}
},
"sort": [{"playlist_channel.keyword": {"order": "desc"}}],
}
all_playlists = IndexPaginate("ta_playlist", data).get_results()
return all_playlists

View File

@ -0,0 +1,325 @@
"""
Functionality:
- reindexing old documents
- syncing updated values between indexes
- scan the filesystem to delete or index
"""
import json
import os
import re
import shutil
import subprocess
from datetime import datetime
import requests
from home.src.download.queue import PendingList
from home.src.download.yt_dlp_handler import VideoDownloader
from home.src.index.reindex import Reindex
from home.src.index.video import index_new_video
from home.src.ta.config import AppConfig
from home.src.ta.helper import clean_string, ignore_filelist
from home.src.ta.ta_redis import RedisArchivist
class FilesystemScanner:
"""handle scanning and fixing from filesystem"""
CONFIG = AppConfig().config
ES_URL = CONFIG["application"]["es_url"]
ES_AUTH = CONFIG["application"]["es_auth"]
VIDEOS = CONFIG["application"]["videos"]
def __init__(self):
self.all_downloaded = self.get_all_downloaded()
self.all_indexed = self.get_all_indexed()
self.mismatch = None
self.to_rename = None
self.to_index = None
self.to_delete = None
def get_all_downloaded(self):
"""get a list of all video files downloaded"""
channels = os.listdir(self.VIDEOS)
all_channels = ignore_filelist(channels)
all_channels.sort()
all_downloaded = []
for channel_name in all_channels:
channel_path = os.path.join(self.VIDEOS, channel_name)
videos = os.listdir(channel_path)
all_videos = ignore_filelist(videos)
for video in all_videos:
youtube_id = video[9:20]
all_downloaded.append((channel_name, video, youtube_id))
return all_downloaded
@staticmethod
def get_all_indexed():
"""get a list of all indexed videos"""
index_handler = PendingList()
all_indexed_raw = index_handler.get_all_indexed()
all_indexed = []
for video in all_indexed_raw:
youtube_id = video["youtube_id"]
media_url = video["media_url"]
published = video["published"]
title = video["title"]
all_indexed.append((youtube_id, media_url, published, title))
return all_indexed
def list_comarison(self):
"""compare the lists to figure out what to do"""
self.find_unindexed()
self.find_missing()
self.find_bad_media_url()
def find_unindexed(self):
"""find video files without a matching document indexed"""
all_indexed_ids = [i[0] for i in self.all_indexed]
to_index = []
for downloaded in self.all_downloaded:
if downloaded[2] not in all_indexed_ids:
to_index.append(downloaded)
self.to_index = to_index
def find_missing(self):
"""find indexed videos without matching media file"""
all_downloaded_ids = [i[2] for i in self.all_downloaded]
to_delete = []
for video in self.all_indexed:
youtube_id = video[0]
if youtube_id not in all_downloaded_ids:
to_delete.append(video)
self.to_delete = to_delete
def find_bad_media_url(self):
"""rename media files not matching the indexed title"""
to_fix = []
to_rename = []
for downloaded in self.all_downloaded:
channel, filename, downloaded_id = downloaded
# find in indexed
for indexed in self.all_indexed:
indexed_id, media_url, published, title = indexed
if indexed_id == downloaded_id:
# found it
title_c = clean_string(title)
pub = published.replace("-", "")
expected_filename = f"{pub}_{indexed_id}_{title_c}.mp4"
new_url = os.path.join(channel, expected_filename)
if expected_filename != filename:
# file to rename
to_rename.append(
(channel, filename, expected_filename)
)
if media_url != new_url:
# media_url to update in es
to_fix.append((indexed_id, new_url))
break
self.mismatch = to_fix
self.to_rename = to_rename
def rename_files(self):
"""rename media files as identified by find_bad_media_url"""
for bad_filename in self.to_rename:
channel, filename, expected_filename = bad_filename
print(f"renaming [{filename}] to [{expected_filename}]")
old_path = os.path.join(self.VIDEOS, channel, filename)
new_path = os.path.join(self.VIDEOS, channel, expected_filename)
os.rename(old_path, new_path)
def send_mismatch_bulk(self):
"""build bulk update"""
bulk_list = []
for video_mismatch in self.mismatch:
youtube_id, media_url = video_mismatch
print(f"{youtube_id}: fixing media url {media_url}")
action = {"update": {"_id": youtube_id, "_index": "ta_video"}}
source = {"doc": {"media_url": media_url}}
bulk_list.append(json.dumps(action))
bulk_list.append(json.dumps(source))
# add last newline
bulk_list.append("\n")
query_str = "\n".join(bulk_list)
# make the call
headers = {"Content-type": "application/x-ndjson"}
url = self.ES_URL + "/_bulk"
request = requests.post(
url, data=query_str, headers=headers, auth=self.ES_AUTH
)
if not request.ok:
print(request.text)
def delete_from_index(self):
"""find indexed but deleted mediafile"""
for indexed in self.to_delete:
youtube_id = indexed[0]
print(f"deleting {youtube_id} from index")
url = self.ES_URL + "/ta_video/_doc/" + youtube_id
request = requests.delete(url, auth=self.ES_AUTH)
if not request.ok:
print(request.text)
class ManualImport:
"""import and indexing existing video files"""
CONFIG = AppConfig().config
CACHE_DIR = CONFIG["application"]["cache_dir"]
IMPORT_DIR = os.path.join(CACHE_DIR, "import")
def __init__(self):
self.identified = self.import_folder_parser()
def import_folder_parser(self):
"""detect files in import folder"""
import_files = os.listdir(self.IMPORT_DIR)
to_import = ignore_filelist(import_files)
to_import.sort()
video_files = [i for i in to_import if not i.endswith(".json")]
identified = []
for file_path in video_files:
file_dict = {"video_file": file_path}
file_name, _ = os.path.splitext(file_path)
matching_json = [
i
for i in to_import
if i.startswith(file_name) and i.endswith(".json")
]
if matching_json:
json_file = matching_json[0]
youtube_id = self.extract_id_from_json(json_file)
file_dict.update({"json_file": json_file})
else:
youtube_id = self.extract_id_from_filename(file_name)
file_dict.update({"json_file": False})
file_dict.update({"youtube_id": youtube_id})
identified.append(file_dict)
return identified
@staticmethod
def extract_id_from_filename(file_name):
"""
look at the file name for the youtube id
expects filename ending in [<youtube_id>].<ext>
"""
id_search = re.search(r"\[([a-zA-Z0-9_-]{11})\]$", file_name)
if id_search:
youtube_id = id_search.group(1)
return youtube_id
print("failed to extract youtube id for: " + file_name)
raise Exception
def extract_id_from_json(self, json_file):
"""open json file and extract id"""
json_path = os.path.join(self.CACHE_DIR, "import", json_file)
with open(json_path, "r", encoding="utf-8") as f:
json_content = f.read()
youtube_id = json.loads(json_content)["id"]
return youtube_id
def process_import(self):
"""go through identified media files"""
all_videos_added = []
for media_file in self.identified:
json_file = media_file["json_file"]
video_file = media_file["video_file"]
youtube_id = media_file["youtube_id"]
video_path = os.path.join(self.CACHE_DIR, "import", video_file)
self.move_to_cache(video_path, youtube_id)
# identify and archive
vid_dict = index_new_video(youtube_id)
VideoDownloader([youtube_id]).move_to_archive(vid_dict)
youtube_id = vid_dict["youtube_id"]
thumb_url = vid_dict["vid_thumb_url"]
all_videos_added.append((youtube_id, thumb_url))
# cleanup
if os.path.exists(video_path):
os.remove(video_path)
if json_file:
json_path = os.path.join(self.CACHE_DIR, "import", json_file)
os.remove(json_path)
return all_videos_added
def move_to_cache(self, video_path, youtube_id):
"""move identified video file to cache, convert to mp4"""
file_name = os.path.split(video_path)[-1]
video_file, ext = os.path.splitext(file_name)
# make sure youtube_id is in filename
if youtube_id not in video_file:
video_file = f"{video_file}_{youtube_id}"
# move, convert if needed
if ext == ".mp4":
new_file = video_file + ext
dest_path = os.path.join(self.CACHE_DIR, "download", new_file)
shutil.move(video_path, dest_path)
else:
print(f"processing with ffmpeg: {video_file}")
new_file = video_file + ".mp4"
dest_path = os.path.join(self.CACHE_DIR, "download", new_file)
subprocess.run(
[
"ffmpeg",
"-i",
video_path,
dest_path,
"-loglevel",
"warning",
"-stats",
],
check=True,
)
def scan_filesystem():
"""grouped function to delete and update index"""
filesystem_handler = FilesystemScanner()
filesystem_handler.list_comarison()
if filesystem_handler.to_rename:
print("renaming files")
filesystem_handler.rename_files()
if filesystem_handler.mismatch:
print("fixing media urls in index")
filesystem_handler.send_mismatch_bulk()
if filesystem_handler.to_delete:
print("delete metadata from index")
filesystem_handler.delete_from_index()
if filesystem_handler.to_index:
print("index new videos")
for missing_vid in filesystem_handler.to_index:
youtube_id = missing_vid[2]
index_new_video(youtube_id)
def reindex_old_documents():
"""daily refresh of old documents"""
# continue if needed
reindex_handler = Reindex()
reindex_handler.check_outdated()
reindex_handler.reindex()
# set timestamp
now = int(datetime.now().strftime("%s"))
RedisArchivist().set_message("last_reindex", now, expire=False)

View File

@ -0,0 +1,142 @@
"""
functionality:
- generic base class to inherit from for video, channel and playlist
"""
import math
import yt_dlp
from home.src.es.connect import ElasticWrap
from home.src.ta.config import AppConfig
from home.src.ta.ta_redis import RedisArchivist
class YouTubeItem:
"""base class for youtube"""
es_path = False
index_name = False
yt_base = False
yt_obs = {
"quiet": True,
"default_search": "ytsearch",
"skip_download": True,
"check_formats": "selected",
"noplaylist": True,
}
def __init__(self, youtube_id):
self.youtube_id = youtube_id
self.config = False
self.app_conf = False
self.youtube_meta = False
self.json_data = False
self._get_conf()
def _get_conf(self):
"""read user conf"""
self.config = AppConfig().config
self.app_conf = self.config["application"]
def get_from_youtube(self):
"""use yt-dlp to get meta data from youtube"""
print(f"{self.youtube_id}: get metadata from youtube")
try:
yt_item = yt_dlp.YoutubeDL(self.yt_obs)
response = yt_item.extract_info(self.yt_base + self.youtube_id)
except (
yt_dlp.utils.ExtractorError,
yt_dlp.utils.DownloadError,
):
print(f"{self.youtube_id}: failed to get info from youtube")
self.youtube_meta = False
self.youtube_meta = response
def get_from_es(self):
"""get indexed data from elastic search"""
print(f"{self.youtube_id}: get metadata from es")
response, _ = ElasticWrap(f"{self.es_path}").get()
source = response.get("_source")
self.json_data = source
def upload_to_es(self):
"""add json_data to elastic"""
_, _ = ElasticWrap(self.es_path).put(self.json_data, refresh=True)
def deactivate(self):
"""deactivate document in es"""
key_match = {
"video": "active",
"channel": "channel_active",
"playlist": "playlist_active",
}
update_path = f"{self.index_name}/_update/{self.youtube_id}"
data = {
"script": f"ctx._source.{key_match.get(self.index_name)} = false"
}
_, _ = ElasticWrap(update_path).post(data)
def del_in_es(self):
"""delete item from elastic search"""
print(f"{self.youtube_id}: delete from es")
_, _ = ElasticWrap(self.es_path).delete()
class Pagination:
"""
figure out the pagination based on page size and total_hits
"""
def __init__(self, page_get, user_id, search_get=False):
self.user_id = user_id
self.page_size = self.get_page_size()
self.page_get = page_get
self.search_get = search_get
self.pagination = self.first_guess()
def get_page_size(self):
"""get default or user modified page_size"""
key = f"{self.user_id}:page_size"
page_size = RedisArchivist().get_message(key)["status"]
if not page_size:
config = AppConfig().config
page_size = config["archive"]["page_size"]
return page_size
def first_guess(self):
"""build first guess before api call"""
page_get = self.page_get
if page_get in [0, 1]:
page_from = 0
prev_pages = False
elif page_get > 1:
page_from = (page_get - 1) * self.page_size
prev_pages = [
i for i in range(page_get - 1, page_get - 6, -1) if i > 1
]
prev_pages.reverse()
pagination = {
"page_size": self.page_size,
"page_from": page_from,
"prev_pages": prev_pages,
"current_page": page_get,
}
if self.search_get:
pagination.update({"search_get": self.search_get})
return pagination
def validate(self, total_hits):
"""validate pagination with total_hits after making api call"""
page_get = self.page_get
max_pages = math.ceil(total_hits / self.page_size)
if page_get < max_pages and max_pages > 1:
self.pagination["last_page"] = max_pages
else:
self.pagination["last_page"] = False
next_pages = [
i for i in range(page_get + 1, page_get + 6) if 1 < i < max_pages
]
self.pagination["next_pages"] = next_pages

View File

@ -0,0 +1,205 @@
"""
functionality:
- get metadata from youtube for a playlist
- index and update in es
"""
import json
from datetime import datetime
from home.src.download.thumbnails import ThumbManager
from home.src.es.connect import ElasticWrap
from home.src.index.generic import YouTubeItem
from home.src.index.video import YoutubeVideo
class YoutubePlaylist(YouTubeItem):
"""represents a single youtube playlist"""
es_path = False
index_name = "ta_playlist"
yt_obs = {
"default_search": "ytsearch",
"quiet": True,
"skip_download": True,
"extract_flat": True,
}
yt_base = "https://www.youtube.com/playlist?list="
def __init__(self, youtube_id):
super().__init__(youtube_id)
self.es_path = f"{self.index_name}/_doc/{youtube_id}"
self.all_members = False
self.nav = False
self.all_youtube_ids = []
def build_json(self, scrape=False):
"""collection to create json_data"""
if not scrape:
self.get_from_es()
if scrape or not self.json_data:
self.get_from_youtube()
self.process_youtube_meta()
self.get_entries()
self.json_data["playlist_entries"] = self.all_members
self.get_playlist_art()
def process_youtube_meta(self):
"""extract relevant fields from youtube"""
self.json_data = {
"playlist_id": self.youtube_id,
"playlist_active": True,
"playlist_subscribed": False,
"playlist_name": self.youtube_meta["title"],
"playlist_channel": self.youtube_meta["channel"],
"playlist_channel_id": self.youtube_meta["channel_id"],
"playlist_thumbnail": self.youtube_meta["thumbnails"][-1]["url"],
"playlist_description": self.youtube_meta["description"] or False,
"playlist_last_refresh": int(datetime.now().strftime("%s")),
}
def get_entries(self, playlistend=False):
"""get all videos in playlist"""
if playlistend:
# implement playlist end
print(playlistend)
all_members = []
for idx, entry in enumerate(self.youtube_meta["entries"]):
if self.all_youtube_ids:
downloaded = entry["id"] in self.all_youtube_ids
else:
downloaded = False
if not entry["uploader"]:
continue
to_append = {
"youtube_id": entry["id"],
"title": entry["title"],
"uploader": entry["uploader"],
"idx": idx,
"downloaded": downloaded,
}
all_members.append(to_append)
self.all_members = all_members
@staticmethod
def get_playlist_art():
"""download artwork of playlist"""
thumbnails = ThumbManager()
missing_playlists = thumbnails.get_missing_playlists()
thumbnails.download_playlist(missing_playlists)
def add_vids_to_playlist(self):
"""sync the playlist id to videos"""
script = (
'if (!ctx._source.containsKey("playlist")) '
+ "{ctx._source.playlist = [params.playlist]} "
+ "else if (!ctx._source.playlist.contains(params.playlist)) "
+ "{ctx._source.playlist.add(params.playlist)} "
+ "else {ctx.op = 'none'}"
)
bulk_list = []
for entry in self.json_data["playlist_entries"]:
video_id = entry["youtube_id"]
action = {"update": {"_id": video_id, "_index": "ta_video"}}
source = {
"script": {
"source": script,
"lang": "painless",
"params": {"playlist": self.youtube_id},
}
}
bulk_list.append(json.dumps(action))
bulk_list.append(json.dumps(source))
# add last newline
bulk_list.append("\n")
query_str = "\n".join(bulk_list)
ElasticWrap("_bulk").post(query_str, ndjson=True)
def update_playlist(self):
"""update metadata for playlist with data from YouTube"""
self.get_from_es()
subscribed = self.json_data["playlist_subscribed"]
self.get_from_youtube()
if not self.json_data:
# return false to deactivate
return False
self.json_data["playlist_subscribed"] = subscribed
self.upload_to_es()
return True
def build_nav(self, youtube_id):
"""find next and previous in playlist of a given youtube_id"""
all_entries_available = self.json_data["playlist_entries"]
all_entries = [i for i in all_entries_available if i["downloaded"]]
current = [i for i in all_entries if i["youtube_id"] == youtube_id]
# stop if not found or playlist of 1
if not current or not len(all_entries) > 1:
return
current_idx = all_entries.index(current[0])
if current_idx == 0:
previous_item = False
else:
previous_item = all_entries[current_idx - 1]
prev_thumb = ThumbManager().vid_thumb_path(
previous_item["youtube_id"]
)
previous_item["vid_thumb"] = prev_thumb
if current_idx == len(all_entries) - 1:
next_item = False
else:
next_item = all_entries[current_idx + 1]
next_thumb = ThumbManager().vid_thumb_path(next_item["youtube_id"])
next_item["vid_thumb"] = next_thumb
self.nav = {
"playlist_meta": {
"current_idx": current[0]["idx"],
"playlist_id": self.youtube_id,
"playlist_name": self.json_data["playlist_name"],
"playlist_channel": self.json_data["playlist_channel"],
},
"playlist_previous": previous_item,
"playlist_next": next_item,
}
return
def delete_metadata(self):
"""delete metadata for playlist"""
script = (
"ctx._source.playlist.removeAll("
+ "Collections.singleton(params.playlist)) "
)
data = {
"query": {
"term": {"playlist.keyword": {"value": self.youtube_id}}
},
"script": {
"source": script,
"lang": "painless",
"params": {"playlist": self.youtube_id},
},
}
_, _ = ElasticWrap("ta_video/_update_by_query").post(data)
self.del_in_es()
def delete_videos_playlist(self):
"""delete playlist with all videos"""
print(f"{self.youtube_id}: delete playlist")
self.get_from_es()
all_youtube_id = [
i["youtube_id"]
for i in self.json_data["playlist_entries"]
if i["downloaded"]
]
for youtube_id in all_youtube_id:
YoutubeVideo(youtube_id).delete_media_file()
self.delete_metadata()

View File

@ -0,0 +1,271 @@
"""
functionality:
- periodically refresh documents
- index and update in es
"""
import json
from datetime import datetime
from math import ceil
from time import sleep
import requests
from home.src.download.queue import PendingList
from home.src.download.subscriptions import ChannelSubscription
from home.src.download.thumbnails import ThumbManager
from home.src.index.channel import YoutubeChannel
from home.src.index.playlist import YoutubePlaylist
from home.src.index.video import YoutubeVideo
from home.src.ta.config import AppConfig
from home.src.ta.helper import get_total_hits
class Reindex:
"""check for outdated documents and refresh data from youtube"""
def __init__(self):
# config
config = AppConfig().config
self.sleep_interval = config["downloads"]["sleep_interval"]
self.es_url = config["application"]["es_url"]
self.es_auth = config["application"]["es_auth"]
self.refresh_interval = config["scheduler"]["check_reindex_days"]
self.integrate_ryd = config["downloads"]["integrate_ryd"]
# scan
self.all_youtube_ids = False
self.all_channel_ids = False
self.all_playlist_ids = False
def get_daily(self):
"""get daily refresh values"""
total_videos = get_total_hits(
"ta_video", self.es_url, self.es_auth, "active"
)
video_daily = ceil(total_videos / self.refresh_interval * 1.2)
total_channels = get_total_hits(
"ta_channel", self.es_url, self.es_auth, "channel_active"
)
channel_daily = ceil(total_channels / self.refresh_interval * 1.2)
total_playlists = get_total_hits(
"ta_playlist", self.es_url, self.es_auth, "playlist_active"
)
playlist_daily = ceil(total_playlists / self.refresh_interval * 1.2)
return (video_daily, channel_daily, playlist_daily)
def get_outdated_vids(self, size):
"""get daily videos to refresh"""
headers = {"Content-type": "application/json"}
now = int(datetime.now().strftime("%s"))
now_lte = now - self.refresh_interval * 24 * 60 * 60
data = {
"size": size,
"query": {
"bool": {
"must": [
{"match": {"active": True}},
{"range": {"vid_last_refresh": {"lte": now_lte}}},
]
}
},
"sort": [{"vid_last_refresh": {"order": "asc"}}],
"_source": False,
}
query_str = json.dumps(data)
url = self.es_url + "/ta_video/_search"
response = requests.get(
url, data=query_str, headers=headers, auth=self.es_auth
)
if not response.ok:
print(response.text)
response_dict = json.loads(response.text)
all_youtube_ids = [i["_id"] for i in response_dict["hits"]["hits"]]
return all_youtube_ids
def get_unrated_vids(self):
"""get all videos without rating if ryd integration is enabled"""
headers = {"Content-type": "application/json"}
data = {
"size": 200,
"query": {
"bool": {
"must_not": [{"exists": {"field": "stats.average_rating"}}]
}
},
}
query_str = json.dumps(data)
url = self.es_url + "/ta_video/_search"
response = requests.get(
url, data=query_str, headers=headers, auth=self.es_auth
)
if not response.ok:
print(response.text)
response_dict = json.loads(response.text)
missing_rating = [i["_id"] for i in response_dict["hits"]["hits"]]
self.all_youtube_ids = self.all_youtube_ids + missing_rating
def get_outdated_channels(self, size):
"""get daily channels to refresh"""
headers = {"Content-type": "application/json"}
now = int(datetime.now().strftime("%s"))
now_lte = now - self.refresh_interval * 24 * 60 * 60
data = {
"size": size,
"query": {
"bool": {
"must": [
{"match": {"channel_active": True}},
{"range": {"channel_last_refresh": {"lte": now_lte}}},
]
}
},
"sort": [{"channel_last_refresh": {"order": "asc"}}],
"_source": False,
}
query_str = json.dumps(data)
url = self.es_url + "/ta_channel/_search"
response = requests.get(
url, data=query_str, headers=headers, auth=self.es_auth
)
if not response.ok:
print(response.text)
response_dict = json.loads(response.text)
all_channel_ids = [i["_id"] for i in response_dict["hits"]["hits"]]
return all_channel_ids
def get_outdated_playlists(self, size):
"""get daily outdated playlists to refresh"""
headers = {"Content-type": "application/json"}
now = int(datetime.now().strftime("%s"))
now_lte = now - self.refresh_interval * 24 * 60 * 60
data = {
"size": size,
"query": {
"bool": {
"must": [
{"match": {"playlist_active": True}},
{"range": {"playlist_last_refresh": {"lte": now_lte}}},
]
}
},
"sort": [{"playlist_last_refresh": {"order": "asc"}}],
"_source": False,
}
query_str = json.dumps(data)
url = self.es_url + "/ta_playlist/_search"
response = requests.get(
url, data=query_str, headers=headers, auth=self.es_auth
)
if not response.ok:
print(response.text)
response_dict = json.loads(response.text)
all_playlist_ids = [i["_id"] for i in response_dict["hits"]["hits"]]
return all_playlist_ids
def check_outdated(self):
"""add missing vids and channels"""
video_daily, channel_daily, playlist_daily = self.get_daily()
self.all_youtube_ids = self.get_outdated_vids(video_daily)
self.all_channel_ids = self.get_outdated_channels(channel_daily)
self.all_playlist_ids = self.get_outdated_playlists(playlist_daily)
if self.integrate_ryd:
self.get_unrated_vids()
def rescrape_all_channels(self):
"""sync new data from channel to all matching videos"""
sleep_interval = self.sleep_interval
channel_sub_handler = ChannelSubscription()
all_channels = channel_sub_handler.get_channels(subscribed_only=False)
all_channel_ids = [i["channel_id"] for i in all_channels]
for channel_id in all_channel_ids:
channel = YoutubeChannel(channel_id)
subscribed = channel.json_data["channel_subscribed"]
channel.get_from_youtube()
channel.json_data["channel_subscribed"] = subscribed
channel.upload_to_es()
channel.sync_to_videos()
if sleep_interval:
sleep(sleep_interval)
@staticmethod
def reindex_single_video(youtube_id):
"""refresh data for single video"""
video = YoutubeVideo(youtube_id)
# read current state
video.get_from_es()
player = video.json_data["player"]
date_downloaded = video.json_data["date_downloaded"]
channel_dict = video.json_data["channel"]
playlist = video.json_data.get("playlist")
# get new
video.build_json()
if not video.json_data:
video.deactivate()
# add back
video.json_data["player"] = player
video.json_data["date_downloaded"] = date_downloaded
video.json_data["channel"] = channel_dict
if playlist:
video.json_data["playlist"] = playlist
video.upload_to_es()
thumb_handler = ThumbManager()
thumb_handler.delete_vid_thumb(youtube_id)
to_download = (youtube_id, video.json_data["vid_thumb_url"])
thumb_handler.download_vid([to_download], notify=False)
@staticmethod
def reindex_single_channel(channel_id):
"""refresh channel data and sync to videos"""
channel = YoutubeChannel(channel_id)
channel.get_from_es()
subscribed = channel.json_data["channel_subscribed"]
channel.get_from_youtube()
channel.json_data["channel_subscribed"] = subscribed
channel.upload_to_es()
channel.sync_to_videos()
@staticmethod
def reindex_single_playlist(playlist_id, all_indexed_ids):
"""refresh playlist data"""
playlist = YoutubePlaylist(playlist_id)
playlist.get_from_es()
subscribed = playlist.json_data["playlist_subscribed"]
playlist.all_youtube_ids = all_indexed_ids
playlist.build_json(scrape=True)
if not playlist.json_data:
playlist.deactivate()
return
playlist.json_data["playlist_subscribed"] = subscribed
playlist.upload_to_es()
return
def reindex(self):
"""reindex what's needed"""
# videos
print(f"reindexing {len(self.all_youtube_ids)} videos")
for youtube_id in self.all_youtube_ids:
self.reindex_single_video(youtube_id)
if self.sleep_interval:
sleep(self.sleep_interval)
# channels
print(f"reindexing {len(self.all_channel_ids)} channels")
for channel_id in self.all_channel_ids:
self.reindex_single_channel(channel_id)
if self.sleep_interval:
sleep(self.sleep_interval)
# playlist
print(f"reindexing {len(self.all_playlist_ids)} playlists")
if self.all_playlist_ids:
all_indexed = PendingList().get_all_indexed()
all_indexed_ids = [i["youtube_id"] for i in all_indexed]
for playlist_id in self.all_playlist_ids:
self.reindex_single_playlist(playlist_id, all_indexed_ids)
if self.sleep_interval:
sleep(self.sleep_interval)

View File

@ -0,0 +1,175 @@
"""
functionality:
- get metadata from youtube for a video
- index and update in es
"""
import os
from datetime import datetime
import requests
from home.src.index import channel as ta_channel
from home.src.index.generic import YouTubeItem
from home.src.ta.helper import DurationConverter, clean_string
from ryd_client import ryd_client
class YoutubeVideo(YouTubeItem):
"""represents a single youtube video"""
es_path = False
index_name = "ta_video"
yt_base = "https://www.youtube.com/watch?v="
def __init__(self, youtube_id):
super().__init__(youtube_id)
self.channel_id = False
self.es_path = f"{self.index_name}/_doc/{youtube_id}"
def build_json(self):
"""build json dict of video"""
self.get_from_youtube()
if not self.youtube_meta:
return
self._process_youtube_meta()
self._add_channel()
self._add_stats()
self.add_file_path()
self.add_player()
if self.config["downloads"]["integrate_ryd"]:
self._get_ryd_stats()
return
def _process_youtube_meta(self):
"""extract relevant fields from youtube"""
# extract
self.channel_id = self.youtube_meta["channel_id"]
upload_date = self.youtube_meta["upload_date"]
upload_date_time = datetime.strptime(upload_date, "%Y%m%d")
published = upload_date_time.strftime("%Y-%m-%d")
last_refresh = int(datetime.now().strftime("%s"))
# build json_data basics
self.json_data = {
"title": self.youtube_meta["title"],
"description": self.youtube_meta["description"],
"category": self.youtube_meta["categories"],
"vid_thumb_url": self.youtube_meta["thumbnail"],
"tags": self.youtube_meta["tags"],
"published": published,
"vid_last_refresh": last_refresh,
"date_downloaded": last_refresh,
"youtube_id": self.youtube_id,
"active": True,
}
def _add_channel(self):
"""add channel dict to video json_data"""
channel = ta_channel.YoutubeChannel(self.channel_id)
channel.build_json(upload=True)
self.json_data.update({"channel": channel.json_data})
def _add_stats(self):
"""add stats dicst to json_data"""
# likes
like_count = self.youtube_meta.get("like_count", 0)
dislike_count = self.youtube_meta.get("dislike_count", 0)
self.json_data.update(
{
"stats": {
"view_count": self.youtube_meta["view_count"],
"like_count": like_count,
"dislike_count": dislike_count,
"average_rating": self.youtube_meta["average_rating"],
}
}
)
def build_dl_cache_path(self):
"""find video path in dl cache"""
cache_dir = self.app_conf["cache_dir"]
cache_path = f"{cache_dir}/download/"
all_cached = os.listdir(cache_path)
for file_cached in all_cached:
if self.youtube_id in file_cached:
vid_path = os.path.join(cache_path, file_cached)
return vid_path
return False
def add_player(self):
"""add player information for new videos"""
try:
# when indexing from download task
vid_path = self.build_dl_cache_path()
except FileNotFoundError:
# when reindexing
base = self.app_conf["videos"]
vid_path = os.path.join(base, self.json_data["media_url"])
duration_handler = DurationConverter()
duration = duration_handler.get_sec(vid_path)
duration_str = duration_handler.get_str(duration)
self.json_data.update(
{
"player": {
"watched": False,
"duration": duration,
"duration_str": duration_str,
}
}
)
def add_file_path(self):
"""build media_url for where file will be located"""
channel_name = self.json_data["channel"]["channel_name"]
clean_channel_name = clean_string(channel_name)
timestamp = self.json_data["published"].replace("-", "")
youtube_id = self.json_data["youtube_id"]
title = self.json_data["title"]
clean_title = clean_string(title)
filename = f"{timestamp}_{youtube_id}_{clean_title}.mp4"
media_url = os.path.join(clean_channel_name, filename)
self.json_data["media_url"] = media_url
def delete_media_file(self):
"""delete video file, meta data"""
self.get_from_es()
video_base = self.app_conf["videos"]
media_url = self.json_data["media_url"]
print(f"{self.youtube_id}: delete {media_url} from file system")
to_delete = os.path.join(video_base, media_url)
os.remove(to_delete)
self.del_in_es()
def _get_ryd_stats(self):
"""get optional stats from returnyoutubedislikeapi.com"""
try:
print(f"{self.youtube_id}: get ryd stats")
result = ryd_client.get(self.youtube_id)
except requests.exceptions.ConnectionError:
print(f"{self.youtube_id}: failed to query ryd api, skipping")
return False
if result["status"] == 404:
return False
dislikes = {
"dislike_count": result["dislikes"],
"average_rating": result["rating"],
}
self.json_data["stats"].update(dislikes)
return True
def index_new_video(youtube_id):
"""combined classes to create new video in index"""
video = YoutubeVideo(youtube_id)
video.build_json()
if not video.json_data:
raise ValueError("failed to get metadata for " + youtube_id)
video.upload_to_es()
return video.json_data

View File

@ -1,600 +0,0 @@
"""
Functionality:
- reindexing old documents
- syncing updated values between indexes
- scan the filesystem to delete or index
"""
import json
import os
import re
import shutil
import subprocess
from datetime import datetime
from math import ceil
from time import sleep
import requests
from home.src.config import AppConfig
from home.src.download import ChannelSubscription, PendingList, VideoDownloader
from home.src.helper import (
RedisArchivist,
clean_string,
get_total_hits,
ignore_filelist,
)
from home.src.index import (
YoutubeChannel,
YoutubePlaylist,
YoutubeVideo,
index_new_video,
)
from home.src.thumbnails import ThumbManager
class Reindex:
"""check for outdated documents and refresh data from youtube"""
def __init__(self):
# config
config = AppConfig().config
self.sleep_interval = config["downloads"]["sleep_interval"]
self.es_url = config["application"]["es_url"]
self.es_auth = config["application"]["es_auth"]
self.refresh_interval = config["scheduler"]["check_reindex_days"]
self.integrate_ryd = config["downloads"]["integrate_ryd"]
# scan
self.all_youtube_ids = False
self.all_channel_ids = False
self.all_playlist_ids = False
def get_daily(self):
"""get daily refresh values"""
total_videos = get_total_hits(
"ta_video", self.es_url, self.es_auth, "active"
)
video_daily = ceil(total_videos / self.refresh_interval * 1.2)
total_channels = get_total_hits(
"ta_channel", self.es_url, self.es_auth, "channel_active"
)
channel_daily = ceil(total_channels / self.refresh_interval * 1.2)
total_playlists = get_total_hits(
"ta_playlist", self.es_url, self.es_auth, "playlist_active"
)
playlist_daily = ceil(total_playlists / self.refresh_interval * 1.2)
return (video_daily, channel_daily, playlist_daily)
def get_outdated_vids(self, size):
"""get daily videos to refresh"""
headers = {"Content-type": "application/json"}
now = int(datetime.now().strftime("%s"))
now_lte = now - self.refresh_interval * 24 * 60 * 60
data = {
"size": size,
"query": {
"bool": {
"must": [
{"match": {"active": True}},
{"range": {"vid_last_refresh": {"lte": now_lte}}},
]
}
},
"sort": [{"vid_last_refresh": {"order": "asc"}}],
"_source": False,
}
query_str = json.dumps(data)
url = self.es_url + "/ta_video/_search"
response = requests.get(
url, data=query_str, headers=headers, auth=self.es_auth
)
if not response.ok:
print(response.text)
response_dict = json.loads(response.text)
all_youtube_ids = [i["_id"] for i in response_dict["hits"]["hits"]]
return all_youtube_ids
def get_unrated_vids(self):
"""get all videos without rating if ryd integration is enabled"""
headers = {"Content-type": "application/json"}
data = {
"size": 200,
"query": {
"bool": {
"must_not": [{"exists": {"field": "stats.average_rating"}}]
}
},
}
query_str = json.dumps(data)
url = self.es_url + "/ta_video/_search"
response = requests.get(
url, data=query_str, headers=headers, auth=self.es_auth
)
if not response.ok:
print(response.text)
response_dict = json.loads(response.text)
missing_rating = [i["_id"] for i in response_dict["hits"]["hits"]]
self.all_youtube_ids = self.all_youtube_ids + missing_rating
def get_outdated_channels(self, size):
"""get daily channels to refresh"""
headers = {"Content-type": "application/json"}
now = int(datetime.now().strftime("%s"))
now_lte = now - self.refresh_interval * 24 * 60 * 60
data = {
"size": size,
"query": {
"bool": {
"must": [
{"match": {"channel_active": True}},
{"range": {"channel_last_refresh": {"lte": now_lte}}},
]
}
},
"sort": [{"channel_last_refresh": {"order": "asc"}}],
"_source": False,
}
query_str = json.dumps(data)
url = self.es_url + "/ta_channel/_search"
response = requests.get(
url, data=query_str, headers=headers, auth=self.es_auth
)
if not response.ok:
print(response.text)
response_dict = json.loads(response.text)
all_channel_ids = [i["_id"] for i in response_dict["hits"]["hits"]]
return all_channel_ids
def get_outdated_playlists(self, size):
"""get daily outdated playlists to refresh"""
headers = {"Content-type": "application/json"}
now = int(datetime.now().strftime("%s"))
now_lte = now - self.refresh_interval * 24 * 60 * 60
data = {
"size": size,
"query": {
"bool": {
"must": [
{"match": {"playlist_active": True}},
{"range": {"playlist_last_refresh": {"lte": now_lte}}},
]
}
},
"sort": [{"playlist_last_refresh": {"order": "asc"}}],
"_source": False,
}
query_str = json.dumps(data)
url = self.es_url + "/ta_playlist/_search"
response = requests.get(
url, data=query_str, headers=headers, auth=self.es_auth
)
if not response.ok:
print(response.text)
response_dict = json.loads(response.text)
all_playlist_ids = [i["_id"] for i in response_dict["hits"]["hits"]]
return all_playlist_ids
def check_outdated(self):
"""add missing vids and channels"""
video_daily, channel_daily, playlist_daily = self.get_daily()
self.all_youtube_ids = self.get_outdated_vids(video_daily)
self.all_channel_ids = self.get_outdated_channels(channel_daily)
self.all_playlist_ids = self.get_outdated_playlists(playlist_daily)
if self.integrate_ryd:
self.get_unrated_vids()
def rescrape_all_channels(self):
"""sync new data from channel to all matching videos"""
sleep_interval = self.sleep_interval
channel_sub_handler = ChannelSubscription()
all_channels = channel_sub_handler.get_channels(subscribed_only=False)
all_channel_ids = [i["channel_id"] for i in all_channels]
counter = 1
for channel_id in all_channel_ids:
channel_index = YoutubeChannel(channel_id)
subscribed = channel_index.channel_dict["channel_subscribed"]
channel_index.channel_dict = channel_index.build_channel_dict(
scrape=True
)
channel_index.channel_dict["channel_subscribed"] = subscribed
channel_index.upload_to_es()
channel_index.sync_to_videos()
counter = counter + 1
if sleep_interval:
sleep(sleep_interval)
@staticmethod
def reindex_single_video(youtube_id):
"""refresh data for single video"""
vid_handler = YoutubeVideo(youtube_id)
vid_handler.get_vid_dict()
if not vid_handler.vid_dict:
# stop if deactivated
vid_handler.deactivate()
return
es_vid_dict = vid_handler.get_es_data()
player = es_vid_dict["_source"]["player"]
date_downloaded = es_vid_dict["_source"]["date_downloaded"]
channel_dict = es_vid_dict["_source"]["channel"]
channel_name = channel_dict["channel_name"]
try:
playlist = es_vid_dict["_source"]["playlist"]
except KeyError:
playlist = False
vid_handler.build_file_path(channel_name)
# add to vid_dict
vid_handler.vid_dict["player"] = player
vid_handler.vid_dict["date_downloaded"] = date_downloaded
vid_handler.vid_dict["channel"] = channel_dict
if playlist:
vid_handler.vid_dict["playlist"] = playlist
# update
vid_handler.upload_to_es()
thumb_handler = ThumbManager()
thumb_handler.delete_vid_thumb(youtube_id)
to_download = (youtube_id, vid_handler.vid_dict["vid_thumb_url"])
thumb_handler.download_vid([to_download], notify=False)
@staticmethod
def reindex_single_channel(channel_id):
"""refresh channel data and sync to videos"""
channel_handler = YoutubeChannel(channel_id)
subscribed = channel_handler.channel_dict["channel_subscribed"]
channel_handler.channel_dict = channel_handler.build_channel_dict(
scrape=True
)
channel_handler.channel_dict["channel_subscribed"] = subscribed
# update
channel_handler.upload_to_es()
channel_handler.sync_to_videos()
thumb_handler = ThumbManager()
thumb_handler.delete_chan_thumb(channel_id)
channel_thumb = channel_handler.channel_dict["channel_thumb_url"]
channel_banner = channel_handler.channel_dict["channel_banner_url"]
to_download = (channel_id, channel_thumb, channel_banner)
thumb_handler.download_chan([to_download])
@staticmethod
def reindex_single_playlist(playlist_id, all_indexed_ids):
"""refresh playlist data"""
playlist_handler = YoutubePlaylist(
playlist_id, all_youtube_ids=all_indexed_ids
)
playlist = playlist_handler.update_playlist()
if not playlist:
playlist_handler.deactivate()
return
playlist_thumbnail = (playlist_id, playlist["playlist_thumbnail"])
thumb_handler = ThumbManager()
thumb_handler.download_playlist([playlist_thumbnail])
return
def reindex(self):
"""reindex what's needed"""
# videos
print(f"reindexing {len(self.all_youtube_ids)} videos")
for youtube_id in self.all_youtube_ids:
self.reindex_single_video(youtube_id)
if self.sleep_interval:
sleep(self.sleep_interval)
# channels
print(f"reindexing {len(self.all_channel_ids)} channels")
for channel_id in self.all_channel_ids:
self.reindex_single_channel(channel_id)
if self.sleep_interval:
sleep(self.sleep_interval)
# playlist
print(f"reindexing {len(self.all_playlist_ids)} playlists")
if self.all_playlist_ids:
all_indexed = PendingList().get_all_indexed()
all_indexed_ids = [i["youtube_id"] for i in all_indexed]
for playlist_id in self.all_playlist_ids:
self.reindex_single_playlist(playlist_id, all_indexed_ids)
if self.sleep_interval:
sleep(self.sleep_interval)
class FilesystemScanner:
"""handle scanning and fixing from filesystem"""
CONFIG = AppConfig().config
ES_URL = CONFIG["application"]["es_url"]
ES_AUTH = CONFIG["application"]["es_auth"]
VIDEOS = CONFIG["application"]["videos"]
def __init__(self):
self.all_downloaded = self.get_all_downloaded()
self.all_indexed = self.get_all_indexed()
self.mismatch = None
self.to_rename = None
self.to_index = None
self.to_delete = None
def get_all_downloaded(self):
"""get a list of all video files downloaded"""
channels = os.listdir(self.VIDEOS)
all_channels = ignore_filelist(channels)
all_channels.sort()
all_downloaded = []
for channel_name in all_channels:
channel_path = os.path.join(self.VIDEOS, channel_name)
videos = os.listdir(channel_path)
all_videos = ignore_filelist(videos)
for video in all_videos:
youtube_id = video[9:20]
all_downloaded.append((channel_name, video, youtube_id))
return all_downloaded
@staticmethod
def get_all_indexed():
"""get a list of all indexed videos"""
index_handler = PendingList()
all_indexed_raw = index_handler.get_all_indexed()
all_indexed = []
for video in all_indexed_raw:
youtube_id = video["youtube_id"]
media_url = video["media_url"]
published = video["published"]
title = video["title"]
all_indexed.append((youtube_id, media_url, published, title))
return all_indexed
def list_comarison(self):
"""compare the lists to figure out what to do"""
self.find_unindexed()
self.find_missing()
self.find_bad_media_url()
def find_unindexed(self):
"""find video files without a matching document indexed"""
all_indexed_ids = [i[0] for i in self.all_indexed]
to_index = []
for downloaded in self.all_downloaded:
if downloaded[2] not in all_indexed_ids:
to_index.append(downloaded)
self.to_index = to_index
def find_missing(self):
"""find indexed videos without matching media file"""
all_downloaded_ids = [i[2] for i in self.all_downloaded]
to_delete = []
for video in self.all_indexed:
youtube_id = video[0]
if youtube_id not in all_downloaded_ids:
to_delete.append(video)
self.to_delete = to_delete
def find_bad_media_url(self):
"""rename media files not matching the indexed title"""
to_fix = []
to_rename = []
for downloaded in self.all_downloaded:
channel, filename, downloaded_id = downloaded
# find in indexed
for indexed in self.all_indexed:
indexed_id, media_url, published, title = indexed
if indexed_id == downloaded_id:
# found it
title_c = clean_string(title)
pub = published.replace("-", "")
expected_filename = f"{pub}_{indexed_id}_{title_c}.mp4"
new_url = os.path.join(channel, expected_filename)
if expected_filename != filename:
# file to rename
to_rename.append(
(channel, filename, expected_filename)
)
if media_url != new_url:
# media_url to update in es
to_fix.append((indexed_id, new_url))
break
self.mismatch = to_fix
self.to_rename = to_rename
def rename_files(self):
"""rename media files as identified by find_bad_media_url"""
for bad_filename in self.to_rename:
channel, filename, expected_filename = bad_filename
print(f"renaming [{filename}] to [{expected_filename}]")
old_path = os.path.join(self.VIDEOS, channel, filename)
new_path = os.path.join(self.VIDEOS, channel, expected_filename)
os.rename(old_path, new_path)
def send_mismatch_bulk(self):
"""build bulk update"""
bulk_list = []
for video_mismatch in self.mismatch:
youtube_id, media_url = video_mismatch
print(f"{youtube_id}: fixing media url {media_url}")
action = {"update": {"_id": youtube_id, "_index": "ta_video"}}
source = {"doc": {"media_url": media_url}}
bulk_list.append(json.dumps(action))
bulk_list.append(json.dumps(source))
# add last newline
bulk_list.append("\n")
query_str = "\n".join(bulk_list)
# make the call
headers = {"Content-type": "application/x-ndjson"}
url = self.ES_URL + "/_bulk"
request = requests.post(
url, data=query_str, headers=headers, auth=self.ES_AUTH
)
if not request.ok:
print(request.text)
def delete_from_index(self):
"""find indexed but deleted mediafile"""
for indexed in self.to_delete:
youtube_id = indexed[0]
print(f"deleting {youtube_id} from index")
url = self.ES_URL + "/ta_video/_doc/" + youtube_id
request = requests.delete(url, auth=self.ES_AUTH)
if not request.ok:
print(request.text)
class ManualImport:
"""import and indexing existing video files"""
CONFIG = AppConfig().config
CACHE_DIR = CONFIG["application"]["cache_dir"]
IMPORT_DIR = os.path.join(CACHE_DIR, "import")
def __init__(self):
self.identified = self.import_folder_parser()
def import_folder_parser(self):
"""detect files in import folder"""
import_files = os.listdir(self.IMPORT_DIR)
to_import = ignore_filelist(import_files)
to_import.sort()
video_files = [i for i in to_import if not i.endswith(".json")]
identified = []
for file_path in video_files:
file_dict = {"video_file": file_path}
file_name, _ = os.path.splitext(file_path)
matching_json = [
i
for i in to_import
if i.startswith(file_name) and i.endswith(".json")
]
if matching_json:
json_file = matching_json[0]
youtube_id = self.extract_id_from_json(json_file)
file_dict.update({"json_file": json_file})
else:
youtube_id = self.extract_id_from_filename(file_name)
file_dict.update({"json_file": False})
file_dict.update({"youtube_id": youtube_id})
identified.append(file_dict)
return identified
@staticmethod
def extract_id_from_filename(file_name):
"""
look at the file name for the youtube id
expects filename ending in [<youtube_id>].<ext>
"""
id_search = re.search(r"\[([a-zA-Z0-9_-]{11})\]$", file_name)
if id_search:
youtube_id = id_search.group(1)
return youtube_id
print("failed to extract youtube id for: " + file_name)
raise Exception
def extract_id_from_json(self, json_file):
"""open json file and extract id"""
json_path = os.path.join(self.CACHE_DIR, "import", json_file)
with open(json_path, "r", encoding="utf-8") as f:
json_content = f.read()
youtube_id = json.loads(json_content)["id"]
return youtube_id
def process_import(self):
"""go through identified media files"""
all_videos_added = []
for media_file in self.identified:
json_file = media_file["json_file"]
video_file = media_file["video_file"]
youtube_id = media_file["youtube_id"]
video_path = os.path.join(self.CACHE_DIR, "import", video_file)
self.move_to_cache(video_path, youtube_id)
# identify and archive
vid_dict = index_new_video(youtube_id)
VideoDownloader([youtube_id]).move_to_archive(vid_dict)
youtube_id = vid_dict["youtube_id"]
thumb_url = vid_dict["vid_thumb_url"]
all_videos_added.append((youtube_id, thumb_url))
# cleanup
if os.path.exists(video_path):
os.remove(video_path)
if json_file:
json_path = os.path.join(self.CACHE_DIR, "import", json_file)
os.remove(json_path)
return all_videos_added
def move_to_cache(self, video_path, youtube_id):
"""move identified video file to cache, convert to mp4"""
file_name = os.path.split(video_path)[-1]
video_file, ext = os.path.splitext(file_name)
# make sure youtube_id is in filename
if youtube_id not in video_file:
video_file = f"{video_file}_{youtube_id}"
# move, convert if needed
if ext == ".mp4":
new_file = video_file + ext
dest_path = os.path.join(self.CACHE_DIR, "download", new_file)
shutil.move(video_path, dest_path)
else:
print(f"processing with ffmpeg: {video_file}")
new_file = video_file + ".mp4"
dest_path = os.path.join(self.CACHE_DIR, "download", new_file)
subprocess.run(
[
"ffmpeg",
"-i",
video_path,
dest_path,
"-loglevel",
"warning",
"-stats",
],
check=True,
)
def scan_filesystem():
"""grouped function to delete and update index"""
filesystem_handler = FilesystemScanner()
filesystem_handler.list_comarison()
if filesystem_handler.to_rename:
print("renaming files")
filesystem_handler.rename_files()
if filesystem_handler.mismatch:
print("fixing media urls in index")
filesystem_handler.send_mismatch_bulk()
if filesystem_handler.to_delete:
print("delete metadata from index")
filesystem_handler.delete_from_index()
if filesystem_handler.to_index:
print("index new videos")
for missing_vid in filesystem_handler.to_index:
youtube_id = missing_vid[2]
index_new_video(youtube_id, missing_vid=missing_vid)
def reindex_old_documents():
"""daily refresh of old documents"""
# continue if needed
reindex_handler = Reindex()
reindex_handler.check_outdated()
reindex_handler.reindex()
# set timestamp
now = int(datetime.now().strftime("%s"))
RedisArchivist().set_message("last_reindex", now, expire=False)

View File

View File

@ -2,7 +2,6 @@
Functionality: Functionality:
- read and write config - read and write config
- load config variables into redis - load config variables into redis
- needs to be a separate module to avoid circular import
""" """
import json import json
@ -10,7 +9,7 @@ import os
import re import re
from celery.schedules import crontab from celery.schedules import crontab
from home.src.helper import RedisArchivist from home.src.ta.ta_redis import RedisArchivist
class AppConfig: class AppConfig:
@ -39,8 +38,7 @@ class AppConfig:
def get_config_file(self): def get_config_file(self):
"""read the defaults from config.json""" """read the defaults from config.json"""
with open("home/config.json", "r", encoding="utf-8") as f: with open("home/config.json", "r", encoding="utf-8") as f:
config_str = f.read() config_file = json.load(f)
config_file = json.loads(config_str)
config_file["application"].update(self.get_config_env()) config_file["application"].update(self.get_config_env())

View File

@ -4,14 +4,12 @@ Loose collection of helper functions
""" """
import json import json
import os
import re import re
import string import string
import subprocess import subprocess
import unicodedata import unicodedata
from urllib.parse import parse_qs, urlparse from urllib.parse import parse_qs, urlparse
import redis
import requests import requests
import yt_dlp import yt_dlp
@ -149,153 +147,6 @@ class UrlListParser:
return channel_id return channel_id
class RedisArchivist:
"""collection of methods to interact with redis"""
REDIS_HOST = os.environ.get("REDIS_HOST")
REDIS_PORT = os.environ.get("REDIS_PORT") or 6379
NAME_SPACE = "ta:"
CHANNELS = [
"download",
"add",
"rescan",
"subchannel",
"subplaylist",
"playlistscan",
"setting",
]
def __init__(self):
self.redis_connection = redis.Redis(
host=self.REDIS_HOST, port=self.REDIS_PORT
)
def set_message(self, key, message, expire=True):
"""write new message to redis"""
self.redis_connection.execute_command(
"JSON.SET", self.NAME_SPACE + key, ".", json.dumps(message)
)
if expire:
if isinstance(expire, bool):
secs = 20
else:
secs = expire
self.redis_connection.execute_command(
"EXPIRE", self.NAME_SPACE + key, secs
)
def get_message(self, key):
"""get message dict from redis"""
reply = self.redis_connection.execute_command(
"JSON.GET", self.NAME_SPACE + key
)
if reply:
json_str = json.loads(reply)
else:
json_str = {"status": False}
return json_str
def del_message(self, key):
"""delete key from redis"""
response = self.redis_connection.execute_command(
"DEL", self.NAME_SPACE + key
)
return response
def get_lock(self, lock_key):
"""handle lock for task management"""
redis_lock = self.redis_connection.lock(self.NAME_SPACE + lock_key)
return redis_lock
def get_progress(self):
"""get a list of all progress messages"""
all_messages = []
for channel in self.CHANNELS:
key = "message:" + channel
reply = self.redis_connection.execute_command(
"JSON.GET", self.NAME_SPACE + key
)
if reply:
json_str = json.loads(reply)
all_messages.append(json_str)
return all_messages
@staticmethod
def monitor_cache_dir(cache_dir):
"""
look at download cache dir directly as alternative progress info
"""
dl_cache = os.path.join(cache_dir, "download")
all_cache_file = os.listdir(dl_cache)
cache_file = ignore_filelist(all_cache_file)
if cache_file:
filename = cache_file[0][12:].replace("_", " ").split(".")[0]
mess_dict = {
"status": "message:download",
"level": "info",
"title": "Downloading: " + filename,
"message": "",
}
else:
return False
return mess_dict
class RedisQueue:
"""dynamically interact with the download queue in redis"""
REDIS_HOST = os.environ.get("REDIS_HOST")
REDIS_PORT = os.environ.get("REDIS_PORT")
NAME_SPACE = "ta:"
if not REDIS_PORT:
REDIS_PORT = 6379
def __init__(self, key):
self.key = self.NAME_SPACE + key
self.conn = redis.Redis(host=self.REDIS_HOST, port=self.REDIS_PORT)
def get_all(self):
"""return all elements in list"""
result = self.conn.execute_command("LRANGE", self.key, 0, -1)
all_elements = [i.decode() for i in result]
return all_elements
def add_list(self, to_add):
"""add list to queue"""
self.conn.execute_command("RPUSH", self.key, *to_add)
def add_priority(self, to_add):
"""add single video to front of queue"""
self.clear_item(to_add)
self.conn.execute_command("LPUSH", self.key, to_add)
def get_next(self):
"""return next element in the queue, False if none"""
result = self.conn.execute_command("LPOP", self.key)
if not result:
return False
next_element = result.decode()
return next_element
def clear(self):
"""delete list from redis"""
self.conn.execute_command("DEL", self.key)
def clear_item(self, to_clear):
"""remove single item from list if it's there"""
self.conn.execute_command("LREM", self.key, 0, to_clear)
def trim(self, size):
"""trim the queue based on settings amount"""
self.conn.execute_command("LTRIM", self.key, 0, size)
class DurationConverter: class DurationConverter:
""" """
using ffmpeg to get and parse duration from filepath using ffmpeg to get and parse duration from filepath

View File

@ -0,0 +1,158 @@
"""
functionality:
- interact with redis
- hold temporary download queue in redis
"""
import json
import os
import redis
from home.src.ta.helper import ignore_filelist
class RedisArchivist:
"""collection of methods to interact with redis"""
REDIS_HOST = os.environ.get("REDIS_HOST")
REDIS_PORT = os.environ.get("REDIS_PORT") or 6379
NAME_SPACE = "ta:"
CHANNELS = [
"download",
"add",
"rescan",
"subchannel",
"subplaylist",
"playlistscan",
"setting",
]
def __init__(self):
self.redis_connection = redis.Redis(
host=self.REDIS_HOST, port=self.REDIS_PORT
)
def set_message(self, key, message, expire=True):
"""write new message to redis"""
self.redis_connection.execute_command(
"JSON.SET", self.NAME_SPACE + key, ".", json.dumps(message)
)
if expire:
if isinstance(expire, bool):
secs = 20
else:
secs = expire
self.redis_connection.execute_command(
"EXPIRE", self.NAME_SPACE + key, secs
)
def get_message(self, key):
"""get message dict from redis"""
reply = self.redis_connection.execute_command(
"JSON.GET", self.NAME_SPACE + key
)
if reply:
json_str = json.loads(reply)
else:
json_str = {"status": False}
return json_str
def del_message(self, key):
"""delete key from redis"""
response = self.redis_connection.execute_command(
"DEL", self.NAME_SPACE + key
)
return response
def get_lock(self, lock_key):
"""handle lock for task management"""
redis_lock = self.redis_connection.lock(self.NAME_SPACE + lock_key)
return redis_lock
def get_progress(self):
"""get a list of all progress messages"""
all_messages = []
for channel in self.CHANNELS:
key = "message:" + channel
reply = self.redis_connection.execute_command(
"JSON.GET", self.NAME_SPACE + key
)
if reply:
json_str = json.loads(reply)
all_messages.append(json_str)
return all_messages
@staticmethod
def monitor_cache_dir(cache_dir):
"""
look at download cache dir directly as alternative progress info
"""
dl_cache = os.path.join(cache_dir, "download")
all_cache_file = os.listdir(dl_cache)
cache_file = ignore_filelist(all_cache_file)
if cache_file:
filename = cache_file[0][12:].replace("_", " ").split(".")[0]
mess_dict = {
"status": "message:download",
"level": "info",
"title": "Downloading: " + filename,
"message": "",
}
else:
return False
return mess_dict
class RedisQueue:
"""dynamically interact with the download queue in redis"""
REDIS_HOST = os.environ.get("REDIS_HOST")
REDIS_PORT = os.environ.get("REDIS_PORT")
NAME_SPACE = "ta:"
if not REDIS_PORT:
REDIS_PORT = 6379
def __init__(self, key):
self.key = self.NAME_SPACE + key
self.conn = redis.Redis(host=self.REDIS_HOST, port=self.REDIS_PORT)
def get_all(self):
"""return all elements in list"""
result = self.conn.execute_command("LRANGE", self.key, 0, -1)
all_elements = [i.decode() for i in result]
return all_elements
def add_list(self, to_add):
"""add list to queue"""
self.conn.execute_command("RPUSH", self.key, *to_add)
def add_priority(self, to_add):
"""add single video to front of queue"""
self.clear_item(to_add)
self.conn.execute_command("LPUSH", self.key, to_add)
def get_next(self):
"""return next element in the queue, False if none"""
result = self.conn.execute_command("LPOP", self.key)
if not result:
return False
next_element = result.decode()
return next_element
def clear(self):
"""delete list from redis"""
self.conn.execute_command("DEL", self.key)
def clear_item(self, to_clear):
"""remove single item from list if it's there"""
self.conn.execute_command("LREM", self.key, 0, to_clear)
def trim(self, size):
"""trim the queue based on settings amount"""
self.conn.execute_command("LTRIM", self.key, 0, size)

View File

@ -10,22 +10,24 @@ import os
import home.apps as startup_apps import home.apps as startup_apps
from celery import Celery, shared_task from celery import Celery, shared_task
from home.src.config import AppConfig, ScheduleBuilder from home.src.download.queue import PendingList
from home.src.download import ( from home.src.download.subscriptions import (
ChannelSubscription, ChannelSubscription,
PendingList,
PlaylistSubscription, PlaylistSubscription,
VideoDownloader,
) )
from home.src.helper import RedisArchivist, RedisQueue, UrlListParser from home.src.download.thumbnails import ThumbManager, validate_thumbnails
from home.src.index import YoutubeChannel, YoutubePlaylist from home.src.download.yt_dlp_handler import VideoDownloader
from home.src.index_management import backup_all_indexes, restore_from_backup from home.src.es.index_setup import backup_all_indexes, restore_from_backup
from home.src.reindex import ( from home.src.index.channel import YoutubeChannel
from home.src.index.filesystem import (
ManualImport, ManualImport,
reindex_old_documents, reindex_old_documents,
scan_filesystem, scan_filesystem,
) )
from home.src.thumbnails import ThumbManager, validate_thumbnails from home.src.index.playlist import YoutubePlaylist
from home.src.ta.config import AppConfig, ScheduleBuilder
from home.src.ta.helper import UrlListParser
from home.src.ta.ta_redis import RedisArchivist, RedisQueue
CONFIG = AppConfig().config CONFIG = AppConfig().config
REDIS_HOST = os.environ.get("REDIS_HOST") REDIS_HOST = os.environ.get("REDIS_HOST")
@ -266,17 +268,16 @@ def subscribe_to(url_str):
@shared_task @shared_task
def index_channel_playlists(channel_id): def index_channel_playlists(channel_id):
"""add all playlists of channel to index""" """add all playlists of channel to index"""
channel_handler = YoutubeChannel(channel_id) channel = YoutubeChannel(channel_id)
channel_name = channel_handler.channel_dict["channel_name"]
# notify # notify
mess_dict = { mess_dict = {
"status": "message:playlistscan", "status": "message:playlistscan",
"level": "info", "level": "info",
"title": "Looking for playlists", "title": "Looking for playlists",
"message": f'Scanning channel "{channel_name}" in progress', "message": f'Scanning channel "{channel.youtube_id}" in progress',
} }
RedisArchivist().set_message("message:playlistscan", mess_dict) RedisArchivist().set_message("message:playlistscan", mess_dict)
all_playlists = channel_handler.get_all_playlists() all_playlists = channel.get_all_playlists()
if not all_playlists: if not all_playlists:
print(f"no playlists found for channel {channel_id}") print(f"no playlists found for channel {channel_id}")
@ -295,28 +296,29 @@ def index_channel_playlists(channel_id):
} }
RedisArchivist().set_message("message:playlistscan", mess_dict) RedisArchivist().set_message("message:playlistscan", mess_dict)
print("add playlist: " + playlist_title) print("add playlist: " + playlist_title)
playlist_handler = YoutubePlaylist(
playlist_id, all_youtube_ids=all_youtube_ids playlist = YoutubePlaylist(playlist_id)
) playlist.all_youtube_ids = all_youtube_ids
playlist_handler.get_playlist_dict() playlist.build_json()
if not playlist_handler.playlist_dict:
if not playlist.json_data:
# skip if not available # skip if not available
continue continue
# don't add if no videos downloaded # don't add if no videos downloaded
downloaded = [ downloaded = [
i i
for i in playlist_handler.playlist_dict["playlist_entries"] for i in playlist.json_data["playlist_entries"]
if i["downloaded"] if i["downloaded"]
] ]
if not downloaded: if not downloaded:
continue continue
playlist_handler.upload_to_es()
playlist_handler.add_vids_to_playlist() playlist.upload_to_es()
playlist.add_vids_to_playlist()
if all_playlists: if all_playlists:
handler = ThumbManager() playlist.get_playlist_art()
missing_playlists = handler.get_missing_playlists()
handler.download_playlist(missing_playlists)
return return

View File

@ -69,7 +69,7 @@
<img src="{% static 'img/icon-gear.svg' %}" alt="gear-icon" title="Settings"> <img src="{% static 'img/icon-gear.svg' %}" alt="gear-icon" title="Settings">
</a> </a>
<a href="{% url 'logout' %}"> <a href="{% url 'logout' %}">
<img src="{% static 'img/icon-exit.svg' %}" alt="exit-icon" title="Logout"> <img class="alert-hover" src="{% static 'img/icon-exit.svg' %}" alt="exit-icon" title="Logout">
</a> </a>
</div> </div>
</div> </div>

View File

@ -1,7 +1,7 @@
""" """
Functionality: Functionality:
- all views for home app - all views for home app
- process post data received from frontend via ajax - holds base classes to inherit from
""" """
import json import json
@ -14,7 +14,9 @@ from django.contrib.auth.forms import AuthenticationForm
from django.http import JsonResponse from django.http import JsonResponse
from django.shortcuts import redirect, render from django.shortcuts import redirect, render
from django.views import View from django.views import View
from home.forms import ( from home.src.es.index_setup import get_available_backups
from home.src.frontend.api_calls import PostData
from home.src.frontend.forms import (
AddToQueueForm, AddToQueueForm,
ApplicationSettingsForm, ApplicationSettingsForm,
CustomAuthForm, CustomAuthForm,
@ -24,12 +26,12 @@ from home.forms import (
SubscribeToPlaylistForm, SubscribeToPlaylistForm,
UserSettingsForm, UserSettingsForm,
) )
from home.src.config import AppConfig, ScheduleBuilder from home.src.frontend.searching import SearchHandler
from home.src.frontend import PostData from home.src.index.generic import Pagination
from home.src.helper import RedisArchivist, UrlListParser from home.src.index.playlist import YoutubePlaylist
from home.src.index import YoutubePlaylist from home.src.ta.config import AppConfig, ScheduleBuilder
from home.src.index_management import get_available_backups from home.src.ta.helper import UrlListParser
from home.src.searching import Pagination, SearchHandler from home.src.ta.ta_redis import RedisArchivist
from home.tasks import extrac_dl, subscribe_to from home.tasks import extrac_dl, subscribe_to
from rest_framework.authtoken.models import Token from rest_framework.authtoken.models import Token
@ -169,8 +171,7 @@ class ArchivistResultsView(ArchivistViewConfig):
def single_lookup(self, es_path): def single_lookup(self, es_path):
"""retrieve a single item from url""" """retrieve a single item from url"""
es_url = self.default_conf["application"]["es_url"] search = SearchHandler(es_path, config=self.default_conf)
search = SearchHandler(f"{es_url}/{es_path}", data=False)
result = search.get_data()[0]["source"] result = search.get_data()[0]["source"]
return result return result
@ -189,8 +190,9 @@ class ArchivistResultsView(ArchivistViewConfig):
def find_results(self): def find_results(self):
"""add results and pagination to context""" """add results and pagination to context"""
url = self.default_conf["application"]["es_url"] + self.es_search search = SearchHandler(
search = SearchHandler(url, self.data) self.es_search, config=self.default_conf, data=self.data
)
self.context["results"] = search.get_data() self.context["results"] = search.get_data()
self.pagination_handler.validate(search.max_hits) self.pagination_handler.validate(search.max_hits)
self.context["max_hits"] = search.max_hits self.context["max_hits"] = search.max_hits
@ -203,7 +205,7 @@ class HomeView(ArchivistResultsView):
""" """
view_origin = "home" view_origin = "home"
es_search = "/ta_video/_search" es_search = "ta_video/_search"
def get(self, request): def get(self, request):
"""handle get requests""" """handle get requests"""
@ -284,7 +286,7 @@ class DownloadView(ArchivistResultsView):
""" """
view_origin = "downloads" view_origin = "downloads"
es_search = "/ta_download/_search" es_search = "ta_download/_search"
def get(self, request): def get(self, request):
"""handle get request""" """handle get request"""
@ -346,7 +348,7 @@ class ChannelIdView(ArchivistResultsView):
""" """
view_origin = "home" view_origin = "home"
es_search = "/ta_video/_search" es_search = "ta_video/_search"
def get(self, request, channel_id): def get(self, request, channel_id):
"""get request""" """get request"""
@ -395,7 +397,7 @@ class ChannelView(ArchivistResultsView):
""" """
view_origin = "channel" view_origin = "channel"
es_search = "/ta_channel/_search" es_search = "ta_channel/_search"
def get(self, request): def get(self, request):
"""handle get request""" """handle get request"""
@ -445,7 +447,7 @@ class PlaylistIdView(ArchivistResultsView):
""" """
view_origin = "home" view_origin = "home"
es_search = "/ta_video/_search" es_search = "ta_video/_search"
def get(self, request, playlist_id): def get(self, request, playlist_id):
"""handle get request""" """handle get request"""
@ -521,7 +523,7 @@ class PlaylistView(ArchivistResultsView):
""" """
view_origin = "playlist" view_origin = "playlist"
es_search = "/ta_playlist/_search" es_search = "ta_playlist/_search"
def get(self, request): def get(self, request):
"""handle get request""" """handle get request"""
@ -592,9 +594,9 @@ class VideoView(View):
def get(self, request, video_id): def get(self, request, video_id):
"""get single video""" """get single video"""
es_url, colors, cast = self.read_config(user_id=request.user.id) colors, cast = self.read_config(user_id=request.user.id)
url = f"{es_url}/ta_video/_doc/{video_id}" path = f"ta_video/_doc/{video_id}"
look_up = SearchHandler(url, None) look_up = SearchHandler(path, config=False)
video_hit = look_up.get_data() video_hit = look_up.get_data()
video_data = video_hit[0]["source"] video_data = video_hit[0]["source"]
try: try:
@ -624,11 +626,11 @@ class VideoView(View):
"""build playlist nav if available""" """build playlist nav if available"""
all_navs = [] all_navs = []
for playlist_id in playlists: for playlist_id in playlists:
handler = YoutubePlaylist(playlist_id) playlist = YoutubePlaylist(playlist_id)
handler.get_playlist_dict() playlist.get_from_es()
nav = handler.build_nav(video_id) playlist.build_nav(video_id)
if nav: if playlist.nav:
all_navs.append(nav) all_navs.append(playlist.nav)
return all_navs return all_navs
@ -636,10 +638,9 @@ class VideoView(View):
def read_config(user_id): def read_config(user_id):
"""read config file""" """read config file"""
config_handler = AppConfig(user_id) config_handler = AppConfig(user_id)
es_url = config_handler.config["application"]["es_url"]
cast = config_handler.config["application"]["enable_cast"] cast = config_handler.config["application"]["enable_cast"]
colors = config_handler.colors colors = config_handler.colors
return es_url, colors, cast return colors, cast
@staticmethod @staticmethod
def star_creator(rating): def star_creator(rating):

View File

@ -1,12 +1,12 @@
beautifulsoup4==4.10.0 beautifulsoup4==4.10.0
celery==5.2.3 celery==5.2.3
django-cors-headers==3.11.0
Django==4.0.1 Django==4.0.1
django-cors-headers==3.11.0
djangorestframework==3.13.1 djangorestframework==3.13.1
Pillow==9.0.0 Pillow==9.0.0
redis==4.1.0 redis==4.1.1
requests==2.27.1 requests==2.27.1
ryd-client==0.0.3 ryd-client==0.0.3
uWSGI==2.0.20 uWSGI==2.0.20
whitenoise==5.3.0 whitenoise==5.3.0
yt_dlp==2021.12.27 yt_dlp==2022.1.21

View File

@ -286,6 +286,10 @@ button:hover {
--connected-color: var(--accent-font-light); --connected-color: var(--accent-font-light);
} }
.alert-hover:hover {
filter: var(--img-filter-error);
}
/* top of page */ /* top of page */
.title-bar { .title-bar {
padding-top: 30px; padding-top: 30px;

View File

@ -9,15 +9,15 @@
xmlns="http://www.w3.org/2000/svg" xmlns="http://www.w3.org/2000/svg"
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
width="500" width="210mm"
height="500" height="210mm"
viewBox="0 0 132.29197 132.29167" viewBox="0 0 210 210"
version="1.1" version="1.1"
id="svg1303" id="svg1566"
inkscape:version="0.92.4 (5da689c313, 2019-01-14)" inkscape:version="0.92.4 (5da689c313, 2019-01-14)"
sodipodi:docname="Icons_exit.svg"> sodipodi:docname="Icons_exit 05.svg">
<defs <defs
id="defs1297" /> id="defs1560" />
<sodipodi:namedview <sodipodi:namedview
id="base" id="base"
pagecolor="#ffffff" pagecolor="#ffffff"
@ -25,20 +25,19 @@
borderopacity="1.0" borderopacity="1.0"
inkscape:pageopacity="0.0" inkscape:pageopacity="0.0"
inkscape:pageshadow="2" inkscape:pageshadow="2"
inkscape:zoom="0.66442107" inkscape:zoom="0.35355339"
inkscape:cx="161.45413" inkscape:cx="963.7258"
inkscape:cy="207.61753" inkscape:cy="291.01609"
inkscape:document-units="mm" inkscape:document-units="mm"
inkscape:current-layer="layer1" inkscape:current-layer="layer1"
showgrid="false" showgrid="false"
units="px" inkscape:window-width="1920"
inkscape:window-width="1169" inkscape:window-height="1009"
inkscape:window-height="893" inkscape:window-x="-8"
inkscape:window-x="729" inkscape:window-y="-8"
inkscape:window-y="13" inkscape:window-maximized="1" />
inkscape:window-maximized="0" />
<metadata <metadata
id="metadata1300"> id="metadata1563">
<rdf:RDF> <rdf:RDF>
<cc:Work <cc:Work
rdf:about=""> rdf:about="">
@ -53,15 +52,24 @@
inkscape:label="Ebene 1" inkscape:label="Ebene 1"
inkscape:groupmode="layer" inkscape:groupmode="layer"
id="layer1" id="layer1"
transform="translate(0,-164.70764)"> transform="translate(0,-87)">
<g <path
id="g855" style="opacity:1;fill:#000000;fill-opacity:1;stroke:none;stroke-width:2.35654187;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke"
transform="matrix(1.9016362,0,0,1.9016362,-197.93838,-58.9418)"> d="M 106.49932,87.901069 C 49.504302,87.900974 3.3006913,134.10459 3.3007713,191.0996 c 0,0.30098 0.003,0.60131 0.005,0.90167 v 0 c -0.003,0.29952 -0.006,0.59901 -0.006,0.89912 -8e-5,56.99502 46.2035307,103.19865 103.1985287,103.19854 23.01714,-0.0773 45.34783,-7.84709 63.44155,-22.07425 0,0 9.01874,-8.71006 2.40579,-16.41737 -6.61297,-7.70731 -19.11222,0.3185 -19.11222,0.3185 -13.60985,9.81394 -29.95596,15.11012 -46.73512,15.14236 -44.275428,0 -80.167758,-35.89234 -80.167758,-80.16778 0,-0.30097 0.003,-0.60148 0.006,-0.90166 h -5.2e-4 c -0.003,-0.29934 -0.006,-0.59901 -0.006,-0.89913 0,-44.27545 35.89234,-80.16777 80.167778,-80.16777 16.77916,0.0322 33.12527,5.32843 46.73512,15.14236 0,0 12.49925,8.02581 19.11222,0.3185 6.61295,-7.70732 -2.4058,-16.41739 -2.4058,-16.41739 C 151.84561,95.74815 129.51494,87.97828 106.4978,87.901069 Z m 54.30959,56.450221 -12.13663,11.69622 20.15864,20.93332 -93.932488,-1.4899 c -9.22763,-0.17349 -16.77655,6.07423 -16.92587,14.00904 l 0.002,0.002 c -0.0149,1.82673 -0.0235,3.40102 0,4.99598 l -0.002,0.002 c 0.14932,7.93483 7.69824,14.18254 16.92587,14.00905 l 93.932488,-1.48991 -20.15864,20.93333 12.13663,11.69622 34.0585,-35.35536 11.82982,-12.29208 h 0.003 l -9.9e-4,-0.002 9.9e-4,-9.9e-4 h -0.003 l -11.82982,-12.29208 z"
<path id="path1405"
inkscape:connector-curvature="0" inkscape:connector-curvature="0"
id="rect1208" sodipodi:nodetypes="cccccccsccsccsccscccccccccccccccccccccc" />
style="opacity:1;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0;stroke-linecap:round;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke" <path
d="m 124.57603,151.92962 c -0.0433,2.30016 2.0751,4.19245 4.75007,4.24278 l 30.26401,0.43007 -6.00195,5.78023 3.43246,3.56154 10.2778,-9.9006 0.002,0.002 3.5183,-3.3908 -3.42991,-3.564 -9.8737,-10.24989 -3.51834,3.39083 5.84388,6.06803 -30.35875,-0.43185 c -2.67494,-0.0503 -4.86301,1.76094 -4.90629,4.06112 z m -17.65039,-32.01644 v 64.95883 h 7.44347 v -58.27707 h 26.3896 v 18.5229 h 7.44296 v -25.20466 z m 33.83307,39.75416 v 25.20467 h 7.44296 v -25.20467 z" /> style="opacity:1;fill:#000000;fill-opacity:1;stroke:none;stroke-width:2.39729571;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke"
</g> d="m 506.57967,92.503023 c -57.98068,-1e-4 -104.98336,47.002567 -104.98326,104.983257 1.9e-4,57.98049 47.00276,104.98284 104.98326,104.98273 23.42489,-0.0758 46.15146,-7.98387 57.83458,-18.08923 11.68313,-10.10537 12.15613,-18.62993 7.38675,-23.04107 v -0.002 c -4.7711,-4.41269 -12.38099,-1.9587 -17.69245,2.25103 -13.83538,9.99805 -30.45915,15.40285 -47.52888,15.4528 -45.04116,0 -81.55421,-36.51305 -81.5542,-81.55419 0,-45.04114 36.51307,-81.5542 81.5542,-81.5542 17.06933,0.0328 33.21884,5.19482 43.16812,12.86758 9.94929,7.67275 17.33418,9.17607 22.1053,4.76338 v -0.002 c 4.77116,-4.41278 5.55882,-12.9887 -0.73482,-18.60197 -18.40654,-14.47308 -41.1234,-22.377337 -64.5386,-22.455877 z m 55.24881,57.426467 -12.34652,11.8985 20.50728,21.29534 -95.55697,-1.51567 c -9.38721,-0.17649 -17.06669,6.17929 -17.21858,14.25133 l 0.003,0.002 c -0.15192,8.07203 7.28245,14.71295 16.66978,14.88953 l 95.22519,1.50947 -21.06332,20.28455 12.04579,12.49846 36.06808,-34.74464 0.005,0.005 12.34654,-11.89954 -12.03701,-12.50724 z m 35.17874,98.71801 0.69918,0.67386 c 0.13539,-0.22412 0.26991,-0.44874 0.4036,-0.67386 z"
id="path1405-6"
inkscape:connector-curvature="0"
sodipodi:nodetypes="ccccccccsczccccccccccccccccccccccc" />
<path
style="opacity:1;fill:#000000;fill-opacity:1;stroke:none;stroke-width:2.39729571;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke"
d="m 740.89945,94.730897 c -57.98068,-9.6e-5 -104.98334,47.002563 -104.98325,104.983253 1.9e-4,57.98049 47.00276,104.98284 104.98325,104.98274 23.42488,-0.0758 46.15145,-7.98387 64.5635,-22.46581 l -17.03461,-16.41553 c -13.83537,9.99805 -30.45916,15.40285 -47.52889,15.4528 -45.04113,0 -81.55419,-36.51306 -81.55419,-81.5542 0,-45.04114 36.51306,-81.55419 81.55419,-81.55419 17.06934,0.0328 33.69814,5.42058 47.54336,15.40423 l 16.99534,-16.3773 c -18.40663,-14.4732 -41.12349,-22.377447 -64.5387,-22.455993 z m 55.24882,57.426473 -12.34653,11.8985 20.50728,21.29534 -95.55696,-1.51567 c -9.38721,-0.17649 -17.06668,6.17928 -17.21858,14.25132 l 0.002,0.002 c -0.1519,8.07203 7.28245,14.71295 16.66978,14.88953 l 95.22519,1.50947 -21.06332,20.28455 12.04578,12.49846 36.06808,-34.74465 0.005,0.005 12.34653,-11.89953 -12.03699,-12.50725 z m 35.17873,98.718 0.69919,0.67386 c 0.13538,-0.22412 0.26991,-0.44874 0.40359,-0.67386 z"
id="path1405-9"
inkscape:connector-curvature="0"
sodipodi:nodetypes="ccccccsccccccccccccccccccccccc" />
</g> </g>
</svg> </svg>

Before

Width:  |  Height:  |  Size: 2.5 KiB

After

Width:  |  Height:  |  Size: 6.0 KiB