mirror of
https://github.com/tubearchivist/tubearchivist-frontend.git
synced 2024-11-22 20:00:15 +00:00
major refactor, #build
Changes: - merges new restructured and split up modules - merges refactor channel, video, playlist index classes - merges code clean up and readability improvements
This commit is contained in:
commit
8591c44ef2
1
.github/workflows/lint_python.yml
vendored
1
.github/workflows/lint_python.yml
vendored
@ -9,7 +9,6 @@ jobs:
|
|||||||
- run: pip install --upgrade pip wheel
|
- run: pip install --upgrade pip wheel
|
||||||
- run: pip install bandit black codespell flake8 flake8-bugbear
|
- run: pip install bandit black codespell flake8 flake8-bugbear
|
||||||
flake8-comprehensions isort
|
flake8-comprehensions isort
|
||||||
- run: bandit --recursive --skip B105,B108,B404,B603,B607 .
|
|
||||||
- run: black --check --diff --line-length 79 .
|
- run: black --check --diff --line-length 79 .
|
||||||
- run: codespell
|
- run: codespell
|
||||||
- run: flake8 . --count --max-complexity=12 --max-line-length=79
|
- run: flake8 . --count --max-complexity=12 --max-line-length=79
|
||||||
|
3
.gitignore
vendored
3
.gitignore
vendored
@ -3,3 +3,6 @@ __pycache__
|
|||||||
|
|
||||||
# django testing db
|
# django testing db
|
||||||
db.sqlite3
|
db.sqlite3
|
||||||
|
|
||||||
|
# vscode custom conf
|
||||||
|
.vscode
|
5
.vscode/settings.json
vendored
5
.vscode/settings.json
vendored
@ -1,5 +0,0 @@
|
|||||||
{
|
|
||||||
"python.linting.pylintEnabled": true,
|
|
||||||
"python.linting.pycodestyleEnabled": false,
|
|
||||||
"python.linting.enabled": true
|
|
||||||
}
|
|
@ -1,6 +1,6 @@
|
|||||||
# build the tube archivist image from default python slim image
|
# build the tube archivist image from default python slim image
|
||||||
|
|
||||||
FROM python:3.10.1-slim-bullseye
|
FROM python:3.10.2-slim-bullseye
|
||||||
ARG TARGETPLATFORM
|
ARG TARGETPLATFORM
|
||||||
|
|
||||||
ENV PYTHONUNBUFFERED 1
|
ENV PYTHONUNBUFFERED 1
|
||||||
@ -35,12 +35,12 @@ COPY ./tubearchivist/requirements.txt /requirements.txt
|
|||||||
RUN pip install --no-cache-dir -r requirements.txt --src /usr/local/src
|
RUN pip install --no-cache-dir -r requirements.txt --src /usr/local/src
|
||||||
|
|
||||||
# copy config files
|
# copy config files
|
||||||
COPY nginx.conf /etc/nginx/conf.d/
|
COPY docker_assets/nginx.conf /etc/nginx/conf.d/
|
||||||
|
|
||||||
# copy application into container
|
# copy application into container
|
||||||
COPY ./tubearchivist /app
|
COPY ./tubearchivist /app
|
||||||
COPY ./run.sh /app
|
COPY ./docker_assets/run.sh /app
|
||||||
COPY ./uwsgi.ini /app
|
COPY ./docker_assets/uwsgi.ini /app
|
||||||
|
|
||||||
# volumes
|
# volumes
|
||||||
VOLUME /cache
|
VOLUME /cache
|
||||||
|
@ -86,8 +86,6 @@ function validate {
|
|||||||
|
|
||||||
echo "run validate on $check_path"
|
echo "run validate on $check_path"
|
||||||
|
|
||||||
echo "running bandit"
|
|
||||||
bandit --recursive --skip B105,B108,B404,B603,B607 "$check_path"
|
|
||||||
echo "running black"
|
echo "running black"
|
||||||
black --diff --color --check -l 79 "$check_path"
|
black --diff --color --check -l 79 "$check_path"
|
||||||
echo "running codespell"
|
echo "running codespell"
|
||||||
|
@ -1,11 +1,6 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
# startup script inside the container for tubearchivist
|
# startup script inside the container for tubearchivist
|
||||||
|
|
||||||
# check environment
|
|
||||||
if [[ -z "$DJANGO_DEBUG" ]]; then
|
|
||||||
export DJANGO_DEBUG=False
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ -z "$ELASTIC_USER" ]]; then
|
if [[ -z "$ELASTIC_USER" ]]; then
|
||||||
export ELASTIC_USER=elastic
|
export ELASTIC_USER=elastic
|
||||||
fi
|
fi
|
@ -1,9 +1,9 @@
|
|||||||
"""all API views"""
|
"""all API views"""
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from home.src.config import AppConfig
|
from home.src.download.thumbnails import ThumbManager
|
||||||
from home.src.helper import UrlListParser
|
from home.src.ta.config import AppConfig
|
||||||
from home.src.thumbnails import ThumbManager
|
from home.src.ta.helper import UrlListParser
|
||||||
from home.tasks import extrac_dl, subscribe_to
|
from home.tasks import extrac_dl, subscribe_to
|
||||||
from rest_framework.authentication import (
|
from rest_framework.authentication import (
|
||||||
SessionAuthentication,
|
SessionAuthentication,
|
||||||
|
@ -15,7 +15,7 @@ from os import environ, path
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from corsheaders.defaults import default_headers
|
from corsheaders.defaults import default_headers
|
||||||
from home.src.config import AppConfig
|
from home.src.ta.config import AppConfig
|
||||||
|
|
||||||
# Build paths inside the project like this: BASE_DIR / 'subdir'.
|
# Build paths inside the project like this: BASE_DIR / 'subdir'.
|
||||||
BASE_DIR = Path(__file__).resolve().parent.parent
|
BASE_DIR = Path(__file__).resolve().parent.parent
|
||||||
|
@ -3,9 +3,9 @@
|
|||||||
import os
|
import os
|
||||||
|
|
||||||
from django.apps import AppConfig
|
from django.apps import AppConfig
|
||||||
from home.src.config import AppConfig as ArchivistConfig
|
from home.src.es.index_setup import index_check
|
||||||
from home.src.helper import RedisArchivist
|
from home.src.ta.config import AppConfig as ArchivistConfig
|
||||||
from home.src.index_management import index_check
|
from home.src.ta.ta_redis import RedisArchivist
|
||||||
|
|
||||||
|
|
||||||
def sync_redis_state():
|
def sync_redis_state():
|
||||||
|
@ -1,802 +0,0 @@
|
|||||||
"""
|
|
||||||
Functionality:
|
|
||||||
- handele the download queue
|
|
||||||
- manage subscriptions to channels
|
|
||||||
- manage subscriptions to playlists
|
|
||||||
- downloading videos
|
|
||||||
"""
|
|
||||||
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
import shutil
|
|
||||||
from datetime import datetime
|
|
||||||
from time import sleep
|
|
||||||
|
|
||||||
import requests
|
|
||||||
import yt_dlp
|
|
||||||
from home.src.config import AppConfig
|
|
||||||
from home.src.helper import (
|
|
||||||
DurationConverter,
|
|
||||||
RedisArchivist,
|
|
||||||
RedisQueue,
|
|
||||||
clean_string,
|
|
||||||
ignore_filelist,
|
|
||||||
)
|
|
||||||
from home.src.index import (
|
|
||||||
IndexPaginate,
|
|
||||||
YoutubeChannel,
|
|
||||||
YoutubePlaylist,
|
|
||||||
YoutubeVideo,
|
|
||||||
index_new_video,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class PendingList:
|
|
||||||
"""manage the pending videos list"""
|
|
||||||
|
|
||||||
CONFIG = AppConfig().config
|
|
||||||
ES_URL = CONFIG["application"]["es_url"]
|
|
||||||
ES_AUTH = CONFIG["application"]["es_auth"]
|
|
||||||
VIDEOS = CONFIG["application"]["videos"]
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
self.all_channel_ids = False
|
|
||||||
self.all_downloaded = False
|
|
||||||
self.missing_from_playlists = []
|
|
||||||
|
|
||||||
def parse_url_list(self, youtube_ids):
|
|
||||||
"""extract youtube ids from list"""
|
|
||||||
missing_videos = []
|
|
||||||
for entry in youtube_ids:
|
|
||||||
# notify
|
|
||||||
mess_dict = {
|
|
||||||
"status": "message:add",
|
|
||||||
"level": "info",
|
|
||||||
"title": "Adding to download queue.",
|
|
||||||
"message": "Extracting lists",
|
|
||||||
}
|
|
||||||
RedisArchivist().set_message("message:add", mess_dict)
|
|
||||||
# extract
|
|
||||||
url = entry["url"]
|
|
||||||
url_type = entry["type"]
|
|
||||||
if url_type == "video":
|
|
||||||
missing_videos.append(url)
|
|
||||||
elif url_type == "channel":
|
|
||||||
video_results = ChannelSubscription().get_last_youtube_videos(
|
|
||||||
url, limit=False
|
|
||||||
)
|
|
||||||
youtube_ids = [i[0] for i in video_results]
|
|
||||||
missing_videos = missing_videos + youtube_ids
|
|
||||||
elif url_type == "playlist":
|
|
||||||
self.missing_from_playlists.append(entry)
|
|
||||||
video_results = YoutubePlaylist(url).get_entries()
|
|
||||||
youtube_ids = [i["youtube_id"] for i in video_results]
|
|
||||||
missing_videos = missing_videos + youtube_ids
|
|
||||||
|
|
||||||
return missing_videos
|
|
||||||
|
|
||||||
def add_to_pending(self, missing_videos, ignore=False):
|
|
||||||
"""build the bulk json data from pending"""
|
|
||||||
# check if channel is indexed
|
|
||||||
channel_handler = ChannelSubscription()
|
|
||||||
all_indexed = channel_handler.get_channels(subscribed_only=False)
|
|
||||||
self.all_channel_ids = [i["channel_id"] for i in all_indexed]
|
|
||||||
# check if already there
|
|
||||||
self.all_downloaded = self.get_all_downloaded()
|
|
||||||
|
|
||||||
bulk_list, all_videos_added = self.build_bulk(missing_videos, ignore)
|
|
||||||
# add last newline
|
|
||||||
bulk_list.append("\n")
|
|
||||||
query_str = "\n".join(bulk_list)
|
|
||||||
headers = {"Content-type": "application/x-ndjson"}
|
|
||||||
url = self.ES_URL + "/_bulk"
|
|
||||||
request = requests.post(
|
|
||||||
url, data=query_str, headers=headers, auth=self.ES_AUTH
|
|
||||||
)
|
|
||||||
if not request.ok:
|
|
||||||
print(request)
|
|
||||||
raise ValueError("failed to add video to download queue")
|
|
||||||
|
|
||||||
return all_videos_added
|
|
||||||
|
|
||||||
def build_bulk(self, missing_videos, ignore=False):
|
|
||||||
"""build the bulk lists"""
|
|
||||||
bulk_list = []
|
|
||||||
all_videos_added = []
|
|
||||||
|
|
||||||
for idx, youtube_id in enumerate(missing_videos):
|
|
||||||
# check if already downloaded
|
|
||||||
if youtube_id in self.all_downloaded:
|
|
||||||
continue
|
|
||||||
|
|
||||||
video = self.get_youtube_details(youtube_id)
|
|
||||||
# skip on download error
|
|
||||||
if not video:
|
|
||||||
continue
|
|
||||||
|
|
||||||
channel_indexed = video["channel_id"] in self.all_channel_ids
|
|
||||||
video["channel_indexed"] = channel_indexed
|
|
||||||
if ignore:
|
|
||||||
video["status"] = "ignore"
|
|
||||||
else:
|
|
||||||
video["status"] = "pending"
|
|
||||||
action = {"create": {"_id": youtube_id, "_index": "ta_download"}}
|
|
||||||
bulk_list.append(json.dumps(action))
|
|
||||||
bulk_list.append(json.dumps(video))
|
|
||||||
all_videos_added.append((youtube_id, video["vid_thumb_url"]))
|
|
||||||
# notify
|
|
||||||
progress = f"{idx + 1}/{len(missing_videos)}"
|
|
||||||
mess_dict = {
|
|
||||||
"status": "message:add",
|
|
||||||
"level": "info",
|
|
||||||
"title": "Adding new videos to download queue.",
|
|
||||||
"message": "Progress: " + progress,
|
|
||||||
}
|
|
||||||
if idx + 1 == len(missing_videos):
|
|
||||||
RedisArchivist().set_message(
|
|
||||||
"message:add", mess_dict, expire=4
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
RedisArchivist().set_message("message:add", mess_dict)
|
|
||||||
if idx + 1 % 25 == 0:
|
|
||||||
print("adding to queue progress: " + progress)
|
|
||||||
|
|
||||||
return bulk_list, all_videos_added
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def get_youtube_details(youtube_id):
|
|
||||||
"""get details from youtubedl for single pending video"""
|
|
||||||
obs = {
|
|
||||||
"default_search": "ytsearch",
|
|
||||||
"quiet": True,
|
|
||||||
"check_formats": "selected",
|
|
||||||
"noplaylist": True,
|
|
||||||
"writethumbnail": True,
|
|
||||||
"simulate": True,
|
|
||||||
}
|
|
||||||
try:
|
|
||||||
vid = yt_dlp.YoutubeDL(obs).extract_info(youtube_id)
|
|
||||||
except yt_dlp.utils.DownloadError:
|
|
||||||
print("failed to extract info for: " + youtube_id)
|
|
||||||
return False
|
|
||||||
# stop if video is streaming live now
|
|
||||||
if vid["is_live"]:
|
|
||||||
return False
|
|
||||||
# parse response
|
|
||||||
seconds = vid["duration"]
|
|
||||||
duration_str = DurationConverter.get_str(seconds)
|
|
||||||
if duration_str == "NA":
|
|
||||||
print(f"skip extracting duration for: {youtube_id}")
|
|
||||||
upload_date = vid["upload_date"]
|
|
||||||
upload_dt = datetime.strptime(upload_date, "%Y%m%d")
|
|
||||||
published = upload_dt.strftime("%Y-%m-%d")
|
|
||||||
# build dict
|
|
||||||
youtube_details = {
|
|
||||||
"youtube_id": youtube_id,
|
|
||||||
"channel_name": vid["channel"],
|
|
||||||
"vid_thumb_url": vid["thumbnail"],
|
|
||||||
"title": vid["title"],
|
|
||||||
"channel_id": vid["channel_id"],
|
|
||||||
"duration": duration_str,
|
|
||||||
"published": published,
|
|
||||||
"timestamp": int(datetime.now().strftime("%s")),
|
|
||||||
}
|
|
||||||
return youtube_details
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def get_all_pending():
|
|
||||||
"""get a list of all pending videos in ta_download"""
|
|
||||||
data = {
|
|
||||||
"query": {"match_all": {}},
|
|
||||||
"sort": [{"timestamp": {"order": "asc"}}],
|
|
||||||
}
|
|
||||||
all_results = IndexPaginate("ta_download", data).get_results()
|
|
||||||
|
|
||||||
all_pending = []
|
|
||||||
all_ignore = []
|
|
||||||
|
|
||||||
for result in all_results:
|
|
||||||
if result["status"] == "pending":
|
|
||||||
all_pending.append(result)
|
|
||||||
elif result["status"] == "ignore":
|
|
||||||
all_ignore.append(result)
|
|
||||||
|
|
||||||
return all_pending, all_ignore
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def get_all_indexed():
|
|
||||||
"""get a list of all videos indexed"""
|
|
||||||
|
|
||||||
data = {
|
|
||||||
"query": {"match_all": {}},
|
|
||||||
"sort": [{"published": {"order": "desc"}}],
|
|
||||||
}
|
|
||||||
all_indexed = IndexPaginate("ta_video", data).get_results()
|
|
||||||
|
|
||||||
return all_indexed
|
|
||||||
|
|
||||||
def get_all_downloaded(self):
|
|
||||||
"""get a list of all videos in archive"""
|
|
||||||
channel_folders = os.listdir(self.VIDEOS)
|
|
||||||
all_channel_folders = ignore_filelist(channel_folders)
|
|
||||||
all_downloaded = []
|
|
||||||
for channel_folder in all_channel_folders:
|
|
||||||
channel_path = os.path.join(self.VIDEOS, channel_folder)
|
|
||||||
videos = os.listdir(channel_path)
|
|
||||||
all_videos = ignore_filelist(videos)
|
|
||||||
youtube_vids = [i[9:20] for i in all_videos]
|
|
||||||
for youtube_id in youtube_vids:
|
|
||||||
all_downloaded.append(youtube_id)
|
|
||||||
return all_downloaded
|
|
||||||
|
|
||||||
def delete_from_pending(self, youtube_id):
|
|
||||||
"""delete the youtube_id from ta_download"""
|
|
||||||
url = f"{self.ES_URL}/ta_download/_doc/{youtube_id}"
|
|
||||||
response = requests.delete(url, auth=self.ES_AUTH)
|
|
||||||
if not response.ok:
|
|
||||||
print(response.text)
|
|
||||||
|
|
||||||
def delete_pending(self, status):
|
|
||||||
"""delete download queue based on status value"""
|
|
||||||
data = {"query": {"term": {"status": {"value": status}}}}
|
|
||||||
payload = json.dumps(data)
|
|
||||||
url = self.ES_URL + "/ta_download/_delete_by_query"
|
|
||||||
headers = {"Content-type": "application/json"}
|
|
||||||
response = requests.post(
|
|
||||||
url, data=payload, headers=headers, auth=self.ES_AUTH
|
|
||||||
)
|
|
||||||
if not response.ok:
|
|
||||||
print(response.text)
|
|
||||||
|
|
||||||
def ignore_from_pending(self, ignore_list):
|
|
||||||
"""build the bulk query string"""
|
|
||||||
|
|
||||||
stamp = int(datetime.now().strftime("%s"))
|
|
||||||
bulk_list = []
|
|
||||||
|
|
||||||
for youtube_id in ignore_list:
|
|
||||||
action = {"update": {"_id": youtube_id, "_index": "ta_download"}}
|
|
||||||
source = {"doc": {"status": "ignore", "timestamp": stamp}}
|
|
||||||
bulk_list.append(json.dumps(action))
|
|
||||||
bulk_list.append(json.dumps(source))
|
|
||||||
|
|
||||||
# add last newline
|
|
||||||
bulk_list.append("\n")
|
|
||||||
query_str = "\n".join(bulk_list)
|
|
||||||
|
|
||||||
headers = {"Content-type": "application/x-ndjson"}
|
|
||||||
url = self.ES_URL + "/_bulk"
|
|
||||||
request = requests.post(
|
|
||||||
url, data=query_str, headers=headers, auth=self.ES_AUTH
|
|
||||||
)
|
|
||||||
if not request.ok:
|
|
||||||
print(request)
|
|
||||||
raise ValueError("failed to set video to ignore")
|
|
||||||
|
|
||||||
|
|
||||||
class ChannelSubscription:
|
|
||||||
"""manage the list of channels subscribed"""
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
config = AppConfig().config
|
|
||||||
self.es_url = config["application"]["es_url"]
|
|
||||||
self.es_auth = config["application"]["es_auth"]
|
|
||||||
self.channel_size = config["subscriptions"]["channel_size"]
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def get_channels(subscribed_only=True):
|
|
||||||
"""get a list of all channels subscribed to"""
|
|
||||||
data = {
|
|
||||||
"sort": [{"channel_name.keyword": {"order": "asc"}}],
|
|
||||||
}
|
|
||||||
if subscribed_only:
|
|
||||||
data["query"] = {"term": {"channel_subscribed": {"value": True}}}
|
|
||||||
else:
|
|
||||||
data["query"] = {"match_all": {}}
|
|
||||||
|
|
||||||
all_channels = IndexPaginate("ta_channel", data).get_results()
|
|
||||||
|
|
||||||
return all_channels
|
|
||||||
|
|
||||||
def get_last_youtube_videos(self, channel_id, limit=True):
|
|
||||||
"""get a list of last videos from channel"""
|
|
||||||
url = f"https://www.youtube.com/channel/{channel_id}/videos"
|
|
||||||
obs = {
|
|
||||||
"default_search": "ytsearch",
|
|
||||||
"quiet": True,
|
|
||||||
"skip_download": True,
|
|
||||||
"extract_flat": True,
|
|
||||||
}
|
|
||||||
if limit:
|
|
||||||
obs["playlistend"] = self.channel_size
|
|
||||||
chan = yt_dlp.YoutubeDL(obs).extract_info(url, download=False)
|
|
||||||
last_videos = [(i["id"], i["title"]) for i in chan["entries"]]
|
|
||||||
return last_videos
|
|
||||||
|
|
||||||
def find_missing(self):
|
|
||||||
"""add missing videos from subscribed channels to pending"""
|
|
||||||
all_channels = self.get_channels()
|
|
||||||
pending_handler = PendingList()
|
|
||||||
all_pending, all_ignore = pending_handler.get_all_pending()
|
|
||||||
all_ids = [i["youtube_id"] for i in all_ignore + all_pending]
|
|
||||||
all_downloaded = pending_handler.get_all_downloaded()
|
|
||||||
to_ignore = all_ids + all_downloaded
|
|
||||||
|
|
||||||
missing_videos = []
|
|
||||||
|
|
||||||
for idx, channel in enumerate(all_channels):
|
|
||||||
channel_id = channel["channel_id"]
|
|
||||||
last_videos = self.get_last_youtube_videos(channel_id)
|
|
||||||
for video in last_videos:
|
|
||||||
if video[0] not in to_ignore:
|
|
||||||
missing_videos.append(video[0])
|
|
||||||
# notify
|
|
||||||
message = {
|
|
||||||
"status": "message:rescan",
|
|
||||||
"level": "info",
|
|
||||||
"title": "Scanning channels: Looking for new videos.",
|
|
||||||
"message": f"Progress: {idx + 1}/{len(all_channels)}",
|
|
||||||
}
|
|
||||||
if idx + 1 == len(all_channels):
|
|
||||||
RedisArchivist().set_message(
|
|
||||||
"message:rescan", message=message, expire=4
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
RedisArchivist().set_message("message:rescan", message=message)
|
|
||||||
|
|
||||||
return missing_videos
|
|
||||||
|
|
||||||
def change_subscribe(self, channel_id, channel_subscribed):
|
|
||||||
"""subscribe or unsubscribe from channel and update"""
|
|
||||||
if not isinstance(channel_subscribed, bool):
|
|
||||||
print("invalid status, should be bool")
|
|
||||||
return
|
|
||||||
headers = {"Content-type": "application/json"}
|
|
||||||
channel_handler = YoutubeChannel(channel_id)
|
|
||||||
channel_dict = channel_handler.channel_dict
|
|
||||||
channel_dict["channel_subscribed"] = channel_subscribed
|
|
||||||
if channel_subscribed:
|
|
||||||
# handle subscribe
|
|
||||||
url = self.es_url + "/ta_channel/_doc/" + channel_id
|
|
||||||
payload = json.dumps(channel_dict)
|
|
||||||
print(channel_dict)
|
|
||||||
else:
|
|
||||||
url = self.es_url + "/ta_channel/_update/" + channel_id
|
|
||||||
payload = json.dumps({"doc": channel_dict})
|
|
||||||
# update channel
|
|
||||||
request = requests.post(
|
|
||||||
url, data=payload, headers=headers, auth=self.es_auth
|
|
||||||
)
|
|
||||||
if not request.ok:
|
|
||||||
print(request.text)
|
|
||||||
raise ValueError("failed change subscribe status")
|
|
||||||
# sync to videos
|
|
||||||
channel_handler.sync_to_videos()
|
|
||||||
if channel_handler.source == "scraped":
|
|
||||||
channel_handler.get_channel_art()
|
|
||||||
|
|
||||||
|
|
||||||
class PlaylistSubscription:
|
|
||||||
"""manage the playlist download functionality"""
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
self.config = AppConfig().config
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def get_playlists(subscribed_only=True):
|
|
||||||
"""get a list of all active playlists"""
|
|
||||||
data = {
|
|
||||||
"sort": [{"playlist_channel.keyword": {"order": "desc"}}],
|
|
||||||
}
|
|
||||||
data["query"] = {
|
|
||||||
"bool": {"must": [{"term": {"playlist_active": {"value": True}}}]}
|
|
||||||
}
|
|
||||||
if subscribed_only:
|
|
||||||
data["query"]["bool"]["must"].append(
|
|
||||||
{"term": {"playlist_subscribed": {"value": True}}}
|
|
||||||
)
|
|
||||||
|
|
||||||
all_playlists = IndexPaginate("ta_playlist", data).get_results()
|
|
||||||
|
|
||||||
return all_playlists
|
|
||||||
|
|
||||||
def process_url_str(self, new_playlists, subscribed=True):
|
|
||||||
"""process playlist subscribe form url_str"""
|
|
||||||
all_indexed = PendingList().get_all_indexed()
|
|
||||||
all_youtube_ids = [i["youtube_id"] for i in all_indexed]
|
|
||||||
|
|
||||||
new_thumbs = []
|
|
||||||
for idx, playlist in enumerate(new_playlists):
|
|
||||||
url_type = playlist["type"]
|
|
||||||
playlist_id = playlist["url"]
|
|
||||||
if not url_type == "playlist":
|
|
||||||
print(f"{playlist_id} not a playlist, skipping...")
|
|
||||||
continue
|
|
||||||
|
|
||||||
playlist_h = YoutubePlaylist(
|
|
||||||
playlist_id, all_youtube_ids=all_youtube_ids
|
|
||||||
)
|
|
||||||
if not playlist_h.get_es_playlist():
|
|
||||||
playlist_h.get_playlist_dict()
|
|
||||||
playlist_h.playlist_dict["playlist_subscribed"] = subscribed
|
|
||||||
playlist_h.upload_to_es()
|
|
||||||
playlist_h.add_vids_to_playlist()
|
|
||||||
thumb = playlist_h.playlist_dict["playlist_thumbnail"]
|
|
||||||
new_thumbs.append((playlist_id, thumb))
|
|
||||||
self.channel_validate(playlist_h)
|
|
||||||
else:
|
|
||||||
self.change_subscribe(playlist_id, subscribe_status=True)
|
|
||||||
|
|
||||||
# notify
|
|
||||||
message = {
|
|
||||||
"status": "message:subplaylist",
|
|
||||||
"level": "info",
|
|
||||||
"title": "Subscribing to Playlists",
|
|
||||||
"message": f"Processing {idx + 1} of {len(new_playlists)}",
|
|
||||||
}
|
|
||||||
RedisArchivist().set_message(
|
|
||||||
"message:subplaylist", message=message
|
|
||||||
)
|
|
||||||
|
|
||||||
return new_thumbs
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def channel_validate(playlist_handler):
|
|
||||||
"""make sure channel of playlist is there"""
|
|
||||||
channel_id = playlist_handler.playlist_dict["playlist_channel_id"]
|
|
||||||
channel_handler = YoutubeChannel(channel_id)
|
|
||||||
if channel_handler.source == "scraped":
|
|
||||||
channel_handler.channel_dict["channel_subscribed"] = False
|
|
||||||
channel_handler.upload_to_es()
|
|
||||||
channel_handler.get_channel_art()
|
|
||||||
|
|
||||||
def change_subscribe(self, playlist_id, subscribe_status):
|
|
||||||
"""change the subscribe status of a playlist"""
|
|
||||||
es_url = self.config["application"]["es_url"]
|
|
||||||
es_auth = self.config["application"]["es_auth"]
|
|
||||||
playlist_handler = YoutubePlaylist(playlist_id)
|
|
||||||
playlist_handler.get_playlist_dict()
|
|
||||||
subed_now = playlist_handler.playlist_dict["playlist_subscribed"]
|
|
||||||
|
|
||||||
if subed_now == subscribe_status:
|
|
||||||
# status already as expected, do nothing
|
|
||||||
return False
|
|
||||||
|
|
||||||
# update subscribed status
|
|
||||||
headers = {"Content-type": "application/json"}
|
|
||||||
url = f"{es_url}/ta_playlist/_update/{playlist_id}"
|
|
||||||
payload = json.dumps(
|
|
||||||
{"doc": {"playlist_subscribed": subscribe_status}}
|
|
||||||
)
|
|
||||||
response = requests.post(
|
|
||||||
url, data=payload, headers=headers, auth=es_auth
|
|
||||||
)
|
|
||||||
if not response.ok:
|
|
||||||
print(response.text)
|
|
||||||
raise ValueError("failed to change subscribe status")
|
|
||||||
|
|
||||||
return True
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def get_to_ignore():
|
|
||||||
"""get all youtube_ids already downloaded or ignored"""
|
|
||||||
pending_handler = PendingList()
|
|
||||||
all_pending, all_ignore = pending_handler.get_all_pending()
|
|
||||||
all_ids = [i["youtube_id"] for i in all_ignore + all_pending]
|
|
||||||
all_downloaded = pending_handler.get_all_downloaded()
|
|
||||||
to_ignore = all_ids + all_downloaded
|
|
||||||
return to_ignore
|
|
||||||
|
|
||||||
def find_missing(self):
|
|
||||||
"""find videos in subscribed playlists not downloaded yet"""
|
|
||||||
all_playlists = [i["playlist_id"] for i in self.get_playlists()]
|
|
||||||
to_ignore = self.get_to_ignore()
|
|
||||||
|
|
||||||
missing_videos = []
|
|
||||||
counter = 1
|
|
||||||
for playlist_id in all_playlists:
|
|
||||||
size_limit = self.config["subscriptions"]["channel_size"]
|
|
||||||
playlist_handler = YoutubePlaylist(playlist_id)
|
|
||||||
playlist = playlist_handler.update_playlist()
|
|
||||||
if not playlist:
|
|
||||||
playlist_handler.deactivate()
|
|
||||||
continue
|
|
||||||
|
|
||||||
if size_limit:
|
|
||||||
playlist_entries = playlist["playlist_entries"][:size_limit]
|
|
||||||
else:
|
|
||||||
playlist_entries = playlist["playlist_entries"]
|
|
||||||
all_missing = [i for i in playlist_entries if not i["downloaded"]]
|
|
||||||
|
|
||||||
message = {
|
|
||||||
"status": "message:rescan",
|
|
||||||
"level": "info",
|
|
||||||
"title": "Scanning playlists: Looking for new videos.",
|
|
||||||
"message": f"Progress: {counter}/{len(all_playlists)}",
|
|
||||||
}
|
|
||||||
RedisArchivist().set_message("message:rescan", message=message)
|
|
||||||
|
|
||||||
for video in all_missing:
|
|
||||||
youtube_id = video["youtube_id"]
|
|
||||||
if youtube_id not in to_ignore:
|
|
||||||
missing_videos.append(youtube_id)
|
|
||||||
counter = counter + 1
|
|
||||||
|
|
||||||
return missing_videos
|
|
||||||
|
|
||||||
|
|
||||||
class VideoDownloader:
|
|
||||||
"""
|
|
||||||
handle the video download functionality
|
|
||||||
if not initiated with list, take from queue
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, youtube_id_list=False):
|
|
||||||
self.youtube_id_list = youtube_id_list
|
|
||||||
self.config = AppConfig().config
|
|
||||||
self.channels = set()
|
|
||||||
|
|
||||||
def run_queue(self):
|
|
||||||
"""setup download queue in redis loop until no more items"""
|
|
||||||
queue = RedisQueue("dl_queue")
|
|
||||||
|
|
||||||
limit_queue = self.config["downloads"]["limit_count"]
|
|
||||||
if limit_queue:
|
|
||||||
queue.trim(limit_queue - 1)
|
|
||||||
|
|
||||||
while True:
|
|
||||||
youtube_id = queue.get_next()
|
|
||||||
if not youtube_id:
|
|
||||||
break
|
|
||||||
|
|
||||||
try:
|
|
||||||
self.dl_single_vid(youtube_id)
|
|
||||||
except yt_dlp.utils.DownloadError:
|
|
||||||
print("failed to download " + youtube_id)
|
|
||||||
continue
|
|
||||||
vid_dict = index_new_video(youtube_id)
|
|
||||||
self.channels.add(vid_dict["channel"]["channel_id"])
|
|
||||||
self.move_to_archive(vid_dict)
|
|
||||||
self.delete_from_pending(youtube_id)
|
|
||||||
|
|
||||||
autodelete_days = self.config["downloads"]["autodelete_days"]
|
|
||||||
if autodelete_days:
|
|
||||||
print(f"auto delete older than {autodelete_days} days")
|
|
||||||
self.auto_delete_watched(autodelete_days)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def add_pending():
|
|
||||||
"""add pending videos to download queue"""
|
|
||||||
mess_dict = {
|
|
||||||
"status": "message:download",
|
|
||||||
"level": "info",
|
|
||||||
"title": "Looking for videos to download",
|
|
||||||
"message": "Scanning your download queue.",
|
|
||||||
}
|
|
||||||
RedisArchivist().set_message("message:download", mess_dict)
|
|
||||||
all_pending, _ = PendingList().get_all_pending()
|
|
||||||
to_add = [i["youtube_id"] for i in all_pending]
|
|
||||||
if not to_add:
|
|
||||||
# there is nothing pending
|
|
||||||
print("download queue is empty")
|
|
||||||
mess_dict = {
|
|
||||||
"status": "message:download",
|
|
||||||
"level": "error",
|
|
||||||
"title": "Download queue is empty",
|
|
||||||
"message": "Add some videos to the queue first.",
|
|
||||||
}
|
|
||||||
RedisArchivist().set_message("message:download", mess_dict)
|
|
||||||
return
|
|
||||||
|
|
||||||
queue = RedisQueue("dl_queue")
|
|
||||||
queue.add_list(to_add)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def progress_hook(response):
|
|
||||||
"""process the progress_hooks from yt_dlp"""
|
|
||||||
# title
|
|
||||||
path = os.path.split(response["filename"])[-1][12:]
|
|
||||||
filename = os.path.splitext(os.path.splitext(path)[0])[0]
|
|
||||||
filename_clean = filename.replace("_", " ")
|
|
||||||
title = "Downloading: " + filename_clean
|
|
||||||
# message
|
|
||||||
try:
|
|
||||||
percent = response["_percent_str"]
|
|
||||||
size = response["_total_bytes_str"]
|
|
||||||
speed = response["_speed_str"]
|
|
||||||
eta = response["_eta_str"]
|
|
||||||
message = f"{percent} of {size} at {speed} - time left: {eta}"
|
|
||||||
except KeyError:
|
|
||||||
message = "processing"
|
|
||||||
mess_dict = {
|
|
||||||
"status": "message:download",
|
|
||||||
"level": "info",
|
|
||||||
"title": title,
|
|
||||||
"message": message,
|
|
||||||
}
|
|
||||||
RedisArchivist().set_message("message:download", mess_dict)
|
|
||||||
|
|
||||||
def build_obs(self):
|
|
||||||
"""build obs dictionary for yt-dlp"""
|
|
||||||
obs = {
|
|
||||||
"default_search": "ytsearch",
|
|
||||||
"merge_output_format": "mp4",
|
|
||||||
"restrictfilenames": True,
|
|
||||||
"outtmpl": (
|
|
||||||
self.config["application"]["cache_dir"]
|
|
||||||
+ "/download/"
|
|
||||||
+ self.config["application"]["file_template"]
|
|
||||||
),
|
|
||||||
"progress_hooks": [self.progress_hook],
|
|
||||||
"noprogress": True,
|
|
||||||
"quiet": True,
|
|
||||||
"continuedl": True,
|
|
||||||
"retries": 3,
|
|
||||||
"writethumbnail": False,
|
|
||||||
"noplaylist": True,
|
|
||||||
"check_formats": "selected",
|
|
||||||
}
|
|
||||||
if self.config["downloads"]["format"]:
|
|
||||||
obs["format"] = self.config["downloads"]["format"]
|
|
||||||
if self.config["downloads"]["limit_speed"]:
|
|
||||||
obs["ratelimit"] = self.config["downloads"]["limit_speed"] * 1024
|
|
||||||
|
|
||||||
throttle = self.config["downloads"]["throttledratelimit"]
|
|
||||||
if throttle:
|
|
||||||
obs["throttledratelimit"] = throttle * 1024
|
|
||||||
|
|
||||||
postprocessors = []
|
|
||||||
|
|
||||||
if self.config["downloads"]["add_metadata"]:
|
|
||||||
postprocessors.append(
|
|
||||||
{
|
|
||||||
"key": "FFmpegMetadata",
|
|
||||||
"add_chapters": True,
|
|
||||||
"add_metadata": True,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
if self.config["downloads"]["add_thumbnail"]:
|
|
||||||
postprocessors.append(
|
|
||||||
{
|
|
||||||
"key": "EmbedThumbnail",
|
|
||||||
"already_have_thumbnail": True,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
obs["writethumbnail"] = True
|
|
||||||
|
|
||||||
obs["postprocessors"] = postprocessors
|
|
||||||
|
|
||||||
return obs
|
|
||||||
|
|
||||||
def dl_single_vid(self, youtube_id):
|
|
||||||
"""download single video"""
|
|
||||||
dl_cache = self.config["application"]["cache_dir"] + "/download/"
|
|
||||||
obs = self.build_obs()
|
|
||||||
|
|
||||||
# check if already in cache to continue from there
|
|
||||||
all_cached = ignore_filelist(os.listdir(dl_cache))
|
|
||||||
for file_name in all_cached:
|
|
||||||
if youtube_id in file_name:
|
|
||||||
obs["outtmpl"] = os.path.join(dl_cache, file_name)
|
|
||||||
with yt_dlp.YoutubeDL(obs) as ydl:
|
|
||||||
try:
|
|
||||||
ydl.download([youtube_id])
|
|
||||||
except yt_dlp.utils.DownloadError:
|
|
||||||
print("retry failed download: " + youtube_id)
|
|
||||||
sleep(10)
|
|
||||||
ydl.download([youtube_id])
|
|
||||||
|
|
||||||
if obs["writethumbnail"]:
|
|
||||||
# webp files don't get cleaned up automatically
|
|
||||||
all_cached = ignore_filelist(os.listdir(dl_cache))
|
|
||||||
to_clean = [i for i in all_cached if not i.endswith(".mp4")]
|
|
||||||
for file_name in to_clean:
|
|
||||||
file_path = os.path.join(dl_cache, file_name)
|
|
||||||
os.remove(file_path)
|
|
||||||
|
|
||||||
def move_to_archive(self, vid_dict):
|
|
||||||
"""move downloaded video from cache to archive"""
|
|
||||||
videos = self.config["application"]["videos"]
|
|
||||||
host_uid = self.config["application"]["HOST_UID"]
|
|
||||||
host_gid = self.config["application"]["HOST_GID"]
|
|
||||||
channel_name = clean_string(vid_dict["channel"]["channel_name"])
|
|
||||||
# make archive folder with correct permissions
|
|
||||||
new_folder = os.path.join(videos, channel_name)
|
|
||||||
if not os.path.exists(new_folder):
|
|
||||||
os.makedirs(new_folder)
|
|
||||||
if host_uid and host_gid:
|
|
||||||
os.chown(new_folder, host_uid, host_gid)
|
|
||||||
# find real filename
|
|
||||||
cache_dir = self.config["application"]["cache_dir"]
|
|
||||||
all_cached = ignore_filelist(os.listdir(cache_dir + "/download/"))
|
|
||||||
for file_str in all_cached:
|
|
||||||
if vid_dict["youtube_id"] in file_str:
|
|
||||||
old_file = file_str
|
|
||||||
old_file_path = os.path.join(cache_dir, "download", old_file)
|
|
||||||
new_file_path = os.path.join(videos, vid_dict["media_url"])
|
|
||||||
# move media file and fix permission
|
|
||||||
shutil.move(old_file_path, new_file_path)
|
|
||||||
if host_uid and host_gid:
|
|
||||||
os.chown(new_file_path, host_uid, host_gid)
|
|
||||||
|
|
||||||
def delete_from_pending(self, youtube_id):
|
|
||||||
"""delete downloaded video from pending index if its there"""
|
|
||||||
es_url = self.config["application"]["es_url"]
|
|
||||||
es_auth = self.config["application"]["es_auth"]
|
|
||||||
url = f"{es_url}/ta_download/_doc/{youtube_id}"
|
|
||||||
response = requests.delete(url, auth=es_auth)
|
|
||||||
if not response.ok and not response.status_code == 404:
|
|
||||||
print(response.text)
|
|
||||||
|
|
||||||
def add_subscribed_channels(self):
|
|
||||||
"""add all channels subscribed to refresh"""
|
|
||||||
all_subscribed = PlaylistSubscription().get_playlists()
|
|
||||||
if not all_subscribed:
|
|
||||||
return
|
|
||||||
|
|
||||||
channel_ids = [i["playlist_channel_id"] for i in all_subscribed]
|
|
||||||
for channel_id in channel_ids:
|
|
||||||
self.channels.add(channel_id)
|
|
||||||
|
|
||||||
return
|
|
||||||
|
|
||||||
def validate_playlists(self):
|
|
||||||
"""look for playlist needing to update"""
|
|
||||||
print("sync playlists")
|
|
||||||
self.add_subscribed_channels()
|
|
||||||
all_indexed = PendingList().get_all_indexed()
|
|
||||||
all_youtube_ids = [i["youtube_id"] for i in all_indexed]
|
|
||||||
for id_c, channel_id in enumerate(self.channels):
|
|
||||||
playlists = YoutubeChannel(channel_id).get_indexed_playlists()
|
|
||||||
all_playlist_ids = [i["playlist_id"] for i in playlists]
|
|
||||||
for id_p, playlist_id in enumerate(all_playlist_ids):
|
|
||||||
playlist_handler = YoutubePlaylist(
|
|
||||||
playlist_id, all_youtube_ids=all_youtube_ids
|
|
||||||
)
|
|
||||||
playlist_dict = playlist_handler.update_playlist()
|
|
||||||
if not playlist_dict:
|
|
||||||
playlist_handler.deactivate()
|
|
||||||
continue
|
|
||||||
|
|
||||||
playlist_handler.add_vids_to_playlist()
|
|
||||||
# notify
|
|
||||||
title = (
|
|
||||||
"Processing playlists for channels: "
|
|
||||||
+ f"{id_c + 1}/{len(self.channels)}"
|
|
||||||
)
|
|
||||||
message = f"Progress: {id_p + 1}/{len(all_playlist_ids)}"
|
|
||||||
mess_dict = {
|
|
||||||
"status": "message:download",
|
|
||||||
"level": "info",
|
|
||||||
"title": title,
|
|
||||||
"message": message,
|
|
||||||
}
|
|
||||||
if id_p + 1 == len(all_playlist_ids):
|
|
||||||
RedisArchivist().set_message(
|
|
||||||
"message:download", mess_dict, expire=4
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
RedisArchivist().set_message("message:download", mess_dict)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def auto_delete_watched(autodelete_days):
|
|
||||||
"""delete watched videos after x days"""
|
|
||||||
now = int(datetime.now().strftime("%s"))
|
|
||||||
now_lte = now - autodelete_days * 24 * 60 * 60
|
|
||||||
data = {
|
|
||||||
"query": {"range": {"player.watched_date": {"lte": now_lte}}},
|
|
||||||
"sort": [{"player.watched_date": {"order": "asc"}}],
|
|
||||||
}
|
|
||||||
all_to_delete = IndexPaginate("ta_video", data).get_results()
|
|
||||||
all_youtube_ids = [i["youtube_id"] for i in all_to_delete]
|
|
||||||
if not all_youtube_ids:
|
|
||||||
return
|
|
||||||
|
|
||||||
for youtube_id in all_youtube_ids:
|
|
||||||
print(f"autodelete {youtube_id}")
|
|
||||||
YoutubeVideo(youtube_id).delete_media_file()
|
|
||||||
|
|
||||||
print("add deleted to ignore list")
|
|
||||||
pending_handler = PendingList()
|
|
||||||
pending_handler.add_to_pending(all_youtube_ids, ignore=True)
|
|
0
tubearchivist/home/src/download/__init__.py
Normal file
0
tubearchivist/home/src/download/__init__.py
Normal file
263
tubearchivist/home/src/download/queue.py
Normal file
263
tubearchivist/home/src/download/queue.py
Normal file
@ -0,0 +1,263 @@
|
|||||||
|
"""
|
||||||
|
Functionality:
|
||||||
|
- handle download queue
|
||||||
|
- linked with ta_dowload index
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import yt_dlp
|
||||||
|
from home.src.download.subscriptions import ChannelSubscription
|
||||||
|
from home.src.es.connect import IndexPaginate
|
||||||
|
from home.src.index.playlist import YoutubePlaylist
|
||||||
|
from home.src.ta.config import AppConfig
|
||||||
|
from home.src.ta.helper import DurationConverter, ignore_filelist
|
||||||
|
from home.src.ta.ta_redis import RedisArchivist
|
||||||
|
|
||||||
|
|
||||||
|
class PendingList:
|
||||||
|
"""manage the pending videos list"""
|
||||||
|
|
||||||
|
CONFIG = AppConfig().config
|
||||||
|
ES_URL = CONFIG["application"]["es_url"]
|
||||||
|
ES_AUTH = CONFIG["application"]["es_auth"]
|
||||||
|
VIDEOS = CONFIG["application"]["videos"]
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.all_channel_ids = False
|
||||||
|
self.all_downloaded = False
|
||||||
|
self.missing_from_playlists = []
|
||||||
|
|
||||||
|
def parse_url_list(self, youtube_ids):
|
||||||
|
"""extract youtube ids from list"""
|
||||||
|
missing_videos = []
|
||||||
|
for entry in youtube_ids:
|
||||||
|
# notify
|
||||||
|
mess_dict = {
|
||||||
|
"status": "message:add",
|
||||||
|
"level": "info",
|
||||||
|
"title": "Adding to download queue.",
|
||||||
|
"message": "Extracting lists",
|
||||||
|
}
|
||||||
|
RedisArchivist().set_message("message:add", mess_dict)
|
||||||
|
# extract
|
||||||
|
url = entry["url"]
|
||||||
|
url_type = entry["type"]
|
||||||
|
if url_type == "video":
|
||||||
|
missing_videos.append(url)
|
||||||
|
elif url_type == "channel":
|
||||||
|
video_results = ChannelSubscription().get_last_youtube_videos(
|
||||||
|
url, limit=False
|
||||||
|
)
|
||||||
|
youtube_ids = [i[0] for i in video_results]
|
||||||
|
missing_videos = missing_videos + youtube_ids
|
||||||
|
elif url_type == "playlist":
|
||||||
|
self.missing_from_playlists.append(entry)
|
||||||
|
playlist = YoutubePlaylist(url)
|
||||||
|
playlist.build_json()
|
||||||
|
video_results = playlist.json_data.get("playlist_entries")
|
||||||
|
youtube_ids = [i["youtube_id"] for i in video_results]
|
||||||
|
missing_videos = missing_videos + youtube_ids
|
||||||
|
|
||||||
|
return missing_videos
|
||||||
|
|
||||||
|
def add_to_pending(self, missing_videos, ignore=False):
|
||||||
|
"""build the bulk json data from pending"""
|
||||||
|
# check if channel is indexed
|
||||||
|
channel_handler = ChannelSubscription()
|
||||||
|
all_indexed = channel_handler.get_channels(subscribed_only=False)
|
||||||
|
self.all_channel_ids = [i["channel_id"] for i in all_indexed]
|
||||||
|
# check if already there
|
||||||
|
self.all_downloaded = self.get_all_downloaded()
|
||||||
|
|
||||||
|
bulk_list, all_videos_added = self.build_bulk(missing_videos, ignore)
|
||||||
|
# add last newline
|
||||||
|
bulk_list.append("\n")
|
||||||
|
query_str = "\n".join(bulk_list)
|
||||||
|
headers = {"Content-type": "application/x-ndjson"}
|
||||||
|
url = self.ES_URL + "/_bulk"
|
||||||
|
request = requests.post(
|
||||||
|
url, data=query_str, headers=headers, auth=self.ES_AUTH
|
||||||
|
)
|
||||||
|
if not request.ok:
|
||||||
|
print(request)
|
||||||
|
raise ValueError("failed to add video to download queue")
|
||||||
|
|
||||||
|
return all_videos_added
|
||||||
|
|
||||||
|
def build_bulk(self, missing_videos, ignore=False):
|
||||||
|
"""build the bulk lists"""
|
||||||
|
bulk_list = []
|
||||||
|
all_videos_added = []
|
||||||
|
|
||||||
|
for idx, youtube_id in enumerate(missing_videos):
|
||||||
|
# check if already downloaded
|
||||||
|
if youtube_id in self.all_downloaded:
|
||||||
|
continue
|
||||||
|
|
||||||
|
video = self.get_youtube_details(youtube_id)
|
||||||
|
# skip on download error
|
||||||
|
if not video:
|
||||||
|
continue
|
||||||
|
|
||||||
|
channel_indexed = video["channel_id"] in self.all_channel_ids
|
||||||
|
video["channel_indexed"] = channel_indexed
|
||||||
|
if ignore:
|
||||||
|
video["status"] = "ignore"
|
||||||
|
else:
|
||||||
|
video["status"] = "pending"
|
||||||
|
action = {"create": {"_id": youtube_id, "_index": "ta_download"}}
|
||||||
|
bulk_list.append(json.dumps(action))
|
||||||
|
bulk_list.append(json.dumps(video))
|
||||||
|
all_videos_added.append((youtube_id, video["vid_thumb_url"]))
|
||||||
|
# notify
|
||||||
|
progress = f"{idx + 1}/{len(missing_videos)}"
|
||||||
|
mess_dict = {
|
||||||
|
"status": "message:add",
|
||||||
|
"level": "info",
|
||||||
|
"title": "Adding new videos to download queue.",
|
||||||
|
"message": "Progress: " + progress,
|
||||||
|
}
|
||||||
|
if idx + 1 == len(missing_videos):
|
||||||
|
RedisArchivist().set_message(
|
||||||
|
"message:add", mess_dict, expire=4
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
RedisArchivist().set_message("message:add", mess_dict)
|
||||||
|
if idx + 1 % 25 == 0:
|
||||||
|
print("adding to queue progress: " + progress)
|
||||||
|
|
||||||
|
return bulk_list, all_videos_added
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_youtube_details(youtube_id):
|
||||||
|
"""get details from youtubedl for single pending video"""
|
||||||
|
obs = {
|
||||||
|
"default_search": "ytsearch",
|
||||||
|
"quiet": True,
|
||||||
|
"check_formats": "selected",
|
||||||
|
"noplaylist": True,
|
||||||
|
"writethumbnail": True,
|
||||||
|
"simulate": True,
|
||||||
|
}
|
||||||
|
try:
|
||||||
|
vid = yt_dlp.YoutubeDL(obs).extract_info(youtube_id)
|
||||||
|
except yt_dlp.utils.DownloadError:
|
||||||
|
print("failed to extract info for: " + youtube_id)
|
||||||
|
return False
|
||||||
|
# stop if video is streaming live now
|
||||||
|
if vid["is_live"]:
|
||||||
|
return False
|
||||||
|
# parse response
|
||||||
|
seconds = vid["duration"]
|
||||||
|
duration_str = DurationConverter.get_str(seconds)
|
||||||
|
if duration_str == "NA":
|
||||||
|
print(f"skip extracting duration for: {youtube_id}")
|
||||||
|
upload_date = vid["upload_date"]
|
||||||
|
upload_dt = datetime.strptime(upload_date, "%Y%m%d")
|
||||||
|
published = upload_dt.strftime("%Y-%m-%d")
|
||||||
|
# build dict
|
||||||
|
youtube_details = {
|
||||||
|
"youtube_id": youtube_id,
|
||||||
|
"channel_name": vid["channel"],
|
||||||
|
"vid_thumb_url": vid["thumbnail"],
|
||||||
|
"title": vid["title"],
|
||||||
|
"channel_id": vid["channel_id"],
|
||||||
|
"duration": duration_str,
|
||||||
|
"published": published,
|
||||||
|
"timestamp": int(datetime.now().strftime("%s")),
|
||||||
|
}
|
||||||
|
return youtube_details
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_all_pending():
|
||||||
|
"""get a list of all pending videos in ta_download"""
|
||||||
|
data = {
|
||||||
|
"query": {"match_all": {}},
|
||||||
|
"sort": [{"timestamp": {"order": "asc"}}],
|
||||||
|
}
|
||||||
|
all_results = IndexPaginate("ta_download", data).get_results()
|
||||||
|
|
||||||
|
all_pending = []
|
||||||
|
all_ignore = []
|
||||||
|
|
||||||
|
for result in all_results:
|
||||||
|
if result["status"] == "pending":
|
||||||
|
all_pending.append(result)
|
||||||
|
elif result["status"] == "ignore":
|
||||||
|
all_ignore.append(result)
|
||||||
|
|
||||||
|
return all_pending, all_ignore
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_all_indexed():
|
||||||
|
"""get a list of all videos indexed"""
|
||||||
|
|
||||||
|
data = {
|
||||||
|
"query": {"match_all": {}},
|
||||||
|
"sort": [{"published": {"order": "desc"}}],
|
||||||
|
}
|
||||||
|
all_indexed = IndexPaginate("ta_video", data).get_results()
|
||||||
|
|
||||||
|
return all_indexed
|
||||||
|
|
||||||
|
def get_all_downloaded(self):
|
||||||
|
"""get a list of all videos in archive"""
|
||||||
|
channel_folders = os.listdir(self.VIDEOS)
|
||||||
|
all_channel_folders = ignore_filelist(channel_folders)
|
||||||
|
all_downloaded = []
|
||||||
|
for channel_folder in all_channel_folders:
|
||||||
|
channel_path = os.path.join(self.VIDEOS, channel_folder)
|
||||||
|
videos = os.listdir(channel_path)
|
||||||
|
all_videos = ignore_filelist(videos)
|
||||||
|
youtube_vids = [i[9:20] for i in all_videos]
|
||||||
|
for youtube_id in youtube_vids:
|
||||||
|
all_downloaded.append(youtube_id)
|
||||||
|
return all_downloaded
|
||||||
|
|
||||||
|
def delete_from_pending(self, youtube_id):
|
||||||
|
"""delete the youtube_id from ta_download"""
|
||||||
|
url = f"{self.ES_URL}/ta_download/_doc/{youtube_id}"
|
||||||
|
response = requests.delete(url, auth=self.ES_AUTH)
|
||||||
|
if not response.ok:
|
||||||
|
print(response.text)
|
||||||
|
|
||||||
|
def delete_pending(self, status):
|
||||||
|
"""delete download queue based on status value"""
|
||||||
|
data = {"query": {"term": {"status": {"value": status}}}}
|
||||||
|
payload = json.dumps(data)
|
||||||
|
url = self.ES_URL + "/ta_download/_delete_by_query"
|
||||||
|
headers = {"Content-type": "application/json"}
|
||||||
|
response = requests.post(
|
||||||
|
url, data=payload, headers=headers, auth=self.ES_AUTH
|
||||||
|
)
|
||||||
|
if not response.ok:
|
||||||
|
print(response.text)
|
||||||
|
|
||||||
|
def ignore_from_pending(self, ignore_list):
|
||||||
|
"""build the bulk query string"""
|
||||||
|
|
||||||
|
stamp = int(datetime.now().strftime("%s"))
|
||||||
|
bulk_list = []
|
||||||
|
|
||||||
|
for youtube_id in ignore_list:
|
||||||
|
action = {"update": {"_id": youtube_id, "_index": "ta_download"}}
|
||||||
|
source = {"doc": {"status": "ignore", "timestamp": stamp}}
|
||||||
|
bulk_list.append(json.dumps(action))
|
||||||
|
bulk_list.append(json.dumps(source))
|
||||||
|
|
||||||
|
# add last newline
|
||||||
|
bulk_list.append("\n")
|
||||||
|
query_str = "\n".join(bulk_list)
|
||||||
|
|
||||||
|
headers = {"Content-type": "application/x-ndjson"}
|
||||||
|
url = self.ES_URL + "/_bulk"
|
||||||
|
request = requests.post(
|
||||||
|
url, data=query_str, headers=headers, auth=self.ES_AUTH
|
||||||
|
)
|
||||||
|
if not request.ok:
|
||||||
|
print(request)
|
||||||
|
raise ValueError("failed to set video to ignore")
|
214
tubearchivist/home/src/download/subscriptions.py
Normal file
214
tubearchivist/home/src/download/subscriptions.py
Normal file
@ -0,0 +1,214 @@
|
|||||||
|
"""
|
||||||
|
Functionality:
|
||||||
|
- handle channel subscriptions
|
||||||
|
- handle playlist subscriptions
|
||||||
|
"""
|
||||||
|
|
||||||
|
import yt_dlp
|
||||||
|
from home.src.download import queue # partial import
|
||||||
|
from home.src.es.connect import IndexPaginate
|
||||||
|
from home.src.index.channel import YoutubeChannel
|
||||||
|
from home.src.index.playlist import YoutubePlaylist
|
||||||
|
from home.src.ta.config import AppConfig
|
||||||
|
from home.src.ta.ta_redis import RedisArchivist
|
||||||
|
|
||||||
|
|
||||||
|
class ChannelSubscription:
|
||||||
|
"""manage the list of channels subscribed"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
config = AppConfig().config
|
||||||
|
self.es_url = config["application"]["es_url"]
|
||||||
|
self.es_auth = config["application"]["es_auth"]
|
||||||
|
self.channel_size = config["subscriptions"]["channel_size"]
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_channels(subscribed_only=True):
|
||||||
|
"""get a list of all channels subscribed to"""
|
||||||
|
data = {
|
||||||
|
"sort": [{"channel_name.keyword": {"order": "asc"}}],
|
||||||
|
}
|
||||||
|
if subscribed_only:
|
||||||
|
data["query"] = {"term": {"channel_subscribed": {"value": True}}}
|
||||||
|
else:
|
||||||
|
data["query"] = {"match_all": {}}
|
||||||
|
|
||||||
|
all_channels = IndexPaginate("ta_channel", data).get_results()
|
||||||
|
|
||||||
|
return all_channels
|
||||||
|
|
||||||
|
def get_last_youtube_videos(self, channel_id, limit=True):
|
||||||
|
"""get a list of last videos from channel"""
|
||||||
|
url = f"https://www.youtube.com/channel/{channel_id}/videos"
|
||||||
|
obs = {
|
||||||
|
"default_search": "ytsearch",
|
||||||
|
"quiet": True,
|
||||||
|
"skip_download": True,
|
||||||
|
"extract_flat": True,
|
||||||
|
}
|
||||||
|
if limit:
|
||||||
|
obs["playlistend"] = self.channel_size
|
||||||
|
chan = yt_dlp.YoutubeDL(obs).extract_info(url, download=False)
|
||||||
|
last_videos = [(i["id"], i["title"]) for i in chan["entries"]]
|
||||||
|
return last_videos
|
||||||
|
|
||||||
|
def find_missing(self):
|
||||||
|
"""add missing videos from subscribed channels to pending"""
|
||||||
|
all_channels = self.get_channels()
|
||||||
|
pending_handler = queue.PendingList()
|
||||||
|
all_pending, all_ignore = pending_handler.get_all_pending()
|
||||||
|
all_ids = [i["youtube_id"] for i in all_ignore + all_pending]
|
||||||
|
all_downloaded = pending_handler.get_all_downloaded()
|
||||||
|
to_ignore = all_ids + all_downloaded
|
||||||
|
|
||||||
|
missing_videos = []
|
||||||
|
|
||||||
|
for idx, channel in enumerate(all_channels):
|
||||||
|
channel_id = channel["channel_id"]
|
||||||
|
last_videos = self.get_last_youtube_videos(channel_id)
|
||||||
|
for video in last_videos:
|
||||||
|
if video[0] not in to_ignore:
|
||||||
|
missing_videos.append(video[0])
|
||||||
|
# notify
|
||||||
|
message = {
|
||||||
|
"status": "message:rescan",
|
||||||
|
"level": "info",
|
||||||
|
"title": "Scanning channels: Looking for new videos.",
|
||||||
|
"message": f"Progress: {idx + 1}/{len(all_channels)}",
|
||||||
|
}
|
||||||
|
if idx + 1 == len(all_channels):
|
||||||
|
RedisArchivist().set_message(
|
||||||
|
"message:rescan", message=message, expire=4
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
RedisArchivist().set_message("message:rescan", message=message)
|
||||||
|
|
||||||
|
return missing_videos
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def change_subscribe(channel_id, channel_subscribed):
|
||||||
|
"""subscribe or unsubscribe from channel and update"""
|
||||||
|
channel = YoutubeChannel(channel_id)
|
||||||
|
channel.build_json()
|
||||||
|
channel.json_data["channel_subscribed"] = channel_subscribed
|
||||||
|
channel.upload_to_es()
|
||||||
|
channel.sync_to_videos()
|
||||||
|
|
||||||
|
|
||||||
|
class PlaylistSubscription:
|
||||||
|
"""manage the playlist download functionality"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.config = AppConfig().config
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_playlists(subscribed_only=True):
|
||||||
|
"""get a list of all active playlists"""
|
||||||
|
data = {
|
||||||
|
"sort": [{"playlist_channel.keyword": {"order": "desc"}}],
|
||||||
|
}
|
||||||
|
data["query"] = {
|
||||||
|
"bool": {"must": [{"term": {"playlist_active": {"value": True}}}]}
|
||||||
|
}
|
||||||
|
if subscribed_only:
|
||||||
|
data["query"]["bool"]["must"].append(
|
||||||
|
{"term": {"playlist_subscribed": {"value": True}}}
|
||||||
|
)
|
||||||
|
|
||||||
|
all_playlists = IndexPaginate("ta_playlist", data).get_results()
|
||||||
|
|
||||||
|
return all_playlists
|
||||||
|
|
||||||
|
def process_url_str(self, new_playlists, subscribed=True):
|
||||||
|
"""process playlist subscribe form url_str"""
|
||||||
|
all_indexed = queue.PendingList().get_all_indexed()
|
||||||
|
all_youtube_ids = [i["youtube_id"] for i in all_indexed]
|
||||||
|
|
||||||
|
new_thumbs = []
|
||||||
|
for idx, playlist in enumerate(new_playlists):
|
||||||
|
url_type = playlist["type"]
|
||||||
|
playlist_id = playlist["url"]
|
||||||
|
if not url_type == "playlist":
|
||||||
|
print(f"{playlist_id} not a playlist, skipping...")
|
||||||
|
continue
|
||||||
|
|
||||||
|
playlist_h = YoutubePlaylist(playlist_id)
|
||||||
|
playlist_h.all_youtube_ids = all_youtube_ids
|
||||||
|
playlist_h.build_json()
|
||||||
|
playlist_h.json_data["playlist_subscribed"] = subscribed
|
||||||
|
playlist_h.upload_to_es()
|
||||||
|
playlist_h.add_vids_to_playlist()
|
||||||
|
self.channel_validate(playlist_h.json_data["playlist_channel_id"])
|
||||||
|
thumb = playlist_h.json_data["playlist_thumbnail"]
|
||||||
|
new_thumbs.append((playlist_id, thumb))
|
||||||
|
# notify
|
||||||
|
message = {
|
||||||
|
"status": "message:subplaylist",
|
||||||
|
"level": "info",
|
||||||
|
"title": "Subscribing to Playlists",
|
||||||
|
"message": f"Processing {idx + 1} of {len(new_playlists)}",
|
||||||
|
}
|
||||||
|
RedisArchivist().set_message(
|
||||||
|
"message:subplaylist", message=message
|
||||||
|
)
|
||||||
|
|
||||||
|
return new_thumbs
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def channel_validate(channel_id):
|
||||||
|
"""make sure channel of playlist is there"""
|
||||||
|
channel = YoutubeChannel(channel_id)
|
||||||
|
channel.build_json()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def change_subscribe(playlist_id, subscribe_status):
|
||||||
|
"""change the subscribe status of a playlist"""
|
||||||
|
playlist = YoutubePlaylist(playlist_id)
|
||||||
|
playlist.build_json()
|
||||||
|
playlist.json_data["playlist_subscribed"] = subscribe_status
|
||||||
|
playlist.upload_to_es()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_to_ignore():
|
||||||
|
"""get all youtube_ids already downloaded or ignored"""
|
||||||
|
pending_handler = queue.PendingList()
|
||||||
|
all_pending, all_ignore = pending_handler.get_all_pending()
|
||||||
|
all_ids = [i["youtube_id"] for i in all_ignore + all_pending]
|
||||||
|
all_downloaded = pending_handler.get_all_downloaded()
|
||||||
|
to_ignore = all_ids + all_downloaded
|
||||||
|
return to_ignore
|
||||||
|
|
||||||
|
def find_missing(self):
|
||||||
|
"""find videos in subscribed playlists not downloaded yet"""
|
||||||
|
all_playlists = [i["playlist_id"] for i in self.get_playlists()]
|
||||||
|
to_ignore = self.get_to_ignore()
|
||||||
|
|
||||||
|
missing_videos = []
|
||||||
|
for idx, playlist_id in enumerate(all_playlists):
|
||||||
|
size_limit = self.config["subscriptions"]["channel_size"]
|
||||||
|
playlist = YoutubePlaylist(playlist_id)
|
||||||
|
playlist.update_playlist()
|
||||||
|
if not playlist:
|
||||||
|
playlist.deactivate()
|
||||||
|
continue
|
||||||
|
|
||||||
|
playlist_entries = playlist.json_data["playlist_entries"]
|
||||||
|
if size_limit:
|
||||||
|
del playlist_entries[size_limit:]
|
||||||
|
|
||||||
|
all_missing = [i for i in playlist_entries if not i["downloaded"]]
|
||||||
|
|
||||||
|
message = {
|
||||||
|
"status": "message:rescan",
|
||||||
|
"level": "info",
|
||||||
|
"title": "Scanning playlists: Looking for new videos.",
|
||||||
|
"message": f"Progress: {idx + 1}/{len(all_playlists)}",
|
||||||
|
}
|
||||||
|
RedisArchivist().set_message("message:rescan", message=message)
|
||||||
|
|
||||||
|
for video in all_missing:
|
||||||
|
youtube_id = video["youtube_id"]
|
||||||
|
if youtube_id not in to_ignore:
|
||||||
|
missing_videos.append(youtube_id)
|
||||||
|
|
||||||
|
return missing_videos
|
@ -1,16 +1,19 @@
|
|||||||
"""
|
"""
|
||||||
functionality:
|
functionality:
|
||||||
- handle download and caching for thumbnails
|
- handle download and caching for thumbnails
|
||||||
|
- check for missing thumbnails
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
from time import sleep
|
from time import sleep
|
||||||
|
|
||||||
import home.src.download as download
|
|
||||||
import requests
|
import requests
|
||||||
from home.src.config import AppConfig
|
from home.src.download import queue # partial import
|
||||||
from home.src.helper import RedisArchivist, ignore_filelist
|
from home.src.download import subscriptions # partial import
|
||||||
|
from home.src.ta.config import AppConfig
|
||||||
|
from home.src.ta.helper import ignore_filelist
|
||||||
|
from home.src.ta.ta_redis import RedisArchivist
|
||||||
from mutagen.mp4 import MP4, MP4Cover
|
from mutagen.mp4 import MP4, MP4Cover
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
@ -55,8 +58,8 @@ class ThumbManager:
|
|||||||
def get_needed_thumbs(self, missing_only=False):
|
def get_needed_thumbs(self, missing_only=False):
|
||||||
"""get a list of all missing thumbnails"""
|
"""get a list of all missing thumbnails"""
|
||||||
all_thumbs = self.get_all_thumbs()
|
all_thumbs = self.get_all_thumbs()
|
||||||
all_indexed = download.PendingList().get_all_indexed()
|
all_indexed = queue.PendingList().get_all_indexed()
|
||||||
all_in_queue, all_ignored = download.PendingList().get_all_pending()
|
all_in_queue, all_ignored = queue.PendingList().get_all_pending()
|
||||||
|
|
||||||
needed_thumbs = []
|
needed_thumbs = []
|
||||||
for video in all_indexed:
|
for video in all_indexed:
|
||||||
@ -84,9 +87,8 @@ class ThumbManager:
|
|||||||
all_channel_art = os.listdir(self.CHANNEL_DIR)
|
all_channel_art = os.listdir(self.CHANNEL_DIR)
|
||||||
files = [i[0:24] for i in all_channel_art]
|
files = [i[0:24] for i in all_channel_art]
|
||||||
cached_channel_ids = [k for (k, v) in Counter(files).items() if v > 1]
|
cached_channel_ids = [k for (k, v) in Counter(files).items() if v > 1]
|
||||||
channels = download.ChannelSubscription().get_channels(
|
channel_sub = subscriptions.ChannelSubscription()
|
||||||
subscribed_only=False
|
channels = channel_sub.get_channels(subscribed_only=False)
|
||||||
)
|
|
||||||
|
|
||||||
missing_channels = []
|
missing_channels = []
|
||||||
for channel in channels:
|
for channel in channels:
|
||||||
@ -104,10 +106,8 @@ class ThumbManager:
|
|||||||
"""get all missing playlist artwork"""
|
"""get all missing playlist artwork"""
|
||||||
all_downloaded = ignore_filelist(os.listdir(self.PLAYLIST_DIR))
|
all_downloaded = ignore_filelist(os.listdir(self.PLAYLIST_DIR))
|
||||||
all_ids_downloaded = [i.replace(".jpg", "") for i in all_downloaded]
|
all_ids_downloaded = [i.replace(".jpg", "") for i in all_downloaded]
|
||||||
|
playlist_sub = subscriptions.PlaylistSubscription()
|
||||||
playlists = download.PlaylistSubscription().get_playlists(
|
playlists = playlist_sub.get_playlists(subscribed_only=False)
|
||||||
subscribed_only=False
|
|
||||||
)
|
|
||||||
|
|
||||||
missing_playlists = []
|
missing_playlists = []
|
||||||
for playlist in playlists:
|
for playlist in playlists:
|
||||||
@ -276,7 +276,7 @@ class ThumbManager:
|
|||||||
|
|
||||||
def get_thumb_list(self):
|
def get_thumb_list(self):
|
||||||
"""get list of mediafiles and matching thumbnails"""
|
"""get list of mediafiles and matching thumbnails"""
|
||||||
all_indexed = download.PendingList().get_all_indexed()
|
all_indexed = queue.PendingList().get_all_indexed()
|
||||||
video_list = []
|
video_list = []
|
||||||
for video in all_indexed:
|
for video in all_indexed:
|
||||||
youtube_id = video["youtube_id"]
|
youtube_id = video["youtube_id"]
|
313
tubearchivist/home/src/download/yt_dlp_handler.py
Normal file
313
tubearchivist/home/src/download/yt_dlp_handler.py
Normal file
@ -0,0 +1,313 @@
|
|||||||
|
"""
|
||||||
|
functionality:
|
||||||
|
- handle yt_dlp
|
||||||
|
- build options and post processor
|
||||||
|
- download video files
|
||||||
|
- move to archive
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
from datetime import datetime
|
||||||
|
from time import sleep
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import yt_dlp
|
||||||
|
from home.src.download.queue import PendingList
|
||||||
|
from home.src.download.subscriptions import PlaylistSubscription
|
||||||
|
from home.src.es.connect import IndexPaginate
|
||||||
|
from home.src.index.channel import YoutubeChannel
|
||||||
|
from home.src.index.playlist import YoutubePlaylist
|
||||||
|
from home.src.index.video import YoutubeVideo, index_new_video
|
||||||
|
from home.src.ta.config import AppConfig
|
||||||
|
from home.src.ta.helper import clean_string, ignore_filelist
|
||||||
|
from home.src.ta.ta_redis import RedisArchivist, RedisQueue
|
||||||
|
|
||||||
|
|
||||||
|
class VideoDownloader:
|
||||||
|
"""
|
||||||
|
handle the video download functionality
|
||||||
|
if not initiated with list, take from queue
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, youtube_id_list=False):
|
||||||
|
self.obs = False
|
||||||
|
self.youtube_id_list = youtube_id_list
|
||||||
|
self.config = AppConfig().config
|
||||||
|
self._build_obs()
|
||||||
|
self.channels = set()
|
||||||
|
|
||||||
|
def run_queue(self):
|
||||||
|
"""setup download queue in redis loop until no more items"""
|
||||||
|
queue = RedisQueue("dl_queue")
|
||||||
|
|
||||||
|
limit_queue = self.config["downloads"]["limit_count"]
|
||||||
|
if limit_queue:
|
||||||
|
queue.trim(limit_queue - 1)
|
||||||
|
|
||||||
|
while True:
|
||||||
|
youtube_id = queue.get_next()
|
||||||
|
if not youtube_id:
|
||||||
|
break
|
||||||
|
|
||||||
|
try:
|
||||||
|
self._dl_single_vid(youtube_id)
|
||||||
|
except yt_dlp.utils.DownloadError:
|
||||||
|
print("failed to download " + youtube_id)
|
||||||
|
continue
|
||||||
|
vid_dict = index_new_video(youtube_id)
|
||||||
|
self.channels.add(vid_dict["channel"]["channel_id"])
|
||||||
|
self.move_to_archive(vid_dict)
|
||||||
|
self._delete_from_pending(youtube_id)
|
||||||
|
|
||||||
|
autodelete_days = self.config["downloads"]["autodelete_days"]
|
||||||
|
if autodelete_days:
|
||||||
|
print(f"auto delete older than {autodelete_days} days")
|
||||||
|
self.auto_delete_watched(autodelete_days)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def add_pending():
|
||||||
|
"""add pending videos to download queue"""
|
||||||
|
mess_dict = {
|
||||||
|
"status": "message:download",
|
||||||
|
"level": "info",
|
||||||
|
"title": "Looking for videos to download",
|
||||||
|
"message": "Scanning your download queue.",
|
||||||
|
}
|
||||||
|
RedisArchivist().set_message("message:download", mess_dict)
|
||||||
|
all_pending, _ = PendingList().get_all_pending()
|
||||||
|
to_add = [i["youtube_id"] for i in all_pending]
|
||||||
|
if not to_add:
|
||||||
|
# there is nothing pending
|
||||||
|
print("download queue is empty")
|
||||||
|
mess_dict = {
|
||||||
|
"status": "message:download",
|
||||||
|
"level": "error",
|
||||||
|
"title": "Download queue is empty",
|
||||||
|
"message": "Add some videos to the queue first.",
|
||||||
|
}
|
||||||
|
RedisArchivist().set_message("message:download", mess_dict)
|
||||||
|
return
|
||||||
|
|
||||||
|
queue = RedisQueue("dl_queue")
|
||||||
|
queue.add_list(to_add)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _progress_hook(response):
|
||||||
|
"""process the progress_hooks from yt_dlp"""
|
||||||
|
# title
|
||||||
|
path = os.path.split(response["filename"])[-1][12:]
|
||||||
|
filename = os.path.splitext(os.path.splitext(path)[0])[0]
|
||||||
|
filename_clean = filename.replace("_", " ")
|
||||||
|
title = "Downloading: " + filename_clean
|
||||||
|
# message
|
||||||
|
try:
|
||||||
|
percent = response["_percent_str"]
|
||||||
|
size = response["_total_bytes_str"]
|
||||||
|
speed = response["_speed_str"]
|
||||||
|
eta = response["_eta_str"]
|
||||||
|
message = f"{percent} of {size} at {speed} - time left: {eta}"
|
||||||
|
except KeyError:
|
||||||
|
message = "processing"
|
||||||
|
mess_dict = {
|
||||||
|
"status": "message:download",
|
||||||
|
"level": "info",
|
||||||
|
"title": title,
|
||||||
|
"message": message,
|
||||||
|
}
|
||||||
|
RedisArchivist().set_message("message:download", mess_dict)
|
||||||
|
|
||||||
|
def _build_obs(self):
|
||||||
|
"""collection to build all obs passed to yt-dlp"""
|
||||||
|
self._build_obs_basic()
|
||||||
|
self._build_obs_user()
|
||||||
|
self._build_obs_postprocessors()
|
||||||
|
|
||||||
|
def _build_obs_basic(self):
|
||||||
|
"""initial obs"""
|
||||||
|
self.obs = {
|
||||||
|
"default_search": "ytsearch",
|
||||||
|
"merge_output_format": "mp4",
|
||||||
|
"restrictfilenames": True,
|
||||||
|
"outtmpl": (
|
||||||
|
self.config["application"]["cache_dir"]
|
||||||
|
+ "/download/"
|
||||||
|
+ self.config["application"]["file_template"]
|
||||||
|
),
|
||||||
|
"progress_hooks": [self._progress_hook],
|
||||||
|
"noprogress": True,
|
||||||
|
"quiet": True,
|
||||||
|
"continuedl": True,
|
||||||
|
"retries": 3,
|
||||||
|
"writethumbnail": False,
|
||||||
|
"noplaylist": True,
|
||||||
|
"check_formats": "selected",
|
||||||
|
}
|
||||||
|
|
||||||
|
def _build_obs_user(self):
|
||||||
|
"""build user customized options"""
|
||||||
|
if self.config["downloads"]["format"]:
|
||||||
|
self.obs["format"] = self.config["downloads"]["format"]
|
||||||
|
if self.config["downloads"]["limit_speed"]:
|
||||||
|
self.obs["ratelimit"] = (
|
||||||
|
self.config["downloads"]["limit_speed"] * 1024
|
||||||
|
)
|
||||||
|
|
||||||
|
throttle = self.config["downloads"]["throttledratelimit"]
|
||||||
|
if throttle:
|
||||||
|
self.obs["throttledratelimit"] = throttle * 1024
|
||||||
|
|
||||||
|
def _build_obs_postprocessors(self):
|
||||||
|
"""add postprocessor to obs"""
|
||||||
|
postprocessors = []
|
||||||
|
|
||||||
|
if self.config["downloads"]["add_metadata"]:
|
||||||
|
postprocessors.append(
|
||||||
|
{
|
||||||
|
"key": "FFmpegMetadata",
|
||||||
|
"add_chapters": True,
|
||||||
|
"add_metadata": True,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
if self.config["downloads"]["add_thumbnail"]:
|
||||||
|
postprocessors.append(
|
||||||
|
{
|
||||||
|
"key": "EmbedThumbnail",
|
||||||
|
"already_have_thumbnail": True,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
self.obs["writethumbnail"] = True
|
||||||
|
|
||||||
|
self.obs["postprocessors"] = postprocessors
|
||||||
|
|
||||||
|
def _dl_single_vid(self, youtube_id):
|
||||||
|
"""download single video"""
|
||||||
|
dl_cache = self.config["application"]["cache_dir"] + "/download/"
|
||||||
|
|
||||||
|
# check if already in cache to continue from there
|
||||||
|
all_cached = ignore_filelist(os.listdir(dl_cache))
|
||||||
|
for file_name in all_cached:
|
||||||
|
if youtube_id in file_name:
|
||||||
|
self.obs["outtmpl"] = os.path.join(dl_cache, file_name)
|
||||||
|
with yt_dlp.YoutubeDL(self.obs) as ydl:
|
||||||
|
try:
|
||||||
|
ydl.download([youtube_id])
|
||||||
|
except yt_dlp.utils.DownloadError:
|
||||||
|
print("retry failed download: " + youtube_id)
|
||||||
|
sleep(10)
|
||||||
|
ydl.download([youtube_id])
|
||||||
|
|
||||||
|
if self.obs["writethumbnail"]:
|
||||||
|
# webp files don't get cleaned up automatically
|
||||||
|
all_cached = ignore_filelist(os.listdir(dl_cache))
|
||||||
|
to_clean = [i for i in all_cached if not i.endswith(".mp4")]
|
||||||
|
for file_name in to_clean:
|
||||||
|
file_path = os.path.join(dl_cache, file_name)
|
||||||
|
os.remove(file_path)
|
||||||
|
|
||||||
|
def move_to_archive(self, vid_dict):
|
||||||
|
"""move downloaded video from cache to archive"""
|
||||||
|
videos = self.config["application"]["videos"]
|
||||||
|
host_uid = self.config["application"]["HOST_UID"]
|
||||||
|
host_gid = self.config["application"]["HOST_GID"]
|
||||||
|
channel_name = clean_string(vid_dict["channel"]["channel_name"])
|
||||||
|
# make archive folder with correct permissions
|
||||||
|
new_folder = os.path.join(videos, channel_name)
|
||||||
|
if not os.path.exists(new_folder):
|
||||||
|
os.makedirs(new_folder)
|
||||||
|
if host_uid and host_gid:
|
||||||
|
os.chown(new_folder, host_uid, host_gid)
|
||||||
|
# find real filename
|
||||||
|
cache_dir = self.config["application"]["cache_dir"]
|
||||||
|
all_cached = ignore_filelist(os.listdir(cache_dir + "/download/"))
|
||||||
|
for file_str in all_cached:
|
||||||
|
if vid_dict["youtube_id"] in file_str:
|
||||||
|
old_file = file_str
|
||||||
|
old_file_path = os.path.join(cache_dir, "download", old_file)
|
||||||
|
new_file_path = os.path.join(videos, vid_dict["media_url"])
|
||||||
|
# move media file and fix permission
|
||||||
|
shutil.move(old_file_path, new_file_path)
|
||||||
|
if host_uid and host_gid:
|
||||||
|
os.chown(new_file_path, host_uid, host_gid)
|
||||||
|
|
||||||
|
def _delete_from_pending(self, youtube_id):
|
||||||
|
"""delete downloaded video from pending index if its there"""
|
||||||
|
es_url = self.config["application"]["es_url"]
|
||||||
|
es_auth = self.config["application"]["es_auth"]
|
||||||
|
url = f"{es_url}/ta_download/_doc/{youtube_id}"
|
||||||
|
response = requests.delete(url, auth=es_auth)
|
||||||
|
if not response.ok and not response.status_code == 404:
|
||||||
|
print(response.text)
|
||||||
|
|
||||||
|
def _add_subscribed_channels(self):
|
||||||
|
"""add all channels subscribed to refresh"""
|
||||||
|
all_subscribed = PlaylistSubscription().get_playlists()
|
||||||
|
if not all_subscribed:
|
||||||
|
return
|
||||||
|
|
||||||
|
channel_ids = [i["playlist_channel_id"] for i in all_subscribed]
|
||||||
|
for channel_id in channel_ids:
|
||||||
|
self.channels.add(channel_id)
|
||||||
|
|
||||||
|
return
|
||||||
|
|
||||||
|
def validate_playlists(self):
|
||||||
|
"""look for playlist needing to update"""
|
||||||
|
print("sync playlists")
|
||||||
|
self._add_subscribed_channels()
|
||||||
|
all_indexed = PendingList().get_all_indexed()
|
||||||
|
all_youtube_ids = [i["youtube_id"] for i in all_indexed]
|
||||||
|
for id_c, channel_id in enumerate(self.channels):
|
||||||
|
playlists = YoutubeChannel(channel_id).get_indexed_playlists()
|
||||||
|
all_playlist_ids = [i["playlist_id"] for i in playlists]
|
||||||
|
for id_p, playlist_id in enumerate(all_playlist_ids):
|
||||||
|
playlist = YoutubePlaylist(playlist_id)
|
||||||
|
playlist.all_youtube_ids = all_youtube_ids
|
||||||
|
playlist.build_json(scrape=True)
|
||||||
|
if not playlist.json_data:
|
||||||
|
playlist.deactivate()
|
||||||
|
|
||||||
|
playlist.add_vids_to_playlist()
|
||||||
|
playlist.upload_to_es()
|
||||||
|
|
||||||
|
# notify
|
||||||
|
title = (
|
||||||
|
"Processing playlists for channels: "
|
||||||
|
+ f"{id_c + 1}/{len(self.channels)}"
|
||||||
|
)
|
||||||
|
message = f"Progress: {id_p + 1}/{len(all_playlist_ids)}"
|
||||||
|
mess_dict = {
|
||||||
|
"status": "message:download",
|
||||||
|
"level": "info",
|
||||||
|
"title": title,
|
||||||
|
"message": message,
|
||||||
|
}
|
||||||
|
if id_p + 1 == len(all_playlist_ids):
|
||||||
|
RedisArchivist().set_message(
|
||||||
|
"message:download", mess_dict, expire=4
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
RedisArchivist().set_message("message:download", mess_dict)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def auto_delete_watched(autodelete_days):
|
||||||
|
"""delete watched videos after x days"""
|
||||||
|
now = int(datetime.now().strftime("%s"))
|
||||||
|
now_lte = now - autodelete_days * 24 * 60 * 60
|
||||||
|
data = {
|
||||||
|
"query": {"range": {"player.watched_date": {"lte": now_lte}}},
|
||||||
|
"sort": [{"player.watched_date": {"order": "asc"}}],
|
||||||
|
}
|
||||||
|
all_to_delete = IndexPaginate("ta_video", data).get_results()
|
||||||
|
all_youtube_ids = [i["youtube_id"] for i in all_to_delete]
|
||||||
|
if not all_youtube_ids:
|
||||||
|
return
|
||||||
|
|
||||||
|
for youtube_id in all_youtube_ids:
|
||||||
|
print(f"autodelete {youtube_id}")
|
||||||
|
YoutubeVideo(youtube_id).delete_media_file()
|
||||||
|
|
||||||
|
print("add deleted to ignore list")
|
||||||
|
pending_handler = PendingList()
|
||||||
|
pending_handler.add_to_pending(all_youtube_ids, ignore=True)
|
0
tubearchivist/home/src/es/__init__.py
Normal file
0
tubearchivist/home/src/es/__init__.py
Normal file
148
tubearchivist/home/src/es/connect.py
Normal file
148
tubearchivist/home/src/es/connect.py
Normal file
@ -0,0 +1,148 @@
|
|||||||
|
"""
|
||||||
|
functionality:
|
||||||
|
- wrapper around requests to call elastic search
|
||||||
|
- reusable search_after to extract total index
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from home.src.ta.config import AppConfig
|
||||||
|
|
||||||
|
|
||||||
|
class ElasticWrap:
|
||||||
|
"""makes all calls to elastic search
|
||||||
|
returns response json and status code tuple
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, path, config=False):
|
||||||
|
self.url = False
|
||||||
|
self.auth = False
|
||||||
|
self.path = path
|
||||||
|
self.config = config
|
||||||
|
self._get_config()
|
||||||
|
|
||||||
|
def _get_config(self):
|
||||||
|
"""add config if not passed"""
|
||||||
|
if not self.config:
|
||||||
|
self.config = AppConfig().config
|
||||||
|
|
||||||
|
es_url = self.config["application"]["es_url"]
|
||||||
|
self.auth = self.config["application"]["es_auth"]
|
||||||
|
self.url = f"{es_url}/{self.path}"
|
||||||
|
|
||||||
|
def get(self, data=False):
|
||||||
|
"""get data from es"""
|
||||||
|
if data:
|
||||||
|
response = requests.get(self.url, json=data, auth=self.auth)
|
||||||
|
else:
|
||||||
|
response = requests.get(self.url, auth=self.auth)
|
||||||
|
if not response.ok:
|
||||||
|
print(response.text)
|
||||||
|
|
||||||
|
return response.json(), response.status_code
|
||||||
|
|
||||||
|
def post(self, data=False, ndjson=False):
|
||||||
|
"""post data to es"""
|
||||||
|
if ndjson:
|
||||||
|
headers = {"Content-type": "application/x-ndjson"}
|
||||||
|
payload = data
|
||||||
|
else:
|
||||||
|
headers = {"Content-type": "application/json"}
|
||||||
|
payload = json.dumps(data)
|
||||||
|
|
||||||
|
if data:
|
||||||
|
response = requests.post(
|
||||||
|
self.url, data=payload, headers=headers, auth=self.auth
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
response = requests.post(self.url, headers=headers, auth=self.auth)
|
||||||
|
|
||||||
|
if not response.ok:
|
||||||
|
print(response.text)
|
||||||
|
|
||||||
|
return response.json(), response.status_code
|
||||||
|
|
||||||
|
def put(self, data, refresh=False):
|
||||||
|
"""put data to es"""
|
||||||
|
if refresh:
|
||||||
|
self.url = f"{self.url}/?refresh=true"
|
||||||
|
response = requests.put(f"{self.url}", json=data, auth=self.auth)
|
||||||
|
if not response.ok:
|
||||||
|
print(response.text)
|
||||||
|
print(data)
|
||||||
|
raise ValueError("failed to add item to index")
|
||||||
|
|
||||||
|
return response.json(), response.status_code
|
||||||
|
|
||||||
|
def delete(self, data=False):
|
||||||
|
"""delete document from es"""
|
||||||
|
if data:
|
||||||
|
response = requests.delete(self.url, json=data, auth=self.auth)
|
||||||
|
else:
|
||||||
|
response = requests.delete(self.url, auth=self.auth)
|
||||||
|
|
||||||
|
if not response.ok:
|
||||||
|
print(response.text)
|
||||||
|
|
||||||
|
return response.json(), response.status_code
|
||||||
|
|
||||||
|
|
||||||
|
class IndexPaginate:
|
||||||
|
"""use search_after to go through whole index"""
|
||||||
|
|
||||||
|
DEFAULT_SIZE = 500
|
||||||
|
|
||||||
|
def __init__(self, index_name, data, size=False):
|
||||||
|
self.index_name = index_name
|
||||||
|
self.data = data
|
||||||
|
self.pit_id = False
|
||||||
|
self.size = size
|
||||||
|
|
||||||
|
def get_results(self):
|
||||||
|
"""get all results"""
|
||||||
|
self.get_pit()
|
||||||
|
self.validate_data()
|
||||||
|
all_results = self.run_loop()
|
||||||
|
self.clean_pit()
|
||||||
|
return all_results
|
||||||
|
|
||||||
|
def get_pit(self):
|
||||||
|
"""get pit for index"""
|
||||||
|
path = f"{self.index_name}/_pit?keep_alive=10m"
|
||||||
|
response, _ = ElasticWrap(path).post()
|
||||||
|
self.pit_id = response["id"]
|
||||||
|
|
||||||
|
def validate_data(self):
|
||||||
|
"""add pit and size to data"""
|
||||||
|
if "sort" not in self.data.keys():
|
||||||
|
print(self.data)
|
||||||
|
raise ValueError("missing sort key in data")
|
||||||
|
|
||||||
|
size = self.size or self.DEFAULT_SIZE
|
||||||
|
|
||||||
|
self.data["size"] = size
|
||||||
|
self.data["pit"] = {"id": self.pit_id, "keep_alive": "10m"}
|
||||||
|
|
||||||
|
def run_loop(self):
|
||||||
|
"""loop through results until last hit"""
|
||||||
|
all_results = []
|
||||||
|
while True:
|
||||||
|
response, _ = ElasticWrap("_search").get(data=self.data)
|
||||||
|
all_hits = response["hits"]["hits"]
|
||||||
|
if all_hits:
|
||||||
|
for hit in all_hits:
|
||||||
|
source = hit["_source"]
|
||||||
|
search_after = hit["sort"]
|
||||||
|
all_results.append(source)
|
||||||
|
# update search_after with last hit data
|
||||||
|
self.data["search_after"] = search_after
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
|
||||||
|
return all_results
|
||||||
|
|
||||||
|
def clean_pit(self):
|
||||||
|
"""delete pit from elastic search"""
|
||||||
|
data = {"id": self.pit_id}
|
||||||
|
ElasticWrap("_pit").delete(data=data)
|
274
tubearchivist/home/src/es/index_mapping.json
Normal file
274
tubearchivist/home/src/es/index_mapping.json
Normal file
@ -0,0 +1,274 @@
|
|||||||
|
{
|
||||||
|
"index_config": [{
|
||||||
|
"index_name": "channel",
|
||||||
|
"expected_map": {
|
||||||
|
"channel_id": {
|
||||||
|
"type": "keyword"
|
||||||
|
},
|
||||||
|
"channel_name": {
|
||||||
|
"type": "text",
|
||||||
|
"analyzer": "english",
|
||||||
|
"fields": {
|
||||||
|
"keyword": {
|
||||||
|
"type": "keyword",
|
||||||
|
"ignore_above": 256,
|
||||||
|
"normalizer": "to_lower"
|
||||||
|
},
|
||||||
|
"search_as_you_type": {
|
||||||
|
"type": "search_as_you_type",
|
||||||
|
"doc_values": false,
|
||||||
|
"max_shingle_size": 3
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"channel_banner_url": {
|
||||||
|
"type": "keyword",
|
||||||
|
"index": false
|
||||||
|
},
|
||||||
|
"channel_thumb_url": {
|
||||||
|
"type": "keyword",
|
||||||
|
"index": false
|
||||||
|
},
|
||||||
|
"channel_description": {
|
||||||
|
"type": "text"
|
||||||
|
},
|
||||||
|
"channel_last_refresh": {
|
||||||
|
"type": "date",
|
||||||
|
"format": "epoch_second"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"expected_set": {
|
||||||
|
"analysis": {
|
||||||
|
"normalizer": {
|
||||||
|
"to_lower": {
|
||||||
|
"type": "custom",
|
||||||
|
"filter": ["lowercase"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"number_of_replicas": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"index_name": "video",
|
||||||
|
"expected_map": {
|
||||||
|
"vid_thumb_url": {
|
||||||
|
"type": "text",
|
||||||
|
"index": false
|
||||||
|
},
|
||||||
|
"date_downloaded": {
|
||||||
|
"type": "date"
|
||||||
|
},
|
||||||
|
"channel": {
|
||||||
|
"properties": {
|
||||||
|
"channel_id": {
|
||||||
|
"type": "keyword"
|
||||||
|
},
|
||||||
|
"channel_name": {
|
||||||
|
"type": "text",
|
||||||
|
"analyzer": "english",
|
||||||
|
"fields": {
|
||||||
|
"keyword": {
|
||||||
|
"type": "keyword",
|
||||||
|
"ignore_above": 256,
|
||||||
|
"normalizer": "to_lower"
|
||||||
|
},
|
||||||
|
"search_as_you_type": {
|
||||||
|
"type": "search_as_you_type",
|
||||||
|
"doc_values": false,
|
||||||
|
"max_shingle_size": 3
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"channel_banner_url": {
|
||||||
|
"type": "keyword",
|
||||||
|
"index": false
|
||||||
|
},
|
||||||
|
"channel_thumb_url": {
|
||||||
|
"type": "keyword",
|
||||||
|
"index": false
|
||||||
|
},
|
||||||
|
"channel_description": {
|
||||||
|
"type": "text"
|
||||||
|
},
|
||||||
|
"channel_last_refresh": {
|
||||||
|
"type": "date",
|
||||||
|
"format": "epoch_second"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"description": {
|
||||||
|
"type": "text"
|
||||||
|
},
|
||||||
|
"media_url": {
|
||||||
|
"type": "keyword",
|
||||||
|
"index": false
|
||||||
|
},
|
||||||
|
"tags": {
|
||||||
|
"type": "text",
|
||||||
|
"analyzer": "english",
|
||||||
|
"fields": {
|
||||||
|
"keyword": {
|
||||||
|
"type": "keyword",
|
||||||
|
"ignore_above": 256
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"title": {
|
||||||
|
"type": "text",
|
||||||
|
"analyzer": "english",
|
||||||
|
"fields": {
|
||||||
|
"keyword": {
|
||||||
|
"type": "keyword",
|
||||||
|
"ignore_above": 256,
|
||||||
|
"normalizer": "to_lower"
|
||||||
|
},
|
||||||
|
"search_as_you_type": {
|
||||||
|
"type": "search_as_you_type",
|
||||||
|
"doc_values": false,
|
||||||
|
"max_shingle_size": 3
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"vid_last_refresh": {
|
||||||
|
"type": "date"
|
||||||
|
},
|
||||||
|
"youtube_id": {
|
||||||
|
"type": "keyword"
|
||||||
|
},
|
||||||
|
"published": {
|
||||||
|
"type": "date"
|
||||||
|
},
|
||||||
|
"playlist": {
|
||||||
|
"type": "text",
|
||||||
|
"fields": {
|
||||||
|
"keyword": {
|
||||||
|
"type": "keyword",
|
||||||
|
"ignore_above": 256,
|
||||||
|
"normalizer": "to_lower"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"expected_set": {
|
||||||
|
"analysis": {
|
||||||
|
"normalizer": {
|
||||||
|
"to_lower": {
|
||||||
|
"type": "custom",
|
||||||
|
"filter": ["lowercase"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"number_of_replicas": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"index_name": "download",
|
||||||
|
"expected_map": {
|
||||||
|
"timestamp": {
|
||||||
|
"type": "date"
|
||||||
|
},
|
||||||
|
"channel_id": {
|
||||||
|
"type": "keyword"
|
||||||
|
},
|
||||||
|
"channel_name": {
|
||||||
|
"type": "text",
|
||||||
|
"fields": {
|
||||||
|
"keyword": {
|
||||||
|
"type": "keyword",
|
||||||
|
"ignore_above": 256,
|
||||||
|
"normalizer": "to_lower"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"status": {
|
||||||
|
"type": "keyword"
|
||||||
|
},
|
||||||
|
"title": {
|
||||||
|
"type": "text",
|
||||||
|
"fields": {
|
||||||
|
"keyword": {
|
||||||
|
"type": "keyword",
|
||||||
|
"ignore_above": 256,
|
||||||
|
"normalizer": "to_lower"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"vid_thumb_url": {
|
||||||
|
"type": "keyword"
|
||||||
|
},
|
||||||
|
"youtube_id": {
|
||||||
|
"type": "keyword"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"expected_set": {
|
||||||
|
"analysis": {
|
||||||
|
"normalizer": {
|
||||||
|
"to_lower": {
|
||||||
|
"type": "custom",
|
||||||
|
"filter": ["lowercase"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"number_of_replicas": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"index_name": "playlist",
|
||||||
|
"expected_map": {
|
||||||
|
"playlist_id": {
|
||||||
|
"type": "keyword"
|
||||||
|
},
|
||||||
|
"playlist_description": {
|
||||||
|
"type": "text"
|
||||||
|
},
|
||||||
|
"playlist_name": {
|
||||||
|
"type": "text",
|
||||||
|
"analyzer": "english",
|
||||||
|
"fields": {
|
||||||
|
"keyword": {
|
||||||
|
"type": "keyword",
|
||||||
|
"ignore_above": 256,
|
||||||
|
"normalizer": "to_lower"
|
||||||
|
},
|
||||||
|
"search_as_you_type": {
|
||||||
|
"type": "search_as_you_type",
|
||||||
|
"doc_values": false,
|
||||||
|
"max_shingle_size": 3
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"playlist_channel": {
|
||||||
|
"type": "text",
|
||||||
|
"fields": {
|
||||||
|
"keyword": {
|
||||||
|
"type": "keyword",
|
||||||
|
"ignore_above": 256,
|
||||||
|
"normalizer": "to_lower"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"playlist_channel_id": {
|
||||||
|
"type": "keyword"
|
||||||
|
},
|
||||||
|
"playlist_thumbnail": {
|
||||||
|
"type": "keyword"
|
||||||
|
},
|
||||||
|
"playlist_last_refresh": {
|
||||||
|
"type": "date"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"expected_set": {
|
||||||
|
"analysis": {
|
||||||
|
"normalizer": {
|
||||||
|
"to_lower": {
|
||||||
|
"type": "custom",
|
||||||
|
"filter": ["lowercase"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"number_of_replicas": "0"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
@ -1,9 +1,8 @@
|
|||||||
"""
|
"""
|
||||||
Functionality:
|
functionality:
|
||||||
- initial elastic search setup
|
- setup elastic index at first start
|
||||||
- index configuration is represented in INDEX_CONFIG
|
- verify and update index mapping and settings if needed
|
||||||
- index mapping and settings validation
|
- backup and restore metadata
|
||||||
- backup and restore
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import json
|
import json
|
||||||
@ -12,213 +11,8 @@ import zipfile
|
|||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from home.src.config import AppConfig
|
from home.src.ta.config import AppConfig
|
||||||
from home.src.helper import ignore_filelist
|
from home.src.ta.helper import ignore_filelist
|
||||||
|
|
||||||
# expected mapping and settings
|
|
||||||
INDEX_CONFIG = [
|
|
||||||
{
|
|
||||||
"index_name": "channel",
|
|
||||||
"expected_map": {
|
|
||||||
"channel_id": {
|
|
||||||
"type": "keyword",
|
|
||||||
},
|
|
||||||
"channel_name": {
|
|
||||||
"type": "text",
|
|
||||||
"analyzer": "english",
|
|
||||||
"fields": {
|
|
||||||
"keyword": {
|
|
||||||
"type": "keyword",
|
|
||||||
"ignore_above": 256,
|
|
||||||
"normalizer": "to_lower",
|
|
||||||
},
|
|
||||||
"search_as_you_type": {
|
|
||||||
"type": "search_as_you_type",
|
|
||||||
"doc_values": False,
|
|
||||||
"max_shingle_size": 3,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
"channel_banner_url": {"type": "keyword", "index": False},
|
|
||||||
"channel_thumb_url": {"type": "keyword", "index": False},
|
|
||||||
"channel_description": {"type": "text"},
|
|
||||||
"channel_last_refresh": {"type": "date", "format": "epoch_second"},
|
|
||||||
},
|
|
||||||
"expected_set": {
|
|
||||||
"analysis": {
|
|
||||||
"normalizer": {
|
|
||||||
"to_lower": {"type": "custom", "filter": ["lowercase"]}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"number_of_replicas": "0",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"index_name": "video",
|
|
||||||
"expected_map": {
|
|
||||||
"vid_thumb_url": {"type": "text", "index": False},
|
|
||||||
"date_downloaded": {"type": "date"},
|
|
||||||
"channel": {
|
|
||||||
"properties": {
|
|
||||||
"channel_id": {
|
|
||||||
"type": "keyword",
|
|
||||||
},
|
|
||||||
"channel_name": {
|
|
||||||
"type": "text",
|
|
||||||
"analyzer": "english",
|
|
||||||
"fields": {
|
|
||||||
"keyword": {
|
|
||||||
"type": "keyword",
|
|
||||||
"ignore_above": 256,
|
|
||||||
"normalizer": "to_lower",
|
|
||||||
},
|
|
||||||
"search_as_you_type": {
|
|
||||||
"type": "search_as_you_type",
|
|
||||||
"doc_values": False,
|
|
||||||
"max_shingle_size": 3,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
"channel_banner_url": {"type": "keyword", "index": False},
|
|
||||||
"channel_thumb_url": {"type": "keyword", "index": False},
|
|
||||||
"channel_description": {"type": "text"},
|
|
||||||
"channel_last_refresh": {
|
|
||||||
"type": "date",
|
|
||||||
"format": "epoch_second",
|
|
||||||
},
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"description": {"type": "text"},
|
|
||||||
"media_url": {"type": "keyword", "index": False},
|
|
||||||
"tags": {
|
|
||||||
"type": "text",
|
|
||||||
"analyzer": "english",
|
|
||||||
"fields": {
|
|
||||||
"keyword": {"type": "keyword", "ignore_above": 256}
|
|
||||||
},
|
|
||||||
},
|
|
||||||
"title": {
|
|
||||||
"type": "text",
|
|
||||||
"analyzer": "english",
|
|
||||||
"fields": {
|
|
||||||
"keyword": {
|
|
||||||
"type": "keyword",
|
|
||||||
"ignore_above": 256,
|
|
||||||
"normalizer": "to_lower",
|
|
||||||
},
|
|
||||||
"search_as_you_type": {
|
|
||||||
"type": "search_as_you_type",
|
|
||||||
"doc_values": False,
|
|
||||||
"max_shingle_size": 3,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
"vid_last_refresh": {"type": "date"},
|
|
||||||
"youtube_id": {"type": "keyword"},
|
|
||||||
"published": {"type": "date"},
|
|
||||||
"playlist": {
|
|
||||||
"type": "text",
|
|
||||||
"fields": {
|
|
||||||
"keyword": {
|
|
||||||
"type": "keyword",
|
|
||||||
"ignore_above": 256,
|
|
||||||
"normalizer": "to_lower",
|
|
||||||
}
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
"expected_set": {
|
|
||||||
"analysis": {
|
|
||||||
"normalizer": {
|
|
||||||
"to_lower": {"type": "custom", "filter": ["lowercase"]}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"number_of_replicas": "0",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"index_name": "download",
|
|
||||||
"expected_map": {
|
|
||||||
"timestamp": {"type": "date"},
|
|
||||||
"channel_id": {"type": "keyword"},
|
|
||||||
"channel_name": {
|
|
||||||
"type": "text",
|
|
||||||
"fields": {
|
|
||||||
"keyword": {
|
|
||||||
"type": "keyword",
|
|
||||||
"ignore_above": 256,
|
|
||||||
"normalizer": "to_lower",
|
|
||||||
}
|
|
||||||
},
|
|
||||||
},
|
|
||||||
"status": {"type": "keyword"},
|
|
||||||
"title": {
|
|
||||||
"type": "text",
|
|
||||||
"fields": {
|
|
||||||
"keyword": {
|
|
||||||
"type": "keyword",
|
|
||||||
"ignore_above": 256,
|
|
||||||
"normalizer": "to_lower",
|
|
||||||
}
|
|
||||||
},
|
|
||||||
},
|
|
||||||
"vid_thumb_url": {"type": "keyword"},
|
|
||||||
"youtube_id": {"type": "keyword"},
|
|
||||||
},
|
|
||||||
"expected_set": {
|
|
||||||
"analysis": {
|
|
||||||
"normalizer": {
|
|
||||||
"to_lower": {"type": "custom", "filter": ["lowercase"]}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"number_of_replicas": "0",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"index_name": "playlist",
|
|
||||||
"expected_map": {
|
|
||||||
"playlist_id": {"type": "keyword"},
|
|
||||||
"playlist_description": {"type": "text"},
|
|
||||||
"playlist_name": {
|
|
||||||
"type": "text",
|
|
||||||
"analyzer": "english",
|
|
||||||
"fields": {
|
|
||||||
"keyword": {
|
|
||||||
"type": "keyword",
|
|
||||||
"ignore_above": 256,
|
|
||||||
"normalizer": "to_lower",
|
|
||||||
},
|
|
||||||
"search_as_you_type": {
|
|
||||||
"type": "search_as_you_type",
|
|
||||||
"doc_values": False,
|
|
||||||
"max_shingle_size": 3,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
"playlist_channel": {
|
|
||||||
"type": "text",
|
|
||||||
"fields": {
|
|
||||||
"keyword": {
|
|
||||||
"type": "keyword",
|
|
||||||
"ignore_above": 256,
|
|
||||||
"normalizer": "to_lower",
|
|
||||||
}
|
|
||||||
},
|
|
||||||
},
|
|
||||||
"playlist_channel_id": {"type": "keyword"},
|
|
||||||
"playlist_thumbnail": {"type": "keyword"},
|
|
||||||
"playlist_last_refresh": {"type": "date"},
|
|
||||||
},
|
|
||||||
"expected_set": {
|
|
||||||
"analysis": {
|
|
||||||
"normalizer": {
|
|
||||||
"to_lower": {"type": "custom", "filter": ["lowercase"]}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"number_of_replicas": "0",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
class ElasticIndex:
|
class ElasticIndex:
|
||||||
@ -602,48 +396,21 @@ class ElasticBackup:
|
|||||||
os.remove(file_path)
|
os.remove(file_path)
|
||||||
|
|
||||||
|
|
||||||
def get_available_backups():
|
def get_mapping():
|
||||||
"""return dict of available backups for settings view"""
|
"""read index_mapping.json and get expected mapping and settings"""
|
||||||
backup_handler = ElasticBackup(INDEX_CONFIG, reason=False)
|
with open("home/src/es/index_mapping.json", "r", encoding="utf-8") as f:
|
||||||
all_backup_files = backup_handler.get_all_backup_files()
|
index_config = json.load(f).get("index_config")
|
||||||
return all_backup_files
|
|
||||||
|
|
||||||
|
return index_config
|
||||||
def backup_all_indexes(reason):
|
|
||||||
"""backup all es indexes to disk"""
|
|
||||||
backup_handler = ElasticBackup(INDEX_CONFIG, reason)
|
|
||||||
|
|
||||||
for index in backup_handler.index_config:
|
|
||||||
index_name = index["index_name"]
|
|
||||||
if not backup_handler.index_exists(index_name):
|
|
||||||
continue
|
|
||||||
all_results = backup_handler.get_all_documents(index_name)
|
|
||||||
file_content = backup_handler.build_bulk(all_results)
|
|
||||||
backup_handler.write_es_json(file_content, index_name)
|
|
||||||
backup_handler.write_ta_json(all_results, index_name)
|
|
||||||
|
|
||||||
backup_handler.zip_it()
|
|
||||||
|
|
||||||
if reason == "auto":
|
|
||||||
backup_handler.rotate_backup()
|
|
||||||
|
|
||||||
|
|
||||||
def restore_from_backup(filename):
|
|
||||||
"""restore indexes from backup file"""
|
|
||||||
# delete
|
|
||||||
index_check(force_restore=True)
|
|
||||||
# recreate
|
|
||||||
backup_handler = ElasticBackup(INDEX_CONFIG, reason=False)
|
|
||||||
zip_content = backup_handler.unpack_zip_backup(filename)
|
|
||||||
backup_handler.restore_json_files(zip_content)
|
|
||||||
|
|
||||||
|
|
||||||
def index_check(force_restore=False):
|
def index_check(force_restore=False):
|
||||||
"""check if all indexes are created and have correct mapping"""
|
"""check if all indexes are created and have correct mapping"""
|
||||||
|
|
||||||
backed_up = False
|
backed_up = False
|
||||||
|
index_config = get_mapping()
|
||||||
|
|
||||||
for index in INDEX_CONFIG:
|
for index in index_config:
|
||||||
index_name = index["index_name"]
|
index_name = index["index_name"]
|
||||||
expected_map = index["expected_map"]
|
expected_map = index["expected_map"]
|
||||||
expected_set = index["expected_set"]
|
expected_set = index["expected_set"]
|
||||||
@ -675,3 +442,42 @@ def index_check(force_restore=False):
|
|||||||
|
|
||||||
# else all good
|
# else all good
|
||||||
print(f"ta_{index_name} index is created and up to date...")
|
print(f"ta_{index_name} index is created and up to date...")
|
||||||
|
|
||||||
|
|
||||||
|
def get_available_backups():
|
||||||
|
"""return dict of available backups for settings view"""
|
||||||
|
index_config = get_mapping()
|
||||||
|
backup_handler = ElasticBackup(index_config, reason=False)
|
||||||
|
all_backup_files = backup_handler.get_all_backup_files()
|
||||||
|
return all_backup_files
|
||||||
|
|
||||||
|
|
||||||
|
def backup_all_indexes(reason):
|
||||||
|
"""backup all es indexes to disk"""
|
||||||
|
index_config = get_mapping()
|
||||||
|
backup_handler = ElasticBackup(index_config, reason)
|
||||||
|
|
||||||
|
for index in backup_handler.index_config:
|
||||||
|
index_name = index["index_name"]
|
||||||
|
if not backup_handler.index_exists(index_name):
|
||||||
|
continue
|
||||||
|
all_results = backup_handler.get_all_documents(index_name)
|
||||||
|
file_content = backup_handler.build_bulk(all_results)
|
||||||
|
backup_handler.write_es_json(file_content, index_name)
|
||||||
|
backup_handler.write_ta_json(all_results, index_name)
|
||||||
|
|
||||||
|
backup_handler.zip_it()
|
||||||
|
|
||||||
|
if reason == "auto":
|
||||||
|
backup_handler.rotate_backup()
|
||||||
|
|
||||||
|
|
||||||
|
def restore_from_backup(filename):
|
||||||
|
"""restore indexes from backup file"""
|
||||||
|
# delete
|
||||||
|
index_check(force_restore=True)
|
||||||
|
# recreate
|
||||||
|
index_config = get_mapping()
|
||||||
|
backup_handler = ElasticBackup(index_config, reason=False)
|
||||||
|
zip_content = backup_handler.unpack_zip_backup(filename)
|
||||||
|
backup_handler.restore_json_files(zip_content)
|
0
tubearchivist/home/src/frontend/__init__.py
Normal file
0
tubearchivist/home/src/frontend/__init__.py
Normal file
@ -4,19 +4,18 @@ Functionality:
|
|||||||
- called via user input
|
- called via user input
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from home.src.download import (
|
from home.src.download.queue import PendingList
|
||||||
|
from home.src.download.subscriptions import (
|
||||||
ChannelSubscription,
|
ChannelSubscription,
|
||||||
PendingList,
|
|
||||||
PlaylistSubscription,
|
PlaylistSubscription,
|
||||||
)
|
)
|
||||||
from home.src.helper import RedisArchivist, RedisQueue, UrlListParser
|
from home.src.frontend.searching import SearchForm
|
||||||
from home.src.index import (
|
from home.src.frontend.watched import WatchState
|
||||||
WatchState,
|
from home.src.index.channel import YoutubeChannel
|
||||||
YoutubeChannel,
|
from home.src.index.playlist import YoutubePlaylist
|
||||||
YoutubePlaylist,
|
from home.src.index.video import YoutubeVideo
|
||||||
YoutubeVideo,
|
from home.src.ta.helper import UrlListParser
|
||||||
)
|
from home.src.ta.ta_redis import RedisArchivist, RedisQueue
|
||||||
from home.src.searching import SearchForm
|
|
||||||
from home.tasks import (
|
from home.tasks import (
|
||||||
download_pending,
|
download_pending,
|
||||||
download_single,
|
download_single,
|
||||||
@ -306,7 +305,7 @@ class PostData:
|
|||||||
playlist_dict = self.exec_val
|
playlist_dict = self.exec_val
|
||||||
playlist_id = playlist_dict["playlist-id"]
|
playlist_id = playlist_dict["playlist-id"]
|
||||||
playlist_action = playlist_dict["playlist-action"]
|
playlist_action = playlist_dict["playlist-action"]
|
||||||
print(f"delete {playlist_action} from playlist {playlist_id}")
|
print(f"{playlist_id}: delete playlist {playlist_action}")
|
||||||
if playlist_action == "metadata":
|
if playlist_action == "metadata":
|
||||||
YoutubePlaylist(playlist_id).delete_metadata()
|
YoutubePlaylist(playlist_id).delete_metadata()
|
||||||
elif playlist_action == "all":
|
elif playlist_action == "all":
|
@ -6,36 +6,26 @@ Functionality:
|
|||||||
- calculate pagination values
|
- calculate pagination values
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import math
|
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
import requests
|
from home.src.download.thumbnails import ThumbManager
|
||||||
from home.src.config import AppConfig
|
from home.src.es.connect import ElasticWrap
|
||||||
from home.src.helper import RedisArchivist
|
from home.src.ta.config import AppConfig
|
||||||
from home.src.thumbnails import ThumbManager
|
|
||||||
|
|
||||||
|
|
||||||
class SearchHandler:
|
class SearchHandler:
|
||||||
"""search elastic search"""
|
"""search elastic search"""
|
||||||
|
|
||||||
CONFIG = AppConfig().config
|
def __init__(self, path, config, data=False):
|
||||||
CACHE_DIR = CONFIG["application"]["cache_dir"]
|
|
||||||
ES_AUTH = CONFIG["application"]["es_auth"]
|
|
||||||
|
|
||||||
def __init__(self, url, data):
|
|
||||||
self.max_hits = None
|
self.max_hits = None
|
||||||
self.url = url
|
self.path = path
|
||||||
|
self.config = config
|
||||||
self.data = data
|
self.data = data
|
||||||
|
|
||||||
def get_data(self):
|
def get_data(self):
|
||||||
"""get the data"""
|
"""get the data"""
|
||||||
if self.data:
|
response, _ = ElasticWrap(self.path, config=self.config).get(self.data)
|
||||||
response = requests.get(
|
|
||||||
self.url, json=self.data, auth=self.ES_AUTH
|
|
||||||
).json()
|
|
||||||
else:
|
|
||||||
response = requests.get(self.url, auth=self.ES_AUTH).json()
|
|
||||||
|
|
||||||
if "hits" in response.keys():
|
if "hits" in response.keys():
|
||||||
self.max_hits = response["hits"]["total"]["value"]
|
self.max_hits = response["hits"]["total"]["value"]
|
||||||
@ -153,11 +143,10 @@ class SearchForm:
|
|||||||
"""build query from search form data"""
|
"""build query from search form data"""
|
||||||
|
|
||||||
CONFIG = AppConfig().config
|
CONFIG = AppConfig().config
|
||||||
ES_URL = CONFIG["application"]["es_url"]
|
|
||||||
|
|
||||||
def multi_search(self, search_query):
|
def multi_search(self, search_query):
|
||||||
"""searching through index"""
|
"""searching through index"""
|
||||||
url = self.ES_URL + "/ta_video,ta_channel,ta_playlist/_search"
|
path = "ta_video,ta_channel,ta_playlist/_search"
|
||||||
data = {
|
data = {
|
||||||
"size": 30,
|
"size": 30,
|
||||||
"query": {
|
"query": {
|
||||||
@ -184,7 +173,7 @@ class SearchForm:
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
look_up = SearchHandler(url, data)
|
look_up = SearchHandler(path, config=self.CONFIG, data=data)
|
||||||
search_results = look_up.get_data()
|
search_results = look_up.get_data()
|
||||||
all_results = self.build_results(search_results)
|
all_results = self.build_results(search_results)
|
||||||
|
|
||||||
@ -212,62 +201,3 @@ class SearchForm:
|
|||||||
}
|
}
|
||||||
|
|
||||||
return all_results
|
return all_results
|
||||||
|
|
||||||
|
|
||||||
class Pagination:
|
|
||||||
"""
|
|
||||||
figure out the pagination based on page size and total_hits
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, page_get, user_id, search_get=False):
|
|
||||||
self.user_id = user_id
|
|
||||||
self.page_size = self.get_page_size()
|
|
||||||
self.page_get = page_get
|
|
||||||
self.search_get = search_get
|
|
||||||
self.pagination = self.first_guess()
|
|
||||||
|
|
||||||
def get_page_size(self):
|
|
||||||
"""get default or user modified page_size"""
|
|
||||||
key = f"{self.user_id}:page_size"
|
|
||||||
page_size = RedisArchivist().get_message(key)["status"]
|
|
||||||
if not page_size:
|
|
||||||
config = AppConfig().config
|
|
||||||
page_size = config["archive"]["page_size"]
|
|
||||||
|
|
||||||
return page_size
|
|
||||||
|
|
||||||
def first_guess(self):
|
|
||||||
"""build first guess before api call"""
|
|
||||||
page_get = self.page_get
|
|
||||||
if page_get in [0, 1]:
|
|
||||||
page_from = 0
|
|
||||||
prev_pages = False
|
|
||||||
elif page_get > 1:
|
|
||||||
page_from = (page_get - 1) * self.page_size
|
|
||||||
prev_pages = [
|
|
||||||
i for i in range(page_get - 1, page_get - 6, -1) if i > 1
|
|
||||||
]
|
|
||||||
prev_pages.reverse()
|
|
||||||
pagination = {
|
|
||||||
"page_size": self.page_size,
|
|
||||||
"page_from": page_from,
|
|
||||||
"prev_pages": prev_pages,
|
|
||||||
"current_page": page_get,
|
|
||||||
}
|
|
||||||
if self.search_get:
|
|
||||||
pagination.update({"search_get": self.search_get})
|
|
||||||
return pagination
|
|
||||||
|
|
||||||
def validate(self, total_hits):
|
|
||||||
"""validate pagination with total_hits after making api call"""
|
|
||||||
page_get = self.page_get
|
|
||||||
max_pages = math.ceil(total_hits / self.page_size)
|
|
||||||
if page_get < max_pages and max_pages > 1:
|
|
||||||
self.pagination["last_page"] = max_pages
|
|
||||||
else:
|
|
||||||
self.pagination["last_page"] = False
|
|
||||||
next_pages = [
|
|
||||||
i for i in range(page_get + 1, page_get + 6) if 1 < i < max_pages
|
|
||||||
]
|
|
||||||
|
|
||||||
self.pagination["next_pages"] = next_pages
|
|
128
tubearchivist/home/src/frontend/watched.py
Normal file
128
tubearchivist/home/src/frontend/watched.py
Normal file
@ -0,0 +1,128 @@
|
|||||||
|
"""
|
||||||
|
functionality:
|
||||||
|
- handle watched state for videos, channels and playlists
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from home.src.ta.config import AppConfig
|
||||||
|
from home.src.ta.helper import UrlListParser
|
||||||
|
|
||||||
|
|
||||||
|
class WatchState:
|
||||||
|
"""handle watched checkbox for videos and channels"""
|
||||||
|
|
||||||
|
CONFIG = AppConfig().config
|
||||||
|
ES_URL = CONFIG["application"]["es_url"]
|
||||||
|
ES_AUTH = CONFIG["application"]["es_auth"]
|
||||||
|
HEADERS = {"Content-type": "application/json"}
|
||||||
|
|
||||||
|
def __init__(self, youtube_id):
|
||||||
|
self.youtube_id = youtube_id
|
||||||
|
self.stamp = int(datetime.now().strftime("%s"))
|
||||||
|
|
||||||
|
def mark_as_watched(self):
|
||||||
|
"""update es with new watched value"""
|
||||||
|
url_type = self.dedect_type()
|
||||||
|
if url_type == "video":
|
||||||
|
self.mark_vid_watched()
|
||||||
|
elif url_type == "channel":
|
||||||
|
self.mark_channel_watched()
|
||||||
|
elif url_type == "playlist":
|
||||||
|
self.mark_playlist_watched()
|
||||||
|
|
||||||
|
print(f"marked {self.youtube_id} as watched")
|
||||||
|
|
||||||
|
def mark_as_unwatched(self):
|
||||||
|
"""revert watched state to false"""
|
||||||
|
url_type = self.dedect_type()
|
||||||
|
if url_type == "video":
|
||||||
|
self.mark_vid_watched(revert=True)
|
||||||
|
|
||||||
|
print(f"revert {self.youtube_id} as unwatched")
|
||||||
|
|
||||||
|
def dedect_type(self):
|
||||||
|
"""find youtube id type"""
|
||||||
|
print(self.youtube_id)
|
||||||
|
url_process = UrlListParser(self.youtube_id).process_list()
|
||||||
|
url_type = url_process[0]["type"]
|
||||||
|
return url_type
|
||||||
|
|
||||||
|
def mark_vid_watched(self, revert=False):
|
||||||
|
"""change watched status of single video"""
|
||||||
|
url = self.ES_URL + "/ta_video/_update/" + self.youtube_id
|
||||||
|
data = {
|
||||||
|
"doc": {"player": {"watched": True, "watched_date": self.stamp}}
|
||||||
|
}
|
||||||
|
if revert:
|
||||||
|
data["doc"]["player"]["watched"] = False
|
||||||
|
|
||||||
|
payload = json.dumps(data)
|
||||||
|
request = requests.post(
|
||||||
|
url, data=payload, headers=self.HEADERS, auth=self.ES_AUTH
|
||||||
|
)
|
||||||
|
if not request.ok:
|
||||||
|
print(request.text)
|
||||||
|
raise ValueError("failed to mark video as watched")
|
||||||
|
|
||||||
|
def mark_channel_watched(self):
|
||||||
|
"""change watched status of every video in channel"""
|
||||||
|
data = {
|
||||||
|
"query": {
|
||||||
|
"bool": {
|
||||||
|
"must": [
|
||||||
|
{
|
||||||
|
"term": {
|
||||||
|
"channel.channel_id": {
|
||||||
|
"value": self.youtube_id
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{"term": {"player.watched": {"value": False}}},
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"script": {
|
||||||
|
"source": "ctx._source.player['watched'] = true",
|
||||||
|
"lang": "painless",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
payload = json.dumps(data)
|
||||||
|
url = f"{self.ES_URL}/ta_video/_update_by_query"
|
||||||
|
request = requests.post(
|
||||||
|
url, data=payload, headers=self.HEADERS, auth=self.ES_AUTH
|
||||||
|
)
|
||||||
|
if not request.ok:
|
||||||
|
print(request.text)
|
||||||
|
raise ValueError("failed mark channel as watched")
|
||||||
|
|
||||||
|
def mark_playlist_watched(self):
|
||||||
|
"""change watched state of all videos in playlist"""
|
||||||
|
data = {
|
||||||
|
"query": {
|
||||||
|
"bool": {
|
||||||
|
"must": [
|
||||||
|
{
|
||||||
|
"term": {
|
||||||
|
"playlist.keyword": {"value": self.youtube_id}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{"term": {"player.watched": {"value": False}}},
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"script": {
|
||||||
|
"source": "ctx._source.player['watched'] = true",
|
||||||
|
"lang": "painless",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
payload = json.dumps(data)
|
||||||
|
url = f"{self.ES_URL}/ta_video/_update_by_query"
|
||||||
|
request = requests.post(
|
||||||
|
url, data=payload, headers=self.HEADERS, auth=self.ES_AUTH
|
||||||
|
)
|
||||||
|
if not request.ok:
|
||||||
|
print(request.text)
|
||||||
|
raise ValueError("failed mark playlist as watched")
|
@ -1,970 +0,0 @@
|
|||||||
"""
|
|
||||||
Functionality:
|
|
||||||
- index new videos into elastisearch
|
|
||||||
- extract video info with yt_dlp
|
|
||||||
- scrape youtube channel page if needed
|
|
||||||
"""
|
|
||||||
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
import re
|
|
||||||
from datetime import datetime
|
|
||||||
from time import sleep
|
|
||||||
|
|
||||||
import requests
|
|
||||||
import yt_dlp
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
from home.src.config import AppConfig
|
|
||||||
from home.src.helper import DurationConverter, UrlListParser, clean_string
|
|
||||||
from home.src.thumbnails import ThumbManager
|
|
||||||
from ryd_client import ryd_client
|
|
||||||
|
|
||||||
|
|
||||||
class YoutubeChannel:
|
|
||||||
"""represents a single youtube channel"""
|
|
||||||
|
|
||||||
CONFIG = AppConfig().config
|
|
||||||
ES_URL = CONFIG["application"]["es_url"]
|
|
||||||
ES_AUTH = CONFIG["application"]["es_auth"]
|
|
||||||
CACHE_DIR = CONFIG["application"]["cache_dir"]
|
|
||||||
VIDEOS = CONFIG["application"]["videos"]
|
|
||||||
|
|
||||||
def __init__(self, channel_id):
|
|
||||||
self.channel_id = channel_id
|
|
||||||
self.json_data = None
|
|
||||||
self.source = None
|
|
||||||
self.channel_dict = self.build_channel_dict()
|
|
||||||
|
|
||||||
def build_channel_dict(self, scrape=False):
|
|
||||||
"""combine the dicts build from extracted json payload"""
|
|
||||||
if scrape:
|
|
||||||
channel_dict = False
|
|
||||||
else:
|
|
||||||
channel_dict = self.get_es_channel()
|
|
||||||
if not channel_dict:
|
|
||||||
print("scrape data from youtube")
|
|
||||||
self.scrape_channel()
|
|
||||||
channel_dict = self.parse_channel_main()
|
|
||||||
channel_dict.update(self.parse_channel_meta())
|
|
||||||
self.source = "scraped"
|
|
||||||
return channel_dict
|
|
||||||
|
|
||||||
def get_es_channel(self):
|
|
||||||
"""get from elastic search first if possible"""
|
|
||||||
channel_id = self.channel_id
|
|
||||||
url = f"{self.ES_URL}/ta_channel/_doc/{channel_id}"
|
|
||||||
response = requests.get(url, auth=self.ES_AUTH)
|
|
||||||
if response.ok:
|
|
||||||
channel_source = response.json()["_source"]
|
|
||||||
self.source = "elastic"
|
|
||||||
return channel_source
|
|
||||||
return False
|
|
||||||
|
|
||||||
def scrape_channel(self):
|
|
||||||
"""scrape channel page for additional infos"""
|
|
||||||
channel_id = self.channel_id
|
|
||||||
url = f"https://www.youtube.com/channel/{channel_id}/about?hl=en"
|
|
||||||
cookies = {"CONSENT": "YES+xxxxxxxxxxxxxxxxxxxxxxxxxxx"}
|
|
||||||
response = requests.get(url, cookies=cookies, auth=self.ES_AUTH)
|
|
||||||
if response.ok:
|
|
||||||
channel_page = response.text
|
|
||||||
else:
|
|
||||||
print(f"failed to extract channel info for: {channel_id}")
|
|
||||||
raise ConnectionError
|
|
||||||
soup = BeautifulSoup(channel_page, "html.parser")
|
|
||||||
# load script into json
|
|
||||||
all_scripts = soup.find("body").find_all("script")
|
|
||||||
for script in all_scripts:
|
|
||||||
if "var ytInitialData = " in str(script):
|
|
||||||
script_content = str(script)
|
|
||||||
break
|
|
||||||
# extract payload
|
|
||||||
script_content = script_content.split("var ytInitialData = ")[1]
|
|
||||||
json_raw = script_content.rstrip(";</script>")
|
|
||||||
json_data = json.loads(json_raw)
|
|
||||||
# add to self
|
|
||||||
self.json_data = json_data
|
|
||||||
|
|
||||||
def parse_channel_main(self):
|
|
||||||
"""extract maintab values from scraped channel json data"""
|
|
||||||
main_tab = self.json_data["header"]["c4TabbedHeaderRenderer"]
|
|
||||||
channel_name = main_tab["title"]
|
|
||||||
last_refresh = int(datetime.now().strftime("%s"))
|
|
||||||
# channel_subs
|
|
||||||
try:
|
|
||||||
sub_text_simple = main_tab["subscriberCountText"]["simpleText"]
|
|
||||||
sub_text = sub_text_simple.split(" ")[0]
|
|
||||||
if sub_text[-1] == "K":
|
|
||||||
channel_subs = int(float(sub_text.replace("K", "")) * 1000)
|
|
||||||
elif sub_text[-1] == "M":
|
|
||||||
channel_subs = int(float(sub_text.replace("M", "")) * 1000000)
|
|
||||||
elif int(sub_text) >= 0:
|
|
||||||
channel_subs = int(sub_text)
|
|
||||||
else:
|
|
||||||
message = f"{sub_text} not dealt with"
|
|
||||||
print(message)
|
|
||||||
except KeyError:
|
|
||||||
channel_subs = 0
|
|
||||||
# banner
|
|
||||||
try:
|
|
||||||
all_banners = main_tab["banner"]["thumbnails"]
|
|
||||||
banner = sorted(all_banners, key=lambda k: k["width"])[-1]["url"]
|
|
||||||
except KeyError:
|
|
||||||
banner = False
|
|
||||||
# build and return dict
|
|
||||||
main_channel_dict = {
|
|
||||||
"channel_active": True,
|
|
||||||
"channel_last_refresh": last_refresh,
|
|
||||||
"channel_subs": channel_subs,
|
|
||||||
"channel_banner_url": banner,
|
|
||||||
"channel_name": channel_name,
|
|
||||||
"channel_id": self.channel_id,
|
|
||||||
}
|
|
||||||
return main_channel_dict
|
|
||||||
|
|
||||||
def parse_channel_meta(self):
|
|
||||||
"""extract meta tab values from channel payload"""
|
|
||||||
# meta tab
|
|
||||||
json_data = self.json_data
|
|
||||||
meta_tab = json_data["metadata"]["channelMetadataRenderer"]
|
|
||||||
description = meta_tab["description"]
|
|
||||||
all_thumbs = meta_tab["avatar"]["thumbnails"]
|
|
||||||
thumb_url = sorted(all_thumbs, key=lambda k: k["width"])[-1]["url"]
|
|
||||||
# stats tab
|
|
||||||
renderer = "twoColumnBrowseResultsRenderer"
|
|
||||||
all_tabs = json_data["contents"][renderer]["tabs"]
|
|
||||||
for tab in all_tabs:
|
|
||||||
if "tabRenderer" in tab.keys():
|
|
||||||
if tab["tabRenderer"]["title"] == "About":
|
|
||||||
about_tab = tab["tabRenderer"]["content"][
|
|
||||||
"sectionListRenderer"
|
|
||||||
]["contents"][0]["itemSectionRenderer"]["contents"][0][
|
|
||||||
"channelAboutFullMetadataRenderer"
|
|
||||||
]
|
|
||||||
break
|
|
||||||
try:
|
|
||||||
channel_views_text = about_tab["viewCountText"]["simpleText"]
|
|
||||||
channel_views = int(re.sub(r"\D", "", channel_views_text))
|
|
||||||
except KeyError:
|
|
||||||
channel_views = 0
|
|
||||||
|
|
||||||
meta_channel_dict = {
|
|
||||||
"channel_description": description,
|
|
||||||
"channel_thumb_url": thumb_url,
|
|
||||||
"channel_views": channel_views,
|
|
||||||
}
|
|
||||||
|
|
||||||
return meta_channel_dict
|
|
||||||
|
|
||||||
def get_channel_art(self):
|
|
||||||
"""download channel art for new channels"""
|
|
||||||
channel_id = self.channel_id
|
|
||||||
channel_thumb = self.channel_dict["channel_thumb_url"]
|
|
||||||
channel_banner = self.channel_dict["channel_banner_url"]
|
|
||||||
ThumbManager().download_chan(
|
|
||||||
[(channel_id, channel_thumb, channel_banner)]
|
|
||||||
)
|
|
||||||
|
|
||||||
def upload_to_es(self):
|
|
||||||
"""upload channel data to elastic search"""
|
|
||||||
url = f"{self.ES_URL}/ta_channel/_doc/{self.channel_id}"
|
|
||||||
response = requests.put(url, json=self.channel_dict, auth=self.ES_AUTH)
|
|
||||||
print(f"added {self.channel_id} to es")
|
|
||||||
if not response.ok:
|
|
||||||
print(response.text)
|
|
||||||
raise ValueError("failed to add channel to index")
|
|
||||||
|
|
||||||
def sync_to_videos(self):
|
|
||||||
"""sync new channel_dict to all videos of channel"""
|
|
||||||
headers = {"Content-type": "application/json"}
|
|
||||||
channel_id = self.channel_id
|
|
||||||
# add ingest pipeline
|
|
||||||
processors = []
|
|
||||||
for field, value in self.channel_dict.items():
|
|
||||||
line = {"set": {"field": "channel." + field, "value": value}}
|
|
||||||
processors.append(line)
|
|
||||||
data = {"description": channel_id, "processors": processors}
|
|
||||||
payload = json.dumps(data)
|
|
||||||
url = self.ES_URL + "/_ingest/pipeline/" + channel_id
|
|
||||||
request = requests.put(
|
|
||||||
url, data=payload, headers=headers, auth=self.ES_AUTH
|
|
||||||
)
|
|
||||||
if not request.ok:
|
|
||||||
print(request.text)
|
|
||||||
# apply pipeline
|
|
||||||
data = {"query": {"match": {"channel.channel_id": channel_id}}}
|
|
||||||
payload = json.dumps(data)
|
|
||||||
url = self.ES_URL + "/ta_video/_update_by_query?pipeline=" + channel_id
|
|
||||||
request = requests.post(
|
|
||||||
url, data=payload, headers=headers, auth=self.ES_AUTH
|
|
||||||
)
|
|
||||||
if not request.ok:
|
|
||||||
print(request.text)
|
|
||||||
|
|
||||||
def get_folder_path(self):
|
|
||||||
"""get folder where media files get stored"""
|
|
||||||
channel_name = self.channel_dict["channel_name"]
|
|
||||||
folder_name = clean_string(channel_name)
|
|
||||||
folder_path = os.path.join(self.VIDEOS, folder_name)
|
|
||||||
return folder_path
|
|
||||||
|
|
||||||
def delete_es_videos(self):
|
|
||||||
"""delete all channel documents from elasticsearch"""
|
|
||||||
headers = {"Content-type": "application/json"}
|
|
||||||
data = {
|
|
||||||
"query": {
|
|
||||||
"term": {"channel.channel_id": {"value": self.channel_id}}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
payload = json.dumps(data)
|
|
||||||
url = self.ES_URL + "/ta_video/_delete_by_query"
|
|
||||||
response = requests.post(
|
|
||||||
url, data=payload, headers=headers, auth=self.ES_AUTH
|
|
||||||
)
|
|
||||||
if not response.ok:
|
|
||||||
print(response.text)
|
|
||||||
|
|
||||||
def delete_playlists(self):
|
|
||||||
"""delete all indexed playlist from es"""
|
|
||||||
all_playlists = self.get_indexed_playlists()
|
|
||||||
for playlist in all_playlists:
|
|
||||||
playlist_id = playlist["playlist_id"]
|
|
||||||
YoutubePlaylist(playlist_id).delete_metadata()
|
|
||||||
|
|
||||||
def delete_channel(self):
|
|
||||||
"""delete channel and all videos"""
|
|
||||||
print(f"deleting {self.channel_id} and all matching media files")
|
|
||||||
folder_path = self.get_folder_path()
|
|
||||||
print("delete all media files")
|
|
||||||
try:
|
|
||||||
all_videos = os.listdir(folder_path)
|
|
||||||
for video in all_videos:
|
|
||||||
video_path = os.path.join(folder_path, video)
|
|
||||||
os.remove(video_path)
|
|
||||||
os.rmdir(folder_path)
|
|
||||||
except FileNotFoundError:
|
|
||||||
print(f"no videos found for {folder_path}")
|
|
||||||
|
|
||||||
ThumbManager().delete_chan_thumb(self.channel_id)
|
|
||||||
print("delete indexed playlists")
|
|
||||||
self.delete_playlists()
|
|
||||||
print("delete indexed videos")
|
|
||||||
self.delete_es_videos()
|
|
||||||
url = self.ES_URL + "/ta_channel/_doc/" + self.channel_id
|
|
||||||
response = requests.delete(url, auth=self.ES_AUTH)
|
|
||||||
if not response.ok:
|
|
||||||
print(response.text)
|
|
||||||
|
|
||||||
def get_all_playlists(self):
|
|
||||||
"""get all playlists owned by this channel"""
|
|
||||||
url = (
|
|
||||||
f"https://www.youtube.com/channel/{self.channel_id}"
|
|
||||||
+ "/playlists?view=1&sort=dd&shelf_id=0"
|
|
||||||
)
|
|
||||||
obs = {
|
|
||||||
"quiet": True,
|
|
||||||
"skip_download": True,
|
|
||||||
"extract_flat": True,
|
|
||||||
}
|
|
||||||
playlists = yt_dlp.YoutubeDL(obs).extract_info(url)
|
|
||||||
all_entries = [(i["id"], i["title"]) for i in playlists["entries"]]
|
|
||||||
|
|
||||||
return all_entries
|
|
||||||
|
|
||||||
def get_indexed_playlists(self):
|
|
||||||
"""get all indexed playlists from channel"""
|
|
||||||
data = {
|
|
||||||
"query": {
|
|
||||||
"term": {"playlist_channel_id": {"value": self.channel_id}}
|
|
||||||
},
|
|
||||||
"sort": [{"playlist_channel.keyword": {"order": "desc"}}],
|
|
||||||
}
|
|
||||||
all_playlists = IndexPaginate("ta_playlist", data).get_results()
|
|
||||||
return all_playlists
|
|
||||||
|
|
||||||
|
|
||||||
class YoutubeVideo:
|
|
||||||
"""represents a single youtube video"""
|
|
||||||
|
|
||||||
CONFIG = AppConfig().config
|
|
||||||
ES_URL = CONFIG["application"]["es_url"]
|
|
||||||
ES_AUTH = CONFIG["application"]["es_auth"]
|
|
||||||
CACHE_DIR = CONFIG["application"]["cache_dir"]
|
|
||||||
VIDEOS = CONFIG["application"]["videos"]
|
|
||||||
|
|
||||||
def __init__(self, youtube_id):
|
|
||||||
self.youtube_id = youtube_id
|
|
||||||
self.channel_id = None
|
|
||||||
self.vid_dict = None
|
|
||||||
|
|
||||||
def get_vid_dict(self):
|
|
||||||
"""wrapper to loop around yt_dlp to retry on failure"""
|
|
||||||
print(f"get video data for {self.youtube_id}")
|
|
||||||
vid_dict = False
|
|
||||||
for i in range(3):
|
|
||||||
try:
|
|
||||||
vid_dict = self.get_youtubedl_vid_data()
|
|
||||||
except KeyError as e:
|
|
||||||
print(e)
|
|
||||||
sleep((i + 1) ** 2)
|
|
||||||
continue
|
|
||||||
else:
|
|
||||||
break
|
|
||||||
|
|
||||||
self.vid_dict = vid_dict
|
|
||||||
if self.CONFIG["downloads"]["integrate_ryd"]:
|
|
||||||
self.get_ryd_stats()
|
|
||||||
|
|
||||||
def get_youtubedl_vid_data(self):
|
|
||||||
"""parse youtubedl extract info"""
|
|
||||||
youtube_id = self.youtube_id
|
|
||||||
obs = {
|
|
||||||
"quiet": True,
|
|
||||||
"default_search": "ytsearch",
|
|
||||||
"skip_download": True,
|
|
||||||
"check_formats": "selected",
|
|
||||||
"noplaylist": True,
|
|
||||||
}
|
|
||||||
try:
|
|
||||||
vid = yt_dlp.YoutubeDL(obs).extract_info(youtube_id)
|
|
||||||
except (
|
|
||||||
yt_dlp.utils.ExtractorError,
|
|
||||||
yt_dlp.utils.DownloadError,
|
|
||||||
):
|
|
||||||
print("failed to get info for " + youtube_id)
|
|
||||||
return False
|
|
||||||
# extract
|
|
||||||
self.channel_id = vid["channel_id"]
|
|
||||||
upload_date = vid["upload_date"]
|
|
||||||
upload_date_time = datetime.strptime(upload_date, "%Y%m%d")
|
|
||||||
published = upload_date_time.strftime("%Y-%m-%d")
|
|
||||||
last_refresh = int(datetime.now().strftime("%s"))
|
|
||||||
# likes
|
|
||||||
try:
|
|
||||||
like_count = vid["like_count"]
|
|
||||||
except KeyError:
|
|
||||||
like_count = 0
|
|
||||||
try:
|
|
||||||
dislike_count = vid["dislike_count"]
|
|
||||||
except KeyError:
|
|
||||||
dislike_count = 0
|
|
||||||
# build dicts
|
|
||||||
stats = {
|
|
||||||
"view_count": vid["view_count"],
|
|
||||||
"like_count": like_count,
|
|
||||||
"dislike_count": dislike_count,
|
|
||||||
"average_rating": vid["average_rating"],
|
|
||||||
}
|
|
||||||
vid_basic = {
|
|
||||||
"title": vid["title"],
|
|
||||||
"description": vid["description"],
|
|
||||||
"category": vid["categories"],
|
|
||||||
"vid_thumb_url": vid["thumbnail"],
|
|
||||||
"tags": vid["tags"],
|
|
||||||
"published": published,
|
|
||||||
"stats": stats,
|
|
||||||
"vid_last_refresh": last_refresh,
|
|
||||||
"date_downloaded": last_refresh,
|
|
||||||
"youtube_id": youtube_id,
|
|
||||||
"active": True,
|
|
||||||
"channel": False,
|
|
||||||
}
|
|
||||||
|
|
||||||
return vid_basic
|
|
||||||
|
|
||||||
def add_player(self, missing_vid):
|
|
||||||
"""add player information for new videos"""
|
|
||||||
cache_path = self.CACHE_DIR + "/download/"
|
|
||||||
videos = self.VIDEOS
|
|
||||||
|
|
||||||
if missing_vid:
|
|
||||||
# coming from scan_filesystem
|
|
||||||
channel_name, file_name, _ = missing_vid
|
|
||||||
vid_path = os.path.join(videos, channel_name, file_name)
|
|
||||||
else:
|
|
||||||
# coming from VideoDownload
|
|
||||||
all_cached = os.listdir(cache_path)
|
|
||||||
for file_cached in all_cached:
|
|
||||||
if self.youtube_id in file_cached:
|
|
||||||
vid_path = os.path.join(cache_path, file_cached)
|
|
||||||
break
|
|
||||||
|
|
||||||
duration_handler = DurationConverter()
|
|
||||||
duration = duration_handler.get_sec(vid_path)
|
|
||||||
duration_str = duration_handler.get_str(duration)
|
|
||||||
player = {
|
|
||||||
"watched": False,
|
|
||||||
"duration": duration,
|
|
||||||
"duration_str": duration_str,
|
|
||||||
}
|
|
||||||
self.vid_dict["player"] = player
|
|
||||||
|
|
||||||
def build_file_path(self, channel_name):
|
|
||||||
"""build media_url from where file will be located"""
|
|
||||||
clean_channel_name = clean_string(channel_name)
|
|
||||||
timestamp = self.vid_dict["published"].replace("-", "")
|
|
||||||
youtube_id = self.vid_dict["youtube_id"]
|
|
||||||
title = self.vid_dict["title"]
|
|
||||||
clean_title = clean_string(title)
|
|
||||||
filename = f"{timestamp}_{youtube_id}_{clean_title}.mp4"
|
|
||||||
media_url = os.path.join(clean_channel_name, filename)
|
|
||||||
self.vid_dict["media_url"] = media_url
|
|
||||||
|
|
||||||
def get_es_data(self):
|
|
||||||
"""get current data from elastic search"""
|
|
||||||
url = self.ES_URL + "/ta_video/_doc/" + self.youtube_id
|
|
||||||
response = requests.get(url, auth=self.ES_AUTH)
|
|
||||||
if not response.ok:
|
|
||||||
print(response.text)
|
|
||||||
es_vid_dict = json.loads(response.text)
|
|
||||||
return es_vid_dict
|
|
||||||
|
|
||||||
def upload_to_es(self):
|
|
||||||
"""upload video data to elastic search"""
|
|
||||||
url = f"{self.ES_URL}/ta_video/_doc/{self.youtube_id}/?refresh=true"
|
|
||||||
response = requests.put(url, json=self.vid_dict, auth=self.ES_AUTH)
|
|
||||||
if not response.ok:
|
|
||||||
print(response.text)
|
|
||||||
raise ValueError("failed to add video to index")
|
|
||||||
|
|
||||||
def deactivate(self):
|
|
||||||
"""deactivate document on extractor error"""
|
|
||||||
youtube_id = self.youtube_id
|
|
||||||
headers = {"Content-type": "application/json"}
|
|
||||||
url = f"{self.ES_URL}/ta_video/_update/{youtube_id}"
|
|
||||||
data = {"script": "ctx._source.active = false"}
|
|
||||||
json_str = json.dumps(data)
|
|
||||||
response = requests.post(
|
|
||||||
url, data=json_str, headers=headers, auth=self.ES_AUTH
|
|
||||||
)
|
|
||||||
print(f"deactivated {youtube_id}")
|
|
||||||
if not response.ok:
|
|
||||||
print(response.text)
|
|
||||||
|
|
||||||
def delete_media_file(self):
|
|
||||||
"""delete video file, meta data, thumbnails"""
|
|
||||||
# delete media file
|
|
||||||
es_vid_dict = self.get_es_data()
|
|
||||||
media_url = es_vid_dict["_source"]["media_url"]
|
|
||||||
print(f"delete {media_url} from file system")
|
|
||||||
to_delete = os.path.join(self.VIDEOS, media_url)
|
|
||||||
os.remove(to_delete)
|
|
||||||
# delete from index
|
|
||||||
url = f"{self.ES_URL}/ta_video/_doc/{self.youtube_id}"
|
|
||||||
response = requests.delete(url, auth=self.ES_AUTH)
|
|
||||||
if not response.ok:
|
|
||||||
print(response.text)
|
|
||||||
# delete thumbs from cache
|
|
||||||
ThumbManager().delete_vid_thumb(self.youtube_id)
|
|
||||||
|
|
||||||
def get_ryd_stats(self):
|
|
||||||
"""get optional stats from returnyoutubedislikeapi.com"""
|
|
||||||
try:
|
|
||||||
print(f"get ryd stats for: {self.youtube_id}")
|
|
||||||
result = ryd_client.get(self.youtube_id)
|
|
||||||
except requests.exceptions.ConnectionError:
|
|
||||||
print(f"failed to query ryd api, skipping {self.youtube_id}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
if result["status"] == 404:
|
|
||||||
return False
|
|
||||||
|
|
||||||
dislikes = {
|
|
||||||
"dislike_count": result["dislikes"],
|
|
||||||
"average_rating": result["rating"],
|
|
||||||
}
|
|
||||||
self.vid_dict["stats"].update(dislikes)
|
|
||||||
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
|
||||||
class YoutubePlaylist:
|
|
||||||
"""represent a single playlist on YouTube"""
|
|
||||||
|
|
||||||
CONFIG = AppConfig().config
|
|
||||||
ES_URL = CONFIG["application"]["es_url"]
|
|
||||||
ES_AUTH = CONFIG["application"]["es_auth"]
|
|
||||||
|
|
||||||
def __init__(self, playlist_id, all_youtube_ids=False):
|
|
||||||
self.playlist_id = playlist_id
|
|
||||||
self.stamp = int(datetime.now().strftime("%s"))
|
|
||||||
self.all_youtube_ids = all_youtube_ids
|
|
||||||
self.playlist_dict = False
|
|
||||||
|
|
||||||
def get_playlist_dict(self, scrape=False):
|
|
||||||
"""get data from es or youtube"""
|
|
||||||
print(f"get playlist with id {self.playlist_id}")
|
|
||||||
|
|
||||||
if scrape:
|
|
||||||
playlist_dict = self.get_youtube_playlist()
|
|
||||||
if not playlist_dict:
|
|
||||||
return False
|
|
||||||
playlist_dict["playlist_entries"] = self.get_entries()
|
|
||||||
else:
|
|
||||||
playlist_dict = self.get_es_playlist()
|
|
||||||
if not playlist_dict:
|
|
||||||
playlist_dict = self.get_youtube_playlist()
|
|
||||||
playlist_dict["playlist_entries"] = self.get_entries()
|
|
||||||
|
|
||||||
self.playlist_dict = playlist_dict
|
|
||||||
return True
|
|
||||||
|
|
||||||
def get_youtube_playlist(self):
|
|
||||||
"""get meta data dict from youtube"""
|
|
||||||
url = "https://www.youtube.com/playlist?list=" + self.playlist_id
|
|
||||||
obs = {
|
|
||||||
"default_search": "ytsearch",
|
|
||||||
"quiet": True,
|
|
||||||
"skip_download": True,
|
|
||||||
"extract_flat": True,
|
|
||||||
"playlistend": 0,
|
|
||||||
}
|
|
||||||
try:
|
|
||||||
playlist = yt_dlp.YoutubeDL(obs).extract_info(url, download=False)
|
|
||||||
except (
|
|
||||||
yt_dlp.utils.ExtractorError,
|
|
||||||
yt_dlp.utils.DownloadError,
|
|
||||||
):
|
|
||||||
print("failed to get info for " + self.playlist_id)
|
|
||||||
return False
|
|
||||||
|
|
||||||
playlist_es = {
|
|
||||||
"playlist_id": self.playlist_id,
|
|
||||||
"playlist_active": True,
|
|
||||||
"playlist_subscribed": False,
|
|
||||||
"playlist_name": playlist["title"],
|
|
||||||
"playlist_channel": playlist["channel"],
|
|
||||||
"playlist_channel_id": playlist["channel_id"],
|
|
||||||
"playlist_thumbnail": playlist["thumbnails"][-1]["url"],
|
|
||||||
"playlist_description": playlist["description"] or False,
|
|
||||||
"playlist_last_refresh": self.stamp,
|
|
||||||
}
|
|
||||||
|
|
||||||
return playlist_es
|
|
||||||
|
|
||||||
def get_es_playlist(self):
|
|
||||||
"""get indexed data from es"""
|
|
||||||
url = f"{self.ES_URL}/ta_playlist/_doc/{self.playlist_id}"
|
|
||||||
response = requests.get(url, auth=self.ES_AUTH)
|
|
||||||
if response.ok:
|
|
||||||
return json.loads(response.text)["_source"]
|
|
||||||
|
|
||||||
return False
|
|
||||||
|
|
||||||
def get_entries(self, playlistend=False):
|
|
||||||
"""get all videos in playlist"""
|
|
||||||
url = "https://www.youtube.com/playlist?list=" + self.playlist_id
|
|
||||||
obs = {
|
|
||||||
"default_search": "ytsearch",
|
|
||||||
"quiet": True,
|
|
||||||
"skip_download": True,
|
|
||||||
"extract_flat": True,
|
|
||||||
}
|
|
||||||
if playlistend:
|
|
||||||
obs["playlistend"] = playlistend
|
|
||||||
|
|
||||||
try:
|
|
||||||
playlist = yt_dlp.YoutubeDL(obs).extract_info(url, download=False)
|
|
||||||
except (
|
|
||||||
yt_dlp.utils.ExtractorError,
|
|
||||||
yt_dlp.utils.DownloadError,
|
|
||||||
):
|
|
||||||
print("failed to get plealist entries for " + self.playlist_id)
|
|
||||||
return False
|
|
||||||
|
|
||||||
all_members = []
|
|
||||||
for idx, entry in enumerate(playlist["entries"]):
|
|
||||||
uploader = entry["uploader"]
|
|
||||||
youtube_id = entry["id"]
|
|
||||||
if self.all_youtube_ids:
|
|
||||||
downloaded = youtube_id in self.all_youtube_ids
|
|
||||||
else:
|
|
||||||
downloaded = False
|
|
||||||
if not uploader:
|
|
||||||
continue
|
|
||||||
to_append = {
|
|
||||||
"youtube_id": youtube_id,
|
|
||||||
"title": entry["title"],
|
|
||||||
"uploader": uploader,
|
|
||||||
"idx": idx,
|
|
||||||
"downloaded": downloaded,
|
|
||||||
}
|
|
||||||
all_members.append(to_append)
|
|
||||||
|
|
||||||
return all_members
|
|
||||||
|
|
||||||
def upload_to_es(self):
|
|
||||||
"""add playlist to es with its entries"""
|
|
||||||
playlist = self.playlist_dict
|
|
||||||
url = f"{self.ES_URL}/ta_playlist/_doc/{self.playlist_id}"
|
|
||||||
response = requests.put(url, json=playlist, auth=self.ES_AUTH)
|
|
||||||
if not response.ok:
|
|
||||||
print(response.text)
|
|
||||||
raise ValueError("failed to add playlist to index")
|
|
||||||
|
|
||||||
def add_vids_to_playlist(self):
|
|
||||||
"""sync the playlist id to videos"""
|
|
||||||
playlist_dict = self.playlist_dict
|
|
||||||
script = (
|
|
||||||
'if (!ctx._source.containsKey("playlist")) '
|
|
||||||
+ "{ctx._source.playlist = [params.playlist]} "
|
|
||||||
+ "else if (!ctx._source.playlist.contains(params.playlist)) "
|
|
||||||
+ "{ctx._source.playlist.add(params.playlist)} "
|
|
||||||
+ "else {ctx.op = 'none'}"
|
|
||||||
)
|
|
||||||
|
|
||||||
bulk_list = []
|
|
||||||
for entry in playlist_dict["playlist_entries"]:
|
|
||||||
youtube_id = entry["youtube_id"]
|
|
||||||
action = {"update": {"_id": youtube_id, "_index": "ta_video"}}
|
|
||||||
source = {
|
|
||||||
"script": {
|
|
||||||
"source": script,
|
|
||||||
"lang": "painless",
|
|
||||||
"params": {"playlist": self.playlist_id},
|
|
||||||
}
|
|
||||||
}
|
|
||||||
bulk_list.append(json.dumps(action))
|
|
||||||
bulk_list.append(json.dumps(source))
|
|
||||||
|
|
||||||
# add last newline
|
|
||||||
bulk_list.append("\n")
|
|
||||||
query_str = "\n".join(bulk_list)
|
|
||||||
headers = {"Content-type": "application/x-ndjson"}
|
|
||||||
url = self.ES_URL + "/_bulk"
|
|
||||||
response = requests.post(
|
|
||||||
url, data=query_str, headers=headers, auth=self.ES_AUTH
|
|
||||||
)
|
|
||||||
if not response.ok:
|
|
||||||
print(response.text)
|
|
||||||
|
|
||||||
def update_playlist(self):
|
|
||||||
"""update metadata for playlist with data from YouTube"""
|
|
||||||
subscribed = self.get_es_playlist()["playlist_subscribed"]
|
|
||||||
self.get_playlist_dict(scrape=True)
|
|
||||||
if not self.playlist_dict:
|
|
||||||
# return false to deactivate
|
|
||||||
return False
|
|
||||||
|
|
||||||
self.playlist_dict["playlist_subscribed"] = subscribed
|
|
||||||
self.upload_to_es()
|
|
||||||
return self.playlist_dict
|
|
||||||
|
|
||||||
def build_nav(self, youtube_id):
|
|
||||||
"""find next and previous in playlist of a given youtube_id"""
|
|
||||||
all_entries_available = self.playlist_dict["playlist_entries"]
|
|
||||||
all_entries = [i for i in all_entries_available if i["downloaded"]]
|
|
||||||
current = [i for i in all_entries if i["youtube_id"] == youtube_id]
|
|
||||||
# stop if not found or playlist of 1
|
|
||||||
if not current or not len(all_entries) > 1:
|
|
||||||
return False
|
|
||||||
|
|
||||||
current_idx = all_entries.index(current[0])
|
|
||||||
if current_idx == 0:
|
|
||||||
previous_item = False
|
|
||||||
else:
|
|
||||||
previous_item = all_entries[current_idx - 1]
|
|
||||||
prev_thumb = ThumbManager().vid_thumb_path(
|
|
||||||
previous_item["youtube_id"]
|
|
||||||
)
|
|
||||||
previous_item["vid_thumb"] = prev_thumb
|
|
||||||
|
|
||||||
if current_idx == len(all_entries) - 1:
|
|
||||||
next_item = False
|
|
||||||
else:
|
|
||||||
next_item = all_entries[current_idx + 1]
|
|
||||||
next_thumb = ThumbManager().vid_thumb_path(next_item["youtube_id"])
|
|
||||||
next_item["vid_thumb"] = next_thumb
|
|
||||||
|
|
||||||
nav = {
|
|
||||||
"playlist_meta": {
|
|
||||||
"current_idx": current[0]["idx"],
|
|
||||||
"playlist_id": self.playlist_id,
|
|
||||||
"playlist_name": self.playlist_dict["playlist_name"],
|
|
||||||
"playlist_channel": self.playlist_dict["playlist_channel"],
|
|
||||||
},
|
|
||||||
"playlist_previous": previous_item,
|
|
||||||
"playlist_next": next_item,
|
|
||||||
}
|
|
||||||
return nav
|
|
||||||
|
|
||||||
def delete_metadata(self):
|
|
||||||
"""delete metadata for playlist"""
|
|
||||||
script = (
|
|
||||||
"ctx._source.playlist.removeAll("
|
|
||||||
+ "Collections.singleton(params.playlist)) "
|
|
||||||
)
|
|
||||||
data = {
|
|
||||||
"query": {
|
|
||||||
"term": {"playlist.keyword": {"value": self.playlist_id}}
|
|
||||||
},
|
|
||||||
"script": {
|
|
||||||
"source": script,
|
|
||||||
"lang": "painless",
|
|
||||||
"params": {"playlist": self.playlist_id},
|
|
||||||
},
|
|
||||||
}
|
|
||||||
payload = json.dumps(data)
|
|
||||||
url = f"{self.ES_URL}/ta_video/_update_by_query"
|
|
||||||
headers = {"Content-type": "application/json"}
|
|
||||||
response = requests.post(
|
|
||||||
url, data=payload, headers=headers, auth=self.ES_AUTH
|
|
||||||
)
|
|
||||||
if not response.ok:
|
|
||||||
print(response.text)
|
|
||||||
|
|
||||||
self.delete_playlist()
|
|
||||||
|
|
||||||
def delete_videos_playlist(self):
|
|
||||||
"""delete playlist with all videos"""
|
|
||||||
print(f"delete playlist {self.playlist_id} with all videos")
|
|
||||||
self.get_playlist_dict()
|
|
||||||
all_youtube_id = [
|
|
||||||
i["youtube_id"]
|
|
||||||
for i in self.playlist_dict["playlist_entries"]
|
|
||||||
if i["downloaded"]
|
|
||||||
]
|
|
||||||
for youtube_id in all_youtube_id:
|
|
||||||
YoutubeVideo(youtube_id).delete_media_file()
|
|
||||||
|
|
||||||
self.delete_playlist()
|
|
||||||
|
|
||||||
def delete_playlist(self):
|
|
||||||
"""delete only playlist document"""
|
|
||||||
url = f"{self.ES_URL}/ta_playlist/_doc/{self.playlist_id}"
|
|
||||||
response = requests.delete(url, auth=self.ES_AUTH)
|
|
||||||
if not response.ok:
|
|
||||||
print(response.text)
|
|
||||||
|
|
||||||
def deactivate(self):
|
|
||||||
"""deactivate document on extractor error"""
|
|
||||||
headers = {"Content-type": "application/json"}
|
|
||||||
url = f"{self.ES_URL}/ta_playlist/_update/{self.playlist_id}"
|
|
||||||
data = {"script": "ctx._source.playlist_active = false"}
|
|
||||||
json_str = json.dumps(data)
|
|
||||||
response = requests.post(
|
|
||||||
url, data=json_str, headers=headers, auth=self.ES_AUTH
|
|
||||||
)
|
|
||||||
print(f"deactivated {self.playlist_id}")
|
|
||||||
if not response.ok:
|
|
||||||
print(response.text)
|
|
||||||
|
|
||||||
|
|
||||||
class WatchState:
|
|
||||||
"""handle watched checkbox for videos and channels"""
|
|
||||||
|
|
||||||
CONFIG = AppConfig().config
|
|
||||||
ES_URL = CONFIG["application"]["es_url"]
|
|
||||||
ES_AUTH = CONFIG["application"]["es_auth"]
|
|
||||||
HEADERS = {"Content-type": "application/json"}
|
|
||||||
|
|
||||||
def __init__(self, youtube_id):
|
|
||||||
self.youtube_id = youtube_id
|
|
||||||
self.stamp = int(datetime.now().strftime("%s"))
|
|
||||||
|
|
||||||
def mark_as_watched(self):
|
|
||||||
"""update es with new watched value"""
|
|
||||||
url_type = self.dedect_type()
|
|
||||||
if url_type == "video":
|
|
||||||
self.mark_vid_watched()
|
|
||||||
elif url_type == "channel":
|
|
||||||
self.mark_channel_watched()
|
|
||||||
elif url_type == "playlist":
|
|
||||||
self.mark_playlist_watched()
|
|
||||||
|
|
||||||
print(f"marked {self.youtube_id} as watched")
|
|
||||||
|
|
||||||
def mark_as_unwatched(self):
|
|
||||||
"""revert watched state to false"""
|
|
||||||
url_type = self.dedect_type()
|
|
||||||
if url_type == "video":
|
|
||||||
self.mark_vid_watched(revert=True)
|
|
||||||
|
|
||||||
print(f"revert {self.youtube_id} as unwatched")
|
|
||||||
|
|
||||||
def dedect_type(self):
|
|
||||||
"""find youtube id type"""
|
|
||||||
print(self.youtube_id)
|
|
||||||
url_process = UrlListParser(self.youtube_id).process_list()
|
|
||||||
url_type = url_process[0]["type"]
|
|
||||||
return url_type
|
|
||||||
|
|
||||||
def mark_vid_watched(self, revert=False):
|
|
||||||
"""change watched status of single video"""
|
|
||||||
url = self.ES_URL + "/ta_video/_update/" + self.youtube_id
|
|
||||||
data = {
|
|
||||||
"doc": {"player": {"watched": True, "watched_date": self.stamp}}
|
|
||||||
}
|
|
||||||
if revert:
|
|
||||||
data["doc"]["player"]["watched"] = False
|
|
||||||
|
|
||||||
payload = json.dumps(data)
|
|
||||||
request = requests.post(
|
|
||||||
url, data=payload, headers=self.HEADERS, auth=self.ES_AUTH
|
|
||||||
)
|
|
||||||
if not request.ok:
|
|
||||||
print(request.text)
|
|
||||||
raise ValueError("failed to mark video as watched")
|
|
||||||
|
|
||||||
def mark_channel_watched(self):
|
|
||||||
"""change watched status of every video in channel"""
|
|
||||||
data = {
|
|
||||||
"query": {
|
|
||||||
"bool": {
|
|
||||||
"must": [
|
|
||||||
{
|
|
||||||
"term": {
|
|
||||||
"channel.channel_id": {
|
|
||||||
"value": self.youtube_id
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{"term": {"player.watched": {"value": False}}},
|
|
||||||
]
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"script": {
|
|
||||||
"source": "ctx._source.player['watched'] = true",
|
|
||||||
"lang": "painless",
|
|
||||||
},
|
|
||||||
}
|
|
||||||
payload = json.dumps(data)
|
|
||||||
url = f"{self.ES_URL}/ta_video/_update_by_query"
|
|
||||||
request = requests.post(
|
|
||||||
url, data=payload, headers=self.HEADERS, auth=self.ES_AUTH
|
|
||||||
)
|
|
||||||
if not request.ok:
|
|
||||||
print(request.text)
|
|
||||||
raise ValueError("failed mark channel as watched")
|
|
||||||
|
|
||||||
def mark_playlist_watched(self):
|
|
||||||
"""change watched state of all videos in playlist"""
|
|
||||||
data = {
|
|
||||||
"query": {
|
|
||||||
"bool": {
|
|
||||||
"must": [
|
|
||||||
{
|
|
||||||
"term": {
|
|
||||||
"playlist.keyword": {"value": self.youtube_id}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{"term": {"player.watched": {"value": False}}},
|
|
||||||
]
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"script": {
|
|
||||||
"source": "ctx._source.player['watched'] = true",
|
|
||||||
"lang": "painless",
|
|
||||||
},
|
|
||||||
}
|
|
||||||
payload = json.dumps(data)
|
|
||||||
url = f"{self.ES_URL}/ta_video/_update_by_query"
|
|
||||||
request = requests.post(
|
|
||||||
url, data=payload, headers=self.HEADERS, auth=self.ES_AUTH
|
|
||||||
)
|
|
||||||
if not request.ok:
|
|
||||||
print(request.text)
|
|
||||||
raise ValueError("failed mark playlist as watched")
|
|
||||||
|
|
||||||
|
|
||||||
class IndexPaginate:
|
|
||||||
"""use search_after to go through whole index"""
|
|
||||||
|
|
||||||
CONFIG = AppConfig().config
|
|
||||||
ES_URL = CONFIG["application"]["es_url"]
|
|
||||||
ES_AUTH = CONFIG["application"]["es_auth"]
|
|
||||||
HEADERS = {"Content-type": "application/json"}
|
|
||||||
DEFAULT_SIZE = 500
|
|
||||||
|
|
||||||
def __init__(self, index_name, data, size=False):
|
|
||||||
self.index_name = index_name
|
|
||||||
self.data = data
|
|
||||||
self.pit_id = False
|
|
||||||
self.size = size
|
|
||||||
|
|
||||||
def get_results(self):
|
|
||||||
"""get all results"""
|
|
||||||
self.get_pit()
|
|
||||||
self.validate_data()
|
|
||||||
all_results = self.run_loop()
|
|
||||||
self.clean_pit()
|
|
||||||
return all_results
|
|
||||||
|
|
||||||
def get_pit(self):
|
|
||||||
"""get pit for index"""
|
|
||||||
url = f"{self.ES_URL}/{self.index_name}/_pit?keep_alive=10m"
|
|
||||||
response = requests.post(url, auth=self.ES_AUTH)
|
|
||||||
json_data = json.loads(response.text)
|
|
||||||
|
|
||||||
self.pit_id = json_data["id"]
|
|
||||||
|
|
||||||
def validate_data(self):
|
|
||||||
"""add pit and size to data"""
|
|
||||||
if "sort" not in self.data.keys():
|
|
||||||
print(self.data)
|
|
||||||
raise ValueError("missing sort key in data")
|
|
||||||
|
|
||||||
size = self.size or self.DEFAULT_SIZE
|
|
||||||
|
|
||||||
self.data["size"] = size
|
|
||||||
self.data["pit"] = {"id": self.pit_id, "keep_alive": "10m"}
|
|
||||||
|
|
||||||
def run_loop(self):
|
|
||||||
"""loop through results until last hit"""
|
|
||||||
query_str = json.dumps(self.data)
|
|
||||||
url = self.ES_URL + "/_search"
|
|
||||||
|
|
||||||
all_results = []
|
|
||||||
while True:
|
|
||||||
response = requests.get(
|
|
||||||
url, data=query_str, headers=self.HEADERS, auth=self.ES_AUTH
|
|
||||||
)
|
|
||||||
json_data = json.loads(response.text)
|
|
||||||
all_hits = json_data["hits"]["hits"]
|
|
||||||
if all_hits:
|
|
||||||
for hit in all_hits:
|
|
||||||
source = hit["_source"]
|
|
||||||
search_after = hit["sort"]
|
|
||||||
all_results.append(source)
|
|
||||||
# update search_after with last hit data
|
|
||||||
self.data["search_after"] = search_after
|
|
||||||
query_str = json.dumps(self.data)
|
|
||||||
else:
|
|
||||||
break
|
|
||||||
|
|
||||||
return all_results
|
|
||||||
|
|
||||||
def clean_pit(self):
|
|
||||||
"""delete pit from elastic search"""
|
|
||||||
query_str = json.dumps({"id": self.pit_id})
|
|
||||||
requests.delete(
|
|
||||||
self.ES_URL + "/_pit",
|
|
||||||
data=query_str,
|
|
||||||
headers=self.HEADERS,
|
|
||||||
auth=self.ES_AUTH,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def index_new_video(youtube_id, missing_vid=False):
|
|
||||||
"""combine video and channel classes for new video index"""
|
|
||||||
vid_handler = YoutubeVideo(youtube_id)
|
|
||||||
vid_handler.get_vid_dict()
|
|
||||||
if not vid_handler.vid_dict:
|
|
||||||
raise ValueError("failed to get metadata for " + youtube_id)
|
|
||||||
|
|
||||||
channel_handler = YoutubeChannel(vid_handler.channel_id)
|
|
||||||
# add filepath to vid_dict
|
|
||||||
channel_name = channel_handler.channel_dict["channel_name"]
|
|
||||||
vid_handler.build_file_path(channel_name)
|
|
||||||
# add channel and player to video
|
|
||||||
vid_handler.add_player(missing_vid)
|
|
||||||
vid_handler.vid_dict["channel"] = channel_handler.channel_dict
|
|
||||||
# add new channel to es
|
|
||||||
if channel_handler.source == "scraped":
|
|
||||||
channel_handler.channel_dict["channel_subscribed"] = False
|
|
||||||
channel_handler.upload_to_es()
|
|
||||||
channel_handler.get_channel_art()
|
|
||||||
# upload video to es
|
|
||||||
vid_handler.upload_to_es()
|
|
||||||
# return vid_dict for further processing
|
|
||||||
return vid_handler.vid_dict
|
|
0
tubearchivist/home/src/index/__init__.py
Normal file
0
tubearchivist/home/src/index/__init__.py
Normal file
266
tubearchivist/home/src/index/channel.py
Normal file
266
tubearchivist/home/src/index/channel.py
Normal file
@ -0,0 +1,266 @@
|
|||||||
|
"""
|
||||||
|
functionality:
|
||||||
|
- get metadata from youtube for a channel
|
||||||
|
- index and update in es
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import yt_dlp
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from home.src.download.thumbnails import ThumbManager
|
||||||
|
from home.src.es.connect import ElasticWrap, IndexPaginate
|
||||||
|
from home.src.index.generic import YouTubeItem
|
||||||
|
from home.src.index.playlist import YoutubePlaylist
|
||||||
|
from home.src.ta.helper import clean_string
|
||||||
|
|
||||||
|
|
||||||
|
class ChannelScraper:
|
||||||
|
"""custom scraper using bs4 to scrape channel about page
|
||||||
|
will be able to be integrated into yt-dlp
|
||||||
|
once #2237 and #2350 are merged upstream
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, channel_id):
|
||||||
|
self.channel_id = channel_id
|
||||||
|
self.soup = False
|
||||||
|
self.yt_json = False
|
||||||
|
self.json_data = False
|
||||||
|
|
||||||
|
def get_json(self):
|
||||||
|
"""main method to return channel dict"""
|
||||||
|
self.get_soup()
|
||||||
|
self._extract_yt_json()
|
||||||
|
self._parse_channel_main()
|
||||||
|
self._parse_channel_meta()
|
||||||
|
return self.json_data
|
||||||
|
|
||||||
|
def get_soup(self):
|
||||||
|
"""return soup from youtube"""
|
||||||
|
print(f"{self.channel_id}: scrape channel data from youtube")
|
||||||
|
url = f"https://www.youtube.com/channel/{self.channel_id}/about?hl=en"
|
||||||
|
cookies = {"CONSENT": "YES+xxxxxxxxxxxxxxxxxxxxxxxxxxx"}
|
||||||
|
response = requests.get(url, cookies=cookies)
|
||||||
|
if response.ok:
|
||||||
|
channel_page = response.text
|
||||||
|
else:
|
||||||
|
print(f"{self.channel_id}: failed to extract channel info")
|
||||||
|
raise ConnectionError
|
||||||
|
self.soup = BeautifulSoup(channel_page, "html.parser")
|
||||||
|
|
||||||
|
def _extract_yt_json(self):
|
||||||
|
"""parse soup and get ytInitialData json"""
|
||||||
|
all_scripts = self.soup.find("body").find_all("script")
|
||||||
|
for script in all_scripts:
|
||||||
|
if "var ytInitialData = " in str(script):
|
||||||
|
script_content = str(script)
|
||||||
|
break
|
||||||
|
# extract payload
|
||||||
|
script_content = script_content.split("var ytInitialData = ")[1]
|
||||||
|
json_raw = script_content.rstrip(";</script>")
|
||||||
|
self.yt_json = json.loads(json_raw)
|
||||||
|
|
||||||
|
def _parse_channel_main(self):
|
||||||
|
"""extract maintab values from scraped channel json data"""
|
||||||
|
main_tab = self.yt_json["header"]["c4TabbedHeaderRenderer"]
|
||||||
|
# build and return dict
|
||||||
|
self.json_data = {
|
||||||
|
"channel_active": True,
|
||||||
|
"channel_last_refresh": int(datetime.now().strftime("%s")),
|
||||||
|
"channel_subs": self._get_channel_subs(main_tab),
|
||||||
|
"channel_name": main_tab["title"],
|
||||||
|
"channel_banner_url": self._get_thumbnails(main_tab, "banner"),
|
||||||
|
"channel_tvart_url": self._get_thumbnails(main_tab, "tvBanner"),
|
||||||
|
"channel_id": self.channel_id,
|
||||||
|
"channel_subscribed": False,
|
||||||
|
}
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _get_thumbnails(main_tab, thumb_name):
|
||||||
|
"""extract banner url from main_tab"""
|
||||||
|
try:
|
||||||
|
all_banners = main_tab[thumb_name]["thumbnails"]
|
||||||
|
banner = sorted(all_banners, key=lambda k: k["width"])[-1]["url"]
|
||||||
|
except KeyError:
|
||||||
|
banner = False
|
||||||
|
|
||||||
|
return banner
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _get_channel_subs(main_tab):
|
||||||
|
"""process main_tab to get channel subs as int"""
|
||||||
|
try:
|
||||||
|
sub_text_simple = main_tab["subscriberCountText"]["simpleText"]
|
||||||
|
sub_text = sub_text_simple.split(" ")[0]
|
||||||
|
if sub_text[-1] == "K":
|
||||||
|
channel_subs = int(float(sub_text.replace("K", "")) * 1000)
|
||||||
|
elif sub_text[-1] == "M":
|
||||||
|
channel_subs = int(float(sub_text.replace("M", "")) * 1000000)
|
||||||
|
elif int(sub_text) >= 0:
|
||||||
|
channel_subs = int(sub_text)
|
||||||
|
else:
|
||||||
|
message = f"{sub_text} not dealt with"
|
||||||
|
print(message)
|
||||||
|
except KeyError:
|
||||||
|
channel_subs = 0
|
||||||
|
|
||||||
|
return channel_subs
|
||||||
|
|
||||||
|
def _parse_channel_meta(self):
|
||||||
|
"""extract meta tab values from channel payload"""
|
||||||
|
# meta tab
|
||||||
|
meta_tab = self.yt_json["metadata"]["channelMetadataRenderer"]
|
||||||
|
all_thumbs = meta_tab["avatar"]["thumbnails"]
|
||||||
|
thumb_url = sorted(all_thumbs, key=lambda k: k["width"])[-1]["url"]
|
||||||
|
# stats tab
|
||||||
|
renderer = "twoColumnBrowseResultsRenderer"
|
||||||
|
all_tabs = self.yt_json["contents"][renderer]["tabs"]
|
||||||
|
for tab in all_tabs:
|
||||||
|
if "tabRenderer" in tab.keys():
|
||||||
|
if tab["tabRenderer"]["title"] == "About":
|
||||||
|
about_tab = tab["tabRenderer"]["content"][
|
||||||
|
"sectionListRenderer"
|
||||||
|
]["contents"][0]["itemSectionRenderer"]["contents"][0][
|
||||||
|
"channelAboutFullMetadataRenderer"
|
||||||
|
]
|
||||||
|
break
|
||||||
|
try:
|
||||||
|
channel_views_text = about_tab["viewCountText"]["simpleText"]
|
||||||
|
channel_views = int(re.sub(r"\D", "", channel_views_text))
|
||||||
|
except KeyError:
|
||||||
|
channel_views = 0
|
||||||
|
|
||||||
|
self.json_data.update(
|
||||||
|
{
|
||||||
|
"channel_description": meta_tab["description"],
|
||||||
|
"channel_thumb_url": thumb_url,
|
||||||
|
"channel_views": channel_views,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class YoutubeChannel(YouTubeItem):
|
||||||
|
"""represents a single youtube channel"""
|
||||||
|
|
||||||
|
es_path = False
|
||||||
|
index_name = "ta_channel"
|
||||||
|
yt_base = "https://www.youtube.com/channel/"
|
||||||
|
|
||||||
|
def __init__(self, youtube_id):
|
||||||
|
super().__init__(youtube_id)
|
||||||
|
self.es_path = f"{self.index_name}/_doc/{youtube_id}"
|
||||||
|
|
||||||
|
def build_json(self, upload=False):
|
||||||
|
"""get from es or from youtube"""
|
||||||
|
self.get_from_es()
|
||||||
|
if self.json_data:
|
||||||
|
return
|
||||||
|
|
||||||
|
self.get_from_youtube()
|
||||||
|
if upload:
|
||||||
|
self.upload_to_es()
|
||||||
|
return
|
||||||
|
|
||||||
|
def get_from_youtube(self):
|
||||||
|
"""use bs4 to scrape channel about page"""
|
||||||
|
self.json_data = ChannelScraper(self.youtube_id).get_json()
|
||||||
|
self.get_channel_art()
|
||||||
|
|
||||||
|
def get_channel_art(self):
|
||||||
|
"""download channel art for new channels"""
|
||||||
|
channel_id = self.youtube_id
|
||||||
|
channel_thumb = self.json_data["channel_thumb_url"]
|
||||||
|
channel_banner = self.json_data["channel_banner_url"]
|
||||||
|
ThumbManager().download_chan(
|
||||||
|
[(channel_id, channel_thumb, channel_banner)]
|
||||||
|
)
|
||||||
|
|
||||||
|
def sync_to_videos(self):
|
||||||
|
"""sync new channel_dict to all videos of channel"""
|
||||||
|
# add ingest pipeline
|
||||||
|
processors = []
|
||||||
|
for field, value in self.json_data.items():
|
||||||
|
line = {"set": {"field": "channel." + field, "value": value}}
|
||||||
|
processors.append(line)
|
||||||
|
data = {"description": self.youtube_id, "processors": processors}
|
||||||
|
ingest_path = f"_ingest/pipeline/{self.youtube_id}"
|
||||||
|
_, _ = ElasticWrap(ingest_path).put(data)
|
||||||
|
# apply pipeline
|
||||||
|
data = {"query": {"match": {"channel.channel_id": self.youtube_id}}}
|
||||||
|
update_path = f"ta_video/_update_by_query?pipeline={self.youtube_id}"
|
||||||
|
_, _ = ElasticWrap(update_path).post(data)
|
||||||
|
|
||||||
|
def get_folder_path(self):
|
||||||
|
"""get folder where media files get stored"""
|
||||||
|
channel_name = self.json_data["channel_name"]
|
||||||
|
folder_name = clean_string(channel_name)
|
||||||
|
folder_path = os.path.join(self.app_conf["videos"], folder_name)
|
||||||
|
return folder_path
|
||||||
|
|
||||||
|
def delete_es_videos(self):
|
||||||
|
"""delete all channel documents from elasticsearch"""
|
||||||
|
data = {
|
||||||
|
"query": {
|
||||||
|
"term": {"channel.channel_id": {"value": self.youtube_id}}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_, _ = ElasticWrap("ta_video/_delete_by_query").post(data)
|
||||||
|
|
||||||
|
def delete_playlists(self):
|
||||||
|
"""delete all indexed playlist from es"""
|
||||||
|
all_playlists = self.get_indexed_playlists()
|
||||||
|
for playlist in all_playlists:
|
||||||
|
playlist_id = playlist["playlist_id"]
|
||||||
|
YoutubePlaylist(playlist_id).delete_metadata()
|
||||||
|
|
||||||
|
def delete_channel(self):
|
||||||
|
"""delete channel and all videos"""
|
||||||
|
print(f"{self.youtube_id}: delete channel")
|
||||||
|
self.get_from_es()
|
||||||
|
folder_path = self.get_folder_path()
|
||||||
|
print(f"{self.youtube_id}: delete all media files")
|
||||||
|
try:
|
||||||
|
all_videos = os.listdir(folder_path)
|
||||||
|
for video in all_videos:
|
||||||
|
video_path = os.path.join(folder_path, video)
|
||||||
|
os.remove(video_path)
|
||||||
|
os.rmdir(folder_path)
|
||||||
|
except FileNotFoundError:
|
||||||
|
print(f"no videos found for {folder_path}")
|
||||||
|
|
||||||
|
print(f"{self.youtube_id}: delete indexed playlists")
|
||||||
|
self.delete_playlists()
|
||||||
|
print(f"{self.youtube_id}: delete indexed videos")
|
||||||
|
self.delete_es_videos()
|
||||||
|
self.del_in_es()
|
||||||
|
|
||||||
|
def get_all_playlists(self):
|
||||||
|
"""get all playlists owned by this channel"""
|
||||||
|
url = (
|
||||||
|
f"https://www.youtube.com/channel/{self.youtube_id}"
|
||||||
|
+ "/playlists?view=1&sort=dd&shelf_id=0"
|
||||||
|
)
|
||||||
|
obs = {
|
||||||
|
"quiet": True,
|
||||||
|
"skip_download": True,
|
||||||
|
"extract_flat": True,
|
||||||
|
}
|
||||||
|
playlists = yt_dlp.YoutubeDL(obs).extract_info(url)
|
||||||
|
all_entries = [(i["id"], i["title"]) for i in playlists["entries"]]
|
||||||
|
|
||||||
|
return all_entries
|
||||||
|
|
||||||
|
def get_indexed_playlists(self):
|
||||||
|
"""get all indexed playlists from channel"""
|
||||||
|
data = {
|
||||||
|
"query": {
|
||||||
|
"term": {"playlist_channel_id": {"value": self.youtube_id}}
|
||||||
|
},
|
||||||
|
"sort": [{"playlist_channel.keyword": {"order": "desc"}}],
|
||||||
|
}
|
||||||
|
all_playlists = IndexPaginate("ta_playlist", data).get_results()
|
||||||
|
return all_playlists
|
325
tubearchivist/home/src/index/filesystem.py
Normal file
325
tubearchivist/home/src/index/filesystem.py
Normal file
@ -0,0 +1,325 @@
|
|||||||
|
"""
|
||||||
|
Functionality:
|
||||||
|
- reindexing old documents
|
||||||
|
- syncing updated values between indexes
|
||||||
|
- scan the filesystem to delete or index
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import shutil
|
||||||
|
import subprocess
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from home.src.download.queue import PendingList
|
||||||
|
from home.src.download.yt_dlp_handler import VideoDownloader
|
||||||
|
from home.src.index.reindex import Reindex
|
||||||
|
from home.src.index.video import index_new_video
|
||||||
|
from home.src.ta.config import AppConfig
|
||||||
|
from home.src.ta.helper import clean_string, ignore_filelist
|
||||||
|
from home.src.ta.ta_redis import RedisArchivist
|
||||||
|
|
||||||
|
|
||||||
|
class FilesystemScanner:
|
||||||
|
"""handle scanning and fixing from filesystem"""
|
||||||
|
|
||||||
|
CONFIG = AppConfig().config
|
||||||
|
ES_URL = CONFIG["application"]["es_url"]
|
||||||
|
ES_AUTH = CONFIG["application"]["es_auth"]
|
||||||
|
VIDEOS = CONFIG["application"]["videos"]
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.all_downloaded = self.get_all_downloaded()
|
||||||
|
self.all_indexed = self.get_all_indexed()
|
||||||
|
self.mismatch = None
|
||||||
|
self.to_rename = None
|
||||||
|
self.to_index = None
|
||||||
|
self.to_delete = None
|
||||||
|
|
||||||
|
def get_all_downloaded(self):
|
||||||
|
"""get a list of all video files downloaded"""
|
||||||
|
channels = os.listdir(self.VIDEOS)
|
||||||
|
all_channels = ignore_filelist(channels)
|
||||||
|
all_channels.sort()
|
||||||
|
all_downloaded = []
|
||||||
|
for channel_name in all_channels:
|
||||||
|
channel_path = os.path.join(self.VIDEOS, channel_name)
|
||||||
|
videos = os.listdir(channel_path)
|
||||||
|
all_videos = ignore_filelist(videos)
|
||||||
|
for video in all_videos:
|
||||||
|
youtube_id = video[9:20]
|
||||||
|
all_downloaded.append((channel_name, video, youtube_id))
|
||||||
|
|
||||||
|
return all_downloaded
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_all_indexed():
|
||||||
|
"""get a list of all indexed videos"""
|
||||||
|
index_handler = PendingList()
|
||||||
|
all_indexed_raw = index_handler.get_all_indexed()
|
||||||
|
all_indexed = []
|
||||||
|
for video in all_indexed_raw:
|
||||||
|
youtube_id = video["youtube_id"]
|
||||||
|
media_url = video["media_url"]
|
||||||
|
published = video["published"]
|
||||||
|
title = video["title"]
|
||||||
|
all_indexed.append((youtube_id, media_url, published, title))
|
||||||
|
return all_indexed
|
||||||
|
|
||||||
|
def list_comarison(self):
|
||||||
|
"""compare the lists to figure out what to do"""
|
||||||
|
self.find_unindexed()
|
||||||
|
self.find_missing()
|
||||||
|
self.find_bad_media_url()
|
||||||
|
|
||||||
|
def find_unindexed(self):
|
||||||
|
"""find video files without a matching document indexed"""
|
||||||
|
all_indexed_ids = [i[0] for i in self.all_indexed]
|
||||||
|
to_index = []
|
||||||
|
for downloaded in self.all_downloaded:
|
||||||
|
if downloaded[2] not in all_indexed_ids:
|
||||||
|
to_index.append(downloaded)
|
||||||
|
|
||||||
|
self.to_index = to_index
|
||||||
|
|
||||||
|
def find_missing(self):
|
||||||
|
"""find indexed videos without matching media file"""
|
||||||
|
all_downloaded_ids = [i[2] for i in self.all_downloaded]
|
||||||
|
to_delete = []
|
||||||
|
for video in self.all_indexed:
|
||||||
|
youtube_id = video[0]
|
||||||
|
if youtube_id not in all_downloaded_ids:
|
||||||
|
to_delete.append(video)
|
||||||
|
|
||||||
|
self.to_delete = to_delete
|
||||||
|
|
||||||
|
def find_bad_media_url(self):
|
||||||
|
"""rename media files not matching the indexed title"""
|
||||||
|
to_fix = []
|
||||||
|
to_rename = []
|
||||||
|
for downloaded in self.all_downloaded:
|
||||||
|
channel, filename, downloaded_id = downloaded
|
||||||
|
# find in indexed
|
||||||
|
for indexed in self.all_indexed:
|
||||||
|
indexed_id, media_url, published, title = indexed
|
||||||
|
if indexed_id == downloaded_id:
|
||||||
|
# found it
|
||||||
|
title_c = clean_string(title)
|
||||||
|
pub = published.replace("-", "")
|
||||||
|
expected_filename = f"{pub}_{indexed_id}_{title_c}.mp4"
|
||||||
|
new_url = os.path.join(channel, expected_filename)
|
||||||
|
if expected_filename != filename:
|
||||||
|
# file to rename
|
||||||
|
to_rename.append(
|
||||||
|
(channel, filename, expected_filename)
|
||||||
|
)
|
||||||
|
if media_url != new_url:
|
||||||
|
# media_url to update in es
|
||||||
|
to_fix.append((indexed_id, new_url))
|
||||||
|
|
||||||
|
break
|
||||||
|
|
||||||
|
self.mismatch = to_fix
|
||||||
|
self.to_rename = to_rename
|
||||||
|
|
||||||
|
def rename_files(self):
|
||||||
|
"""rename media files as identified by find_bad_media_url"""
|
||||||
|
for bad_filename in self.to_rename:
|
||||||
|
channel, filename, expected_filename = bad_filename
|
||||||
|
print(f"renaming [{filename}] to [{expected_filename}]")
|
||||||
|
old_path = os.path.join(self.VIDEOS, channel, filename)
|
||||||
|
new_path = os.path.join(self.VIDEOS, channel, expected_filename)
|
||||||
|
os.rename(old_path, new_path)
|
||||||
|
|
||||||
|
def send_mismatch_bulk(self):
|
||||||
|
"""build bulk update"""
|
||||||
|
bulk_list = []
|
||||||
|
for video_mismatch in self.mismatch:
|
||||||
|
youtube_id, media_url = video_mismatch
|
||||||
|
print(f"{youtube_id}: fixing media url {media_url}")
|
||||||
|
action = {"update": {"_id": youtube_id, "_index": "ta_video"}}
|
||||||
|
source = {"doc": {"media_url": media_url}}
|
||||||
|
bulk_list.append(json.dumps(action))
|
||||||
|
bulk_list.append(json.dumps(source))
|
||||||
|
# add last newline
|
||||||
|
bulk_list.append("\n")
|
||||||
|
query_str = "\n".join(bulk_list)
|
||||||
|
# make the call
|
||||||
|
headers = {"Content-type": "application/x-ndjson"}
|
||||||
|
url = self.ES_URL + "/_bulk"
|
||||||
|
request = requests.post(
|
||||||
|
url, data=query_str, headers=headers, auth=self.ES_AUTH
|
||||||
|
)
|
||||||
|
if not request.ok:
|
||||||
|
print(request.text)
|
||||||
|
|
||||||
|
def delete_from_index(self):
|
||||||
|
"""find indexed but deleted mediafile"""
|
||||||
|
for indexed in self.to_delete:
|
||||||
|
youtube_id = indexed[0]
|
||||||
|
print(f"deleting {youtube_id} from index")
|
||||||
|
url = self.ES_URL + "/ta_video/_doc/" + youtube_id
|
||||||
|
request = requests.delete(url, auth=self.ES_AUTH)
|
||||||
|
if not request.ok:
|
||||||
|
print(request.text)
|
||||||
|
|
||||||
|
|
||||||
|
class ManualImport:
|
||||||
|
"""import and indexing existing video files"""
|
||||||
|
|
||||||
|
CONFIG = AppConfig().config
|
||||||
|
CACHE_DIR = CONFIG["application"]["cache_dir"]
|
||||||
|
IMPORT_DIR = os.path.join(CACHE_DIR, "import")
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.identified = self.import_folder_parser()
|
||||||
|
|
||||||
|
def import_folder_parser(self):
|
||||||
|
"""detect files in import folder"""
|
||||||
|
import_files = os.listdir(self.IMPORT_DIR)
|
||||||
|
to_import = ignore_filelist(import_files)
|
||||||
|
to_import.sort()
|
||||||
|
video_files = [i for i in to_import if not i.endswith(".json")]
|
||||||
|
|
||||||
|
identified = []
|
||||||
|
|
||||||
|
for file_path in video_files:
|
||||||
|
|
||||||
|
file_dict = {"video_file": file_path}
|
||||||
|
file_name, _ = os.path.splitext(file_path)
|
||||||
|
|
||||||
|
matching_json = [
|
||||||
|
i
|
||||||
|
for i in to_import
|
||||||
|
if i.startswith(file_name) and i.endswith(".json")
|
||||||
|
]
|
||||||
|
if matching_json:
|
||||||
|
json_file = matching_json[0]
|
||||||
|
youtube_id = self.extract_id_from_json(json_file)
|
||||||
|
file_dict.update({"json_file": json_file})
|
||||||
|
else:
|
||||||
|
youtube_id = self.extract_id_from_filename(file_name)
|
||||||
|
file_dict.update({"json_file": False})
|
||||||
|
|
||||||
|
file_dict.update({"youtube_id": youtube_id})
|
||||||
|
identified.append(file_dict)
|
||||||
|
|
||||||
|
return identified
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def extract_id_from_filename(file_name):
|
||||||
|
"""
|
||||||
|
look at the file name for the youtube id
|
||||||
|
expects filename ending in [<youtube_id>].<ext>
|
||||||
|
"""
|
||||||
|
id_search = re.search(r"\[([a-zA-Z0-9_-]{11})\]$", file_name)
|
||||||
|
if id_search:
|
||||||
|
youtube_id = id_search.group(1)
|
||||||
|
return youtube_id
|
||||||
|
|
||||||
|
print("failed to extract youtube id for: " + file_name)
|
||||||
|
raise Exception
|
||||||
|
|
||||||
|
def extract_id_from_json(self, json_file):
|
||||||
|
"""open json file and extract id"""
|
||||||
|
json_path = os.path.join(self.CACHE_DIR, "import", json_file)
|
||||||
|
with open(json_path, "r", encoding="utf-8") as f:
|
||||||
|
json_content = f.read()
|
||||||
|
|
||||||
|
youtube_id = json.loads(json_content)["id"]
|
||||||
|
|
||||||
|
return youtube_id
|
||||||
|
|
||||||
|
def process_import(self):
|
||||||
|
"""go through identified media files"""
|
||||||
|
|
||||||
|
all_videos_added = []
|
||||||
|
|
||||||
|
for media_file in self.identified:
|
||||||
|
json_file = media_file["json_file"]
|
||||||
|
video_file = media_file["video_file"]
|
||||||
|
youtube_id = media_file["youtube_id"]
|
||||||
|
|
||||||
|
video_path = os.path.join(self.CACHE_DIR, "import", video_file)
|
||||||
|
|
||||||
|
self.move_to_cache(video_path, youtube_id)
|
||||||
|
|
||||||
|
# identify and archive
|
||||||
|
vid_dict = index_new_video(youtube_id)
|
||||||
|
VideoDownloader([youtube_id]).move_to_archive(vid_dict)
|
||||||
|
youtube_id = vid_dict["youtube_id"]
|
||||||
|
thumb_url = vid_dict["vid_thumb_url"]
|
||||||
|
all_videos_added.append((youtube_id, thumb_url))
|
||||||
|
|
||||||
|
# cleanup
|
||||||
|
if os.path.exists(video_path):
|
||||||
|
os.remove(video_path)
|
||||||
|
if json_file:
|
||||||
|
json_path = os.path.join(self.CACHE_DIR, "import", json_file)
|
||||||
|
os.remove(json_path)
|
||||||
|
|
||||||
|
return all_videos_added
|
||||||
|
|
||||||
|
def move_to_cache(self, video_path, youtube_id):
|
||||||
|
"""move identified video file to cache, convert to mp4"""
|
||||||
|
file_name = os.path.split(video_path)[-1]
|
||||||
|
video_file, ext = os.path.splitext(file_name)
|
||||||
|
|
||||||
|
# make sure youtube_id is in filename
|
||||||
|
if youtube_id not in video_file:
|
||||||
|
video_file = f"{video_file}_{youtube_id}"
|
||||||
|
|
||||||
|
# move, convert if needed
|
||||||
|
if ext == ".mp4":
|
||||||
|
new_file = video_file + ext
|
||||||
|
dest_path = os.path.join(self.CACHE_DIR, "download", new_file)
|
||||||
|
shutil.move(video_path, dest_path)
|
||||||
|
else:
|
||||||
|
print(f"processing with ffmpeg: {video_file}")
|
||||||
|
new_file = video_file + ".mp4"
|
||||||
|
dest_path = os.path.join(self.CACHE_DIR, "download", new_file)
|
||||||
|
subprocess.run(
|
||||||
|
[
|
||||||
|
"ffmpeg",
|
||||||
|
"-i",
|
||||||
|
video_path,
|
||||||
|
dest_path,
|
||||||
|
"-loglevel",
|
||||||
|
"warning",
|
||||||
|
"-stats",
|
||||||
|
],
|
||||||
|
check=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def scan_filesystem():
|
||||||
|
"""grouped function to delete and update index"""
|
||||||
|
filesystem_handler = FilesystemScanner()
|
||||||
|
filesystem_handler.list_comarison()
|
||||||
|
if filesystem_handler.to_rename:
|
||||||
|
print("renaming files")
|
||||||
|
filesystem_handler.rename_files()
|
||||||
|
if filesystem_handler.mismatch:
|
||||||
|
print("fixing media urls in index")
|
||||||
|
filesystem_handler.send_mismatch_bulk()
|
||||||
|
if filesystem_handler.to_delete:
|
||||||
|
print("delete metadata from index")
|
||||||
|
filesystem_handler.delete_from_index()
|
||||||
|
if filesystem_handler.to_index:
|
||||||
|
print("index new videos")
|
||||||
|
for missing_vid in filesystem_handler.to_index:
|
||||||
|
youtube_id = missing_vid[2]
|
||||||
|
index_new_video(youtube_id)
|
||||||
|
|
||||||
|
|
||||||
|
def reindex_old_documents():
|
||||||
|
"""daily refresh of old documents"""
|
||||||
|
# continue if needed
|
||||||
|
reindex_handler = Reindex()
|
||||||
|
reindex_handler.check_outdated()
|
||||||
|
reindex_handler.reindex()
|
||||||
|
# set timestamp
|
||||||
|
now = int(datetime.now().strftime("%s"))
|
||||||
|
RedisArchivist().set_message("last_reindex", now, expire=False)
|
142
tubearchivist/home/src/index/generic.py
Normal file
142
tubearchivist/home/src/index/generic.py
Normal file
@ -0,0 +1,142 @@
|
|||||||
|
"""
|
||||||
|
functionality:
|
||||||
|
- generic base class to inherit from for video, channel and playlist
|
||||||
|
"""
|
||||||
|
|
||||||
|
import math
|
||||||
|
|
||||||
|
import yt_dlp
|
||||||
|
from home.src.es.connect import ElasticWrap
|
||||||
|
from home.src.ta.config import AppConfig
|
||||||
|
from home.src.ta.ta_redis import RedisArchivist
|
||||||
|
|
||||||
|
|
||||||
|
class YouTubeItem:
|
||||||
|
"""base class for youtube"""
|
||||||
|
|
||||||
|
es_path = False
|
||||||
|
index_name = False
|
||||||
|
yt_base = False
|
||||||
|
yt_obs = {
|
||||||
|
"quiet": True,
|
||||||
|
"default_search": "ytsearch",
|
||||||
|
"skip_download": True,
|
||||||
|
"check_formats": "selected",
|
||||||
|
"noplaylist": True,
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(self, youtube_id):
|
||||||
|
self.youtube_id = youtube_id
|
||||||
|
self.config = False
|
||||||
|
self.app_conf = False
|
||||||
|
self.youtube_meta = False
|
||||||
|
self.json_data = False
|
||||||
|
self._get_conf()
|
||||||
|
|
||||||
|
def _get_conf(self):
|
||||||
|
"""read user conf"""
|
||||||
|
self.config = AppConfig().config
|
||||||
|
self.app_conf = self.config["application"]
|
||||||
|
|
||||||
|
def get_from_youtube(self):
|
||||||
|
"""use yt-dlp to get meta data from youtube"""
|
||||||
|
print(f"{self.youtube_id}: get metadata from youtube")
|
||||||
|
try:
|
||||||
|
yt_item = yt_dlp.YoutubeDL(self.yt_obs)
|
||||||
|
response = yt_item.extract_info(self.yt_base + self.youtube_id)
|
||||||
|
except (
|
||||||
|
yt_dlp.utils.ExtractorError,
|
||||||
|
yt_dlp.utils.DownloadError,
|
||||||
|
):
|
||||||
|
print(f"{self.youtube_id}: failed to get info from youtube")
|
||||||
|
self.youtube_meta = False
|
||||||
|
|
||||||
|
self.youtube_meta = response
|
||||||
|
|
||||||
|
def get_from_es(self):
|
||||||
|
"""get indexed data from elastic search"""
|
||||||
|
print(f"{self.youtube_id}: get metadata from es")
|
||||||
|
response, _ = ElasticWrap(f"{self.es_path}").get()
|
||||||
|
source = response.get("_source")
|
||||||
|
self.json_data = source
|
||||||
|
|
||||||
|
def upload_to_es(self):
|
||||||
|
"""add json_data to elastic"""
|
||||||
|
_, _ = ElasticWrap(self.es_path).put(self.json_data, refresh=True)
|
||||||
|
|
||||||
|
def deactivate(self):
|
||||||
|
"""deactivate document in es"""
|
||||||
|
key_match = {
|
||||||
|
"video": "active",
|
||||||
|
"channel": "channel_active",
|
||||||
|
"playlist": "playlist_active",
|
||||||
|
}
|
||||||
|
update_path = f"{self.index_name}/_update/{self.youtube_id}"
|
||||||
|
data = {
|
||||||
|
"script": f"ctx._source.{key_match.get(self.index_name)} = false"
|
||||||
|
}
|
||||||
|
_, _ = ElasticWrap(update_path).post(data)
|
||||||
|
|
||||||
|
def del_in_es(self):
|
||||||
|
"""delete item from elastic search"""
|
||||||
|
print(f"{self.youtube_id}: delete from es")
|
||||||
|
_, _ = ElasticWrap(self.es_path).delete()
|
||||||
|
|
||||||
|
|
||||||
|
class Pagination:
|
||||||
|
"""
|
||||||
|
figure out the pagination based on page size and total_hits
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, page_get, user_id, search_get=False):
|
||||||
|
self.user_id = user_id
|
||||||
|
self.page_size = self.get_page_size()
|
||||||
|
self.page_get = page_get
|
||||||
|
self.search_get = search_get
|
||||||
|
self.pagination = self.first_guess()
|
||||||
|
|
||||||
|
def get_page_size(self):
|
||||||
|
"""get default or user modified page_size"""
|
||||||
|
key = f"{self.user_id}:page_size"
|
||||||
|
page_size = RedisArchivist().get_message(key)["status"]
|
||||||
|
if not page_size:
|
||||||
|
config = AppConfig().config
|
||||||
|
page_size = config["archive"]["page_size"]
|
||||||
|
|
||||||
|
return page_size
|
||||||
|
|
||||||
|
def first_guess(self):
|
||||||
|
"""build first guess before api call"""
|
||||||
|
page_get = self.page_get
|
||||||
|
if page_get in [0, 1]:
|
||||||
|
page_from = 0
|
||||||
|
prev_pages = False
|
||||||
|
elif page_get > 1:
|
||||||
|
page_from = (page_get - 1) * self.page_size
|
||||||
|
prev_pages = [
|
||||||
|
i for i in range(page_get - 1, page_get - 6, -1) if i > 1
|
||||||
|
]
|
||||||
|
prev_pages.reverse()
|
||||||
|
pagination = {
|
||||||
|
"page_size": self.page_size,
|
||||||
|
"page_from": page_from,
|
||||||
|
"prev_pages": prev_pages,
|
||||||
|
"current_page": page_get,
|
||||||
|
}
|
||||||
|
if self.search_get:
|
||||||
|
pagination.update({"search_get": self.search_get})
|
||||||
|
return pagination
|
||||||
|
|
||||||
|
def validate(self, total_hits):
|
||||||
|
"""validate pagination with total_hits after making api call"""
|
||||||
|
page_get = self.page_get
|
||||||
|
max_pages = math.ceil(total_hits / self.page_size)
|
||||||
|
if page_get < max_pages and max_pages > 1:
|
||||||
|
self.pagination["last_page"] = max_pages
|
||||||
|
else:
|
||||||
|
self.pagination["last_page"] = False
|
||||||
|
next_pages = [
|
||||||
|
i for i in range(page_get + 1, page_get + 6) if 1 < i < max_pages
|
||||||
|
]
|
||||||
|
|
||||||
|
self.pagination["next_pages"] = next_pages
|
205
tubearchivist/home/src/index/playlist.py
Normal file
205
tubearchivist/home/src/index/playlist.py
Normal file
@ -0,0 +1,205 @@
|
|||||||
|
"""
|
||||||
|
functionality:
|
||||||
|
- get metadata from youtube for a playlist
|
||||||
|
- index and update in es
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
from home.src.download.thumbnails import ThumbManager
|
||||||
|
from home.src.es.connect import ElasticWrap
|
||||||
|
from home.src.index.generic import YouTubeItem
|
||||||
|
from home.src.index.video import YoutubeVideo
|
||||||
|
|
||||||
|
|
||||||
|
class YoutubePlaylist(YouTubeItem):
|
||||||
|
"""represents a single youtube playlist"""
|
||||||
|
|
||||||
|
es_path = False
|
||||||
|
index_name = "ta_playlist"
|
||||||
|
yt_obs = {
|
||||||
|
"default_search": "ytsearch",
|
||||||
|
"quiet": True,
|
||||||
|
"skip_download": True,
|
||||||
|
"extract_flat": True,
|
||||||
|
}
|
||||||
|
yt_base = "https://www.youtube.com/playlist?list="
|
||||||
|
|
||||||
|
def __init__(self, youtube_id):
|
||||||
|
super().__init__(youtube_id)
|
||||||
|
self.es_path = f"{self.index_name}/_doc/{youtube_id}"
|
||||||
|
self.all_members = False
|
||||||
|
self.nav = False
|
||||||
|
self.all_youtube_ids = []
|
||||||
|
|
||||||
|
def build_json(self, scrape=False):
|
||||||
|
"""collection to create json_data"""
|
||||||
|
if not scrape:
|
||||||
|
self.get_from_es()
|
||||||
|
|
||||||
|
if scrape or not self.json_data:
|
||||||
|
self.get_from_youtube()
|
||||||
|
self.process_youtube_meta()
|
||||||
|
self.get_entries()
|
||||||
|
self.json_data["playlist_entries"] = self.all_members
|
||||||
|
self.get_playlist_art()
|
||||||
|
|
||||||
|
def process_youtube_meta(self):
|
||||||
|
"""extract relevant fields from youtube"""
|
||||||
|
self.json_data = {
|
||||||
|
"playlist_id": self.youtube_id,
|
||||||
|
"playlist_active": True,
|
||||||
|
"playlist_subscribed": False,
|
||||||
|
"playlist_name": self.youtube_meta["title"],
|
||||||
|
"playlist_channel": self.youtube_meta["channel"],
|
||||||
|
"playlist_channel_id": self.youtube_meta["channel_id"],
|
||||||
|
"playlist_thumbnail": self.youtube_meta["thumbnails"][-1]["url"],
|
||||||
|
"playlist_description": self.youtube_meta["description"] or False,
|
||||||
|
"playlist_last_refresh": int(datetime.now().strftime("%s")),
|
||||||
|
}
|
||||||
|
|
||||||
|
def get_entries(self, playlistend=False):
|
||||||
|
"""get all videos in playlist"""
|
||||||
|
if playlistend:
|
||||||
|
# implement playlist end
|
||||||
|
print(playlistend)
|
||||||
|
all_members = []
|
||||||
|
for idx, entry in enumerate(self.youtube_meta["entries"]):
|
||||||
|
if self.all_youtube_ids:
|
||||||
|
downloaded = entry["id"] in self.all_youtube_ids
|
||||||
|
else:
|
||||||
|
downloaded = False
|
||||||
|
if not entry["uploader"]:
|
||||||
|
continue
|
||||||
|
to_append = {
|
||||||
|
"youtube_id": entry["id"],
|
||||||
|
"title": entry["title"],
|
||||||
|
"uploader": entry["uploader"],
|
||||||
|
"idx": idx,
|
||||||
|
"downloaded": downloaded,
|
||||||
|
}
|
||||||
|
all_members.append(to_append)
|
||||||
|
|
||||||
|
self.all_members = all_members
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_playlist_art():
|
||||||
|
"""download artwork of playlist"""
|
||||||
|
thumbnails = ThumbManager()
|
||||||
|
missing_playlists = thumbnails.get_missing_playlists()
|
||||||
|
thumbnails.download_playlist(missing_playlists)
|
||||||
|
|
||||||
|
def add_vids_to_playlist(self):
|
||||||
|
"""sync the playlist id to videos"""
|
||||||
|
script = (
|
||||||
|
'if (!ctx._source.containsKey("playlist")) '
|
||||||
|
+ "{ctx._source.playlist = [params.playlist]} "
|
||||||
|
+ "else if (!ctx._source.playlist.contains(params.playlist)) "
|
||||||
|
+ "{ctx._source.playlist.add(params.playlist)} "
|
||||||
|
+ "else {ctx.op = 'none'}"
|
||||||
|
)
|
||||||
|
|
||||||
|
bulk_list = []
|
||||||
|
for entry in self.json_data["playlist_entries"]:
|
||||||
|
video_id = entry["youtube_id"]
|
||||||
|
action = {"update": {"_id": video_id, "_index": "ta_video"}}
|
||||||
|
source = {
|
||||||
|
"script": {
|
||||||
|
"source": script,
|
||||||
|
"lang": "painless",
|
||||||
|
"params": {"playlist": self.youtube_id},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
bulk_list.append(json.dumps(action))
|
||||||
|
bulk_list.append(json.dumps(source))
|
||||||
|
|
||||||
|
# add last newline
|
||||||
|
bulk_list.append("\n")
|
||||||
|
query_str = "\n".join(bulk_list)
|
||||||
|
|
||||||
|
ElasticWrap("_bulk").post(query_str, ndjson=True)
|
||||||
|
|
||||||
|
def update_playlist(self):
|
||||||
|
"""update metadata for playlist with data from YouTube"""
|
||||||
|
self.get_from_es()
|
||||||
|
subscribed = self.json_data["playlist_subscribed"]
|
||||||
|
self.get_from_youtube()
|
||||||
|
if not self.json_data:
|
||||||
|
# return false to deactivate
|
||||||
|
return False
|
||||||
|
|
||||||
|
self.json_data["playlist_subscribed"] = subscribed
|
||||||
|
self.upload_to_es()
|
||||||
|
return True
|
||||||
|
|
||||||
|
def build_nav(self, youtube_id):
|
||||||
|
"""find next and previous in playlist of a given youtube_id"""
|
||||||
|
all_entries_available = self.json_data["playlist_entries"]
|
||||||
|
all_entries = [i for i in all_entries_available if i["downloaded"]]
|
||||||
|
current = [i for i in all_entries if i["youtube_id"] == youtube_id]
|
||||||
|
# stop if not found or playlist of 1
|
||||||
|
if not current or not len(all_entries) > 1:
|
||||||
|
return
|
||||||
|
|
||||||
|
current_idx = all_entries.index(current[0])
|
||||||
|
if current_idx == 0:
|
||||||
|
previous_item = False
|
||||||
|
else:
|
||||||
|
previous_item = all_entries[current_idx - 1]
|
||||||
|
prev_thumb = ThumbManager().vid_thumb_path(
|
||||||
|
previous_item["youtube_id"]
|
||||||
|
)
|
||||||
|
previous_item["vid_thumb"] = prev_thumb
|
||||||
|
|
||||||
|
if current_idx == len(all_entries) - 1:
|
||||||
|
next_item = False
|
||||||
|
else:
|
||||||
|
next_item = all_entries[current_idx + 1]
|
||||||
|
next_thumb = ThumbManager().vid_thumb_path(next_item["youtube_id"])
|
||||||
|
next_item["vid_thumb"] = next_thumb
|
||||||
|
|
||||||
|
self.nav = {
|
||||||
|
"playlist_meta": {
|
||||||
|
"current_idx": current[0]["idx"],
|
||||||
|
"playlist_id": self.youtube_id,
|
||||||
|
"playlist_name": self.json_data["playlist_name"],
|
||||||
|
"playlist_channel": self.json_data["playlist_channel"],
|
||||||
|
},
|
||||||
|
"playlist_previous": previous_item,
|
||||||
|
"playlist_next": next_item,
|
||||||
|
}
|
||||||
|
return
|
||||||
|
|
||||||
|
def delete_metadata(self):
|
||||||
|
"""delete metadata for playlist"""
|
||||||
|
script = (
|
||||||
|
"ctx._source.playlist.removeAll("
|
||||||
|
+ "Collections.singleton(params.playlist)) "
|
||||||
|
)
|
||||||
|
data = {
|
||||||
|
"query": {
|
||||||
|
"term": {"playlist.keyword": {"value": self.youtube_id}}
|
||||||
|
},
|
||||||
|
"script": {
|
||||||
|
"source": script,
|
||||||
|
"lang": "painless",
|
||||||
|
"params": {"playlist": self.youtube_id},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
_, _ = ElasticWrap("ta_video/_update_by_query").post(data)
|
||||||
|
self.del_in_es()
|
||||||
|
|
||||||
|
def delete_videos_playlist(self):
|
||||||
|
"""delete playlist with all videos"""
|
||||||
|
print(f"{self.youtube_id}: delete playlist")
|
||||||
|
self.get_from_es()
|
||||||
|
all_youtube_id = [
|
||||||
|
i["youtube_id"]
|
||||||
|
for i in self.json_data["playlist_entries"]
|
||||||
|
if i["downloaded"]
|
||||||
|
]
|
||||||
|
for youtube_id in all_youtube_id:
|
||||||
|
YoutubeVideo(youtube_id).delete_media_file()
|
||||||
|
|
||||||
|
self.delete_metadata()
|
271
tubearchivist/home/src/index/reindex.py
Normal file
271
tubearchivist/home/src/index/reindex.py
Normal file
@ -0,0 +1,271 @@
|
|||||||
|
"""
|
||||||
|
functionality:
|
||||||
|
- periodically refresh documents
|
||||||
|
- index and update in es
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
from datetime import datetime
|
||||||
|
from math import ceil
|
||||||
|
from time import sleep
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from home.src.download.queue import PendingList
|
||||||
|
from home.src.download.subscriptions import ChannelSubscription
|
||||||
|
from home.src.download.thumbnails import ThumbManager
|
||||||
|
from home.src.index.channel import YoutubeChannel
|
||||||
|
from home.src.index.playlist import YoutubePlaylist
|
||||||
|
from home.src.index.video import YoutubeVideo
|
||||||
|
from home.src.ta.config import AppConfig
|
||||||
|
from home.src.ta.helper import get_total_hits
|
||||||
|
|
||||||
|
|
||||||
|
class Reindex:
|
||||||
|
"""check for outdated documents and refresh data from youtube"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
# config
|
||||||
|
config = AppConfig().config
|
||||||
|
self.sleep_interval = config["downloads"]["sleep_interval"]
|
||||||
|
self.es_url = config["application"]["es_url"]
|
||||||
|
self.es_auth = config["application"]["es_auth"]
|
||||||
|
self.refresh_interval = config["scheduler"]["check_reindex_days"]
|
||||||
|
self.integrate_ryd = config["downloads"]["integrate_ryd"]
|
||||||
|
# scan
|
||||||
|
self.all_youtube_ids = False
|
||||||
|
self.all_channel_ids = False
|
||||||
|
self.all_playlist_ids = False
|
||||||
|
|
||||||
|
def get_daily(self):
|
||||||
|
"""get daily refresh values"""
|
||||||
|
total_videos = get_total_hits(
|
||||||
|
"ta_video", self.es_url, self.es_auth, "active"
|
||||||
|
)
|
||||||
|
video_daily = ceil(total_videos / self.refresh_interval * 1.2)
|
||||||
|
total_channels = get_total_hits(
|
||||||
|
"ta_channel", self.es_url, self.es_auth, "channel_active"
|
||||||
|
)
|
||||||
|
channel_daily = ceil(total_channels / self.refresh_interval * 1.2)
|
||||||
|
total_playlists = get_total_hits(
|
||||||
|
"ta_playlist", self.es_url, self.es_auth, "playlist_active"
|
||||||
|
)
|
||||||
|
playlist_daily = ceil(total_playlists / self.refresh_interval * 1.2)
|
||||||
|
return (video_daily, channel_daily, playlist_daily)
|
||||||
|
|
||||||
|
def get_outdated_vids(self, size):
|
||||||
|
"""get daily videos to refresh"""
|
||||||
|
headers = {"Content-type": "application/json"}
|
||||||
|
now = int(datetime.now().strftime("%s"))
|
||||||
|
now_lte = now - self.refresh_interval * 24 * 60 * 60
|
||||||
|
data = {
|
||||||
|
"size": size,
|
||||||
|
"query": {
|
||||||
|
"bool": {
|
||||||
|
"must": [
|
||||||
|
{"match": {"active": True}},
|
||||||
|
{"range": {"vid_last_refresh": {"lte": now_lte}}},
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"sort": [{"vid_last_refresh": {"order": "asc"}}],
|
||||||
|
"_source": False,
|
||||||
|
}
|
||||||
|
query_str = json.dumps(data)
|
||||||
|
url = self.es_url + "/ta_video/_search"
|
||||||
|
response = requests.get(
|
||||||
|
url, data=query_str, headers=headers, auth=self.es_auth
|
||||||
|
)
|
||||||
|
if not response.ok:
|
||||||
|
print(response.text)
|
||||||
|
response_dict = json.loads(response.text)
|
||||||
|
all_youtube_ids = [i["_id"] for i in response_dict["hits"]["hits"]]
|
||||||
|
return all_youtube_ids
|
||||||
|
|
||||||
|
def get_unrated_vids(self):
|
||||||
|
"""get all videos without rating if ryd integration is enabled"""
|
||||||
|
headers = {"Content-type": "application/json"}
|
||||||
|
data = {
|
||||||
|
"size": 200,
|
||||||
|
"query": {
|
||||||
|
"bool": {
|
||||||
|
"must_not": [{"exists": {"field": "stats.average_rating"}}]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
query_str = json.dumps(data)
|
||||||
|
url = self.es_url + "/ta_video/_search"
|
||||||
|
response = requests.get(
|
||||||
|
url, data=query_str, headers=headers, auth=self.es_auth
|
||||||
|
)
|
||||||
|
if not response.ok:
|
||||||
|
print(response.text)
|
||||||
|
response_dict = json.loads(response.text)
|
||||||
|
missing_rating = [i["_id"] for i in response_dict["hits"]["hits"]]
|
||||||
|
self.all_youtube_ids = self.all_youtube_ids + missing_rating
|
||||||
|
|
||||||
|
def get_outdated_channels(self, size):
|
||||||
|
"""get daily channels to refresh"""
|
||||||
|
headers = {"Content-type": "application/json"}
|
||||||
|
now = int(datetime.now().strftime("%s"))
|
||||||
|
now_lte = now - self.refresh_interval * 24 * 60 * 60
|
||||||
|
data = {
|
||||||
|
"size": size,
|
||||||
|
"query": {
|
||||||
|
"bool": {
|
||||||
|
"must": [
|
||||||
|
{"match": {"channel_active": True}},
|
||||||
|
{"range": {"channel_last_refresh": {"lte": now_lte}}},
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"sort": [{"channel_last_refresh": {"order": "asc"}}],
|
||||||
|
"_source": False,
|
||||||
|
}
|
||||||
|
query_str = json.dumps(data)
|
||||||
|
url = self.es_url + "/ta_channel/_search"
|
||||||
|
response = requests.get(
|
||||||
|
url, data=query_str, headers=headers, auth=self.es_auth
|
||||||
|
)
|
||||||
|
if not response.ok:
|
||||||
|
print(response.text)
|
||||||
|
response_dict = json.loads(response.text)
|
||||||
|
all_channel_ids = [i["_id"] for i in response_dict["hits"]["hits"]]
|
||||||
|
return all_channel_ids
|
||||||
|
|
||||||
|
def get_outdated_playlists(self, size):
|
||||||
|
"""get daily outdated playlists to refresh"""
|
||||||
|
headers = {"Content-type": "application/json"}
|
||||||
|
now = int(datetime.now().strftime("%s"))
|
||||||
|
now_lte = now - self.refresh_interval * 24 * 60 * 60
|
||||||
|
data = {
|
||||||
|
"size": size,
|
||||||
|
"query": {
|
||||||
|
"bool": {
|
||||||
|
"must": [
|
||||||
|
{"match": {"playlist_active": True}},
|
||||||
|
{"range": {"playlist_last_refresh": {"lte": now_lte}}},
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"sort": [{"playlist_last_refresh": {"order": "asc"}}],
|
||||||
|
"_source": False,
|
||||||
|
}
|
||||||
|
query_str = json.dumps(data)
|
||||||
|
url = self.es_url + "/ta_playlist/_search"
|
||||||
|
response = requests.get(
|
||||||
|
url, data=query_str, headers=headers, auth=self.es_auth
|
||||||
|
)
|
||||||
|
if not response.ok:
|
||||||
|
print(response.text)
|
||||||
|
response_dict = json.loads(response.text)
|
||||||
|
all_playlist_ids = [i["_id"] for i in response_dict["hits"]["hits"]]
|
||||||
|
return all_playlist_ids
|
||||||
|
|
||||||
|
def check_outdated(self):
|
||||||
|
"""add missing vids and channels"""
|
||||||
|
video_daily, channel_daily, playlist_daily = self.get_daily()
|
||||||
|
self.all_youtube_ids = self.get_outdated_vids(video_daily)
|
||||||
|
self.all_channel_ids = self.get_outdated_channels(channel_daily)
|
||||||
|
self.all_playlist_ids = self.get_outdated_playlists(playlist_daily)
|
||||||
|
if self.integrate_ryd:
|
||||||
|
self.get_unrated_vids()
|
||||||
|
|
||||||
|
def rescrape_all_channels(self):
|
||||||
|
"""sync new data from channel to all matching videos"""
|
||||||
|
sleep_interval = self.sleep_interval
|
||||||
|
channel_sub_handler = ChannelSubscription()
|
||||||
|
all_channels = channel_sub_handler.get_channels(subscribed_only=False)
|
||||||
|
all_channel_ids = [i["channel_id"] for i in all_channels]
|
||||||
|
|
||||||
|
for channel_id in all_channel_ids:
|
||||||
|
channel = YoutubeChannel(channel_id)
|
||||||
|
subscribed = channel.json_data["channel_subscribed"]
|
||||||
|
channel.get_from_youtube()
|
||||||
|
channel.json_data["channel_subscribed"] = subscribed
|
||||||
|
channel.upload_to_es()
|
||||||
|
channel.sync_to_videos()
|
||||||
|
|
||||||
|
if sleep_interval:
|
||||||
|
sleep(sleep_interval)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def reindex_single_video(youtube_id):
|
||||||
|
"""refresh data for single video"""
|
||||||
|
video = YoutubeVideo(youtube_id)
|
||||||
|
|
||||||
|
# read current state
|
||||||
|
video.get_from_es()
|
||||||
|
player = video.json_data["player"]
|
||||||
|
date_downloaded = video.json_data["date_downloaded"]
|
||||||
|
channel_dict = video.json_data["channel"]
|
||||||
|
playlist = video.json_data.get("playlist")
|
||||||
|
|
||||||
|
# get new
|
||||||
|
video.build_json()
|
||||||
|
if not video.json_data:
|
||||||
|
video.deactivate()
|
||||||
|
|
||||||
|
# add back
|
||||||
|
video.json_data["player"] = player
|
||||||
|
video.json_data["date_downloaded"] = date_downloaded
|
||||||
|
video.json_data["channel"] = channel_dict
|
||||||
|
if playlist:
|
||||||
|
video.json_data["playlist"] = playlist
|
||||||
|
|
||||||
|
video.upload_to_es()
|
||||||
|
|
||||||
|
thumb_handler = ThumbManager()
|
||||||
|
thumb_handler.delete_vid_thumb(youtube_id)
|
||||||
|
to_download = (youtube_id, video.json_data["vid_thumb_url"])
|
||||||
|
thumb_handler.download_vid([to_download], notify=False)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def reindex_single_channel(channel_id):
|
||||||
|
"""refresh channel data and sync to videos"""
|
||||||
|
channel = YoutubeChannel(channel_id)
|
||||||
|
channel.get_from_es()
|
||||||
|
subscribed = channel.json_data["channel_subscribed"]
|
||||||
|
channel.get_from_youtube()
|
||||||
|
channel.json_data["channel_subscribed"] = subscribed
|
||||||
|
channel.upload_to_es()
|
||||||
|
channel.sync_to_videos()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def reindex_single_playlist(playlist_id, all_indexed_ids):
|
||||||
|
"""refresh playlist data"""
|
||||||
|
playlist = YoutubePlaylist(playlist_id)
|
||||||
|
playlist.get_from_es()
|
||||||
|
subscribed = playlist.json_data["playlist_subscribed"]
|
||||||
|
playlist.all_youtube_ids = all_indexed_ids
|
||||||
|
playlist.build_json(scrape=True)
|
||||||
|
if not playlist.json_data:
|
||||||
|
playlist.deactivate()
|
||||||
|
return
|
||||||
|
|
||||||
|
playlist.json_data["playlist_subscribed"] = subscribed
|
||||||
|
playlist.upload_to_es()
|
||||||
|
return
|
||||||
|
|
||||||
|
def reindex(self):
|
||||||
|
"""reindex what's needed"""
|
||||||
|
# videos
|
||||||
|
print(f"reindexing {len(self.all_youtube_ids)} videos")
|
||||||
|
for youtube_id in self.all_youtube_ids:
|
||||||
|
self.reindex_single_video(youtube_id)
|
||||||
|
if self.sleep_interval:
|
||||||
|
sleep(self.sleep_interval)
|
||||||
|
# channels
|
||||||
|
print(f"reindexing {len(self.all_channel_ids)} channels")
|
||||||
|
for channel_id in self.all_channel_ids:
|
||||||
|
self.reindex_single_channel(channel_id)
|
||||||
|
if self.sleep_interval:
|
||||||
|
sleep(self.sleep_interval)
|
||||||
|
# playlist
|
||||||
|
print(f"reindexing {len(self.all_playlist_ids)} playlists")
|
||||||
|
if self.all_playlist_ids:
|
||||||
|
all_indexed = PendingList().get_all_indexed()
|
||||||
|
all_indexed_ids = [i["youtube_id"] for i in all_indexed]
|
||||||
|
for playlist_id in self.all_playlist_ids:
|
||||||
|
self.reindex_single_playlist(playlist_id, all_indexed_ids)
|
||||||
|
if self.sleep_interval:
|
||||||
|
sleep(self.sleep_interval)
|
175
tubearchivist/home/src/index/video.py
Normal file
175
tubearchivist/home/src/index/video.py
Normal file
@ -0,0 +1,175 @@
|
|||||||
|
"""
|
||||||
|
functionality:
|
||||||
|
- get metadata from youtube for a video
|
||||||
|
- index and update in es
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from home.src.index import channel as ta_channel
|
||||||
|
from home.src.index.generic import YouTubeItem
|
||||||
|
from home.src.ta.helper import DurationConverter, clean_string
|
||||||
|
from ryd_client import ryd_client
|
||||||
|
|
||||||
|
|
||||||
|
class YoutubeVideo(YouTubeItem):
|
||||||
|
"""represents a single youtube video"""
|
||||||
|
|
||||||
|
es_path = False
|
||||||
|
index_name = "ta_video"
|
||||||
|
yt_base = "https://www.youtube.com/watch?v="
|
||||||
|
|
||||||
|
def __init__(self, youtube_id):
|
||||||
|
super().__init__(youtube_id)
|
||||||
|
self.channel_id = False
|
||||||
|
self.es_path = f"{self.index_name}/_doc/{youtube_id}"
|
||||||
|
|
||||||
|
def build_json(self):
|
||||||
|
"""build json dict of video"""
|
||||||
|
self.get_from_youtube()
|
||||||
|
if not self.youtube_meta:
|
||||||
|
return
|
||||||
|
|
||||||
|
self._process_youtube_meta()
|
||||||
|
self._add_channel()
|
||||||
|
self._add_stats()
|
||||||
|
self.add_file_path()
|
||||||
|
self.add_player()
|
||||||
|
if self.config["downloads"]["integrate_ryd"]:
|
||||||
|
self._get_ryd_stats()
|
||||||
|
|
||||||
|
return
|
||||||
|
|
||||||
|
def _process_youtube_meta(self):
|
||||||
|
"""extract relevant fields from youtube"""
|
||||||
|
# extract
|
||||||
|
self.channel_id = self.youtube_meta["channel_id"]
|
||||||
|
upload_date = self.youtube_meta["upload_date"]
|
||||||
|
upload_date_time = datetime.strptime(upload_date, "%Y%m%d")
|
||||||
|
published = upload_date_time.strftime("%Y-%m-%d")
|
||||||
|
last_refresh = int(datetime.now().strftime("%s"))
|
||||||
|
# build json_data basics
|
||||||
|
self.json_data = {
|
||||||
|
"title": self.youtube_meta["title"],
|
||||||
|
"description": self.youtube_meta["description"],
|
||||||
|
"category": self.youtube_meta["categories"],
|
||||||
|
"vid_thumb_url": self.youtube_meta["thumbnail"],
|
||||||
|
"tags": self.youtube_meta["tags"],
|
||||||
|
"published": published,
|
||||||
|
"vid_last_refresh": last_refresh,
|
||||||
|
"date_downloaded": last_refresh,
|
||||||
|
"youtube_id": self.youtube_id,
|
||||||
|
"active": True,
|
||||||
|
}
|
||||||
|
|
||||||
|
def _add_channel(self):
|
||||||
|
"""add channel dict to video json_data"""
|
||||||
|
channel = ta_channel.YoutubeChannel(self.channel_id)
|
||||||
|
channel.build_json(upload=True)
|
||||||
|
self.json_data.update({"channel": channel.json_data})
|
||||||
|
|
||||||
|
def _add_stats(self):
|
||||||
|
"""add stats dicst to json_data"""
|
||||||
|
# likes
|
||||||
|
like_count = self.youtube_meta.get("like_count", 0)
|
||||||
|
dislike_count = self.youtube_meta.get("dislike_count", 0)
|
||||||
|
self.json_data.update(
|
||||||
|
{
|
||||||
|
"stats": {
|
||||||
|
"view_count": self.youtube_meta["view_count"],
|
||||||
|
"like_count": like_count,
|
||||||
|
"dislike_count": dislike_count,
|
||||||
|
"average_rating": self.youtube_meta["average_rating"],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
def build_dl_cache_path(self):
|
||||||
|
"""find video path in dl cache"""
|
||||||
|
cache_dir = self.app_conf["cache_dir"]
|
||||||
|
cache_path = f"{cache_dir}/download/"
|
||||||
|
all_cached = os.listdir(cache_path)
|
||||||
|
for file_cached in all_cached:
|
||||||
|
if self.youtube_id in file_cached:
|
||||||
|
vid_path = os.path.join(cache_path, file_cached)
|
||||||
|
return vid_path
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def add_player(self):
|
||||||
|
"""add player information for new videos"""
|
||||||
|
try:
|
||||||
|
# when indexing from download task
|
||||||
|
vid_path = self.build_dl_cache_path()
|
||||||
|
except FileNotFoundError:
|
||||||
|
# when reindexing
|
||||||
|
base = self.app_conf["videos"]
|
||||||
|
vid_path = os.path.join(base, self.json_data["media_url"])
|
||||||
|
|
||||||
|
duration_handler = DurationConverter()
|
||||||
|
duration = duration_handler.get_sec(vid_path)
|
||||||
|
duration_str = duration_handler.get_str(duration)
|
||||||
|
self.json_data.update(
|
||||||
|
{
|
||||||
|
"player": {
|
||||||
|
"watched": False,
|
||||||
|
"duration": duration,
|
||||||
|
"duration_str": duration_str,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
def add_file_path(self):
|
||||||
|
"""build media_url for where file will be located"""
|
||||||
|
channel_name = self.json_data["channel"]["channel_name"]
|
||||||
|
clean_channel_name = clean_string(channel_name)
|
||||||
|
timestamp = self.json_data["published"].replace("-", "")
|
||||||
|
youtube_id = self.json_data["youtube_id"]
|
||||||
|
title = self.json_data["title"]
|
||||||
|
clean_title = clean_string(title)
|
||||||
|
filename = f"{timestamp}_{youtube_id}_{clean_title}.mp4"
|
||||||
|
media_url = os.path.join(clean_channel_name, filename)
|
||||||
|
self.json_data["media_url"] = media_url
|
||||||
|
|
||||||
|
def delete_media_file(self):
|
||||||
|
"""delete video file, meta data"""
|
||||||
|
self.get_from_es()
|
||||||
|
video_base = self.app_conf["videos"]
|
||||||
|
media_url = self.json_data["media_url"]
|
||||||
|
print(f"{self.youtube_id}: delete {media_url} from file system")
|
||||||
|
to_delete = os.path.join(video_base, media_url)
|
||||||
|
os.remove(to_delete)
|
||||||
|
self.del_in_es()
|
||||||
|
|
||||||
|
def _get_ryd_stats(self):
|
||||||
|
"""get optional stats from returnyoutubedislikeapi.com"""
|
||||||
|
try:
|
||||||
|
print(f"{self.youtube_id}: get ryd stats")
|
||||||
|
result = ryd_client.get(self.youtube_id)
|
||||||
|
except requests.exceptions.ConnectionError:
|
||||||
|
print(f"{self.youtube_id}: failed to query ryd api, skipping")
|
||||||
|
return False
|
||||||
|
|
||||||
|
if result["status"] == 404:
|
||||||
|
return False
|
||||||
|
|
||||||
|
dislikes = {
|
||||||
|
"dislike_count": result["dislikes"],
|
||||||
|
"average_rating": result["rating"],
|
||||||
|
}
|
||||||
|
self.json_data["stats"].update(dislikes)
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def index_new_video(youtube_id):
|
||||||
|
"""combined classes to create new video in index"""
|
||||||
|
video = YoutubeVideo(youtube_id)
|
||||||
|
video.build_json()
|
||||||
|
if not video.json_data:
|
||||||
|
raise ValueError("failed to get metadata for " + youtube_id)
|
||||||
|
|
||||||
|
video.upload_to_es()
|
||||||
|
return video.json_data
|
@ -1,600 +0,0 @@
|
|||||||
"""
|
|
||||||
Functionality:
|
|
||||||
- reindexing old documents
|
|
||||||
- syncing updated values between indexes
|
|
||||||
- scan the filesystem to delete or index
|
|
||||||
"""
|
|
||||||
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
import re
|
|
||||||
import shutil
|
|
||||||
import subprocess
|
|
||||||
from datetime import datetime
|
|
||||||
from math import ceil
|
|
||||||
from time import sleep
|
|
||||||
|
|
||||||
import requests
|
|
||||||
from home.src.config import AppConfig
|
|
||||||
from home.src.download import ChannelSubscription, PendingList, VideoDownloader
|
|
||||||
from home.src.helper import (
|
|
||||||
RedisArchivist,
|
|
||||||
clean_string,
|
|
||||||
get_total_hits,
|
|
||||||
ignore_filelist,
|
|
||||||
)
|
|
||||||
from home.src.index import (
|
|
||||||
YoutubeChannel,
|
|
||||||
YoutubePlaylist,
|
|
||||||
YoutubeVideo,
|
|
||||||
index_new_video,
|
|
||||||
)
|
|
||||||
from home.src.thumbnails import ThumbManager
|
|
||||||
|
|
||||||
|
|
||||||
class Reindex:
|
|
||||||
"""check for outdated documents and refresh data from youtube"""
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
# config
|
|
||||||
config = AppConfig().config
|
|
||||||
self.sleep_interval = config["downloads"]["sleep_interval"]
|
|
||||||
self.es_url = config["application"]["es_url"]
|
|
||||||
self.es_auth = config["application"]["es_auth"]
|
|
||||||
self.refresh_interval = config["scheduler"]["check_reindex_days"]
|
|
||||||
self.integrate_ryd = config["downloads"]["integrate_ryd"]
|
|
||||||
# scan
|
|
||||||
self.all_youtube_ids = False
|
|
||||||
self.all_channel_ids = False
|
|
||||||
self.all_playlist_ids = False
|
|
||||||
|
|
||||||
def get_daily(self):
|
|
||||||
"""get daily refresh values"""
|
|
||||||
total_videos = get_total_hits(
|
|
||||||
"ta_video", self.es_url, self.es_auth, "active"
|
|
||||||
)
|
|
||||||
video_daily = ceil(total_videos / self.refresh_interval * 1.2)
|
|
||||||
total_channels = get_total_hits(
|
|
||||||
"ta_channel", self.es_url, self.es_auth, "channel_active"
|
|
||||||
)
|
|
||||||
channel_daily = ceil(total_channels / self.refresh_interval * 1.2)
|
|
||||||
total_playlists = get_total_hits(
|
|
||||||
"ta_playlist", self.es_url, self.es_auth, "playlist_active"
|
|
||||||
)
|
|
||||||
playlist_daily = ceil(total_playlists / self.refresh_interval * 1.2)
|
|
||||||
return (video_daily, channel_daily, playlist_daily)
|
|
||||||
|
|
||||||
def get_outdated_vids(self, size):
|
|
||||||
"""get daily videos to refresh"""
|
|
||||||
headers = {"Content-type": "application/json"}
|
|
||||||
now = int(datetime.now().strftime("%s"))
|
|
||||||
now_lte = now - self.refresh_interval * 24 * 60 * 60
|
|
||||||
data = {
|
|
||||||
"size": size,
|
|
||||||
"query": {
|
|
||||||
"bool": {
|
|
||||||
"must": [
|
|
||||||
{"match": {"active": True}},
|
|
||||||
{"range": {"vid_last_refresh": {"lte": now_lte}}},
|
|
||||||
]
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"sort": [{"vid_last_refresh": {"order": "asc"}}],
|
|
||||||
"_source": False,
|
|
||||||
}
|
|
||||||
query_str = json.dumps(data)
|
|
||||||
url = self.es_url + "/ta_video/_search"
|
|
||||||
response = requests.get(
|
|
||||||
url, data=query_str, headers=headers, auth=self.es_auth
|
|
||||||
)
|
|
||||||
if not response.ok:
|
|
||||||
print(response.text)
|
|
||||||
response_dict = json.loads(response.text)
|
|
||||||
all_youtube_ids = [i["_id"] for i in response_dict["hits"]["hits"]]
|
|
||||||
return all_youtube_ids
|
|
||||||
|
|
||||||
def get_unrated_vids(self):
|
|
||||||
"""get all videos without rating if ryd integration is enabled"""
|
|
||||||
headers = {"Content-type": "application/json"}
|
|
||||||
data = {
|
|
||||||
"size": 200,
|
|
||||||
"query": {
|
|
||||||
"bool": {
|
|
||||||
"must_not": [{"exists": {"field": "stats.average_rating"}}]
|
|
||||||
}
|
|
||||||
},
|
|
||||||
}
|
|
||||||
query_str = json.dumps(data)
|
|
||||||
url = self.es_url + "/ta_video/_search"
|
|
||||||
response = requests.get(
|
|
||||||
url, data=query_str, headers=headers, auth=self.es_auth
|
|
||||||
)
|
|
||||||
if not response.ok:
|
|
||||||
print(response.text)
|
|
||||||
response_dict = json.loads(response.text)
|
|
||||||
missing_rating = [i["_id"] for i in response_dict["hits"]["hits"]]
|
|
||||||
self.all_youtube_ids = self.all_youtube_ids + missing_rating
|
|
||||||
|
|
||||||
def get_outdated_channels(self, size):
|
|
||||||
"""get daily channels to refresh"""
|
|
||||||
headers = {"Content-type": "application/json"}
|
|
||||||
now = int(datetime.now().strftime("%s"))
|
|
||||||
now_lte = now - self.refresh_interval * 24 * 60 * 60
|
|
||||||
data = {
|
|
||||||
"size": size,
|
|
||||||
"query": {
|
|
||||||
"bool": {
|
|
||||||
"must": [
|
|
||||||
{"match": {"channel_active": True}},
|
|
||||||
{"range": {"channel_last_refresh": {"lte": now_lte}}},
|
|
||||||
]
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"sort": [{"channel_last_refresh": {"order": "asc"}}],
|
|
||||||
"_source": False,
|
|
||||||
}
|
|
||||||
query_str = json.dumps(data)
|
|
||||||
url = self.es_url + "/ta_channel/_search"
|
|
||||||
response = requests.get(
|
|
||||||
url, data=query_str, headers=headers, auth=self.es_auth
|
|
||||||
)
|
|
||||||
if not response.ok:
|
|
||||||
print(response.text)
|
|
||||||
response_dict = json.loads(response.text)
|
|
||||||
all_channel_ids = [i["_id"] for i in response_dict["hits"]["hits"]]
|
|
||||||
return all_channel_ids
|
|
||||||
|
|
||||||
def get_outdated_playlists(self, size):
|
|
||||||
"""get daily outdated playlists to refresh"""
|
|
||||||
headers = {"Content-type": "application/json"}
|
|
||||||
now = int(datetime.now().strftime("%s"))
|
|
||||||
now_lte = now - self.refresh_interval * 24 * 60 * 60
|
|
||||||
data = {
|
|
||||||
"size": size,
|
|
||||||
"query": {
|
|
||||||
"bool": {
|
|
||||||
"must": [
|
|
||||||
{"match": {"playlist_active": True}},
|
|
||||||
{"range": {"playlist_last_refresh": {"lte": now_lte}}},
|
|
||||||
]
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"sort": [{"playlist_last_refresh": {"order": "asc"}}],
|
|
||||||
"_source": False,
|
|
||||||
}
|
|
||||||
query_str = json.dumps(data)
|
|
||||||
url = self.es_url + "/ta_playlist/_search"
|
|
||||||
response = requests.get(
|
|
||||||
url, data=query_str, headers=headers, auth=self.es_auth
|
|
||||||
)
|
|
||||||
if not response.ok:
|
|
||||||
print(response.text)
|
|
||||||
response_dict = json.loads(response.text)
|
|
||||||
all_playlist_ids = [i["_id"] for i in response_dict["hits"]["hits"]]
|
|
||||||
return all_playlist_ids
|
|
||||||
|
|
||||||
def check_outdated(self):
|
|
||||||
"""add missing vids and channels"""
|
|
||||||
video_daily, channel_daily, playlist_daily = self.get_daily()
|
|
||||||
self.all_youtube_ids = self.get_outdated_vids(video_daily)
|
|
||||||
self.all_channel_ids = self.get_outdated_channels(channel_daily)
|
|
||||||
self.all_playlist_ids = self.get_outdated_playlists(playlist_daily)
|
|
||||||
if self.integrate_ryd:
|
|
||||||
self.get_unrated_vids()
|
|
||||||
|
|
||||||
def rescrape_all_channels(self):
|
|
||||||
"""sync new data from channel to all matching videos"""
|
|
||||||
sleep_interval = self.sleep_interval
|
|
||||||
channel_sub_handler = ChannelSubscription()
|
|
||||||
all_channels = channel_sub_handler.get_channels(subscribed_only=False)
|
|
||||||
all_channel_ids = [i["channel_id"] for i in all_channels]
|
|
||||||
|
|
||||||
counter = 1
|
|
||||||
for channel_id in all_channel_ids:
|
|
||||||
channel_index = YoutubeChannel(channel_id)
|
|
||||||
subscribed = channel_index.channel_dict["channel_subscribed"]
|
|
||||||
channel_index.channel_dict = channel_index.build_channel_dict(
|
|
||||||
scrape=True
|
|
||||||
)
|
|
||||||
channel_index.channel_dict["channel_subscribed"] = subscribed
|
|
||||||
channel_index.upload_to_es()
|
|
||||||
channel_index.sync_to_videos()
|
|
||||||
counter = counter + 1
|
|
||||||
if sleep_interval:
|
|
||||||
sleep(sleep_interval)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def reindex_single_video(youtube_id):
|
|
||||||
"""refresh data for single video"""
|
|
||||||
vid_handler = YoutubeVideo(youtube_id)
|
|
||||||
vid_handler.get_vid_dict()
|
|
||||||
if not vid_handler.vid_dict:
|
|
||||||
# stop if deactivated
|
|
||||||
vid_handler.deactivate()
|
|
||||||
return
|
|
||||||
|
|
||||||
es_vid_dict = vid_handler.get_es_data()
|
|
||||||
player = es_vid_dict["_source"]["player"]
|
|
||||||
date_downloaded = es_vid_dict["_source"]["date_downloaded"]
|
|
||||||
channel_dict = es_vid_dict["_source"]["channel"]
|
|
||||||
channel_name = channel_dict["channel_name"]
|
|
||||||
try:
|
|
||||||
playlist = es_vid_dict["_source"]["playlist"]
|
|
||||||
except KeyError:
|
|
||||||
playlist = False
|
|
||||||
|
|
||||||
vid_handler.build_file_path(channel_name)
|
|
||||||
# add to vid_dict
|
|
||||||
vid_handler.vid_dict["player"] = player
|
|
||||||
vid_handler.vid_dict["date_downloaded"] = date_downloaded
|
|
||||||
vid_handler.vid_dict["channel"] = channel_dict
|
|
||||||
if playlist:
|
|
||||||
vid_handler.vid_dict["playlist"] = playlist
|
|
||||||
# update
|
|
||||||
vid_handler.upload_to_es()
|
|
||||||
thumb_handler = ThumbManager()
|
|
||||||
thumb_handler.delete_vid_thumb(youtube_id)
|
|
||||||
to_download = (youtube_id, vid_handler.vid_dict["vid_thumb_url"])
|
|
||||||
thumb_handler.download_vid([to_download], notify=False)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def reindex_single_channel(channel_id):
|
|
||||||
"""refresh channel data and sync to videos"""
|
|
||||||
channel_handler = YoutubeChannel(channel_id)
|
|
||||||
subscribed = channel_handler.channel_dict["channel_subscribed"]
|
|
||||||
channel_handler.channel_dict = channel_handler.build_channel_dict(
|
|
||||||
scrape=True
|
|
||||||
)
|
|
||||||
channel_handler.channel_dict["channel_subscribed"] = subscribed
|
|
||||||
# update
|
|
||||||
channel_handler.upload_to_es()
|
|
||||||
channel_handler.sync_to_videos()
|
|
||||||
thumb_handler = ThumbManager()
|
|
||||||
thumb_handler.delete_chan_thumb(channel_id)
|
|
||||||
channel_thumb = channel_handler.channel_dict["channel_thumb_url"]
|
|
||||||
channel_banner = channel_handler.channel_dict["channel_banner_url"]
|
|
||||||
to_download = (channel_id, channel_thumb, channel_banner)
|
|
||||||
thumb_handler.download_chan([to_download])
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def reindex_single_playlist(playlist_id, all_indexed_ids):
|
|
||||||
"""refresh playlist data"""
|
|
||||||
playlist_handler = YoutubePlaylist(
|
|
||||||
playlist_id, all_youtube_ids=all_indexed_ids
|
|
||||||
)
|
|
||||||
playlist = playlist_handler.update_playlist()
|
|
||||||
if not playlist:
|
|
||||||
playlist_handler.deactivate()
|
|
||||||
return
|
|
||||||
|
|
||||||
playlist_thumbnail = (playlist_id, playlist["playlist_thumbnail"])
|
|
||||||
thumb_handler = ThumbManager()
|
|
||||||
thumb_handler.download_playlist([playlist_thumbnail])
|
|
||||||
return
|
|
||||||
|
|
||||||
def reindex(self):
|
|
||||||
"""reindex what's needed"""
|
|
||||||
# videos
|
|
||||||
print(f"reindexing {len(self.all_youtube_ids)} videos")
|
|
||||||
for youtube_id in self.all_youtube_ids:
|
|
||||||
self.reindex_single_video(youtube_id)
|
|
||||||
if self.sleep_interval:
|
|
||||||
sleep(self.sleep_interval)
|
|
||||||
# channels
|
|
||||||
print(f"reindexing {len(self.all_channel_ids)} channels")
|
|
||||||
for channel_id in self.all_channel_ids:
|
|
||||||
self.reindex_single_channel(channel_id)
|
|
||||||
if self.sleep_interval:
|
|
||||||
sleep(self.sleep_interval)
|
|
||||||
# playlist
|
|
||||||
print(f"reindexing {len(self.all_playlist_ids)} playlists")
|
|
||||||
if self.all_playlist_ids:
|
|
||||||
all_indexed = PendingList().get_all_indexed()
|
|
||||||
all_indexed_ids = [i["youtube_id"] for i in all_indexed]
|
|
||||||
for playlist_id in self.all_playlist_ids:
|
|
||||||
self.reindex_single_playlist(playlist_id, all_indexed_ids)
|
|
||||||
if self.sleep_interval:
|
|
||||||
sleep(self.sleep_interval)
|
|
||||||
|
|
||||||
|
|
||||||
class FilesystemScanner:
|
|
||||||
"""handle scanning and fixing from filesystem"""
|
|
||||||
|
|
||||||
CONFIG = AppConfig().config
|
|
||||||
ES_URL = CONFIG["application"]["es_url"]
|
|
||||||
ES_AUTH = CONFIG["application"]["es_auth"]
|
|
||||||
VIDEOS = CONFIG["application"]["videos"]
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
self.all_downloaded = self.get_all_downloaded()
|
|
||||||
self.all_indexed = self.get_all_indexed()
|
|
||||||
self.mismatch = None
|
|
||||||
self.to_rename = None
|
|
||||||
self.to_index = None
|
|
||||||
self.to_delete = None
|
|
||||||
|
|
||||||
def get_all_downloaded(self):
|
|
||||||
"""get a list of all video files downloaded"""
|
|
||||||
channels = os.listdir(self.VIDEOS)
|
|
||||||
all_channels = ignore_filelist(channels)
|
|
||||||
all_channels.sort()
|
|
||||||
all_downloaded = []
|
|
||||||
for channel_name in all_channels:
|
|
||||||
channel_path = os.path.join(self.VIDEOS, channel_name)
|
|
||||||
videos = os.listdir(channel_path)
|
|
||||||
all_videos = ignore_filelist(videos)
|
|
||||||
for video in all_videos:
|
|
||||||
youtube_id = video[9:20]
|
|
||||||
all_downloaded.append((channel_name, video, youtube_id))
|
|
||||||
|
|
||||||
return all_downloaded
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def get_all_indexed():
|
|
||||||
"""get a list of all indexed videos"""
|
|
||||||
index_handler = PendingList()
|
|
||||||
all_indexed_raw = index_handler.get_all_indexed()
|
|
||||||
all_indexed = []
|
|
||||||
for video in all_indexed_raw:
|
|
||||||
youtube_id = video["youtube_id"]
|
|
||||||
media_url = video["media_url"]
|
|
||||||
published = video["published"]
|
|
||||||
title = video["title"]
|
|
||||||
all_indexed.append((youtube_id, media_url, published, title))
|
|
||||||
return all_indexed
|
|
||||||
|
|
||||||
def list_comarison(self):
|
|
||||||
"""compare the lists to figure out what to do"""
|
|
||||||
self.find_unindexed()
|
|
||||||
self.find_missing()
|
|
||||||
self.find_bad_media_url()
|
|
||||||
|
|
||||||
def find_unindexed(self):
|
|
||||||
"""find video files without a matching document indexed"""
|
|
||||||
all_indexed_ids = [i[0] for i in self.all_indexed]
|
|
||||||
to_index = []
|
|
||||||
for downloaded in self.all_downloaded:
|
|
||||||
if downloaded[2] not in all_indexed_ids:
|
|
||||||
to_index.append(downloaded)
|
|
||||||
|
|
||||||
self.to_index = to_index
|
|
||||||
|
|
||||||
def find_missing(self):
|
|
||||||
"""find indexed videos without matching media file"""
|
|
||||||
all_downloaded_ids = [i[2] for i in self.all_downloaded]
|
|
||||||
to_delete = []
|
|
||||||
for video in self.all_indexed:
|
|
||||||
youtube_id = video[0]
|
|
||||||
if youtube_id not in all_downloaded_ids:
|
|
||||||
to_delete.append(video)
|
|
||||||
|
|
||||||
self.to_delete = to_delete
|
|
||||||
|
|
||||||
def find_bad_media_url(self):
|
|
||||||
"""rename media files not matching the indexed title"""
|
|
||||||
to_fix = []
|
|
||||||
to_rename = []
|
|
||||||
for downloaded in self.all_downloaded:
|
|
||||||
channel, filename, downloaded_id = downloaded
|
|
||||||
# find in indexed
|
|
||||||
for indexed in self.all_indexed:
|
|
||||||
indexed_id, media_url, published, title = indexed
|
|
||||||
if indexed_id == downloaded_id:
|
|
||||||
# found it
|
|
||||||
title_c = clean_string(title)
|
|
||||||
pub = published.replace("-", "")
|
|
||||||
expected_filename = f"{pub}_{indexed_id}_{title_c}.mp4"
|
|
||||||
new_url = os.path.join(channel, expected_filename)
|
|
||||||
if expected_filename != filename:
|
|
||||||
# file to rename
|
|
||||||
to_rename.append(
|
|
||||||
(channel, filename, expected_filename)
|
|
||||||
)
|
|
||||||
if media_url != new_url:
|
|
||||||
# media_url to update in es
|
|
||||||
to_fix.append((indexed_id, new_url))
|
|
||||||
|
|
||||||
break
|
|
||||||
|
|
||||||
self.mismatch = to_fix
|
|
||||||
self.to_rename = to_rename
|
|
||||||
|
|
||||||
def rename_files(self):
|
|
||||||
"""rename media files as identified by find_bad_media_url"""
|
|
||||||
for bad_filename in self.to_rename:
|
|
||||||
channel, filename, expected_filename = bad_filename
|
|
||||||
print(f"renaming [{filename}] to [{expected_filename}]")
|
|
||||||
old_path = os.path.join(self.VIDEOS, channel, filename)
|
|
||||||
new_path = os.path.join(self.VIDEOS, channel, expected_filename)
|
|
||||||
os.rename(old_path, new_path)
|
|
||||||
|
|
||||||
def send_mismatch_bulk(self):
|
|
||||||
"""build bulk update"""
|
|
||||||
bulk_list = []
|
|
||||||
for video_mismatch in self.mismatch:
|
|
||||||
youtube_id, media_url = video_mismatch
|
|
||||||
print(f"{youtube_id}: fixing media url {media_url}")
|
|
||||||
action = {"update": {"_id": youtube_id, "_index": "ta_video"}}
|
|
||||||
source = {"doc": {"media_url": media_url}}
|
|
||||||
bulk_list.append(json.dumps(action))
|
|
||||||
bulk_list.append(json.dumps(source))
|
|
||||||
# add last newline
|
|
||||||
bulk_list.append("\n")
|
|
||||||
query_str = "\n".join(bulk_list)
|
|
||||||
# make the call
|
|
||||||
headers = {"Content-type": "application/x-ndjson"}
|
|
||||||
url = self.ES_URL + "/_bulk"
|
|
||||||
request = requests.post(
|
|
||||||
url, data=query_str, headers=headers, auth=self.ES_AUTH
|
|
||||||
)
|
|
||||||
if not request.ok:
|
|
||||||
print(request.text)
|
|
||||||
|
|
||||||
def delete_from_index(self):
|
|
||||||
"""find indexed but deleted mediafile"""
|
|
||||||
for indexed in self.to_delete:
|
|
||||||
youtube_id = indexed[0]
|
|
||||||
print(f"deleting {youtube_id} from index")
|
|
||||||
url = self.ES_URL + "/ta_video/_doc/" + youtube_id
|
|
||||||
request = requests.delete(url, auth=self.ES_AUTH)
|
|
||||||
if not request.ok:
|
|
||||||
print(request.text)
|
|
||||||
|
|
||||||
|
|
||||||
class ManualImport:
|
|
||||||
"""import and indexing existing video files"""
|
|
||||||
|
|
||||||
CONFIG = AppConfig().config
|
|
||||||
CACHE_DIR = CONFIG["application"]["cache_dir"]
|
|
||||||
IMPORT_DIR = os.path.join(CACHE_DIR, "import")
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
self.identified = self.import_folder_parser()
|
|
||||||
|
|
||||||
def import_folder_parser(self):
|
|
||||||
"""detect files in import folder"""
|
|
||||||
import_files = os.listdir(self.IMPORT_DIR)
|
|
||||||
to_import = ignore_filelist(import_files)
|
|
||||||
to_import.sort()
|
|
||||||
video_files = [i for i in to_import if not i.endswith(".json")]
|
|
||||||
|
|
||||||
identified = []
|
|
||||||
|
|
||||||
for file_path in video_files:
|
|
||||||
|
|
||||||
file_dict = {"video_file": file_path}
|
|
||||||
file_name, _ = os.path.splitext(file_path)
|
|
||||||
|
|
||||||
matching_json = [
|
|
||||||
i
|
|
||||||
for i in to_import
|
|
||||||
if i.startswith(file_name) and i.endswith(".json")
|
|
||||||
]
|
|
||||||
if matching_json:
|
|
||||||
json_file = matching_json[0]
|
|
||||||
youtube_id = self.extract_id_from_json(json_file)
|
|
||||||
file_dict.update({"json_file": json_file})
|
|
||||||
else:
|
|
||||||
youtube_id = self.extract_id_from_filename(file_name)
|
|
||||||
file_dict.update({"json_file": False})
|
|
||||||
|
|
||||||
file_dict.update({"youtube_id": youtube_id})
|
|
||||||
identified.append(file_dict)
|
|
||||||
|
|
||||||
return identified
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def extract_id_from_filename(file_name):
|
|
||||||
"""
|
|
||||||
look at the file name for the youtube id
|
|
||||||
expects filename ending in [<youtube_id>].<ext>
|
|
||||||
"""
|
|
||||||
id_search = re.search(r"\[([a-zA-Z0-9_-]{11})\]$", file_name)
|
|
||||||
if id_search:
|
|
||||||
youtube_id = id_search.group(1)
|
|
||||||
return youtube_id
|
|
||||||
|
|
||||||
print("failed to extract youtube id for: " + file_name)
|
|
||||||
raise Exception
|
|
||||||
|
|
||||||
def extract_id_from_json(self, json_file):
|
|
||||||
"""open json file and extract id"""
|
|
||||||
json_path = os.path.join(self.CACHE_DIR, "import", json_file)
|
|
||||||
with open(json_path, "r", encoding="utf-8") as f:
|
|
||||||
json_content = f.read()
|
|
||||||
|
|
||||||
youtube_id = json.loads(json_content)["id"]
|
|
||||||
|
|
||||||
return youtube_id
|
|
||||||
|
|
||||||
def process_import(self):
|
|
||||||
"""go through identified media files"""
|
|
||||||
|
|
||||||
all_videos_added = []
|
|
||||||
|
|
||||||
for media_file in self.identified:
|
|
||||||
json_file = media_file["json_file"]
|
|
||||||
video_file = media_file["video_file"]
|
|
||||||
youtube_id = media_file["youtube_id"]
|
|
||||||
|
|
||||||
video_path = os.path.join(self.CACHE_DIR, "import", video_file)
|
|
||||||
|
|
||||||
self.move_to_cache(video_path, youtube_id)
|
|
||||||
|
|
||||||
# identify and archive
|
|
||||||
vid_dict = index_new_video(youtube_id)
|
|
||||||
VideoDownloader([youtube_id]).move_to_archive(vid_dict)
|
|
||||||
youtube_id = vid_dict["youtube_id"]
|
|
||||||
thumb_url = vid_dict["vid_thumb_url"]
|
|
||||||
all_videos_added.append((youtube_id, thumb_url))
|
|
||||||
|
|
||||||
# cleanup
|
|
||||||
if os.path.exists(video_path):
|
|
||||||
os.remove(video_path)
|
|
||||||
if json_file:
|
|
||||||
json_path = os.path.join(self.CACHE_DIR, "import", json_file)
|
|
||||||
os.remove(json_path)
|
|
||||||
|
|
||||||
return all_videos_added
|
|
||||||
|
|
||||||
def move_to_cache(self, video_path, youtube_id):
|
|
||||||
"""move identified video file to cache, convert to mp4"""
|
|
||||||
file_name = os.path.split(video_path)[-1]
|
|
||||||
video_file, ext = os.path.splitext(file_name)
|
|
||||||
|
|
||||||
# make sure youtube_id is in filename
|
|
||||||
if youtube_id not in video_file:
|
|
||||||
video_file = f"{video_file}_{youtube_id}"
|
|
||||||
|
|
||||||
# move, convert if needed
|
|
||||||
if ext == ".mp4":
|
|
||||||
new_file = video_file + ext
|
|
||||||
dest_path = os.path.join(self.CACHE_DIR, "download", new_file)
|
|
||||||
shutil.move(video_path, dest_path)
|
|
||||||
else:
|
|
||||||
print(f"processing with ffmpeg: {video_file}")
|
|
||||||
new_file = video_file + ".mp4"
|
|
||||||
dest_path = os.path.join(self.CACHE_DIR, "download", new_file)
|
|
||||||
subprocess.run(
|
|
||||||
[
|
|
||||||
"ffmpeg",
|
|
||||||
"-i",
|
|
||||||
video_path,
|
|
||||||
dest_path,
|
|
||||||
"-loglevel",
|
|
||||||
"warning",
|
|
||||||
"-stats",
|
|
||||||
],
|
|
||||||
check=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def scan_filesystem():
|
|
||||||
"""grouped function to delete and update index"""
|
|
||||||
filesystem_handler = FilesystemScanner()
|
|
||||||
filesystem_handler.list_comarison()
|
|
||||||
if filesystem_handler.to_rename:
|
|
||||||
print("renaming files")
|
|
||||||
filesystem_handler.rename_files()
|
|
||||||
if filesystem_handler.mismatch:
|
|
||||||
print("fixing media urls in index")
|
|
||||||
filesystem_handler.send_mismatch_bulk()
|
|
||||||
if filesystem_handler.to_delete:
|
|
||||||
print("delete metadata from index")
|
|
||||||
filesystem_handler.delete_from_index()
|
|
||||||
if filesystem_handler.to_index:
|
|
||||||
print("index new videos")
|
|
||||||
for missing_vid in filesystem_handler.to_index:
|
|
||||||
youtube_id = missing_vid[2]
|
|
||||||
index_new_video(youtube_id, missing_vid=missing_vid)
|
|
||||||
|
|
||||||
|
|
||||||
def reindex_old_documents():
|
|
||||||
"""daily refresh of old documents"""
|
|
||||||
# continue if needed
|
|
||||||
reindex_handler = Reindex()
|
|
||||||
reindex_handler.check_outdated()
|
|
||||||
reindex_handler.reindex()
|
|
||||||
# set timestamp
|
|
||||||
now = int(datetime.now().strftime("%s"))
|
|
||||||
RedisArchivist().set_message("last_reindex", now, expire=False)
|
|
0
tubearchivist/home/src/ta/__init__.py
Normal file
0
tubearchivist/home/src/ta/__init__.py
Normal file
@ -2,7 +2,6 @@
|
|||||||
Functionality:
|
Functionality:
|
||||||
- read and write config
|
- read and write config
|
||||||
- load config variables into redis
|
- load config variables into redis
|
||||||
- needs to be a separate module to avoid circular import
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import json
|
import json
|
||||||
@ -10,7 +9,7 @@ import os
|
|||||||
import re
|
import re
|
||||||
|
|
||||||
from celery.schedules import crontab
|
from celery.schedules import crontab
|
||||||
from home.src.helper import RedisArchivist
|
from home.src.ta.ta_redis import RedisArchivist
|
||||||
|
|
||||||
|
|
||||||
class AppConfig:
|
class AppConfig:
|
||||||
@ -39,8 +38,7 @@ class AppConfig:
|
|||||||
def get_config_file(self):
|
def get_config_file(self):
|
||||||
"""read the defaults from config.json"""
|
"""read the defaults from config.json"""
|
||||||
with open("home/config.json", "r", encoding="utf-8") as f:
|
with open("home/config.json", "r", encoding="utf-8") as f:
|
||||||
config_str = f.read()
|
config_file = json.load(f)
|
||||||
config_file = json.loads(config_str)
|
|
||||||
|
|
||||||
config_file["application"].update(self.get_config_env())
|
config_file["application"].update(self.get_config_env())
|
||||||
|
|
@ -4,14 +4,12 @@ Loose collection of helper functions
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import json
|
import json
|
||||||
import os
|
|
||||||
import re
|
import re
|
||||||
import string
|
import string
|
||||||
import subprocess
|
import subprocess
|
||||||
import unicodedata
|
import unicodedata
|
||||||
from urllib.parse import parse_qs, urlparse
|
from urllib.parse import parse_qs, urlparse
|
||||||
|
|
||||||
import redis
|
|
||||||
import requests
|
import requests
|
||||||
import yt_dlp
|
import yt_dlp
|
||||||
|
|
||||||
@ -149,153 +147,6 @@ class UrlListParser:
|
|||||||
return channel_id
|
return channel_id
|
||||||
|
|
||||||
|
|
||||||
class RedisArchivist:
|
|
||||||
"""collection of methods to interact with redis"""
|
|
||||||
|
|
||||||
REDIS_HOST = os.environ.get("REDIS_HOST")
|
|
||||||
REDIS_PORT = os.environ.get("REDIS_PORT") or 6379
|
|
||||||
NAME_SPACE = "ta:"
|
|
||||||
CHANNELS = [
|
|
||||||
"download",
|
|
||||||
"add",
|
|
||||||
"rescan",
|
|
||||||
"subchannel",
|
|
||||||
"subplaylist",
|
|
||||||
"playlistscan",
|
|
||||||
"setting",
|
|
||||||
]
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
self.redis_connection = redis.Redis(
|
|
||||||
host=self.REDIS_HOST, port=self.REDIS_PORT
|
|
||||||
)
|
|
||||||
|
|
||||||
def set_message(self, key, message, expire=True):
|
|
||||||
"""write new message to redis"""
|
|
||||||
self.redis_connection.execute_command(
|
|
||||||
"JSON.SET", self.NAME_SPACE + key, ".", json.dumps(message)
|
|
||||||
)
|
|
||||||
|
|
||||||
if expire:
|
|
||||||
if isinstance(expire, bool):
|
|
||||||
secs = 20
|
|
||||||
else:
|
|
||||||
secs = expire
|
|
||||||
self.redis_connection.execute_command(
|
|
||||||
"EXPIRE", self.NAME_SPACE + key, secs
|
|
||||||
)
|
|
||||||
|
|
||||||
def get_message(self, key):
|
|
||||||
"""get message dict from redis"""
|
|
||||||
reply = self.redis_connection.execute_command(
|
|
||||||
"JSON.GET", self.NAME_SPACE + key
|
|
||||||
)
|
|
||||||
if reply:
|
|
||||||
json_str = json.loads(reply)
|
|
||||||
else:
|
|
||||||
json_str = {"status": False}
|
|
||||||
|
|
||||||
return json_str
|
|
||||||
|
|
||||||
def del_message(self, key):
|
|
||||||
"""delete key from redis"""
|
|
||||||
response = self.redis_connection.execute_command(
|
|
||||||
"DEL", self.NAME_SPACE + key
|
|
||||||
)
|
|
||||||
return response
|
|
||||||
|
|
||||||
def get_lock(self, lock_key):
|
|
||||||
"""handle lock for task management"""
|
|
||||||
redis_lock = self.redis_connection.lock(self.NAME_SPACE + lock_key)
|
|
||||||
return redis_lock
|
|
||||||
|
|
||||||
def get_progress(self):
|
|
||||||
"""get a list of all progress messages"""
|
|
||||||
all_messages = []
|
|
||||||
for channel in self.CHANNELS:
|
|
||||||
key = "message:" + channel
|
|
||||||
reply = self.redis_connection.execute_command(
|
|
||||||
"JSON.GET", self.NAME_SPACE + key
|
|
||||||
)
|
|
||||||
if reply:
|
|
||||||
json_str = json.loads(reply)
|
|
||||||
all_messages.append(json_str)
|
|
||||||
|
|
||||||
return all_messages
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def monitor_cache_dir(cache_dir):
|
|
||||||
"""
|
|
||||||
look at download cache dir directly as alternative progress info
|
|
||||||
"""
|
|
||||||
dl_cache = os.path.join(cache_dir, "download")
|
|
||||||
all_cache_file = os.listdir(dl_cache)
|
|
||||||
cache_file = ignore_filelist(all_cache_file)
|
|
||||||
if cache_file:
|
|
||||||
filename = cache_file[0][12:].replace("_", " ").split(".")[0]
|
|
||||||
mess_dict = {
|
|
||||||
"status": "message:download",
|
|
||||||
"level": "info",
|
|
||||||
"title": "Downloading: " + filename,
|
|
||||||
"message": "",
|
|
||||||
}
|
|
||||||
else:
|
|
||||||
return False
|
|
||||||
|
|
||||||
return mess_dict
|
|
||||||
|
|
||||||
|
|
||||||
class RedisQueue:
|
|
||||||
"""dynamically interact with the download queue in redis"""
|
|
||||||
|
|
||||||
REDIS_HOST = os.environ.get("REDIS_HOST")
|
|
||||||
REDIS_PORT = os.environ.get("REDIS_PORT")
|
|
||||||
NAME_SPACE = "ta:"
|
|
||||||
|
|
||||||
if not REDIS_PORT:
|
|
||||||
REDIS_PORT = 6379
|
|
||||||
|
|
||||||
def __init__(self, key):
|
|
||||||
self.key = self.NAME_SPACE + key
|
|
||||||
self.conn = redis.Redis(host=self.REDIS_HOST, port=self.REDIS_PORT)
|
|
||||||
|
|
||||||
def get_all(self):
|
|
||||||
"""return all elements in list"""
|
|
||||||
result = self.conn.execute_command("LRANGE", self.key, 0, -1)
|
|
||||||
all_elements = [i.decode() for i in result]
|
|
||||||
return all_elements
|
|
||||||
|
|
||||||
def add_list(self, to_add):
|
|
||||||
"""add list to queue"""
|
|
||||||
self.conn.execute_command("RPUSH", self.key, *to_add)
|
|
||||||
|
|
||||||
def add_priority(self, to_add):
|
|
||||||
"""add single video to front of queue"""
|
|
||||||
self.clear_item(to_add)
|
|
||||||
self.conn.execute_command("LPUSH", self.key, to_add)
|
|
||||||
|
|
||||||
def get_next(self):
|
|
||||||
"""return next element in the queue, False if none"""
|
|
||||||
result = self.conn.execute_command("LPOP", self.key)
|
|
||||||
if not result:
|
|
||||||
return False
|
|
||||||
|
|
||||||
next_element = result.decode()
|
|
||||||
return next_element
|
|
||||||
|
|
||||||
def clear(self):
|
|
||||||
"""delete list from redis"""
|
|
||||||
self.conn.execute_command("DEL", self.key)
|
|
||||||
|
|
||||||
def clear_item(self, to_clear):
|
|
||||||
"""remove single item from list if it's there"""
|
|
||||||
self.conn.execute_command("LREM", self.key, 0, to_clear)
|
|
||||||
|
|
||||||
def trim(self, size):
|
|
||||||
"""trim the queue based on settings amount"""
|
|
||||||
self.conn.execute_command("LTRIM", self.key, 0, size)
|
|
||||||
|
|
||||||
|
|
||||||
class DurationConverter:
|
class DurationConverter:
|
||||||
"""
|
"""
|
||||||
using ffmpeg to get and parse duration from filepath
|
using ffmpeg to get and parse duration from filepath
|
158
tubearchivist/home/src/ta/ta_redis.py
Normal file
158
tubearchivist/home/src/ta/ta_redis.py
Normal file
@ -0,0 +1,158 @@
|
|||||||
|
"""
|
||||||
|
functionality:
|
||||||
|
- interact with redis
|
||||||
|
- hold temporary download queue in redis
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
|
||||||
|
import redis
|
||||||
|
from home.src.ta.helper import ignore_filelist
|
||||||
|
|
||||||
|
|
||||||
|
class RedisArchivist:
|
||||||
|
"""collection of methods to interact with redis"""
|
||||||
|
|
||||||
|
REDIS_HOST = os.environ.get("REDIS_HOST")
|
||||||
|
REDIS_PORT = os.environ.get("REDIS_PORT") or 6379
|
||||||
|
NAME_SPACE = "ta:"
|
||||||
|
CHANNELS = [
|
||||||
|
"download",
|
||||||
|
"add",
|
||||||
|
"rescan",
|
||||||
|
"subchannel",
|
||||||
|
"subplaylist",
|
||||||
|
"playlistscan",
|
||||||
|
"setting",
|
||||||
|
]
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.redis_connection = redis.Redis(
|
||||||
|
host=self.REDIS_HOST, port=self.REDIS_PORT
|
||||||
|
)
|
||||||
|
|
||||||
|
def set_message(self, key, message, expire=True):
|
||||||
|
"""write new message to redis"""
|
||||||
|
self.redis_connection.execute_command(
|
||||||
|
"JSON.SET", self.NAME_SPACE + key, ".", json.dumps(message)
|
||||||
|
)
|
||||||
|
|
||||||
|
if expire:
|
||||||
|
if isinstance(expire, bool):
|
||||||
|
secs = 20
|
||||||
|
else:
|
||||||
|
secs = expire
|
||||||
|
self.redis_connection.execute_command(
|
||||||
|
"EXPIRE", self.NAME_SPACE + key, secs
|
||||||
|
)
|
||||||
|
|
||||||
|
def get_message(self, key):
|
||||||
|
"""get message dict from redis"""
|
||||||
|
reply = self.redis_connection.execute_command(
|
||||||
|
"JSON.GET", self.NAME_SPACE + key
|
||||||
|
)
|
||||||
|
if reply:
|
||||||
|
json_str = json.loads(reply)
|
||||||
|
else:
|
||||||
|
json_str = {"status": False}
|
||||||
|
|
||||||
|
return json_str
|
||||||
|
|
||||||
|
def del_message(self, key):
|
||||||
|
"""delete key from redis"""
|
||||||
|
response = self.redis_connection.execute_command(
|
||||||
|
"DEL", self.NAME_SPACE + key
|
||||||
|
)
|
||||||
|
return response
|
||||||
|
|
||||||
|
def get_lock(self, lock_key):
|
||||||
|
"""handle lock for task management"""
|
||||||
|
redis_lock = self.redis_connection.lock(self.NAME_SPACE + lock_key)
|
||||||
|
return redis_lock
|
||||||
|
|
||||||
|
def get_progress(self):
|
||||||
|
"""get a list of all progress messages"""
|
||||||
|
all_messages = []
|
||||||
|
for channel in self.CHANNELS:
|
||||||
|
key = "message:" + channel
|
||||||
|
reply = self.redis_connection.execute_command(
|
||||||
|
"JSON.GET", self.NAME_SPACE + key
|
||||||
|
)
|
||||||
|
if reply:
|
||||||
|
json_str = json.loads(reply)
|
||||||
|
all_messages.append(json_str)
|
||||||
|
|
||||||
|
return all_messages
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def monitor_cache_dir(cache_dir):
|
||||||
|
"""
|
||||||
|
look at download cache dir directly as alternative progress info
|
||||||
|
"""
|
||||||
|
dl_cache = os.path.join(cache_dir, "download")
|
||||||
|
all_cache_file = os.listdir(dl_cache)
|
||||||
|
cache_file = ignore_filelist(all_cache_file)
|
||||||
|
if cache_file:
|
||||||
|
filename = cache_file[0][12:].replace("_", " ").split(".")[0]
|
||||||
|
mess_dict = {
|
||||||
|
"status": "message:download",
|
||||||
|
"level": "info",
|
||||||
|
"title": "Downloading: " + filename,
|
||||||
|
"message": "",
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
|
return mess_dict
|
||||||
|
|
||||||
|
|
||||||
|
class RedisQueue:
|
||||||
|
"""dynamically interact with the download queue in redis"""
|
||||||
|
|
||||||
|
REDIS_HOST = os.environ.get("REDIS_HOST")
|
||||||
|
REDIS_PORT = os.environ.get("REDIS_PORT")
|
||||||
|
NAME_SPACE = "ta:"
|
||||||
|
|
||||||
|
if not REDIS_PORT:
|
||||||
|
REDIS_PORT = 6379
|
||||||
|
|
||||||
|
def __init__(self, key):
|
||||||
|
self.key = self.NAME_SPACE + key
|
||||||
|
self.conn = redis.Redis(host=self.REDIS_HOST, port=self.REDIS_PORT)
|
||||||
|
|
||||||
|
def get_all(self):
|
||||||
|
"""return all elements in list"""
|
||||||
|
result = self.conn.execute_command("LRANGE", self.key, 0, -1)
|
||||||
|
all_elements = [i.decode() for i in result]
|
||||||
|
return all_elements
|
||||||
|
|
||||||
|
def add_list(self, to_add):
|
||||||
|
"""add list to queue"""
|
||||||
|
self.conn.execute_command("RPUSH", self.key, *to_add)
|
||||||
|
|
||||||
|
def add_priority(self, to_add):
|
||||||
|
"""add single video to front of queue"""
|
||||||
|
self.clear_item(to_add)
|
||||||
|
self.conn.execute_command("LPUSH", self.key, to_add)
|
||||||
|
|
||||||
|
def get_next(self):
|
||||||
|
"""return next element in the queue, False if none"""
|
||||||
|
result = self.conn.execute_command("LPOP", self.key)
|
||||||
|
if not result:
|
||||||
|
return False
|
||||||
|
|
||||||
|
next_element = result.decode()
|
||||||
|
return next_element
|
||||||
|
|
||||||
|
def clear(self):
|
||||||
|
"""delete list from redis"""
|
||||||
|
self.conn.execute_command("DEL", self.key)
|
||||||
|
|
||||||
|
def clear_item(self, to_clear):
|
||||||
|
"""remove single item from list if it's there"""
|
||||||
|
self.conn.execute_command("LREM", self.key, 0, to_clear)
|
||||||
|
|
||||||
|
def trim(self, size):
|
||||||
|
"""trim the queue based on settings amount"""
|
||||||
|
self.conn.execute_command("LTRIM", self.key, 0, size)
|
@ -10,22 +10,24 @@ import os
|
|||||||
|
|
||||||
import home.apps as startup_apps
|
import home.apps as startup_apps
|
||||||
from celery import Celery, shared_task
|
from celery import Celery, shared_task
|
||||||
from home.src.config import AppConfig, ScheduleBuilder
|
from home.src.download.queue import PendingList
|
||||||
from home.src.download import (
|
from home.src.download.subscriptions import (
|
||||||
ChannelSubscription,
|
ChannelSubscription,
|
||||||
PendingList,
|
|
||||||
PlaylistSubscription,
|
PlaylistSubscription,
|
||||||
VideoDownloader,
|
|
||||||
)
|
)
|
||||||
from home.src.helper import RedisArchivist, RedisQueue, UrlListParser
|
from home.src.download.thumbnails import ThumbManager, validate_thumbnails
|
||||||
from home.src.index import YoutubeChannel, YoutubePlaylist
|
from home.src.download.yt_dlp_handler import VideoDownloader
|
||||||
from home.src.index_management import backup_all_indexes, restore_from_backup
|
from home.src.es.index_setup import backup_all_indexes, restore_from_backup
|
||||||
from home.src.reindex import (
|
from home.src.index.channel import YoutubeChannel
|
||||||
|
from home.src.index.filesystem import (
|
||||||
ManualImport,
|
ManualImport,
|
||||||
reindex_old_documents,
|
reindex_old_documents,
|
||||||
scan_filesystem,
|
scan_filesystem,
|
||||||
)
|
)
|
||||||
from home.src.thumbnails import ThumbManager, validate_thumbnails
|
from home.src.index.playlist import YoutubePlaylist
|
||||||
|
from home.src.ta.config import AppConfig, ScheduleBuilder
|
||||||
|
from home.src.ta.helper import UrlListParser
|
||||||
|
from home.src.ta.ta_redis import RedisArchivist, RedisQueue
|
||||||
|
|
||||||
CONFIG = AppConfig().config
|
CONFIG = AppConfig().config
|
||||||
REDIS_HOST = os.environ.get("REDIS_HOST")
|
REDIS_HOST = os.environ.get("REDIS_HOST")
|
||||||
@ -266,17 +268,16 @@ def subscribe_to(url_str):
|
|||||||
@shared_task
|
@shared_task
|
||||||
def index_channel_playlists(channel_id):
|
def index_channel_playlists(channel_id):
|
||||||
"""add all playlists of channel to index"""
|
"""add all playlists of channel to index"""
|
||||||
channel_handler = YoutubeChannel(channel_id)
|
channel = YoutubeChannel(channel_id)
|
||||||
channel_name = channel_handler.channel_dict["channel_name"]
|
|
||||||
# notify
|
# notify
|
||||||
mess_dict = {
|
mess_dict = {
|
||||||
"status": "message:playlistscan",
|
"status": "message:playlistscan",
|
||||||
"level": "info",
|
"level": "info",
|
||||||
"title": "Looking for playlists",
|
"title": "Looking for playlists",
|
||||||
"message": f'Scanning channel "{channel_name}" in progress',
|
"message": f'Scanning channel "{channel.youtube_id}" in progress',
|
||||||
}
|
}
|
||||||
RedisArchivist().set_message("message:playlistscan", mess_dict)
|
RedisArchivist().set_message("message:playlistscan", mess_dict)
|
||||||
all_playlists = channel_handler.get_all_playlists()
|
all_playlists = channel.get_all_playlists()
|
||||||
|
|
||||||
if not all_playlists:
|
if not all_playlists:
|
||||||
print(f"no playlists found for channel {channel_id}")
|
print(f"no playlists found for channel {channel_id}")
|
||||||
@ -295,28 +296,29 @@ def index_channel_playlists(channel_id):
|
|||||||
}
|
}
|
||||||
RedisArchivist().set_message("message:playlistscan", mess_dict)
|
RedisArchivist().set_message("message:playlistscan", mess_dict)
|
||||||
print("add playlist: " + playlist_title)
|
print("add playlist: " + playlist_title)
|
||||||
playlist_handler = YoutubePlaylist(
|
|
||||||
playlist_id, all_youtube_ids=all_youtube_ids
|
playlist = YoutubePlaylist(playlist_id)
|
||||||
)
|
playlist.all_youtube_ids = all_youtube_ids
|
||||||
playlist_handler.get_playlist_dict()
|
playlist.build_json()
|
||||||
if not playlist_handler.playlist_dict:
|
|
||||||
|
if not playlist.json_data:
|
||||||
# skip if not available
|
# skip if not available
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# don't add if no videos downloaded
|
# don't add if no videos downloaded
|
||||||
downloaded = [
|
downloaded = [
|
||||||
i
|
i
|
||||||
for i in playlist_handler.playlist_dict["playlist_entries"]
|
for i in playlist.json_data["playlist_entries"]
|
||||||
if i["downloaded"]
|
if i["downloaded"]
|
||||||
]
|
]
|
||||||
if not downloaded:
|
if not downloaded:
|
||||||
continue
|
continue
|
||||||
playlist_handler.upload_to_es()
|
|
||||||
playlist_handler.add_vids_to_playlist()
|
playlist.upload_to_es()
|
||||||
|
playlist.add_vids_to_playlist()
|
||||||
|
|
||||||
if all_playlists:
|
if all_playlists:
|
||||||
handler = ThumbManager()
|
playlist.get_playlist_art()
|
||||||
missing_playlists = handler.get_missing_playlists()
|
|
||||||
handler.download_playlist(missing_playlists)
|
|
||||||
|
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -69,7 +69,7 @@
|
|||||||
<img src="{% static 'img/icon-gear.svg' %}" alt="gear-icon" title="Settings">
|
<img src="{% static 'img/icon-gear.svg' %}" alt="gear-icon" title="Settings">
|
||||||
</a>
|
</a>
|
||||||
<a href="{% url 'logout' %}">
|
<a href="{% url 'logout' %}">
|
||||||
<img src="{% static 'img/icon-exit.svg' %}" alt="exit-icon" title="Logout">
|
<img class="alert-hover" src="{% static 'img/icon-exit.svg' %}" alt="exit-icon" title="Logout">
|
||||||
</a>
|
</a>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
"""
|
"""
|
||||||
Functionality:
|
Functionality:
|
||||||
- all views for home app
|
- all views for home app
|
||||||
- process post data received from frontend via ajax
|
- holds base classes to inherit from
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import json
|
import json
|
||||||
@ -14,7 +14,9 @@ from django.contrib.auth.forms import AuthenticationForm
|
|||||||
from django.http import JsonResponse
|
from django.http import JsonResponse
|
||||||
from django.shortcuts import redirect, render
|
from django.shortcuts import redirect, render
|
||||||
from django.views import View
|
from django.views import View
|
||||||
from home.forms import (
|
from home.src.es.index_setup import get_available_backups
|
||||||
|
from home.src.frontend.api_calls import PostData
|
||||||
|
from home.src.frontend.forms import (
|
||||||
AddToQueueForm,
|
AddToQueueForm,
|
||||||
ApplicationSettingsForm,
|
ApplicationSettingsForm,
|
||||||
CustomAuthForm,
|
CustomAuthForm,
|
||||||
@ -24,12 +26,12 @@ from home.forms import (
|
|||||||
SubscribeToPlaylistForm,
|
SubscribeToPlaylistForm,
|
||||||
UserSettingsForm,
|
UserSettingsForm,
|
||||||
)
|
)
|
||||||
from home.src.config import AppConfig, ScheduleBuilder
|
from home.src.frontend.searching import SearchHandler
|
||||||
from home.src.frontend import PostData
|
from home.src.index.generic import Pagination
|
||||||
from home.src.helper import RedisArchivist, UrlListParser
|
from home.src.index.playlist import YoutubePlaylist
|
||||||
from home.src.index import YoutubePlaylist
|
from home.src.ta.config import AppConfig, ScheduleBuilder
|
||||||
from home.src.index_management import get_available_backups
|
from home.src.ta.helper import UrlListParser
|
||||||
from home.src.searching import Pagination, SearchHandler
|
from home.src.ta.ta_redis import RedisArchivist
|
||||||
from home.tasks import extrac_dl, subscribe_to
|
from home.tasks import extrac_dl, subscribe_to
|
||||||
from rest_framework.authtoken.models import Token
|
from rest_framework.authtoken.models import Token
|
||||||
|
|
||||||
@ -169,8 +171,7 @@ class ArchivistResultsView(ArchivistViewConfig):
|
|||||||
|
|
||||||
def single_lookup(self, es_path):
|
def single_lookup(self, es_path):
|
||||||
"""retrieve a single item from url"""
|
"""retrieve a single item from url"""
|
||||||
es_url = self.default_conf["application"]["es_url"]
|
search = SearchHandler(es_path, config=self.default_conf)
|
||||||
search = SearchHandler(f"{es_url}/{es_path}", data=False)
|
|
||||||
result = search.get_data()[0]["source"]
|
result = search.get_data()[0]["source"]
|
||||||
return result
|
return result
|
||||||
|
|
||||||
@ -189,8 +190,9 @@ class ArchivistResultsView(ArchivistViewConfig):
|
|||||||
|
|
||||||
def find_results(self):
|
def find_results(self):
|
||||||
"""add results and pagination to context"""
|
"""add results and pagination to context"""
|
||||||
url = self.default_conf["application"]["es_url"] + self.es_search
|
search = SearchHandler(
|
||||||
search = SearchHandler(url, self.data)
|
self.es_search, config=self.default_conf, data=self.data
|
||||||
|
)
|
||||||
self.context["results"] = search.get_data()
|
self.context["results"] = search.get_data()
|
||||||
self.pagination_handler.validate(search.max_hits)
|
self.pagination_handler.validate(search.max_hits)
|
||||||
self.context["max_hits"] = search.max_hits
|
self.context["max_hits"] = search.max_hits
|
||||||
@ -203,7 +205,7 @@ class HomeView(ArchivistResultsView):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
view_origin = "home"
|
view_origin = "home"
|
||||||
es_search = "/ta_video/_search"
|
es_search = "ta_video/_search"
|
||||||
|
|
||||||
def get(self, request):
|
def get(self, request):
|
||||||
"""handle get requests"""
|
"""handle get requests"""
|
||||||
@ -284,7 +286,7 @@ class DownloadView(ArchivistResultsView):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
view_origin = "downloads"
|
view_origin = "downloads"
|
||||||
es_search = "/ta_download/_search"
|
es_search = "ta_download/_search"
|
||||||
|
|
||||||
def get(self, request):
|
def get(self, request):
|
||||||
"""handle get request"""
|
"""handle get request"""
|
||||||
@ -346,7 +348,7 @@ class ChannelIdView(ArchivistResultsView):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
view_origin = "home"
|
view_origin = "home"
|
||||||
es_search = "/ta_video/_search"
|
es_search = "ta_video/_search"
|
||||||
|
|
||||||
def get(self, request, channel_id):
|
def get(self, request, channel_id):
|
||||||
"""get request"""
|
"""get request"""
|
||||||
@ -395,7 +397,7 @@ class ChannelView(ArchivistResultsView):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
view_origin = "channel"
|
view_origin = "channel"
|
||||||
es_search = "/ta_channel/_search"
|
es_search = "ta_channel/_search"
|
||||||
|
|
||||||
def get(self, request):
|
def get(self, request):
|
||||||
"""handle get request"""
|
"""handle get request"""
|
||||||
@ -445,7 +447,7 @@ class PlaylistIdView(ArchivistResultsView):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
view_origin = "home"
|
view_origin = "home"
|
||||||
es_search = "/ta_video/_search"
|
es_search = "ta_video/_search"
|
||||||
|
|
||||||
def get(self, request, playlist_id):
|
def get(self, request, playlist_id):
|
||||||
"""handle get request"""
|
"""handle get request"""
|
||||||
@ -521,7 +523,7 @@ class PlaylistView(ArchivistResultsView):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
view_origin = "playlist"
|
view_origin = "playlist"
|
||||||
es_search = "/ta_playlist/_search"
|
es_search = "ta_playlist/_search"
|
||||||
|
|
||||||
def get(self, request):
|
def get(self, request):
|
||||||
"""handle get request"""
|
"""handle get request"""
|
||||||
@ -592,9 +594,9 @@ class VideoView(View):
|
|||||||
|
|
||||||
def get(self, request, video_id):
|
def get(self, request, video_id):
|
||||||
"""get single video"""
|
"""get single video"""
|
||||||
es_url, colors, cast = self.read_config(user_id=request.user.id)
|
colors, cast = self.read_config(user_id=request.user.id)
|
||||||
url = f"{es_url}/ta_video/_doc/{video_id}"
|
path = f"ta_video/_doc/{video_id}"
|
||||||
look_up = SearchHandler(url, None)
|
look_up = SearchHandler(path, config=False)
|
||||||
video_hit = look_up.get_data()
|
video_hit = look_up.get_data()
|
||||||
video_data = video_hit[0]["source"]
|
video_data = video_hit[0]["source"]
|
||||||
try:
|
try:
|
||||||
@ -624,11 +626,11 @@ class VideoView(View):
|
|||||||
"""build playlist nav if available"""
|
"""build playlist nav if available"""
|
||||||
all_navs = []
|
all_navs = []
|
||||||
for playlist_id in playlists:
|
for playlist_id in playlists:
|
||||||
handler = YoutubePlaylist(playlist_id)
|
playlist = YoutubePlaylist(playlist_id)
|
||||||
handler.get_playlist_dict()
|
playlist.get_from_es()
|
||||||
nav = handler.build_nav(video_id)
|
playlist.build_nav(video_id)
|
||||||
if nav:
|
if playlist.nav:
|
||||||
all_navs.append(nav)
|
all_navs.append(playlist.nav)
|
||||||
|
|
||||||
return all_navs
|
return all_navs
|
||||||
|
|
||||||
@ -636,10 +638,9 @@ class VideoView(View):
|
|||||||
def read_config(user_id):
|
def read_config(user_id):
|
||||||
"""read config file"""
|
"""read config file"""
|
||||||
config_handler = AppConfig(user_id)
|
config_handler = AppConfig(user_id)
|
||||||
es_url = config_handler.config["application"]["es_url"]
|
|
||||||
cast = config_handler.config["application"]["enable_cast"]
|
cast = config_handler.config["application"]["enable_cast"]
|
||||||
colors = config_handler.colors
|
colors = config_handler.colors
|
||||||
return es_url, colors, cast
|
return colors, cast
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def star_creator(rating):
|
def star_creator(rating):
|
||||||
|
@ -1,12 +1,12 @@
|
|||||||
beautifulsoup4==4.10.0
|
beautifulsoup4==4.10.0
|
||||||
celery==5.2.3
|
celery==5.2.3
|
||||||
django-cors-headers==3.11.0
|
|
||||||
Django==4.0.1
|
Django==4.0.1
|
||||||
|
django-cors-headers==3.11.0
|
||||||
djangorestframework==3.13.1
|
djangorestframework==3.13.1
|
||||||
Pillow==9.0.0
|
Pillow==9.0.0
|
||||||
redis==4.1.0
|
redis==4.1.1
|
||||||
requests==2.27.1
|
requests==2.27.1
|
||||||
ryd-client==0.0.3
|
ryd-client==0.0.3
|
||||||
uWSGI==2.0.20
|
uWSGI==2.0.20
|
||||||
whitenoise==5.3.0
|
whitenoise==5.3.0
|
||||||
yt_dlp==2021.12.27
|
yt_dlp==2022.1.21
|
||||||
|
@ -286,6 +286,10 @@ button:hover {
|
|||||||
--connected-color: var(--accent-font-light);
|
--connected-color: var(--accent-font-light);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.alert-hover:hover {
|
||||||
|
filter: var(--img-filter-error);
|
||||||
|
}
|
||||||
|
|
||||||
/* top of page */
|
/* top of page */
|
||||||
.title-bar {
|
.title-bar {
|
||||||
padding-top: 30px;
|
padding-top: 30px;
|
||||||
|
@ -9,15 +9,15 @@
|
|||||||
xmlns="http://www.w3.org/2000/svg"
|
xmlns="http://www.w3.org/2000/svg"
|
||||||
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
|
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
|
||||||
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
|
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
|
||||||
width="500"
|
width="210mm"
|
||||||
height="500"
|
height="210mm"
|
||||||
viewBox="0 0 132.29197 132.29167"
|
viewBox="0 0 210 210"
|
||||||
version="1.1"
|
version="1.1"
|
||||||
id="svg1303"
|
id="svg1566"
|
||||||
inkscape:version="0.92.4 (5da689c313, 2019-01-14)"
|
inkscape:version="0.92.4 (5da689c313, 2019-01-14)"
|
||||||
sodipodi:docname="Icons_exit.svg">
|
sodipodi:docname="Icons_exit 05.svg">
|
||||||
<defs
|
<defs
|
||||||
id="defs1297" />
|
id="defs1560" />
|
||||||
<sodipodi:namedview
|
<sodipodi:namedview
|
||||||
id="base"
|
id="base"
|
||||||
pagecolor="#ffffff"
|
pagecolor="#ffffff"
|
||||||
@ -25,20 +25,19 @@
|
|||||||
borderopacity="1.0"
|
borderopacity="1.0"
|
||||||
inkscape:pageopacity="0.0"
|
inkscape:pageopacity="0.0"
|
||||||
inkscape:pageshadow="2"
|
inkscape:pageshadow="2"
|
||||||
inkscape:zoom="0.66442107"
|
inkscape:zoom="0.35355339"
|
||||||
inkscape:cx="161.45413"
|
inkscape:cx="963.7258"
|
||||||
inkscape:cy="207.61753"
|
inkscape:cy="291.01609"
|
||||||
inkscape:document-units="mm"
|
inkscape:document-units="mm"
|
||||||
inkscape:current-layer="layer1"
|
inkscape:current-layer="layer1"
|
||||||
showgrid="false"
|
showgrid="false"
|
||||||
units="px"
|
inkscape:window-width="1920"
|
||||||
inkscape:window-width="1169"
|
inkscape:window-height="1009"
|
||||||
inkscape:window-height="893"
|
inkscape:window-x="-8"
|
||||||
inkscape:window-x="729"
|
inkscape:window-y="-8"
|
||||||
inkscape:window-y="13"
|
inkscape:window-maximized="1" />
|
||||||
inkscape:window-maximized="0" />
|
|
||||||
<metadata
|
<metadata
|
||||||
id="metadata1300">
|
id="metadata1563">
|
||||||
<rdf:RDF>
|
<rdf:RDF>
|
||||||
<cc:Work
|
<cc:Work
|
||||||
rdf:about="">
|
rdf:about="">
|
||||||
@ -53,15 +52,24 @@
|
|||||||
inkscape:label="Ebene 1"
|
inkscape:label="Ebene 1"
|
||||||
inkscape:groupmode="layer"
|
inkscape:groupmode="layer"
|
||||||
id="layer1"
|
id="layer1"
|
||||||
transform="translate(0,-164.70764)">
|
transform="translate(0,-87)">
|
||||||
<g
|
<path
|
||||||
id="g855"
|
style="opacity:1;fill:#000000;fill-opacity:1;stroke:none;stroke-width:2.35654187;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke"
|
||||||
transform="matrix(1.9016362,0,0,1.9016362,-197.93838,-58.9418)">
|
d="M 106.49932,87.901069 C 49.504302,87.900974 3.3006913,134.10459 3.3007713,191.0996 c 0,0.30098 0.003,0.60131 0.005,0.90167 v 0 c -0.003,0.29952 -0.006,0.59901 -0.006,0.89912 -8e-5,56.99502 46.2035307,103.19865 103.1985287,103.19854 23.01714,-0.0773 45.34783,-7.84709 63.44155,-22.07425 0,0 9.01874,-8.71006 2.40579,-16.41737 -6.61297,-7.70731 -19.11222,0.3185 -19.11222,0.3185 -13.60985,9.81394 -29.95596,15.11012 -46.73512,15.14236 -44.275428,0 -80.167758,-35.89234 -80.167758,-80.16778 0,-0.30097 0.003,-0.60148 0.006,-0.90166 h -5.2e-4 c -0.003,-0.29934 -0.006,-0.59901 -0.006,-0.89913 0,-44.27545 35.89234,-80.16777 80.167778,-80.16777 16.77916,0.0322 33.12527,5.32843 46.73512,15.14236 0,0 12.49925,8.02581 19.11222,0.3185 6.61295,-7.70732 -2.4058,-16.41739 -2.4058,-16.41739 C 151.84561,95.74815 129.51494,87.97828 106.4978,87.901069 Z m 54.30959,56.450221 -12.13663,11.69622 20.15864,20.93332 -93.932488,-1.4899 c -9.22763,-0.17349 -16.77655,6.07423 -16.92587,14.00904 l 0.002,0.002 c -0.0149,1.82673 -0.0235,3.40102 0,4.99598 l -0.002,0.002 c 0.14932,7.93483 7.69824,14.18254 16.92587,14.00905 l 93.932488,-1.48991 -20.15864,20.93333 12.13663,11.69622 34.0585,-35.35536 11.82982,-12.29208 h 0.003 l -9.9e-4,-0.002 9.9e-4,-9.9e-4 h -0.003 l -11.82982,-12.29208 z"
|
||||||
<path
|
id="path1405"
|
||||||
inkscape:connector-curvature="0"
|
inkscape:connector-curvature="0"
|
||||||
id="rect1208"
|
sodipodi:nodetypes="cccccccsccsccsccscccccccccccccccccccccc" />
|
||||||
style="opacity:1;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0;stroke-linecap:round;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke"
|
<path
|
||||||
d="m 124.57603,151.92962 c -0.0433,2.30016 2.0751,4.19245 4.75007,4.24278 l 30.26401,0.43007 -6.00195,5.78023 3.43246,3.56154 10.2778,-9.9006 0.002,0.002 3.5183,-3.3908 -3.42991,-3.564 -9.8737,-10.24989 -3.51834,3.39083 5.84388,6.06803 -30.35875,-0.43185 c -2.67494,-0.0503 -4.86301,1.76094 -4.90629,4.06112 z m -17.65039,-32.01644 v 64.95883 h 7.44347 v -58.27707 h 26.3896 v 18.5229 h 7.44296 v -25.20466 z m 33.83307,39.75416 v 25.20467 h 7.44296 v -25.20467 z" />
|
style="opacity:1;fill:#000000;fill-opacity:1;stroke:none;stroke-width:2.39729571;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke"
|
||||||
</g>
|
d="m 506.57967,92.503023 c -57.98068,-1e-4 -104.98336,47.002567 -104.98326,104.983257 1.9e-4,57.98049 47.00276,104.98284 104.98326,104.98273 23.42489,-0.0758 46.15146,-7.98387 57.83458,-18.08923 11.68313,-10.10537 12.15613,-18.62993 7.38675,-23.04107 v -0.002 c -4.7711,-4.41269 -12.38099,-1.9587 -17.69245,2.25103 -13.83538,9.99805 -30.45915,15.40285 -47.52888,15.4528 -45.04116,0 -81.55421,-36.51305 -81.5542,-81.55419 0,-45.04114 36.51307,-81.5542 81.5542,-81.5542 17.06933,0.0328 33.21884,5.19482 43.16812,12.86758 9.94929,7.67275 17.33418,9.17607 22.1053,4.76338 v -0.002 c 4.77116,-4.41278 5.55882,-12.9887 -0.73482,-18.60197 -18.40654,-14.47308 -41.1234,-22.377337 -64.5386,-22.455877 z m 55.24881,57.426467 -12.34652,11.8985 20.50728,21.29534 -95.55697,-1.51567 c -9.38721,-0.17649 -17.06669,6.17929 -17.21858,14.25133 l 0.003,0.002 c -0.15192,8.07203 7.28245,14.71295 16.66978,14.88953 l 95.22519,1.50947 -21.06332,20.28455 12.04579,12.49846 36.06808,-34.74464 0.005,0.005 12.34654,-11.89954 -12.03701,-12.50724 z m 35.17874,98.71801 0.69918,0.67386 c 0.13539,-0.22412 0.26991,-0.44874 0.4036,-0.67386 z"
|
||||||
|
id="path1405-6"
|
||||||
|
inkscape:connector-curvature="0"
|
||||||
|
sodipodi:nodetypes="ccccccccsczccccccccccccccccccccccc" />
|
||||||
|
<path
|
||||||
|
style="opacity:1;fill:#000000;fill-opacity:1;stroke:none;stroke-width:2.39729571;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke"
|
||||||
|
d="m 740.89945,94.730897 c -57.98068,-9.6e-5 -104.98334,47.002563 -104.98325,104.983253 1.9e-4,57.98049 47.00276,104.98284 104.98325,104.98274 23.42488,-0.0758 46.15145,-7.98387 64.5635,-22.46581 l -17.03461,-16.41553 c -13.83537,9.99805 -30.45916,15.40285 -47.52889,15.4528 -45.04113,0 -81.55419,-36.51306 -81.55419,-81.5542 0,-45.04114 36.51306,-81.55419 81.55419,-81.55419 17.06934,0.0328 33.69814,5.42058 47.54336,15.40423 l 16.99534,-16.3773 c -18.40663,-14.4732 -41.12349,-22.377447 -64.5387,-22.455993 z m 55.24882,57.426473 -12.34653,11.8985 20.50728,21.29534 -95.55696,-1.51567 c -9.38721,-0.17649 -17.06668,6.17928 -17.21858,14.25132 l 0.002,0.002 c -0.1519,8.07203 7.28245,14.71295 16.66978,14.88953 l 95.22519,1.50947 -21.06332,20.28455 12.04578,12.49846 36.06808,-34.74465 0.005,0.005 12.34653,-11.89953 -12.03699,-12.50725 z m 35.17873,98.718 0.69919,0.67386 c 0.13538,-0.22412 0.26991,-0.44874 0.40359,-0.67386 z"
|
||||||
|
id="path1405-9"
|
||||||
|
inkscape:connector-curvature="0"
|
||||||
|
sodipodi:nodetypes="ccccccsccccccccccccccccccccccc" />
|
||||||
</g>
|
</g>
|
||||||
</svg>
|
</svg>
|
||||||
|
Before Width: | Height: | Size: 2.5 KiB After Width: | Height: | Size: 6.0 KiB |
Loading…
Reference in New Issue
Block a user