2021-09-05 17:10:14 +00:00
|
|
|
"""
|
|
|
|
Functionality:
|
|
|
|
- index new videos into elastisearch
|
|
|
|
- extract video info with yt_dlp
|
|
|
|
- scrape youtube channel page if needed
|
|
|
|
"""
|
|
|
|
|
|
|
|
import json
|
2021-09-18 13:02:54 +00:00
|
|
|
import os
|
2021-09-05 17:10:14 +00:00
|
|
|
import re
|
|
|
|
from datetime import datetime
|
|
|
|
from time import sleep
|
|
|
|
|
|
|
|
import requests
|
|
|
|
import yt_dlp as youtube_dl
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
from home.src.config import AppConfig
|
2021-10-31 09:04:28 +00:00
|
|
|
from home.src.helper import DurationConverter, UrlListParser, clean_string
|
2021-10-11 09:03:25 +00:00
|
|
|
from home.src.thumbnails import ThumbManager
|
2021-09-05 17:10:14 +00:00
|
|
|
|
|
|
|
|
|
|
|
class YoutubeChannel:
|
2021-09-21 09:25:22 +00:00
|
|
|
"""represents a single youtube channel"""
|
2021-09-05 17:10:14 +00:00
|
|
|
|
|
|
|
CONFIG = AppConfig().config
|
2021-09-21 09:25:22 +00:00
|
|
|
ES_URL = CONFIG["application"]["es_url"]
|
2021-10-28 08:49:58 +00:00
|
|
|
ES_AUTH = CONFIG["application"]["es_auth"]
|
2021-09-21 09:25:22 +00:00
|
|
|
CACHE_DIR = CONFIG["application"]["cache_dir"]
|
2021-10-09 12:54:36 +00:00
|
|
|
VIDEOS = CONFIG["application"]["videos"]
|
2021-09-05 17:10:14 +00:00
|
|
|
|
|
|
|
def __init__(self, channel_id):
|
|
|
|
self.channel_id = channel_id
|
|
|
|
self.json_data = None
|
|
|
|
self.source = None
|
|
|
|
self.channel_dict = self.build_channel_dict()
|
|
|
|
|
|
|
|
def build_channel_dict(self, scrape=False):
|
2021-09-21 09:25:22 +00:00
|
|
|
"""combine the dicts build from extracted json payload"""
|
2021-09-05 17:10:14 +00:00
|
|
|
if scrape:
|
|
|
|
channel_dict = False
|
|
|
|
else:
|
|
|
|
channel_dict = self.get_es_channel()
|
|
|
|
if not channel_dict:
|
2021-09-21 09:25:22 +00:00
|
|
|
print("scrape data from youtube")
|
2021-09-05 17:10:14 +00:00
|
|
|
self.scrape_channel()
|
|
|
|
channel_dict = self.parse_channel_main()
|
|
|
|
channel_dict.update(self.parse_channel_meta())
|
2021-09-21 09:25:22 +00:00
|
|
|
self.source = "scraped"
|
2021-09-05 17:10:14 +00:00
|
|
|
return channel_dict
|
|
|
|
|
|
|
|
def get_es_channel(self):
|
2021-09-21 09:25:22 +00:00
|
|
|
"""get from elastic search first if possible"""
|
2021-09-05 17:10:14 +00:00
|
|
|
channel_id = self.channel_id
|
2021-09-21 09:25:22 +00:00
|
|
|
url = f"{self.ES_URL}/ta_channel/_doc/{channel_id}"
|
2021-10-28 08:49:58 +00:00
|
|
|
response = requests.get(url, auth=self.ES_AUTH)
|
2021-09-05 17:10:14 +00:00
|
|
|
if response.ok:
|
2021-09-21 09:25:22 +00:00
|
|
|
channel_source = response.json()["_source"]
|
|
|
|
self.source = "elastic"
|
2021-09-05 17:10:14 +00:00
|
|
|
return channel_source
|
|
|
|
return False
|
|
|
|
|
|
|
|
def scrape_channel(self):
|
2021-09-21 09:25:22 +00:00
|
|
|
"""scrape channel page for additional infos"""
|
2021-09-05 17:10:14 +00:00
|
|
|
channel_id = self.channel_id
|
2021-09-21 09:25:22 +00:00
|
|
|
url = f"https://www.youtube.com/channel/{channel_id}/about?hl=en"
|
|
|
|
cookies = {"CONSENT": "YES+xxxxxxxxxxxxxxxxxxxxxxxxxxx"}
|
2021-10-28 08:49:58 +00:00
|
|
|
response = requests.get(url, cookies=cookies, auth=self.ES_AUTH)
|
2021-09-05 17:10:14 +00:00
|
|
|
if response.ok:
|
|
|
|
channel_page = response.text
|
|
|
|
else:
|
2021-09-21 09:25:22 +00:00
|
|
|
print(f"failed to extract channel info for: {channel_id}")
|
2021-09-05 17:10:14 +00:00
|
|
|
raise ConnectionError
|
2021-09-21 09:25:22 +00:00
|
|
|
soup = BeautifulSoup(channel_page, "html.parser")
|
2021-09-05 17:10:14 +00:00
|
|
|
# load script into json
|
2021-09-21 09:25:22 +00:00
|
|
|
all_scripts = soup.find("body").find_all("script")
|
2021-09-05 17:10:14 +00:00
|
|
|
for script in all_scripts:
|
2021-09-21 09:25:22 +00:00
|
|
|
if "var ytInitialData = " in str(script):
|
2021-09-05 17:10:14 +00:00
|
|
|
script_content = str(script)
|
|
|
|
break
|
|
|
|
# extract payload
|
2021-09-21 09:25:22 +00:00
|
|
|
script_content = script_content.split("var ytInitialData = ")[1]
|
|
|
|
json_raw = script_content.rstrip(";</script>")
|
2021-09-05 17:10:14 +00:00
|
|
|
json_data = json.loads(json_raw)
|
|
|
|
# add to self
|
|
|
|
self.json_data = json_data
|
|
|
|
|
|
|
|
def parse_channel_main(self):
|
2021-09-21 09:25:22 +00:00
|
|
|
"""extract maintab values from scraped channel json data"""
|
|
|
|
main_tab = self.json_data["header"]["c4TabbedHeaderRenderer"]
|
|
|
|
channel_name = main_tab["title"]
|
2021-09-05 17:10:14 +00:00
|
|
|
last_refresh = int(datetime.now().strftime("%s"))
|
|
|
|
# channel_subs
|
|
|
|
try:
|
2021-09-21 09:25:22 +00:00
|
|
|
sub_text_simple = main_tab["subscriberCountText"]["simpleText"]
|
|
|
|
sub_text = sub_text_simple.split(" ")[0]
|
|
|
|
if sub_text[-1] == "K":
|
|
|
|
channel_subs = int(float(sub_text.replace("K", "")) * 1000)
|
|
|
|
elif sub_text[-1] == "M":
|
|
|
|
channel_subs = int(float(sub_text.replace("M", "")) * 1000000)
|
2021-09-05 17:10:14 +00:00
|
|
|
elif int(sub_text) >= 0:
|
|
|
|
channel_subs = int(sub_text)
|
|
|
|
else:
|
2021-09-21 09:25:22 +00:00
|
|
|
message = f"{sub_text} not dealt with"
|
2021-09-05 17:10:14 +00:00
|
|
|
print(message)
|
|
|
|
except KeyError:
|
|
|
|
channel_subs = 0
|
|
|
|
# banner
|
|
|
|
try:
|
2021-09-21 09:25:22 +00:00
|
|
|
all_banners = main_tab["banner"]["thumbnails"]
|
|
|
|
banner = sorted(all_banners, key=lambda k: k["width"])[-1]["url"]
|
2021-09-05 17:10:14 +00:00
|
|
|
except KeyError:
|
|
|
|
banner = False
|
|
|
|
# build and return dict
|
|
|
|
main_channel_dict = {
|
2021-09-21 09:25:22 +00:00
|
|
|
"channel_active": True,
|
|
|
|
"channel_last_refresh": last_refresh,
|
|
|
|
"channel_subs": channel_subs,
|
|
|
|
"channel_banner_url": banner,
|
|
|
|
"channel_name": channel_name,
|
|
|
|
"channel_id": self.channel_id,
|
2021-09-05 17:10:14 +00:00
|
|
|
}
|
|
|
|
return main_channel_dict
|
|
|
|
|
|
|
|
def parse_channel_meta(self):
|
2021-09-21 09:25:22 +00:00
|
|
|
"""extract meta tab values from channel payload"""
|
2021-09-05 17:10:14 +00:00
|
|
|
# meta tab
|
|
|
|
json_data = self.json_data
|
2021-09-21 09:25:22 +00:00
|
|
|
meta_tab = json_data["metadata"]["channelMetadataRenderer"]
|
|
|
|
description = meta_tab["description"]
|
|
|
|
all_thumbs = meta_tab["avatar"]["thumbnails"]
|
|
|
|
thumb_url = sorted(all_thumbs, key=lambda k: k["width"])[-1]["url"]
|
2021-09-05 17:10:14 +00:00
|
|
|
# stats tab
|
2021-09-21 09:25:22 +00:00
|
|
|
renderer = "twoColumnBrowseResultsRenderer"
|
|
|
|
all_tabs = json_data["contents"][renderer]["tabs"]
|
2021-09-05 17:10:14 +00:00
|
|
|
for tab in all_tabs:
|
2021-09-21 09:25:22 +00:00
|
|
|
if "tabRenderer" in tab.keys():
|
|
|
|
if tab["tabRenderer"]["title"] == "About":
|
|
|
|
about_tab = tab["tabRenderer"]["content"][
|
|
|
|
"sectionListRenderer"
|
|
|
|
]["contents"][0]["itemSectionRenderer"]["contents"][0][
|
|
|
|
"channelAboutFullMetadataRenderer"
|
|
|
|
]
|
2021-09-05 17:10:14 +00:00
|
|
|
break
|
|
|
|
try:
|
2021-09-21 09:25:22 +00:00
|
|
|
channel_views_text = about_tab["viewCountText"]["simpleText"]
|
2021-09-05 17:10:14 +00:00
|
|
|
channel_views = int(re.sub(r"\D", "", channel_views_text))
|
|
|
|
except KeyError:
|
|
|
|
channel_views = 0
|
|
|
|
|
|
|
|
meta_channel_dict = {
|
2021-09-21 09:25:22 +00:00
|
|
|
"channel_description": description,
|
|
|
|
"channel_thumb_url": thumb_url,
|
|
|
|
"channel_views": channel_views,
|
2021-09-05 17:10:14 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return meta_channel_dict
|
|
|
|
|
2021-10-11 09:03:25 +00:00
|
|
|
def get_channel_art(self):
|
|
|
|
"""download channel art for new channels"""
|
|
|
|
channel_id = self.channel_id
|
|
|
|
channel_thumb = self.channel_dict["channel_thumb_url"]
|
|
|
|
channel_banner = self.channel_dict["channel_banner_url"]
|
|
|
|
ThumbManager().download_chan(
|
|
|
|
[(channel_id, channel_thumb, channel_banner)]
|
|
|
|
)
|
|
|
|
|
2021-09-05 17:10:14 +00:00
|
|
|
def upload_to_es(self):
|
2021-09-21 09:25:22 +00:00
|
|
|
"""upload channel data to elastic search"""
|
|
|
|
url = f"{self.ES_URL}/ta_channel/_doc/{self.channel_id}"
|
2021-10-28 08:49:58 +00:00
|
|
|
response = requests.put(url, json=self.channel_dict, auth=self.ES_AUTH)
|
2021-09-21 09:25:22 +00:00
|
|
|
print(f"added {self.channel_id} to es")
|
2021-09-05 17:10:14 +00:00
|
|
|
if not response.ok:
|
|
|
|
print(response.text)
|
|
|
|
|
|
|
|
def sync_to_videos(self):
|
2021-09-21 09:25:22 +00:00
|
|
|
"""sync new channel_dict to all videos of channel"""
|
|
|
|
headers = {"Content-type": "application/json"}
|
2021-09-05 17:10:14 +00:00
|
|
|
channel_id = self.channel_id
|
|
|
|
# add ingest pipeline
|
|
|
|
processors = []
|
|
|
|
for field, value in self.channel_dict.items():
|
|
|
|
line = {"set": {"field": "channel." + field, "value": value}}
|
|
|
|
processors.append(line)
|
2021-09-21 09:25:22 +00:00
|
|
|
data = {"description": channel_id, "processors": processors}
|
2021-09-05 17:10:14 +00:00
|
|
|
payload = json.dumps(data)
|
2021-09-21 09:25:22 +00:00
|
|
|
url = self.ES_URL + "/_ingest/pipeline/" + channel_id
|
2021-10-28 08:49:58 +00:00
|
|
|
request = requests.put(
|
|
|
|
url, data=payload, headers=headers, auth=self.ES_AUTH
|
|
|
|
)
|
2021-09-05 17:10:14 +00:00
|
|
|
if not request.ok:
|
|
|
|
print(request.text)
|
|
|
|
# apply pipeline
|
2021-09-21 09:25:22 +00:00
|
|
|
data = {"query": {"match": {"channel.channel_id": channel_id}}}
|
2021-09-05 17:10:14 +00:00
|
|
|
payload = json.dumps(data)
|
2021-09-21 09:25:22 +00:00
|
|
|
url = self.ES_URL + "/ta_video/_update_by_query?pipeline=" + channel_id
|
2021-10-28 08:49:58 +00:00
|
|
|
request = requests.post(
|
|
|
|
url, data=payload, headers=headers, auth=self.ES_AUTH
|
|
|
|
)
|
2021-09-05 17:10:14 +00:00
|
|
|
if not request.ok:
|
|
|
|
print(request.text)
|
|
|
|
|
2021-10-09 12:54:36 +00:00
|
|
|
def get_folder_path(self):
|
|
|
|
"""get folder where media files get stored"""
|
2021-10-09 13:32:42 +00:00
|
|
|
channel_name = self.channel_dict["channel_name"]
|
|
|
|
folder_name = clean_string(channel_name)
|
2021-10-09 12:54:36 +00:00
|
|
|
folder_path = os.path.join(self.VIDEOS, folder_name)
|
|
|
|
return folder_path
|
|
|
|
|
|
|
|
def delete_es_videos(self):
|
|
|
|
"""delete all channel documents from elasticsearch"""
|
|
|
|
headers = {"Content-type": "application/json"}
|
|
|
|
data = {
|
|
|
|
"query": {
|
|
|
|
"term": {"channel.channel_id": {"value": self.channel_id}}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
payload = json.dumps(data)
|
|
|
|
url = self.ES_URL + "/ta_video/_delete_by_query"
|
2021-10-28 08:49:58 +00:00
|
|
|
response = requests.post(
|
|
|
|
url, data=payload, headers=headers, auth=self.ES_AUTH
|
|
|
|
)
|
2021-10-09 12:54:36 +00:00
|
|
|
if not response.ok:
|
|
|
|
print(response.text)
|
|
|
|
|
|
|
|
def delete_channel(self):
|
|
|
|
"""delete channel and all videos"""
|
|
|
|
print(f"deleting {self.channel_id} and all matching media files")
|
|
|
|
folder_path = self.get_folder_path()
|
|
|
|
print("delete all media files")
|
|
|
|
all_videos = os.listdir(folder_path)
|
|
|
|
for video in all_videos:
|
|
|
|
video_path = os.path.join(folder_path, video)
|
|
|
|
os.remove(video_path)
|
|
|
|
os.rmdir(folder_path)
|
2021-10-11 12:29:27 +00:00
|
|
|
ThumbManager().delete_chan_thumb(self.channel_id)
|
2021-10-09 12:54:36 +00:00
|
|
|
|
|
|
|
print("delete indexed videos")
|
|
|
|
self.delete_es_videos()
|
|
|
|
url = self.ES_URL + "/ta_channel/_doc/" + self.channel_id
|
2021-10-28 08:49:58 +00:00
|
|
|
response = requests.delete(url, auth=self.ES_AUTH)
|
2021-10-09 12:54:36 +00:00
|
|
|
if not response.ok:
|
|
|
|
print(response.text)
|
2021-09-05 17:10:14 +00:00
|
|
|
|
|
|
|
|
|
|
|
class YoutubeVideo:
|
2021-09-21 09:25:22 +00:00
|
|
|
"""represents a single youtube video"""
|
2021-09-05 17:10:14 +00:00
|
|
|
|
|
|
|
CONFIG = AppConfig().config
|
2021-09-21 09:25:22 +00:00
|
|
|
ES_URL = CONFIG["application"]["es_url"]
|
2021-10-28 08:49:58 +00:00
|
|
|
ES_AUTH = CONFIG["application"]["es_auth"]
|
2021-09-21 09:25:22 +00:00
|
|
|
CACHE_DIR = CONFIG["application"]["cache_dir"]
|
|
|
|
VIDEOS = CONFIG["application"]["videos"]
|
2021-09-05 17:10:14 +00:00
|
|
|
|
|
|
|
def __init__(self, youtube_id):
|
|
|
|
self.youtube_id = youtube_id
|
|
|
|
self.channel_id = None
|
2021-10-08 09:18:01 +00:00
|
|
|
self.vid_dict = None
|
2021-09-05 17:10:14 +00:00
|
|
|
|
2021-10-08 09:18:01 +00:00
|
|
|
def get_vid_dict(self):
|
2021-09-21 09:25:22 +00:00
|
|
|
"""wrapper to loop around youtube_dl to retry on failure"""
|
|
|
|
print(f"get video data for {self.youtube_id}")
|
2021-10-08 09:18:01 +00:00
|
|
|
vid_dict = False
|
2021-09-05 17:10:14 +00:00
|
|
|
for i in range(3):
|
|
|
|
try:
|
|
|
|
vid_dict = self.get_youtubedl_vid_data()
|
|
|
|
except KeyError as e:
|
|
|
|
print(e)
|
|
|
|
sleep((i + 1) ** 2)
|
|
|
|
continue
|
|
|
|
else:
|
|
|
|
break
|
|
|
|
|
2021-10-08 09:18:01 +00:00
|
|
|
self.vid_dict = vid_dict
|
2021-09-05 17:10:14 +00:00
|
|
|
|
|
|
|
def get_youtubedl_vid_data(self):
|
2021-09-21 09:25:22 +00:00
|
|
|
"""parse youtubedl extract info"""
|
2021-09-05 17:10:14 +00:00
|
|
|
youtube_id = self.youtube_id
|
|
|
|
obs = {
|
2021-09-21 09:25:22 +00:00
|
|
|
"quiet": True,
|
|
|
|
"default_search": "ytsearch",
|
|
|
|
"skip_download": True,
|
2021-09-05 17:10:14 +00:00
|
|
|
}
|
|
|
|
try:
|
|
|
|
vid = youtube_dl.YoutubeDL(obs).extract_info(youtube_id)
|
|
|
|
except (
|
2021-09-21 09:25:22 +00:00
|
|
|
youtube_dl.utils.ExtractorError,
|
|
|
|
youtube_dl.utils.DownloadError,
|
|
|
|
):
|
|
|
|
print("failed to get info for " + youtube_id)
|
2021-09-05 17:10:14 +00:00
|
|
|
return False
|
|
|
|
# extract
|
2021-09-21 09:25:22 +00:00
|
|
|
self.channel_id = vid["channel_id"]
|
|
|
|
upload_date = vid["upload_date"]
|
2021-09-05 17:10:14 +00:00
|
|
|
upload_date_time = datetime.strptime(upload_date, "%Y%m%d")
|
|
|
|
published = upload_date_time.strftime("%Y-%m-%d")
|
|
|
|
last_refresh = int(datetime.now().strftime("%s"))
|
|
|
|
# likes
|
|
|
|
try:
|
2021-09-21 09:25:22 +00:00
|
|
|
like_count = vid["like_count"]
|
2021-09-05 17:10:14 +00:00
|
|
|
except KeyError:
|
|
|
|
like_count = 0
|
|
|
|
try:
|
2021-09-21 09:25:22 +00:00
|
|
|
dislike_count = vid["dislike_count"]
|
2021-09-05 17:10:14 +00:00
|
|
|
except KeyError:
|
|
|
|
dislike_count = 0
|
|
|
|
# build dicts
|
|
|
|
stats = {
|
2021-09-21 09:25:22 +00:00
|
|
|
"view_count": vid["view_count"],
|
2021-09-05 17:10:14 +00:00
|
|
|
"like_count": like_count,
|
|
|
|
"dislike_count": dislike_count,
|
2021-09-21 09:25:22 +00:00
|
|
|
"average_rating": vid["average_rating"],
|
2021-09-05 17:10:14 +00:00
|
|
|
}
|
|
|
|
vid_basic = {
|
2021-09-21 09:25:22 +00:00
|
|
|
"title": vid["title"],
|
|
|
|
"description": vid["description"],
|
|
|
|
"category": vid["categories"],
|
|
|
|
"vid_thumb_url": vid["thumbnail"],
|
|
|
|
"tags": vid["tags"],
|
2021-09-05 17:10:14 +00:00
|
|
|
"published": published,
|
|
|
|
"stats": stats,
|
|
|
|
"vid_last_refresh": last_refresh,
|
|
|
|
"date_downloaded": last_refresh,
|
|
|
|
"youtube_id": youtube_id,
|
|
|
|
"active": True,
|
2021-09-21 09:25:22 +00:00
|
|
|
"channel": False,
|
2021-09-05 17:10:14 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return vid_basic
|
|
|
|
|
2021-09-08 05:32:53 +00:00
|
|
|
def add_player(self, missing_vid):
|
2021-09-21 09:25:22 +00:00
|
|
|
"""add player information for new videos"""
|
|
|
|
cache_path = self.CACHE_DIR + "/download/"
|
2021-09-08 05:32:53 +00:00
|
|
|
videos = self.VIDEOS
|
2021-09-05 17:10:14 +00:00
|
|
|
|
2021-09-08 05:32:53 +00:00
|
|
|
if missing_vid:
|
|
|
|
# coming from scan_filesystem
|
|
|
|
channel_name, file_name, _ = missing_vid
|
|
|
|
vid_path = os.path.join(videos, channel_name, file_name)
|
|
|
|
else:
|
|
|
|
# coming from VideoDownload
|
|
|
|
all_cached = os.listdir(cache_path)
|
|
|
|
for file_cached in all_cached:
|
|
|
|
if self.youtube_id in file_cached:
|
|
|
|
vid_path = os.path.join(cache_path, file_cached)
|
|
|
|
break
|
|
|
|
|
|
|
|
duration_handler = DurationConverter()
|
|
|
|
duration = duration_handler.get_sec(vid_path)
|
|
|
|
duration_str = duration_handler.get_str(duration)
|
|
|
|
player = {
|
|
|
|
"watched": False,
|
|
|
|
"duration": duration,
|
2021-09-21 09:25:22 +00:00
|
|
|
"duration_str": duration_str,
|
2021-09-08 05:32:53 +00:00
|
|
|
}
|
2021-09-21 09:25:22 +00:00
|
|
|
self.vid_dict["player"] = player
|
2021-09-05 17:10:14 +00:00
|
|
|
|
|
|
|
def build_file_path(self, channel_name):
|
2021-09-21 09:25:22 +00:00
|
|
|
"""build media_url from where file will be located"""
|
2021-09-05 17:10:14 +00:00
|
|
|
clean_channel_name = clean_string(channel_name)
|
2021-09-21 09:25:22 +00:00
|
|
|
timestamp = self.vid_dict["published"].replace("-", "")
|
|
|
|
youtube_id = self.vid_dict["youtube_id"]
|
|
|
|
title = self.vid_dict["title"]
|
2021-09-05 17:10:14 +00:00
|
|
|
clean_title = clean_string(title)
|
2021-09-21 09:25:22 +00:00
|
|
|
filename = f"{timestamp}_{youtube_id}_{clean_title}.mp4"
|
2021-09-05 17:10:14 +00:00
|
|
|
media_url = os.path.join(clean_channel_name, filename)
|
2021-09-21 09:25:22 +00:00
|
|
|
self.vid_dict["media_url"] = media_url
|
2021-09-05 17:10:14 +00:00
|
|
|
|
|
|
|
def get_es_data(self):
|
2021-09-21 09:25:22 +00:00
|
|
|
"""get current data from elastic search"""
|
|
|
|
url = self.ES_URL + "/ta_video/_doc/" + self.youtube_id
|
2021-10-28 08:49:58 +00:00
|
|
|
response = requests.get(url, auth=self.ES_AUTH)
|
2021-09-05 17:10:14 +00:00
|
|
|
if not response.ok:
|
|
|
|
print(response.text)
|
|
|
|
es_vid_dict = json.loads(response.text)
|
|
|
|
return es_vid_dict
|
|
|
|
|
|
|
|
def upload_to_es(self):
|
2021-09-21 09:25:22 +00:00
|
|
|
"""upload channel data to elastic search"""
|
|
|
|
url = f"{self.ES_URL}/ta_video/_doc/{self.youtube_id}"
|
2021-10-28 08:49:58 +00:00
|
|
|
response = requests.put(url, json=self.vid_dict, auth=self.ES_AUTH)
|
2021-09-05 17:10:14 +00:00
|
|
|
if not response.ok:
|
|
|
|
print(response.text)
|
|
|
|
|
|
|
|
def deactivate(self):
|
2021-09-21 09:25:22 +00:00
|
|
|
"""deactivate document on extractor error"""
|
2021-09-05 17:10:14 +00:00
|
|
|
youtube_id = self.youtube_id
|
2021-09-21 09:25:22 +00:00
|
|
|
headers = {"Content-type": "application/json"}
|
|
|
|
url = f"{self.ES_URL}/ta_video/_update/{youtube_id}"
|
2021-09-05 17:10:14 +00:00
|
|
|
data = {"script": "ctx._source.active = false"}
|
|
|
|
json_str = json.dumps(data)
|
2021-10-28 08:49:58 +00:00
|
|
|
response = requests.post(
|
|
|
|
url, data=json_str, headers=headers, auth=self.ES_AUTH
|
|
|
|
)
|
2021-09-21 09:25:22 +00:00
|
|
|
print(f"deactivated {youtube_id}")
|
2021-09-05 17:10:14 +00:00
|
|
|
if not response.ok:
|
|
|
|
print(response.text)
|
|
|
|
|
2021-10-08 09:18:01 +00:00
|
|
|
def delete_media_file(self):
|
|
|
|
"""delete video file, meta data, thumbnails"""
|
|
|
|
# delete media file
|
|
|
|
es_vid_dict = self.get_es_data()
|
|
|
|
media_url = es_vid_dict["_source"]["media_url"]
|
|
|
|
print(f"delete {media_url} from file system")
|
|
|
|
to_delete = os.path.join(self.VIDEOS, media_url)
|
|
|
|
os.remove(to_delete)
|
|
|
|
# delete from index
|
|
|
|
url = f"{self.ES_URL}/ta_video/_doc/{self.youtube_id}"
|
2021-10-28 08:49:58 +00:00
|
|
|
response = requests.delete(url, auth=self.ES_AUTH)
|
2021-10-08 09:18:01 +00:00
|
|
|
if not response.ok:
|
|
|
|
print(response.text)
|
|
|
|
# delete thumbs from cache
|
2021-10-11 12:29:27 +00:00
|
|
|
ThumbManager().delete_vid_thumb(self.youtube_id)
|
2021-10-08 09:18:01 +00:00
|
|
|
|
2021-09-05 17:10:14 +00:00
|
|
|
|
2021-09-22 04:43:38 +00:00
|
|
|
class WatchState:
|
|
|
|
"""handle watched checkbox for videos and channels"""
|
|
|
|
|
|
|
|
CONFIG = AppConfig().config
|
|
|
|
ES_URL = CONFIG["application"]["es_url"]
|
2021-10-28 08:49:58 +00:00
|
|
|
ES_AUTH = CONFIG["application"]["es_auth"]
|
2021-09-22 04:43:38 +00:00
|
|
|
HEADERS = {"Content-type": "application/json"}
|
|
|
|
|
|
|
|
def __init__(self, youtube_id):
|
|
|
|
self.youtube_id = youtube_id
|
|
|
|
self.stamp = int(datetime.now().strftime("%s"))
|
|
|
|
|
|
|
|
def mark_as_watched(self):
|
|
|
|
"""update es with new watched value"""
|
|
|
|
url_type = self.dedect_type()
|
|
|
|
if url_type == "video":
|
|
|
|
self.mark_vid_watched()
|
|
|
|
elif url_type == "channel":
|
|
|
|
self.mark_channel_watched()
|
|
|
|
|
|
|
|
print(f"marked {self.youtube_id} as watched")
|
|
|
|
|
2021-10-07 16:38:17 +00:00
|
|
|
def mark_as_unwatched(self):
|
|
|
|
"""revert watched state to false"""
|
|
|
|
url_type = self.dedect_type()
|
|
|
|
if url_type == "video":
|
|
|
|
self.mark_vid_watched(revert=True)
|
|
|
|
elif url_type == "channel":
|
|
|
|
self.mark_channel_watched(revert=True)
|
|
|
|
|
|
|
|
print(f"revert {self.youtube_id} as unwatched")
|
|
|
|
|
2021-09-22 04:43:38 +00:00
|
|
|
def dedect_type(self):
|
|
|
|
"""find youtube id type"""
|
2021-10-31 09:04:28 +00:00
|
|
|
url_process = UrlListParser(self.youtube_id).process_list()
|
2021-09-22 04:43:38 +00:00
|
|
|
url_type = url_process[0]["type"]
|
|
|
|
return url_type
|
|
|
|
|
2021-10-07 16:38:17 +00:00
|
|
|
def mark_vid_watched(self, revert=False):
|
2021-09-22 04:43:38 +00:00
|
|
|
"""change watched status of single video"""
|
|
|
|
url = self.ES_URL + "/ta_video/_update/" + self.youtube_id
|
|
|
|
data = {
|
|
|
|
"doc": {"player": {"watched": True, "watched_date": self.stamp}}
|
|
|
|
}
|
2021-10-07 16:38:17 +00:00
|
|
|
if revert:
|
|
|
|
data["doc"]["player"]["watched"] = False
|
|
|
|
|
2021-09-22 04:43:38 +00:00
|
|
|
payload = json.dumps(data)
|
2021-10-28 08:49:58 +00:00
|
|
|
request = requests.post(
|
|
|
|
url, data=payload, headers=self.HEADERS, auth=self.ES_AUTH
|
|
|
|
)
|
2021-09-22 04:43:38 +00:00
|
|
|
if not request.ok:
|
|
|
|
print(request.text)
|
|
|
|
|
2021-10-07 16:38:17 +00:00
|
|
|
def mark_channel_watched(self, revert=False):
|
2021-09-22 04:43:38 +00:00
|
|
|
"""change watched status of every video in channel"""
|
|
|
|
es_url = self.ES_URL
|
|
|
|
headers = self.HEADERS
|
|
|
|
youtube_id = self.youtube_id
|
|
|
|
# create pipeline
|
|
|
|
data = {
|
|
|
|
"description": youtube_id,
|
|
|
|
"processors": [
|
|
|
|
{"set": {"field": "player.watched", "value": True}},
|
|
|
|
{"set": {"field": "player.watched_date", "value": self.stamp}},
|
|
|
|
],
|
|
|
|
}
|
2021-10-07 16:38:17 +00:00
|
|
|
if revert:
|
|
|
|
data["processors"][0]["set"]["value"] = False
|
|
|
|
|
2021-09-22 04:43:38 +00:00
|
|
|
payload = json.dumps(data)
|
|
|
|
url = f"{es_url}/_ingest/pipeline/{youtube_id}"
|
2021-10-28 08:49:58 +00:00
|
|
|
request = requests.put(
|
|
|
|
url, data=payload, headers=headers, auth=self.ES_AUTH
|
|
|
|
)
|
2021-09-22 04:43:38 +00:00
|
|
|
if not request.ok:
|
|
|
|
print(request.text)
|
|
|
|
raise ValueError("failed to post ingest pipeline")
|
|
|
|
|
|
|
|
# apply pipeline
|
|
|
|
must_list = [
|
|
|
|
{"term": {"channel.channel_id": {"value": youtube_id}}},
|
|
|
|
{"term": {"player.watched": {"value": False}}},
|
|
|
|
]
|
|
|
|
data = {"query": {"bool": {"must": must_list}}}
|
|
|
|
payload = json.dumps(data)
|
|
|
|
url = f"{es_url}/ta_video/_update_by_query?pipeline={youtube_id}"
|
2021-10-28 08:49:58 +00:00
|
|
|
request = requests.post(
|
|
|
|
url, data=payload, headers=headers, auth=self.ES_AUTH
|
|
|
|
)
|
2021-09-22 04:43:38 +00:00
|
|
|
if not request.ok:
|
|
|
|
print(request.text)
|
|
|
|
|
|
|
|
|
2021-09-08 05:32:53 +00:00
|
|
|
def index_new_video(youtube_id, missing_vid=False):
|
2021-09-21 09:25:22 +00:00
|
|
|
"""combine video and channel classes for new video index"""
|
2021-09-05 17:10:14 +00:00
|
|
|
vid_handler = YoutubeVideo(youtube_id)
|
2021-10-08 09:18:01 +00:00
|
|
|
vid_handler.get_vid_dict()
|
2021-09-15 10:13:10 +00:00
|
|
|
if not vid_handler.vid_dict:
|
2021-09-21 09:25:22 +00:00
|
|
|
raise ValueError("failed to get metadata for " + youtube_id)
|
2021-09-15 10:13:10 +00:00
|
|
|
|
2021-09-05 17:10:14 +00:00
|
|
|
channel_handler = YoutubeChannel(vid_handler.channel_id)
|
|
|
|
# add filepath to vid_dict
|
2021-09-21 09:25:22 +00:00
|
|
|
channel_name = channel_handler.channel_dict["channel_name"]
|
2021-09-05 17:10:14 +00:00
|
|
|
vid_handler.build_file_path(channel_name)
|
|
|
|
# add channel and player to video
|
2021-09-08 05:32:53 +00:00
|
|
|
vid_handler.add_player(missing_vid)
|
2021-09-21 09:25:22 +00:00
|
|
|
vid_handler.vid_dict["channel"] = channel_handler.channel_dict
|
2021-09-05 17:10:14 +00:00
|
|
|
# add new channel to es
|
2021-09-21 09:25:22 +00:00
|
|
|
if channel_handler.source == "scraped":
|
|
|
|
channel_handler.channel_dict["channel_subscribed"] = False
|
2021-09-05 17:10:14 +00:00
|
|
|
channel_handler.upload_to_es()
|
2021-10-11 09:03:25 +00:00
|
|
|
channel_handler.get_channel_art()
|
2021-09-05 17:10:14 +00:00
|
|
|
# upload video to es
|
|
|
|
vid_handler.upload_to_es()
|
|
|
|
# return vid_dict for further processing
|
|
|
|
return vid_handler.vid_dict
|