tubearchivist/tubearchivist/home/src/index.py

381 lines
14 KiB
Python

"""
Functionality:
- index new videos into elastisearch
- extract video info with yt_dlp
- scrape youtube channel page if needed
"""
import json
import re
from datetime import datetime
from time import sleep
import os
import requests
import yt_dlp as youtube_dl
from bs4 import BeautifulSoup
from home.src.config import AppConfig
from home.src.helper import clean_string, DurationConverter
class YoutubeChannel:
""" represents a single youtube channel """
CONFIG = AppConfig().config
ES_URL = CONFIG['application']['es_url']
CACHE_DIR = CONFIG['application']['cache_dir']
def __init__(self, channel_id):
self.channel_id = channel_id
self.json_data = None
self.source = None
self.channel_dict = self.build_channel_dict()
def build_channel_dict(self, scrape=False):
""" combine the dicts build from extracted json payload """
if scrape:
channel_dict = False
else:
channel_dict = self.get_es_channel()
if not channel_dict:
print('scrape data from youtube')
self.scrape_channel()
channel_dict = self.parse_channel_main()
channel_dict.update(self.parse_channel_meta())
self.source = 'scraped'
return channel_dict
def get_es_channel(self):
""" get from elastic search first if possible """
channel_id = self.channel_id
url = f'{self.ES_URL}/ta_channel/_doc/{channel_id}'
response = requests.get(url)
if response.ok:
channel_source = response.json()['_source']
self.source = 'elastic'
return channel_source
return False
def scrape_channel(self):
""" scrape channel page for additional infos """
channel_id = self.channel_id
url = f'https://www.youtube.com/channel/{channel_id}/about?hl=en'
response = requests.get(url)
if response.ok:
channel_page = response.text
else:
print(f'failed to extract channel info for: {channel_id}')
raise ConnectionError
soup = BeautifulSoup(channel_page, 'html.parser')
# load script into json
all_scripts = soup.find('body').find_all('script')
for script in all_scripts:
if 'var ytInitialData = ' in str(script):
script_content = str(script)
break
# extract payload
script_content = script_content.split('var ytInitialData = ')[1]
json_raw = script_content.rstrip(';</script>')
json_data = json.loads(json_raw)
# add to self
self.json_data = json_data
def parse_channel_main(self):
""" extract maintab values from scraped channel json data """
main_tab = self.json_data['header']['c4TabbedHeaderRenderer']
channel_name = main_tab['title']
last_refresh = int(datetime.now().strftime("%s"))
# channel_subs
try:
sub_text_simple = main_tab['subscriberCountText']['simpleText']
sub_text = sub_text_simple.split(' ')[0]
if sub_text[-1] == 'K':
channel_subs = int(float(sub_text.replace('K', ''))*1000)
elif sub_text[-1] == 'M':
channel_subs = int(float(sub_text.replace('M', ''))*1000000)
elif int(sub_text) >= 0:
channel_subs = int(sub_text)
else:
message = f'{sub_text} not dealt with'
print(message)
except KeyError:
channel_subs = 0
# banner
try:
all_banners = main_tab['banner']['thumbnails']
banner = sorted(all_banners, key=lambda k: k['width'])[-1]['url']
except KeyError:
banner = False
# build and return dict
main_channel_dict = {
'channel_active': True,
'channel_last_refresh': last_refresh,
'channel_subs': channel_subs,
'channel_banner_url': banner,
'channel_name': channel_name,
'channel_id': self.channel_id
}
return main_channel_dict
def parse_channel_meta(self):
""" extract meta tab values from channel payload """
# meta tab
json_data = self.json_data
meta_tab = json_data['metadata']['channelMetadataRenderer']
description = meta_tab['description']
all_thumbs = meta_tab['avatar']['thumbnails']
thumb_url = sorted(all_thumbs, key=lambda k: k['width'])[-1]['url']
# stats tab
renderer = 'twoColumnBrowseResultsRenderer'
all_tabs = json_data['contents'][renderer]['tabs']
for tab in all_tabs:
if 'tabRenderer' in tab.keys():
if tab['tabRenderer']['title'] == 'About':
about_tab = (tab['tabRenderer']['content']
['sectionListRenderer']['contents'][0]
['itemSectionRenderer']['contents'][0]
['channelAboutFullMetadataRenderer'])
break
try:
channel_views_text = about_tab['viewCountText']['simpleText']
channel_views = int(re.sub(r"\D", "", channel_views_text))
except KeyError:
channel_views = 0
meta_channel_dict = {
'channel_description': description,
'channel_thumb_url': thumb_url,
'channel_views': channel_views
}
return meta_channel_dict
def upload_to_es(self):
""" upload channel data to elastic search """
url = f'{self.ES_URL}/ta_channel/_doc/{self.channel_id}'
response = requests.put(url, json=self.channel_dict)
print(f'added {self.channel_id} to es')
if not response.ok:
print(response.text)
def clear_cache(self):
""" delete banner and thumb from cache if available """
channel_cache = os.path.join(self.CACHE_DIR, 'channels')
thumb = os.path.join(channel_cache, self.channel_id + '_thumb.jpg')
banner = os.path.join(channel_cache, self.channel_id + '_banner.jpg')
if os.path.exists(thumb):
os.remove(thumb)
if os.path.exists(banner):
os.remove(banner)
def sync_to_videos(self):
""" sync new channel_dict to all videos of channel """
headers = {'Content-type': 'application/json'}
channel_id = self.channel_id
# add ingest pipeline
processors = []
for field, value in self.channel_dict.items():
line = {"set": {"field": "channel." + field, "value": value}}
processors.append(line)
data = {
"description": channel_id,
"processors": processors
}
payload = json.dumps(data)
url = self.ES_URL + '/_ingest/pipeline/' + channel_id
request = requests.put(url, data=payload, headers=headers)
if not request.ok:
print(request.text)
# apply pipeline
data = {
"query": {"match": {"channel.channel_id": channel_id}}
}
payload = json.dumps(data)
url = self.ES_URL + '/ta_video/_update_by_query?pipeline=' + channel_id
request = requests.post(url, data=payload, headers=headers)
if not request.ok:
print(request.text)
def get_total_hits(self):
""" get total channels indexed """
headers = {'Content-type': 'application/json'}
data = {"query": {"match_all": {}}}
payload = json.dumps(data)
url = f'{self.ES_URL}/ta_channel/_search?filter_path=hits.total'
request = requests.post(url, data=payload, headers=headers)
if not request.ok:
print(request.text)
total_hits = json.loads(request.text)['hits']['total']['value']
return total_hits
class YoutubeVideo:
""" represents a signle youtube video """
CONFIG = AppConfig().config
ES_URL = CONFIG['application']['es_url']
CACHE_DIR = CONFIG['application']['cache_dir']
def __init__(self, youtube_id):
self.youtube_id = youtube_id
self.channel_id = None
self.vid_dict = self.get_wrapper()
def get_wrapper(self):
""" wrapper to loop around youtube_dl to retry on failure """
print(f'get video data for {self.youtube_id}')
for i in range(3):
try:
vid_dict = self.get_youtubedl_vid_data()
except KeyError as e:
print(e)
sleep((i + 1) ** 2)
continue
else:
break
return vid_dict
def get_youtubedl_vid_data(self):
""" parse youtubedl extract info """
youtube_id = self.youtube_id
obs = {
'quiet': True,
'default_search': 'ytsearch',
'skip_download': True
}
try:
vid = youtube_dl.YoutubeDL(obs).extract_info(youtube_id)
except (
youtube_dl.utils.ExtractorError,
youtube_dl.utils.DownloadError
):
print('failed to get info for ' + youtube_id)
return False
# extract
self.channel_id = vid['channel_id']
upload_date = vid['upload_date']
upload_date_time = datetime.strptime(upload_date, "%Y%m%d")
published = upload_date_time.strftime("%Y-%m-%d")
last_refresh = int(datetime.now().strftime("%s"))
# likes
try:
like_count = vid['like_count']
except KeyError:
like_count = 0
try:
dislike_count = vid['dislike_count']
except KeyError:
dislike_count = 0
# build dicts
stats = {
"view_count": vid['view_count'],
"like_count": like_count,
"dislike_count": dislike_count,
"average_rating": vid['average_rating']
}
vid_basic = {
"title": vid['title'],
"description": vid['description'],
"category": vid['categories'],
"vid_thumb_url": vid['thumbnail'],
"tags": vid['tags'],
"published": published,
"stats": stats,
"vid_last_refresh": last_refresh,
"date_downloaded": last_refresh,
"youtube_id": youtube_id,
"active": True,
"channel": False
}
return vid_basic
def add_new_player(self):
""" add player information for new videos """
cache_path = self.CACHE_DIR + '/download/'
all_cached = os.listdir(cache_path)
for file_cached in all_cached:
if self.youtube_id in file_cached:
vid_path = os.path.join(cache_path, file_cached)
duration_handler = DurationConverter()
duration = duration_handler.get_sec(vid_path)
duration_str = duration_handler.get_str(duration)
player = {
"watched": False,
"duration": duration,
"duration_str": duration_str
}
break
self.vid_dict['player'] = player
def build_file_path(self, channel_name):
""" build media_url from where file will be located """
clean_channel_name = clean_string(channel_name)
timestamp = self.vid_dict['published'].replace('-', '')
youtube_id = self.vid_dict['youtube_id']
title = self.vid_dict['title']
clean_title = clean_string(title)
filename = f'{timestamp}_{youtube_id}_{clean_title}.mp4'
media_url = os.path.join(clean_channel_name, filename)
self.vid_dict['media_url'] = media_url
def get_es_data(self):
""" get current data from elastic search """
url = self.ES_URL + '/ta_video/_doc/' + self.youtube_id
response = requests.get(url)
if not response.ok:
print(response.text)
es_vid_dict = json.loads(response.text)
return es_vid_dict
def upload_to_es(self):
""" upload channel data to elastic search """
url = f'{self.ES_URL}/ta_video/_doc/{self.youtube_id}'
response = requests.put(url, json=self.vid_dict)
if not response.ok:
print(response.text)
def delete_cache(self):
""" delete thumbnail from cache if exist """
video_cache = os.path.join(self.CACHE_DIR, 'videos')
thumb = os.path.join(video_cache, self.youtube_id + '.jpg')
if os.path.exists(thumb):
os.remove(thumb)
def deactivate(self):
""" deactivate document on extractor error """
youtube_id = self.youtube_id
headers = {'Content-type': 'application/json'}
url = f'{self.ES_URL}/ta_video/_update/{youtube_id}'
data = {"script": "ctx._source.active = false"}
json_str = json.dumps(data)
response = requests.post(url, data=json_str, headers=headers)
print(f'deactivated {youtube_id}')
if not response.ok:
print(response.text)
def index_new_video(youtube_id):
""" combine video and channel classes for new video index """
vid_handler = YoutubeVideo(youtube_id)
channel_handler = YoutubeChannel(vid_handler.channel_id)
# add filepath to vid_dict
channel_name = channel_handler.channel_dict['channel_name']
vid_handler.build_file_path(channel_name)
# add channel and player to video
vid_handler.add_new_player()
vid_handler.vid_dict['channel'] = channel_handler.channel_dict
# add new channel to es
if channel_handler.source == 'scraped':
channel_handler.channel_dict['channel_subscribed'] = False
channel_handler.upload_to_es()
# upload video to es
vid_handler.upload_to_es()
# return vid_dict for further processing
return vid_handler.vid_dict