""" Functionality: - index new videos into elastisearch - extract video info with yt_dlp - scrape youtube channel page if needed """ import json import re from datetime import datetime from time import sleep import os import requests import yt_dlp as youtube_dl from bs4 import BeautifulSoup from home.src.config import AppConfig from home.src.helper import clean_string, DurationConverter class YoutubeChannel: """ represents a single youtube channel """ CONFIG = AppConfig().config ES_URL = CONFIG['application']['es_url'] CACHE_DIR = CONFIG['application']['cache_dir'] def __init__(self, channel_id): self.channel_id = channel_id self.json_data = None self.source = None self.channel_dict = self.build_channel_dict() def build_channel_dict(self, scrape=False): """ combine the dicts build from extracted json payload """ if scrape: channel_dict = False else: channel_dict = self.get_es_channel() if not channel_dict: print('scrape data from youtube') self.scrape_channel() channel_dict = self.parse_channel_main() channel_dict.update(self.parse_channel_meta()) self.source = 'scraped' return channel_dict def get_es_channel(self): """ get from elastic search first if possible """ channel_id = self.channel_id url = f'{self.ES_URL}/ta_channel/_doc/{channel_id}' response = requests.get(url) if response.ok: channel_source = response.json()['_source'] self.source = 'elastic' return channel_source return False def scrape_channel(self): """ scrape channel page for additional infos """ channel_id = self.channel_id url = f'https://www.youtube.com/channel/{channel_id}/about?hl=en' response = requests.get(url) if response.ok: channel_page = response.text else: print(f'failed to extract channel info for: {channel_id}') raise ConnectionError soup = BeautifulSoup(channel_page, 'html.parser') # load script into json all_scripts = soup.find('body').find_all('script') for script in all_scripts: if 'var ytInitialData = ' in str(script): script_content = str(script) break # extract payload script_content = script_content.split('var ytInitialData = ')[1] json_raw = script_content.rstrip(';') json_data = json.loads(json_raw) # add to self self.json_data = json_data def parse_channel_main(self): """ extract maintab values from scraped channel json data """ main_tab = self.json_data['header']['c4TabbedHeaderRenderer'] channel_name = main_tab['title'] last_refresh = int(datetime.now().strftime("%s")) # channel_subs try: sub_text_simple = main_tab['subscriberCountText']['simpleText'] sub_text = sub_text_simple.split(' ')[0] if sub_text[-1] == 'K': channel_subs = int(float(sub_text.replace('K', ''))*1000) elif sub_text[-1] == 'M': channel_subs = int(float(sub_text.replace('M', ''))*1000000) elif int(sub_text) >= 0: channel_subs = int(sub_text) else: message = f'{sub_text} not dealt with' print(message) except KeyError: channel_subs = 0 # banner try: all_banners = main_tab['banner']['thumbnails'] banner = sorted(all_banners, key=lambda k: k['width'])[-1]['url'] except KeyError: banner = False # build and return dict main_channel_dict = { 'channel_active': True, 'channel_last_refresh': last_refresh, 'channel_subs': channel_subs, 'channel_banner_url': banner, 'channel_name': channel_name, 'channel_id': self.channel_id } return main_channel_dict def parse_channel_meta(self): """ extract meta tab values from channel payload """ # meta tab json_data = self.json_data meta_tab = json_data['metadata']['channelMetadataRenderer'] description = meta_tab['description'] all_thumbs = meta_tab['avatar']['thumbnails'] thumb_url = sorted(all_thumbs, key=lambda k: k['width'])[-1]['url'] # stats tab renderer = 'twoColumnBrowseResultsRenderer' all_tabs = json_data['contents'][renderer]['tabs'] for tab in all_tabs: if 'tabRenderer' in tab.keys(): if tab['tabRenderer']['title'] == 'About': about_tab = (tab['tabRenderer']['content'] ['sectionListRenderer']['contents'][0] ['itemSectionRenderer']['contents'][0] ['channelAboutFullMetadataRenderer']) break try: channel_views_text = about_tab['viewCountText']['simpleText'] channel_views = int(re.sub(r"\D", "", channel_views_text)) except KeyError: channel_views = 0 meta_channel_dict = { 'channel_description': description, 'channel_thumb_url': thumb_url, 'channel_views': channel_views } return meta_channel_dict def upload_to_es(self): """ upload channel data to elastic search """ url = f'{self.ES_URL}/ta_channel/_doc/{self.channel_id}' response = requests.put(url, json=self.channel_dict) print(f'added {self.channel_id} to es') if not response.ok: print(response.text) def clear_cache(self): """ delete banner and thumb from cache if available """ channel_cache = os.path.join(self.CACHE_DIR, 'channels') thumb = os.path.join(channel_cache, self.channel_id + '_thumb.jpg') banner = os.path.join(channel_cache, self.channel_id + '_banner.jpg') if os.path.exists(thumb): os.remove(thumb) if os.path.exists(banner): os.remove(banner) def sync_to_videos(self): """ sync new channel_dict to all videos of channel """ headers = {'Content-type': 'application/json'} channel_id = self.channel_id # add ingest pipeline processors = [] for field, value in self.channel_dict.items(): line = {"set": {"field": "channel." + field, "value": value}} processors.append(line) data = { "description": channel_id, "processors": processors } payload = json.dumps(data) url = self.ES_URL + '/_ingest/pipeline/' + channel_id request = requests.put(url, data=payload, headers=headers) if not request.ok: print(request.text) # apply pipeline data = { "query": {"match": {"channel.channel_id": channel_id}} } payload = json.dumps(data) url = self.ES_URL + '/ta_video/_update_by_query?pipeline=' + channel_id request = requests.post(url, data=payload, headers=headers) if not request.ok: print(request.text) def get_total_hits(self): """ get total channels indexed """ headers = {'Content-type': 'application/json'} data = {"query": {"match_all": {}}} payload = json.dumps(data) url = f'{self.ES_URL}/ta_channel/_search?filter_path=hits.total' request = requests.post(url, data=payload, headers=headers) if not request.ok: print(request.text) total_hits = json.loads(request.text)['hits']['total']['value'] return total_hits class YoutubeVideo: """ represents a signle youtube video """ CONFIG = AppConfig().config ES_URL = CONFIG['application']['es_url'] CACHE_DIR = CONFIG['application']['cache_dir'] def __init__(self, youtube_id): self.youtube_id = youtube_id self.channel_id = None self.vid_dict = self.get_wrapper() def get_wrapper(self): """ wrapper to loop around youtube_dl to retry on failure """ print(f'get video data for {self.youtube_id}') for i in range(3): try: vid_dict = self.get_youtubedl_vid_data() except KeyError as e: print(e) sleep((i + 1) ** 2) continue else: break return vid_dict def get_youtubedl_vid_data(self): """ parse youtubedl extract info """ youtube_id = self.youtube_id obs = { 'quiet': True, 'default_search': 'ytsearch', 'skip_download': True } try: vid = youtube_dl.YoutubeDL(obs).extract_info(youtube_id) except ( youtube_dl.utils.ExtractorError, youtube_dl.utils.DownloadError ): print('failed to get info for ' + youtube_id) return False # extract self.channel_id = vid['channel_id'] upload_date = vid['upload_date'] upload_date_time = datetime.strptime(upload_date, "%Y%m%d") published = upload_date_time.strftime("%Y-%m-%d") last_refresh = int(datetime.now().strftime("%s")) # likes try: like_count = vid['like_count'] except KeyError: like_count = 0 try: dislike_count = vid['dislike_count'] except KeyError: dislike_count = 0 # build dicts stats = { "view_count": vid['view_count'], "like_count": like_count, "dislike_count": dislike_count, "average_rating": vid['average_rating'] } vid_basic = { "title": vid['title'], "description": vid['description'], "category": vid['categories'], "vid_thumb_url": vid['thumbnail'], "tags": vid['tags'], "published": published, "stats": stats, "vid_last_refresh": last_refresh, "date_downloaded": last_refresh, "youtube_id": youtube_id, "active": True, "channel": False } return vid_basic def add_new_player(self): """ add player information for new videos """ cache_path = self.CACHE_DIR + '/download/' all_cached = os.listdir(cache_path) for file_cached in all_cached: if self.youtube_id in file_cached: vid_path = os.path.join(cache_path, file_cached) duration_handler = DurationConverter() duration = duration_handler.get_sec(vid_path) duration_str = duration_handler.get_str(duration) player = { "watched": False, "duration": duration, "duration_str": duration_str } break self.vid_dict['player'] = player def build_file_path(self, channel_name): """ build media_url from where file will be located """ clean_channel_name = clean_string(channel_name) timestamp = self.vid_dict['published'].replace('-', '') youtube_id = self.vid_dict['youtube_id'] title = self.vid_dict['title'] clean_title = clean_string(title) filename = f'{timestamp}_{youtube_id}_{clean_title}.mp4' media_url = os.path.join(clean_channel_name, filename) self.vid_dict['media_url'] = media_url def get_es_data(self): """ get current data from elastic search """ url = self.ES_URL + '/ta_video/_doc/' + self.youtube_id response = requests.get(url) if not response.ok: print(response.text) es_vid_dict = json.loads(response.text) return es_vid_dict def upload_to_es(self): """ upload channel data to elastic search """ url = f'{self.ES_URL}/ta_video/_doc/{self.youtube_id}' response = requests.put(url, json=self.vid_dict) if not response.ok: print(response.text) def delete_cache(self): """ delete thumbnail from cache if exist """ video_cache = os.path.join(self.CACHE_DIR, 'videos') thumb = os.path.join(video_cache, self.youtube_id + '.jpg') if os.path.exists(thumb): os.remove(thumb) def deactivate(self): """ deactivate document on extractor error """ youtube_id = self.youtube_id headers = {'Content-type': 'application/json'} url = f'{self.ES_URL}/ta_video/_update/{youtube_id}' data = {"script": "ctx._source.active = false"} json_str = json.dumps(data) response = requests.post(url, data=json_str, headers=headers) print(f'deactivated {youtube_id}') if not response.ok: print(response.text) def index_new_video(youtube_id): """ combine video and channel classes for new video index """ vid_handler = YoutubeVideo(youtube_id) channel_handler = YoutubeChannel(vid_handler.channel_id) # add filepath to vid_dict channel_name = channel_handler.channel_dict['channel_name'] vid_handler.build_file_path(channel_name) # add channel and player to video vid_handler.add_new_player() vid_handler.vid_dict['channel'] = channel_handler.channel_dict # add new channel to es if channel_handler.source == 'scraped': channel_handler.channel_dict['channel_subscribed'] = False channel_handler.upload_to_es() # upload video to es vid_handler.upload_to_es() # return vid_dict for further processing return vid_handler.vid_dict