tubearchivist/tubearchivist/home/src/index.py

"""
Functionality:
- index new videos into elastisearch
- extract video info with yt_dlp
- scrape youtube channel page if needed
"""

import json
import re

from datetime import datetime
from time import sleep
import os

import requests
import yt_dlp as youtube_dl

from bs4 import BeautifulSoup

from home.src.config import AppConfig
from home.src.helper import clean_string, DurationConverter


class YoutubeChannel:
    """ represents a single youtube channel """

    CONFIG = AppConfig().config
    ES_URL = CONFIG['application']['es_url']
    CACHE_DIR = CONFIG['application']['cache_dir']

    def __init__(self, channel_id):
        self.channel_id = channel_id
        self.json_data = None
        self.source = None
        self.channel_dict = self.build_channel_dict()

    def build_channel_dict(self, scrape=False):
        """ combine the dicts build from extracted json payload """
        if scrape:
            channel_dict = False
        else:
            channel_dict = self.get_es_channel()
        if not channel_dict:
            print('scrape data from youtube')
            self.scrape_channel()
            channel_dict = self.parse_channel_main()
            channel_dict.update(self.parse_channel_meta())
            self.source = 'scraped'
        return channel_dict

    def get_es_channel(self):
        """ get from elastic search first if possible """
        channel_id = self.channel_id
        url = f'{self.ES_URL}/ta_channel/_doc/{channel_id}'
        response = requests.get(url)
        if response.ok:
            channel_source = response.json()['_source']
            self.source = 'elastic'
            return channel_source
        return False

    def scrape_channel(self):
        """ scrape channel page for additional infos """
        channel_id = self.channel_id
        url = f'https://www.youtube.com/channel/{channel_id}/about?hl=en'
        response = requests.get(url)
        if response.ok:
            channel_page = response.text
        else:
            print(f'failed to extract channel info for: {channel_id}')
            raise ConnectionError
        soup = BeautifulSoup(channel_page, 'html.parser')
        # load script into json
        all_scripts = soup.find('body').find_all('script')
        for script in all_scripts:
            if 'var ytInitialData = ' in str(script):
                script_content = str(script)
                break
        # extract payload
        script_content = script_content.split('var ytInitialData = ')[1]
        json_raw = script_content.rstrip(';</script>')
        json_data = json.loads(json_raw)
        # add to self
        self.json_data = json_data

    def parse_channel_main(self):
        """ extract maintab values from scraped channel json data """
        main_tab = self.json_data['header']['c4TabbedHeaderRenderer']
        channel_name = main_tab['title']
        last_refresh = int(datetime.now().strftime("%s"))
        # channel_subs
        try:
            sub_text_simple = main_tab['subscriberCountText']['simpleText']
            sub_text = sub_text_simple.split(' ')[0]
            if sub_text[-1] == 'K':
                channel_subs = int(float(sub_text.replace('K', ''))*1000)
            elif sub_text[-1] == 'M':
                channel_subs = int(float(sub_text.replace('M', ''))*1000000)
            elif int(sub_text) >= 0:
                channel_subs = int(sub_text)
            else:
                message = f'{sub_text} not dealt with'
                print(message)
        except KeyError:
            channel_subs = 0
        # banner
        try:
            all_banners = main_tab['banner']['thumbnails']
            banner = sorted(all_banners, key=lambda k: k['width'])[-1]['url']
        except KeyError:
            banner = False
        # build and return dict
        main_channel_dict = {
            'channel_active': True,
            'channel_last_refresh': last_refresh,
            'channel_subs': channel_subs,
            'channel_banner_url': banner,
            'channel_name': channel_name,
            'channel_id': self.channel_id
        }
        return main_channel_dict

    def parse_channel_meta(self):
        """ extract meta tab values from channel payload """
        # meta tab
        json_data = self.json_data
        meta_tab = json_data['metadata']['channelMetadataRenderer']
        description = meta_tab['description']
        all_thumbs = meta_tab['avatar']['thumbnails']
        thumb_url = sorted(all_thumbs, key=lambda k: k['width'])[-1]['url']
        # stats tab
        renderer = 'twoColumnBrowseResultsRenderer'
        all_tabs = json_data['contents'][renderer]['tabs']
        for tab in all_tabs:
            if 'tabRenderer' in tab.keys():
                if tab['tabRenderer']['title'] == 'About':
                    about_tab = (tab['tabRenderer']['content']
                                 ['sectionListRenderer']['contents'][0]
                                 ['itemSectionRenderer']['contents'][0]
                                 ['channelAboutFullMetadataRenderer'])
                    break
        try:
            channel_views_text = about_tab['viewCountText']['simpleText']
            channel_views = int(re.sub(r"\D", "", channel_views_text))
        except KeyError:
            channel_views = 0

        meta_channel_dict = {
            'channel_description': description,
            'channel_thumb_url': thumb_url,
            'channel_views': channel_views
        }

        return meta_channel_dict

    def upload_to_es(self):
        """ upload channel data to elastic search """
        url = f'{self.ES_URL}/ta_channel/_doc/{self.channel_id}'
        response = requests.put(url, json=self.channel_dict)
        print(f'added {self.channel_id} to es')
        if not response.ok:
            print(response.text)

    def clear_cache(self):
        """ delete banner and thumb from cache if available """
        channel_cache = os.path.join(self.CACHE_DIR, 'channels')
        thumb = os.path.join(channel_cache, self.channel_id + '_thumb.jpg')
        banner = os.path.join(channel_cache, self.channel_id + '_banner.jpg')
        if os.path.exists(thumb):
            os.remove(thumb)
        if os.path.exists(banner):
            os.remove(banner)

    def sync_to_videos(self):
        """ sync new channel_dict to all videos of channel """
        headers = {'Content-type': 'application/json'}
        channel_id = self.channel_id
        # add ingest pipeline
        processors = []
        for field, value in self.channel_dict.items():
            line = {"set": {"field": "channel." + field, "value": value}}
            processors.append(line)
        data = {
            "description": channel_id,
            "processors": processors
        }
        payload = json.dumps(data)
        url = self.ES_URL + '/_ingest/pipeline/' + channel_id
        request = requests.put(url, data=payload, headers=headers)
        if not request.ok:
            print(request.text)
        # apply pipeline
        data = {
            "query": {"match": {"channel.channel_id": channel_id}}
        }
        payload = json.dumps(data)
        url = self.ES_URL + '/ta_video/_update_by_query?pipeline=' + channel_id
        request = requests.post(url, data=payload, headers=headers)
        if not request.ok:
            print(request.text)

    def get_total_hits(self):
        """ get total channels indexed """
        headers = {'Content-type': 'application/json'}
        data = {"query": {"match_all": {}}}
        payload = json.dumps(data)
        url = f'{self.ES_URL}/ta_channel/_search?filter_path=hits.total'
        request = requests.post(url, data=payload, headers=headers)
        if not request.ok:
            print(request.text)
        total_hits = json.loads(request.text)['hits']['total']['value']
        return total_hits


class YoutubeVideo:
    """ represents a signle youtube video """

    CONFIG = AppConfig().config
    ES_URL = CONFIG['application']['es_url']
    CACHE_DIR = CONFIG['application']['cache_dir']

    def __init__(self, youtube_id):
        self.youtube_id = youtube_id
        self.channel_id = None
        self.vid_dict = self.get_wrapper()

    def get_wrapper(self):
        """ wrapper to loop around youtube_dl to retry on failure """
        print(f'get video data for {self.youtube_id}')
        for i in range(3):
            try:
                vid_dict = self.get_youtubedl_vid_data()
            except KeyError as e:
                print(e)
                sleep((i + 1) ** 2)
                continue
            else:
                break

        return vid_dict

    def get_youtubedl_vid_data(self):
        """ parse youtubedl extract info """
        youtube_id = self.youtube_id
        obs = {
            'quiet': True,
            'default_search': 'ytsearch',
            'skip_download': True
        }
        try:
            vid = youtube_dl.YoutubeDL(obs).extract_info(youtube_id)
        except (
                youtube_dl.utils.ExtractorError,
                youtube_dl.utils.DownloadError
                ):
            print('failed to get info for ' + youtube_id)
            return False
        # extract
        self.channel_id = vid['channel_id']
        upload_date = vid['upload_date']
        upload_date_time = datetime.strptime(upload_date, "%Y%m%d")
        published = upload_date_time.strftime("%Y-%m-%d")
        last_refresh = int(datetime.now().strftime("%s"))
        # likes
        try:
            like_count = vid['like_count']
        except KeyError:
            like_count = 0
        try:
            dislike_count = vid['dislike_count']
        except KeyError:
            dislike_count = 0
        # build dicts
        stats = {
            "view_count": vid['view_count'],
            "like_count": like_count,
            "dislike_count": dislike_count,
            "average_rating": vid['average_rating']
        }
        vid_basic = {
            "title": vid['title'],
            "description": vid['description'],
            "category": vid['categories'],
            "vid_thumb_url": vid['thumbnail'],
            "tags": vid['tags'],
            "published": published,
            "stats": stats,
            "vid_last_refresh": last_refresh,
            "date_downloaded": last_refresh,
            "youtube_id": youtube_id,
            "active": True,
            "channel": False
        }

        return vid_basic

    def add_new_player(self):
        """ add player information for new videos """
        cache_path = self.CACHE_DIR + '/download/'
        all_cached = os.listdir(cache_path)
        for file_cached in all_cached:
            if self.youtube_id in file_cached:
                vid_path = os.path.join(cache_path, file_cached)
                duration_handler = DurationConverter()
                duration = duration_handler.get_sec(vid_path)
                duration_str = duration_handler.get_str(duration)
                player = {
                    "watched": False,
                    "duration": duration,
                    "duration_str": duration_str
                }
                break

        self.vid_dict['player'] = player

    def build_file_path(self, channel_name):
        """ build media_url from where file will be located """
        clean_channel_name = clean_string(channel_name)
        timestamp = self.vid_dict['published'].replace('-', '')
        youtube_id = self.vid_dict['youtube_id']
        title = self.vid_dict['title']
        clean_title = clean_string(title)
        filename = f'{timestamp}_{youtube_id}_{clean_title}.mp4'
        media_url = os.path.join(clean_channel_name, filename)
        self.vid_dict['media_url'] = media_url

    def get_es_data(self):
        """ get current data from elastic search """
        url = self.ES_URL + '/ta_video/_doc/' + self.youtube_id
        response = requests.get(url)
        if not response.ok:
            print(response.text)
        es_vid_dict = json.loads(response.text)
        return es_vid_dict

    def upload_to_es(self):
        """ upload channel data to elastic search """
        url = f'{self.ES_URL}/ta_video/_doc/{self.youtube_id}'
        response = requests.put(url, json=self.vid_dict)
        if not response.ok:
            print(response.text)

    def delete_cache(self):
        """ delete thumbnail from cache if exist """
        video_cache = os.path.join(self.CACHE_DIR, 'videos')
        thumb = os.path.join(video_cache, self.youtube_id + '.jpg')
        if os.path.exists(thumb):
            os.remove(thumb)

    def deactivate(self):
        """ deactivate document on extractor error """
        youtube_id = self.youtube_id
        headers = {'Content-type': 'application/json'}
        url = f'{self.ES_URL}/ta_video/_update/{youtube_id}'
        data = {"script": "ctx._source.active = false"}
        json_str = json.dumps(data)
        response = requests.post(url, data=json_str, headers=headers)
        print(f'deactivated {youtube_id}')
        if not response.ok:
            print(response.text)


def index_new_video(youtube_id):
    """ combine video and channel classes for new video index """
    vid_handler = YoutubeVideo(youtube_id)
    channel_handler = YoutubeChannel(vid_handler.channel_id)
    # add filepath to vid_dict
    channel_name = channel_handler.channel_dict['channel_name']
    vid_handler.build_file_path(channel_name)
    # add channel and player to video
    vid_handler.add_new_player()
    vid_handler.vid_dict['channel'] = channel_handler.channel_dict
    # add new channel to es
    if channel_handler.source == 'scraped':
        channel_handler.channel_dict['channel_subscribed'] = False
        channel_handler.upload_to_es()
    # upload video to es
    vid_handler.upload_to_es()
    # return vid_dict for further processing
    return vid_handler.vid_dict