tubearchivist/tubearchivist/home/src/searching.py

"""
Functionality:
- handle search to populate results to view
- cache youtube video thumbnails and channel artwork
- parse values in hit_cleanup for frontend
- calculate pagination values
"""

import math
import os
import urllib.parse

from datetime import datetime

import requests

from PIL import Image

from home.src.config import AppConfig


class SearchHandler:
    """ search elastic search """

    CONFIG = AppConfig().config
    CACHE_DIR = CONFIG['application']['cache_dir']

    def __init__(self, url, data, cache=True):
        self.max_hits = None
        self.url = url
        self.data = data
        self.cache = cache

    def get_data(self):
        """ get the data """
        if self.data:
            response = requests.get(self.url, json=self.data).json()
        else:
            response = requests.get(self.url).json()

        if 'hits' in response.keys():
            self.max_hits = response['hits']['total']['value']
            return_value = response['hits']['hits']
        else:
            # simulate list for single result to reuse rest of class
            return_value = [response]

        # stop if empty
        if not return_value:
            return False

        all_videos = []
        all_channels = []
        for idx, hit in enumerate(return_value):
            return_value[idx] = self.hit_cleanup(hit)
            if hit['_index'] == 'ta_video':
                video_dict, channel_dict = self.vid_cache_link(hit)
                if video_dict not in all_videos:
                    all_videos.append(video_dict)
                if channel_dict not in all_channels:
                    all_channels.append(channel_dict)
            elif hit['_index'] == 'ta_channel':
                channel_dict = self.channel_cache_link(hit)
                if channel_dict not in all_channels:
                    all_channels.append(channel_dict)
        if self.cache:
            # validate cache
            self.cache_dl_vids(all_videos)
            self.cache_dl_chan(all_channels)

        return return_value

    @staticmethod
    def vid_cache_link(hit):
        """ download thumbnails into chache """
        vid_thumb = hit['source']['vid_thumb_url']
        youtube_id = hit['source']['youtube_id']
        channel_id_hit = hit['source']['channel']['channel_id']
        chan_thumb = hit['source']['channel']['channel_thumb_url']
        try:
            chan_banner = hit['source']['channel']['channel_banner_url']
        except KeyError:
            chan_banner = False
        video_dict = {
            'youtube_id': youtube_id,
            'vid_thumb': vid_thumb
        }
        channel_dict = {
            'channel_id': channel_id_hit,
            'chan_thumb': chan_thumb,
            'chan_banner': chan_banner
        }
        return video_dict, channel_dict

    @staticmethod
    def channel_cache_link(hit):
        """ build channel thumb links """
        channel_id_hit = hit['source']['channel_id']
        chan_thumb = hit['source']['channel_thumb_url']
        try:
            chan_banner = hit['source']['channel_banner_url']
        except KeyError:
            chan_banner = False
        channel_dict = {
            'channel_id': channel_id_hit,
            'chan_thumb': chan_thumb,
            'chan_banner': chan_banner
        }
        return channel_dict

    def cache_dl_vids(self, all_videos):
        """ video thumbs links for cache """
        vid_cache = os.path.join(self.CACHE_DIR, 'videos')
        all_vid_cached = os.listdir(vid_cache)
        # videos
        for video_dict in all_videos:
            youtube_id = video_dict['youtube_id']
            if not youtube_id + '.jpg' in all_vid_cached:
                cache_path = os.path.join(vid_cache, youtube_id + '.jpg')
                thumb_url = video_dict['vid_thumb']
                img_raw = requests.get(thumb_url, stream=True).raw
                img = Image.open(img_raw)
                width, height = img.size
                if not width / height == 16 / 9:
                    new_height = width / 16 * 9
                    offset = (height - new_height) / 2
                    img = img.crop((0, offset, width, height - offset))
                img.save(cache_path)

    def cache_dl_chan(self, all_channels):
        """ download channel thumbs """
        chan_cache = os.path.join(self.CACHE_DIR, 'channels')
        all_chan_cached = os.listdir(chan_cache)
        for channel_dict in all_channels:
            channel_id_cache = channel_dict['channel_id']
            channel_banner_url = channel_dict['chan_banner']
            channel_banner = channel_id_cache + '_banner.jpg'
            channel_thumb_url = channel_dict['chan_thumb']
            channel_thumb = channel_id_cache + '_thumb.jpg'
            # thumb
            if channel_thumb_url and channel_thumb not in all_chan_cached:
                cache_path = os.path.join(chan_cache, channel_thumb)
                img_raw = requests.get(channel_thumb_url, stream=True).content
                with open(cache_path, 'wb') as f:
                    f.write(img_raw)
            # banner
            if channel_banner_url and channel_banner not in all_chan_cached:
                cache_path = os.path.join(chan_cache, channel_banner)
                img_raw = requests.get(channel_banner_url, stream=True).content
                with open(cache_path, 'wb') as f:
                    f.write(img_raw)

    @staticmethod
    def hit_cleanup(hit):
        """ clean up and parse data from a single hit """
        hit['source'] = hit.pop('_source')
        hit_keys = hit['source'].keys()
        if 'media_url' in hit_keys:
            parsed_url = urllib.parse.quote(hit['source']['media_url'])
            hit['source']['media_url'] = parsed_url

        if 'published' in hit_keys:
            published = hit['source']['published']
            date_pub = datetime.strptime(published, "%Y-%m-%d")
            date_str = datetime.strftime(date_pub, "%d %b, %Y")
            hit['source']['published'] = date_str

        if 'vid_last_refresh' in hit_keys:
            vid_last_refresh = hit['source']['vid_last_refresh']
            date_refresh = datetime.fromtimestamp(vid_last_refresh)
            date_str = datetime.strftime(date_refresh, "%d %b, %Y")
            hit['source']['vid_last_refresh'] = date_str

        if 'channel_last_refresh' in hit_keys:
            refreshed = hit['source']['channel_last_refresh']
            date_refresh = datetime.fromtimestamp(refreshed)
            date_str = datetime.strftime(date_refresh, "%d %b, %Y")
            hit['source']['channel_last_refresh'] = date_str

        if 'channel' in hit_keys:
            channel_keys = hit['source']['channel'].keys()
            if 'channel_last_refresh' in channel_keys:
                refreshed = hit['source']['channel']['channel_last_refresh']
                date_refresh = datetime.fromtimestamp(refreshed)
                date_str = datetime.strftime(date_refresh, "%d %b, %Y")
                hit['source']['channel']['channel_last_refresh'] = date_str

        return hit


class Pagination:
    """
    figure out the pagination based on page size and total_hits
    """

    def __init__(self, page_get, search_get=False):
        config = AppConfig().config
        self.page_size = config['archive']['page_size']
        self.page_get = page_get
        self.search_get = search_get
        self.pagination = self.first_guess()

    def first_guess(self):
        """ build first guess before api call """
        page_get = self.page_get
        if page_get in [0, 1]:
            page_from = 0
            prev_pages = False
        elif page_get > 1:
            page_from = (page_get - 1) * self.page_size
            prev_pages = [
                i for i in range(page_get - 1, page_get - 6, -1) if i > 1
            ]
            prev_pages.reverse()
        pagination = {
            "page_size": self.page_size,
            "page_from": page_from,
            "prev_pages": prev_pages,
            "current_page": page_get
        }
        if self.search_get:
            pagination.update({"search_get": self.search_get})
        return pagination

    def validate(self, total_hits):
        """ validate pagination with total_hits after making api call """
        page_get = self.page_get
        max_pages = math.ceil(total_hits / self.page_size)
        if page_get < max_pages and max_pages > 1:
            self.pagination['last_page'] = max_pages
        else:
            self.pagination['last_page'] = False
        next_pages = [
            i for i in range(page_get + 1, page_get + 6) if 1 < i < max_pages
        ]

        self.pagination['next_pages'] = next_pages