tubearchivist/tubearchivist/home/src/searching.py

"""
Functionality:
- handle search to populate results to view
- cache youtube video thumbnails and channel artwork
- parse values in hit_cleanup for frontend
- calculate pagination values
"""

import math
import urllib.parse
from datetime import datetime

from home.src.config import AppConfig
from home.src.es import ElasticWrap
from home.src.helper import RedisArchivist
from home.src.thumbnails import ThumbManager


class SearchHandler:
    """search elastic search"""

    def __init__(self, path, config, data=False):
        self.max_hits = None
        self.path = path
        self.config = config
        self.data = data

    def get_data(self):
        """get the data"""
        response, _ = ElasticWrap(self.path, config=self.config).get(self.data)

        if "hits" in response.keys():
            self.max_hits = response["hits"]["total"]["value"]
            return_value = response["hits"]["hits"]
        else:
            # simulate list for single result to reuse rest of class
            return_value = [response]

        # stop if empty
        if not return_value:
            return False

        all_videos = []
        all_channels = []
        for idx, hit in enumerate(return_value):
            return_value[idx] = self.hit_cleanup(hit)
            if hit["_index"] == "ta_video":
                video_dict, channel_dict = self.vid_cache_link(hit)
                if video_dict not in all_videos:
                    all_videos.append(video_dict)
                if channel_dict not in all_channels:
                    all_channels.append(channel_dict)
            elif hit["_index"] == "ta_channel":
                channel_dict = self.channel_cache_link(hit)
                if channel_dict not in all_channels:
                    all_channels.append(channel_dict)

        return return_value

    @staticmethod
    def vid_cache_link(hit):
        """download thumbnails into cache"""
        vid_thumb = hit["source"]["vid_thumb_url"]
        youtube_id = hit["source"]["youtube_id"]
        channel_id_hit = hit["source"]["channel"]["channel_id"]
        chan_thumb = hit["source"]["channel"]["channel_thumb_url"]
        try:
            chan_banner = hit["source"]["channel"]["channel_banner_url"]
        except KeyError:
            chan_banner = False
        video_dict = {"youtube_id": youtube_id, "vid_thumb": vid_thumb}
        channel_dict = {
            "channel_id": channel_id_hit,
            "chan_thumb": chan_thumb,
            "chan_banner": chan_banner,
        }
        return video_dict, channel_dict

    @staticmethod
    def channel_cache_link(hit):
        """build channel thumb links"""
        channel_id_hit = hit["source"]["channel_id"]
        chan_thumb = hit["source"]["channel_thumb_url"]
        try:
            chan_banner = hit["source"]["channel_banner_url"]
        except KeyError:
            chan_banner = False
        channel_dict = {
            "channel_id": channel_id_hit,
            "chan_thumb": chan_thumb,
            "chan_banner": chan_banner,
        }
        return channel_dict

    @staticmethod
    def hit_cleanup(hit):
        """clean up and parse data from a single hit"""
        hit["source"] = hit.pop("_source")
        hit_keys = hit["source"].keys()
        if "media_url" in hit_keys:
            parsed_url = urllib.parse.quote(hit["source"]["media_url"])
            hit["source"]["media_url"] = parsed_url

        if "published" in hit_keys:
            published = hit["source"]["published"]
            date_pub = datetime.strptime(published, "%Y-%m-%d")
            date_str = datetime.strftime(date_pub, "%d %b, %Y")
            hit["source"]["published"] = date_str

        if "vid_last_refresh" in hit_keys:
            vid_last_refresh = hit["source"]["vid_last_refresh"]
            date_refresh = datetime.fromtimestamp(vid_last_refresh)
            date_str = datetime.strftime(date_refresh, "%d %b, %Y")
            hit["source"]["vid_last_refresh"] = date_str

        if "playlist_last_refresh" in hit_keys:
            playlist_last_refresh = hit["source"]["playlist_last_refresh"]
            date_refresh = datetime.fromtimestamp(playlist_last_refresh)
            date_str = datetime.strftime(date_refresh, "%d %b, %Y")
            hit["source"]["playlist_last_refresh"] = date_str

        if "vid_thumb_url" in hit_keys:
            youtube_id = hit["source"]["youtube_id"]
            thumb_path = ThumbManager().vid_thumb_path(youtube_id)
            hit["source"]["vid_thumb_url"] = thumb_path

        if "channel_last_refresh" in hit_keys:
            refreshed = hit["source"]["channel_last_refresh"]
            date_refresh = datetime.fromtimestamp(refreshed)
            date_str = datetime.strftime(date_refresh, "%d %b, %Y")
            hit["source"]["channel_last_refresh"] = date_str

        if "channel" in hit_keys:
            channel_keys = hit["source"]["channel"].keys()
            if "channel_last_refresh" in channel_keys:
                refreshed = hit["source"]["channel"]["channel_last_refresh"]
                date_refresh = datetime.fromtimestamp(refreshed)
                date_str = datetime.strftime(date_refresh, "%d %b, %Y")
                hit["source"]["channel"]["channel_last_refresh"] = date_str

        return hit


class SearchForm:
    """build query from search form data"""

    CONFIG = AppConfig().config

    def multi_search(self, search_query):
        """searching through index"""
        path = "ta_video,ta_channel,ta_playlist/_search"
        data = {
            "size": 30,
            "query": {
                "multi_match": {
                    "query": search_query,
                    "type": "bool_prefix",
                    "operator": "and",
                    "fuzziness": "auto",
                    "fields": [
                        "category",
                        "channel_description",
                        "channel_name._2gram",
                        "channel_name._3gram",
                        "channel_name.search_as_you_type",
                        "playlist_description",
                        "playlist_name._2gram",
                        "playlist_name._3gram",
                        "playlist_name.search_as_you_type",
                        "tags",
                        "title._2gram",
                        "title._3gram",
                        "title.search_as_you_type",
                    ],
                }
            },
        }
        look_up = SearchHandler(path, config=self.CONFIG, data=data)
        search_results = look_up.get_data()
        all_results = self.build_results(search_results)

        return {"results": all_results}

    @staticmethod
    def build_results(search_results):
        """build the all_results dict"""
        video_results = []
        channel_results = []
        playlist_results = []
        if search_results:
            for result in search_results:
                if result["_index"] == "ta_video":
                    video_results.append(result)
                elif result["_index"] == "ta_channel":
                    channel_results.append(result)
                elif result["_index"] == "ta_playlist":
                    playlist_results.append(result)

        all_results = {
            "video_results": video_results,
            "channel_results": channel_results,
            "playlist_results": playlist_results,
        }

        return all_results


class Pagination:
    """
    figure out the pagination based on page size and total_hits
    """

    def __init__(self, page_get, user_id, search_get=False):
        self.user_id = user_id
        self.page_size = self.get_page_size()
        self.page_get = page_get
        self.search_get = search_get
        self.pagination = self.first_guess()

    def get_page_size(self):
        """get default or user modified page_size"""
        key = f"{self.user_id}:page_size"
        page_size = RedisArchivist().get_message(key)["status"]
        if not page_size:
            config = AppConfig().config
            page_size = config["archive"]["page_size"]

        return page_size

    def first_guess(self):
        """build first guess before api call"""
        page_get = self.page_get
        if page_get in [0, 1]:
            page_from = 0
            prev_pages = False
        elif page_get > 1:
            page_from = (page_get - 1) * self.page_size
            prev_pages = [
                i for i in range(page_get - 1, page_get - 6, -1) if i > 1
            ]
            prev_pages.reverse()
        pagination = {
            "page_size": self.page_size,
            "page_from": page_from,
            "prev_pages": prev_pages,
            "current_page": page_get,
        }
        if self.search_get:
            pagination.update({"search_get": self.search_get})
        return pagination

    def validate(self, total_hits):
        """validate pagination with total_hits after making api call"""
        page_get = self.page_get
        max_pages = math.ceil(total_hits / self.page_size)
        if page_get < max_pages and max_pages > 1:
            self.pagination["last_page"] = max_pages
        else:
            self.pagination["last_page"] = False
        next_pages = [
            i for i in range(page_get + 1, page_get + 6) if 1 < i < max_pages
        ]

        self.pagination["next_pages"] = next_pages
minimal viable product 2021-09-05 17:10:14 +00:00			`"""`
			`Functionality:`
			`- handle search to populate results to view`
			`- cache youtube video thumbnails and channel artwork`
			`- parse values in hit_cleanup for frontend`
			`- calculate pagination values`
			`"""`

			`import math`
			`import urllib.parse`
			`from datetime import datetime`

			`from home.src.config import AppConfig`
make SearchHandler use new ElasticWrap class 2022-01-18 06:37:22 +00:00			`from home.src.es import ElasticWrap`
validate settings form and userspace archive page_size 2021-10-29 15:37:31 +00:00			`from home.src.helper import RedisArchivist`
new thumbnails module and new cache layout 2021-10-11 06:33:28 +00:00			`from home.src.thumbnails import ThumbManager`
minimal viable product 2021-09-05 17:10:14 +00:00

			`class SearchHandler:`
linting everything in black 2021-09-21 09:25:22 +00:00			`"""search elastic search"""`
minimal viable product 2021-09-05 17:10:14 +00:00
make SearchHandler use new ElasticWrap class 2022-01-18 06:37:22 +00:00			`def __init__(self, path, config, data=False):`
minimal viable product 2021-09-05 17:10:14 +00:00			`self.max_hits = None`
make SearchHandler use new ElasticWrap class 2022-01-18 06:37:22 +00:00			`self.path = path`
			`self.config = config`
minimal viable product 2021-09-05 17:10:14 +00:00			`self.data = data`

			`def get_data(self):`
linting everything in black 2021-09-21 09:25:22 +00:00			`"""get the data"""`
make SearchHandler use new ElasticWrap class 2022-01-18 06:37:22 +00:00			`response, _ = ElasticWrap(self.path, config=self.config).get(self.data)`
minimal viable product 2021-09-05 17:10:14 +00:00
linting everything in black 2021-09-21 09:25:22 +00:00			`if "hits" in response.keys():`
			`self.max_hits = response["hits"]["total"]["value"]`
			`return_value = response["hits"]["hits"]`
minimal viable product 2021-09-05 17:10:14 +00:00			`else:`
			`# simulate list for single result to reuse rest of class`
			`return_value = [response]`

			`# stop if empty`
			`if not return_value:`
			`return False`

			`all_videos = []`
			`all_channels = []`
			`for idx, hit in enumerate(return_value):`
			`return_value[idx] = self.hit_cleanup(hit)`
linting everything in black 2021-09-21 09:25:22 +00:00			`if hit["_index"] == "ta_video":`
minimal viable product 2021-09-05 17:10:14 +00:00			`video_dict, channel_dict = self.vid_cache_link(hit)`
			`if video_dict not in all_videos:`
			`all_videos.append(video_dict)`
			`if channel_dict not in all_channels:`
			`all_channels.append(channel_dict)`
linting everything in black 2021-09-21 09:25:22 +00:00			`elif hit["_index"] == "ta_channel":`
minimal viable product 2021-09-05 17:10:14 +00:00			`channel_dict = self.channel_cache_link(hit)`
			`if channel_dict not in all_channels:`
			`all_channels.append(channel_dict)`

			`return return_value`

			`@staticmethod`
			`def vid_cache_link(hit):`
linting everything in black 2021-09-21 09:25:22 +00:00			`"""download thumbnails into cache"""`
			`vid_thumb = hit["source"]["vid_thumb_url"]`
			`youtube_id = hit["source"]["youtube_id"]`
			`channel_id_hit = hit["source"]["channel"]["channel_id"]`
			`chan_thumb = hit["source"]["channel"]["channel_thumb_url"]`
minimal viable product 2021-09-05 17:10:14 +00:00			`try:`
linting everything in black 2021-09-21 09:25:22 +00:00			`chan_banner = hit["source"]["channel"]["channel_banner_url"]`
minimal viable product 2021-09-05 17:10:14 +00:00			`except KeyError:`
			`chan_banner = False`
linting everything in black 2021-09-21 09:25:22 +00:00			`video_dict = {"youtube_id": youtube_id, "vid_thumb": vid_thumb}`
minimal viable product 2021-09-05 17:10:14 +00:00			`channel_dict = {`
linting everything in black 2021-09-21 09:25:22 +00:00			`"channel_id": channel_id_hit,`
			`"chan_thumb": chan_thumb,`
			`"chan_banner": chan_banner,`
minimal viable product 2021-09-05 17:10:14 +00:00			`}`
			`return video_dict, channel_dict`

			`@staticmethod`
			`def channel_cache_link(hit):`
linting everything in black 2021-09-21 09:25:22 +00:00			`"""build channel thumb links"""`
			`channel_id_hit = hit["source"]["channel_id"]`
			`chan_thumb = hit["source"]["channel_thumb_url"]`
minimal viable product 2021-09-05 17:10:14 +00:00			`try:`
linting everything in black 2021-09-21 09:25:22 +00:00			`chan_banner = hit["source"]["channel_banner_url"]`
minimal viable product 2021-09-05 17:10:14 +00:00			`except KeyError:`
			`chan_banner = False`
			`channel_dict = {`
linting everything in black 2021-09-21 09:25:22 +00:00			`"channel_id": channel_id_hit,`
			`"chan_thumb": chan_thumb,`
			`"chan_banner": chan_banner,`
minimal viable product 2021-09-05 17:10:14 +00:00			`}`
			`return channel_dict`

			`@staticmethod`
			`def hit_cleanup(hit):`
linting everything in black 2021-09-21 09:25:22 +00:00			`"""clean up and parse data from a single hit"""`
			`hit["source"] = hit.pop("_source")`
			`hit_keys = hit["source"].keys()`
			`if "media_url" in hit_keys:`
			`parsed_url = urllib.parse.quote(hit["source"]["media_url"])`
			`hit["source"]["media_url"] = parsed_url`

			`if "published" in hit_keys:`
			`published = hit["source"]["published"]`
minimal viable product 2021-09-05 17:10:14 +00:00			`date_pub = datetime.strptime(published, "%Y-%m-%d")`
			`date_str = datetime.strftime(date_pub, "%d %b, %Y")`
linting everything in black 2021-09-21 09:25:22 +00:00			`hit["source"]["published"] = date_str`
minimal viable product 2021-09-05 17:10:14 +00:00
linting everything in black 2021-09-21 09:25:22 +00:00			`if "vid_last_refresh" in hit_keys:`
			`vid_last_refresh = hit["source"]["vid_last_refresh"]`
minimal viable product 2021-09-05 17:10:14 +00:00			`date_refresh = datetime.fromtimestamp(vid_last_refresh)`
			`date_str = datetime.strftime(date_refresh, "%d %b, %Y")`
linting everything in black 2021-09-21 09:25:22 +00:00			`hit["source"]["vid_last_refresh"] = date_str`
minimal viable product 2021-09-05 17:10:14 +00:00
extend playlist_id with metadata and links 2021-11-12 08:01:39 +00:00			`if "playlist_last_refresh" in hit_keys:`
			`playlist_last_refresh = hit["source"]["playlist_last_refresh"]`
			`date_refresh = datetime.fromtimestamp(playlist_last_refresh)`
			`date_str = datetime.strftime(date_refresh, "%d %b, %Y")`
			`hit["source"]["playlist_last_refresh"] = date_str`

new thumbnails module and new cache layout 2021-10-11 06:33:28 +00:00			`if "vid_thumb_url" in hit_keys:`
			`youtube_id = hit["source"]["youtube_id"]`
			`thumb_path = ThumbManager().vid_thumb_path(youtube_id)`
			`hit["source"]["vid_thumb_url"] = thumb_path`

linting everything in black 2021-09-21 09:25:22 +00:00			`if "channel_last_refresh" in hit_keys:`
			`refreshed = hit["source"]["channel_last_refresh"]`
minimal viable product 2021-09-05 17:10:14 +00:00			`date_refresh = datetime.fromtimestamp(refreshed)`
			`date_str = datetime.strftime(date_refresh, "%d %b, %Y")`
linting everything in black 2021-09-21 09:25:22 +00:00			`hit["source"]["channel_last_refresh"] = date_str`
minimal viable product 2021-09-05 17:10:14 +00:00
linting everything in black 2021-09-21 09:25:22 +00:00			`if "channel" in hit_keys:`
			`channel_keys = hit["source"]["channel"].keys()`
			`if "channel_last_refresh" in channel_keys:`
			`refreshed = hit["source"]["channel"]["channel_last_refresh"]`
minimal viable product 2021-09-05 17:10:14 +00:00			`date_refresh = datetime.fromtimestamp(refreshed)`
			`date_str = datetime.strftime(date_refresh, "%d %b, %Y")`
linting everything in black 2021-09-21 09:25:22 +00:00			`hit["source"]["channel"]["channel_last_refresh"] = date_str`
minimal viable product 2021-09-05 17:10:14 +00:00
			`return hit`


refactor PostData class * Split up into SearchForm class as part of searching module * Split up into WatchState class as part of index module 2021-09-22 04:43:38 +00:00			`class SearchForm:`
			`"""build query from search form data"""`

			`CONFIG = AppConfig().config`

extend SearchForm class with multisearch method for frontend 2021-12-30 08:58:15 +00:00			`def multi_search(self, search_query):`
			`"""searching through index"""`
make SearchHandler use new ElasticWrap class 2022-01-18 06:37:22 +00:00			`path = "ta_video,ta_channel,ta_playlist/_search"`
extend SearchForm class with multisearch method for frontend 2021-12-30 08:58:15 +00:00			`data = {`
			`"size": 30,`
			`"query": {`
			`"multi_match": {`
			`"query": search_query,`
add initial search endpoint and improve results$ 2021-12-30 13:42:42 +00:00			`"type": "bool_prefix",`
			`"operator": "and",`
			`"fuzziness": "auto",`
extend SearchForm class with multisearch method for frontend 2021-12-30 08:58:15 +00:00			`"fields": [`
			`"category",`
			`"channel_description",`
add initial search endpoint and improve results$ 2021-12-30 13:42:42 +00:00			`"channel_name._2gram",`
			`"channel_name._3gram",`
			`"channel_name.search_as_you_type",`
extend SearchForm class with multisearch method for frontend 2021-12-30 08:58:15 +00:00			`"playlist_description",`
add initial search endpoint and improve results$ 2021-12-30 13:42:42 +00:00			`"playlist_name._2gram",`
			`"playlist_name._3gram",`
			`"playlist_name.search_as_you_type",`
			`"tags",`
			`"title._2gram",`
			`"title._3gram",`
			`"title.search_as_you_type",`
extend SearchForm class with multisearch method for frontend 2021-12-30 08:58:15 +00:00			`],`
			`}`
			`},`
			`}`
make SearchHandler use new ElasticWrap class 2022-01-18 06:37:22 +00:00			`look_up = SearchHandler(path, config=self.CONFIG, data=data)`
extend SearchForm class with multisearch method for frontend 2021-12-30 08:58:15 +00:00			`search_results = look_up.get_data()`
			`all_results = self.build_results(search_results)`

			`return {"results": all_results}`

refactor PostData class * Split up into SearchForm class as part of searching module * Split up into WatchState class as part of index module 2021-09-22 04:43:38 +00:00			`@staticmethod`
extend SearchForm class with multisearch method for frontend 2021-12-30 08:58:15 +00:00			`def build_results(search_results):`
			`"""build the all_results dict"""`
			`video_results = []`
			`channel_results = []`
			`playlist_results = []`
add initial search endpoint and improve results$ 2021-12-30 13:42:42 +00:00			`if search_results:`
			`for result in search_results:`
			`if result["_index"] == "ta_video":`
			`video_results.append(result)`
			`elif result["_index"] == "ta_channel":`
			`channel_results.append(result)`
			`elif result["_index"] == "ta_playlist":`
			`playlist_results.append(result)`
extend SearchForm class with multisearch method for frontend 2021-12-30 08:58:15 +00:00
			`all_results = {`
			`"video_results": video_results,`
			`"channel_results": channel_results,`
			`"playlist_results": playlist_results,`
			`}`

			`return all_results`
refactor PostData class * Split up into SearchForm class as part of searching module * Split up into WatchState class as part of index module 2021-09-22 04:43:38 +00:00

minimal viable product 2021-09-05 17:10:14 +00:00			`class Pagination:`
			`"""`
			`figure out the pagination based on page size and total_hits`
			`"""`

validate settings form and userspace archive page_size 2021-10-29 15:37:31 +00:00			`def __init__(self, page_get, user_id, search_get=False):`
			`self.user_id = user_id`
			`self.page_size = self.get_page_size()`
minimal viable product 2021-09-05 17:10:14 +00:00			`self.page_get = page_get`
			`self.search_get = search_get`
			`self.pagination = self.first_guess()`

validate settings form and userspace archive page_size 2021-10-29 15:37:31 +00:00			`def get_page_size(self):`
			`"""get default or user modified page_size"""`
			`key = f"{self.user_id}:page_size"`
			`page_size = RedisArchivist().get_message(key)["status"]`
			`if not page_size:`
			`config = AppConfig().config`
			`page_size = config["archive"]["page_size"]`

			`return page_size`

minimal viable product 2021-09-05 17:10:14 +00:00			`def first_guess(self):`
linting everything in black 2021-09-21 09:25:22 +00:00			`"""build first guess before api call"""`
minimal viable product 2021-09-05 17:10:14 +00:00			`page_get = self.page_get`
			`if page_get in [0, 1]:`
			`page_from = 0`
			`prev_pages = False`
			`elif page_get > 1:`
			`page_from = (page_get - 1) * self.page_size`
			`prev_pages = [`
			`i for i in range(page_get - 1, page_get - 6, -1) if i > 1`
			`]`
			`prev_pages.reverse()`
			`pagination = {`
			`"page_size": self.page_size,`
			`"page_from": page_from,`
			`"prev_pages": prev_pages,`
linting everything in black 2021-09-21 09:25:22 +00:00			`"current_page": page_get,`
minimal viable product 2021-09-05 17:10:14 +00:00			`}`
			`if self.search_get:`
			`pagination.update({"search_get": self.search_get})`
			`return pagination`

			`def validate(self, total_hits):`
linting everything in black 2021-09-21 09:25:22 +00:00			`"""validate pagination with total_hits after making api call"""`
minimal viable product 2021-09-05 17:10:14 +00:00			`page_get = self.page_get`
			`max_pages = math.ceil(total_hits / self.page_size)`
			`if page_get < max_pages and max_pages > 1:`
linting everything in black 2021-09-21 09:25:22 +00:00			`self.pagination["last_page"] = max_pages`
minimal viable product 2021-09-05 17:10:14 +00:00			`else:`
linting everything in black 2021-09-21 09:25:22 +00:00			`self.pagination["last_page"] = False`
minimal viable product 2021-09-05 17:10:14 +00:00			`next_pages = [`
			`i for i in range(page_get + 1, page_get + 6) if 1 < i < max_pages`
			`]`

linting everything in black 2021-09-21 09:25:22 +00:00			`self.pagination["next_pages"] = next_pages`