tubearchivist/tubearchivist/home/src/index/comments.py

"""
Functionality:
- Download comments
- Index comments in ES
- Retrieve comments from ES
"""

from datetime import datetime

from home.src.download.yt_dlp_base import YtWrap
from home.src.es.connect import ElasticWrap
from home.src.ta.config import AppConfig
from home.src.ta.ta_redis import RedisArchivist


class Comments:
    """hold all comments functionality"""

    def __init__(self, youtube_id, config=False):
        self.youtube_id = youtube_id
        self.es_path = f"ta_comment/_doc/{youtube_id}"
        self.json_data = False
        self.config = config
        self.is_activated = False
        self.comments_format = False

    def build_json(self, notify=False):
        """build json document for es"""
        print(f"{self.youtube_id}: get comments")
        self.check_config()
        if not self.is_activated:
            return

        self._send_notification(notify)
        comments_raw, channel_id = self.get_yt_comments()
        self.format_comments(comments_raw)

        self.json_data = {
            "youtube_id": self.youtube_id,
            "comment_last_refresh": int(datetime.now().strftime("%s")),
            "comment_channel_id": channel_id,
            "comment_comments": self.comments_format,
        }

    def check_config(self):
        """read config if not attached"""
        if not self.config:
            self.config = AppConfig().config

        self.is_activated = bool(self.config["downloads"]["comment_max"])

    @staticmethod
    def _send_notification(notify):
        """send notification for download post process message"""
        if not notify:
            return

        key = "message:download"
        idx, total_videos = notify
        message = {
            "status": key,
            "level": "info",
            "title": "Download and index comments",
            "message": f"Progress: {idx + 1}/{total_videos}",
        }

        RedisArchivist().set_message(key, message)

    def build_yt_obs(self):
        """
        get extractor config
        max-comments,max-parents,max-replies,max-replies-per-thread
        """
        max_comments = self.config["downloads"]["comment_max"]
        max_comments_list = [i.strip() for i in max_comments.split(",")]
        comment_sort = self.config["downloads"]["comment_sort"]

        yt_obs = {
            "skip_download": True,
            "quiet": False,
            "getcomments": True,
            "extractor_args": {
                "youtube": {
                    "max_comments": max_comments_list,
                    "comment_sort": [comment_sort],
                }
            },
        }

        return yt_obs

    def get_yt_comments(self):
        """get comments from youtube"""
        yt_obs = self.build_yt_obs()
        info_json = YtWrap(yt_obs).extract(self.youtube_id)
        comments_raw = info_json.get("comments")
        channel_id = info_json.get("channel_id")
        return comments_raw, channel_id

    def format_comments(self, comments_raw):
        """process comments to match format"""
        comments = []

        for comment in comments_raw:
            cleaned_comment = self.clean_comment(comment)
            comments.append(cleaned_comment)

        self.comments_format = comments

    def clean_comment(self, comment):
        """parse metadata from comment for indexing"""
        time_text_datetime = datetime.utcfromtimestamp(comment["timestamp"])

        if time_text_datetime.hour == 0 and time_text_datetime.minute == 0:
            format_string = "%Y-%m-%d"
        else:
            format_string = "%Y-%m-%d %H:%M"

        time_text = time_text_datetime.strftime(format_string)

        cleaned_comment = {
            "comment_id": comment["id"],
            "comment_text": comment["text"].replace("\xa0", ""),
            "comment_timestamp": comment["timestamp"],
            "comment_time_text": time_text,
            "comment_likecount": comment["like_count"],
            "comment_is_favorited": comment["is_favorited"],
            "comment_author": comment["author"],
            "comment_author_id": comment["author_id"],
            "comment_author_thumbnail": comment["author_thumbnail"],
            "comment_author_is_uploader": comment["author_is_uploader"],
            "comment_parent": comment["parent"],
        }

        return cleaned_comment

    def upload_comments(self):
        """upload comments to es"""
        if not self.is_activated:
            return

        _, _ = ElasticWrap(self.es_path).put(self.json_data)

        vid_path = f"ta_video/_update/{self.youtube_id}"
        data = {"doc": {"comment_count": len(self.comments_format)}}
        _, _ = ElasticWrap(vid_path).post(data=data)

    def delete_comments(self):
        """delete comments from es"""
        print(f"{self.youtube_id}: delete comments")
        _, _ = ElasticWrap(self.es_path).delete(refresh=True)

    def get_es_comments(self):
        """get comments from ES"""
        response, statuscode = ElasticWrap(self.es_path).get()
        if statuscode == 404:
            print(f"comments: not found {self.youtube_id}")
            return False

        return response.get("_source")

    def reindex_comments(self):
        """update comments from youtube"""
        self.check_config()
        if not self.is_activated:
            return

        self.build_json()
        es_comments = self.get_es_comments()

        if not self.comments_format and es_comments["comment_comments"]:
            # don't overwrite comments in es
            return

        self.delete_comments()
        self.upload_comments()