tubearchivist/tubearchivist/home/src/index/comments.py

177 lines
5.5 KiB
Python

"""
Functionality:
- Download comments
- Index comments in ES
- Retrieve comments from ES
"""
from datetime import datetime
from home.src.download.yt_dlp_base import YtWrap
from home.src.es.connect import ElasticWrap
from home.src.ta.config import AppConfig
from home.src.ta.ta_redis import RedisArchivist
class Comments:
"""hold all comments functionality"""
def __init__(self, youtube_id, config=False):
self.youtube_id = youtube_id
self.es_path = f"ta_comment/_doc/{youtube_id}"
self.json_data = False
self.config = config
self.is_activated = False
self.comments_format = False
def build_json(self, notify=False):
"""build json document for es"""
print(f"{self.youtube_id}: get comments")
self.check_config()
if not self.is_activated:
return
self._send_notification(notify)
comments_raw, channel_id = self.get_yt_comments()
self.format_comments(comments_raw)
self.json_data = {
"youtube_id": self.youtube_id,
"comment_last_refresh": int(datetime.now().strftime("%s")),
"comment_channel_id": channel_id,
"comment_comments": self.comments_format,
}
def check_config(self):
"""read config if not attached"""
if not self.config:
self.config = AppConfig().config
self.is_activated = bool(self.config["downloads"]["comment_max"])
@staticmethod
def _send_notification(notify):
"""send notification for download post process message"""
if not notify:
return
key = "message:download"
idx, total_videos = notify
message = {
"status": key,
"level": "info",
"title": "Download and index comments",
"message": f"Progress: {idx + 1}/{total_videos}",
}
RedisArchivist().set_message(key, message)
def build_yt_obs(self):
"""
get extractor config
max-comments,max-parents,max-replies,max-replies-per-thread
"""
max_comments = self.config["downloads"]["comment_max"]
max_comments_list = [i.strip() for i in max_comments.split(",")]
comment_sort = self.config["downloads"]["comment_sort"]
yt_obs = {
"skip_download": True,
"quiet": False,
"getcomments": True,
"extractor_args": {
"youtube": {
"max_comments": max_comments_list,
"comment_sort": [comment_sort],
}
},
}
return yt_obs
def get_yt_comments(self):
"""get comments from youtube"""
yt_obs = self.build_yt_obs()
info_json = YtWrap(yt_obs).extract(self.youtube_id)
comments_raw = info_json.get("comments")
channel_id = info_json.get("channel_id")
return comments_raw, channel_id
def format_comments(self, comments_raw):
"""process comments to match format"""
comments = []
for comment in comments_raw:
cleaned_comment = self.clean_comment(comment)
comments.append(cleaned_comment)
self.comments_format = comments
def clean_comment(self, comment):
"""parse metadata from comment for indexing"""
time_text_datetime = datetime.utcfromtimestamp(comment["timestamp"])
if time_text_datetime.hour == 0 and time_text_datetime.minute == 0:
format_string = "%Y-%m-%d"
else:
format_string = "%Y-%m-%d %H:%M"
time_text = time_text_datetime.strftime(format_string)
cleaned_comment = {
"comment_id": comment["id"],
"comment_text": comment["text"].replace("\xa0", ""),
"comment_timestamp": comment["timestamp"],
"comment_time_text": time_text,
"comment_likecount": comment["like_count"],
"comment_is_favorited": comment["is_favorited"],
"comment_author": comment["author"],
"comment_author_id": comment["author_id"],
"comment_author_thumbnail": comment["author_thumbnail"],
"comment_author_is_uploader": comment["author_is_uploader"],
"comment_parent": comment["parent"],
}
return cleaned_comment
def upload_comments(self):
"""upload comments to es"""
if not self.is_activated:
return
_, _ = ElasticWrap(self.es_path).put(self.json_data)
vid_path = f"ta_video/_update/{self.youtube_id}"
data = {"doc": {"comment_count": len(self.comments_format)}}
_, _ = ElasticWrap(vid_path).post(data=data)
def delete_comments(self):
"""delete comments from es"""
print(f"{self.youtube_id}: delete comments")
_, _ = ElasticWrap(self.es_path).delete(refresh=True)
def get_es_comments(self):
"""get comments from ES"""
response, statuscode = ElasticWrap(self.es_path).get()
if statuscode == 404:
print(f"comments: not found {self.youtube_id}")
return False
return response.get("_source")
def reindex_comments(self):
"""update comments from youtube"""
self.check_config()
if not self.is_activated:
return
self.build_json()
es_comments = self.get_es_comments()
if not self.comments_format and es_comments["comment_comments"]:
# don't overwrite comments in es
return
self.delete_comments()
self.upload_comments()