tubearchivist/tubearchivist/home/src/index/comments.py

217 lines
6.7 KiB
Python

"""
Functionality:
- Download comments
- Index comments in ES
- Retrieve comments from ES
"""
from datetime import datetime
from home.src.download.yt_dlp_base import YtWrap
from home.src.es.connect import ElasticWrap
from home.src.ta.config import AppConfig
class Comments:
"""interact with comments per video"""
def __init__(self, youtube_id, config=False):
self.youtube_id = youtube_id
self.es_path = f"ta_comment/_doc/{youtube_id}"
self.json_data = False
self.config = config
self.is_activated = False
self.comments_format = False
def build_json(self):
"""build json document for es"""
print(f"{self.youtube_id}: get comments")
self.check_config()
if not self.is_activated:
return
comments_raw, channel_id = self.get_yt_comments()
if not comments_raw and not channel_id:
return
self.format_comments(comments_raw)
self.json_data = {
"youtube_id": self.youtube_id,
"comment_last_refresh": int(datetime.now().timestamp()),
"comment_channel_id": channel_id,
"comment_comments": self.comments_format,
}
def check_config(self):
"""read config if not attached"""
if not self.config:
self.config = AppConfig().config
self.is_activated = bool(self.config["downloads"]["comment_max"])
def build_yt_obs(self):
"""
get extractor config
max-comments,max-parents,max-replies,max-replies-per-thread
"""
max_comments = self.config["downloads"]["comment_max"]
max_comments_list = [i.strip() for i in max_comments.split(",")]
comment_sort = self.config["downloads"]["comment_sort"]
yt_obs = {
"check_formats": None,
"skip_download": True,
"getcomments": True,
"ignoreerrors": True,
"extractor_args": {
"youtube": {
"max_comments": max_comments_list,
"comment_sort": [comment_sort],
"player_client": ["ios", "web"], # workaround yt-dlp #9554
}
},
}
return yt_obs
def get_yt_comments(self):
"""get comments from youtube"""
yt_obs = self.build_yt_obs()
info_json = YtWrap(yt_obs, config=self.config).extract(self.youtube_id)
if not info_json:
return False, False
comments_raw = info_json.get("comments")
channel_id = info_json.get("channel_id")
return comments_raw, channel_id
def format_comments(self, comments_raw):
"""process comments to match format"""
comments = []
if comments_raw:
for comment in comments_raw:
cleaned_comment = self.clean_comment(comment)
if not cleaned_comment:
continue
comments.append(cleaned_comment)
self.comments_format = comments
def clean_comment(self, comment):
"""parse metadata from comment for indexing"""
if not comment.get("text"):
# comment text can be empty
print(f"{self.youtube_id}: Failed to extract text, {comment}")
return False
time_text_datetime = datetime.utcfromtimestamp(comment["timestamp"])
if time_text_datetime.hour == 0 and time_text_datetime.minute == 0:
format_string = "%Y-%m-%d"
else:
format_string = "%Y-%m-%d %H:%M"
time_text = time_text_datetime.strftime(format_string)
if not comment.get("author"):
comment["author"] = comment.get("author_id", "Unknown")
cleaned_comment = {
"comment_id": comment["id"],
"comment_text": comment["text"].replace("\xa0", ""),
"comment_timestamp": comment["timestamp"],
"comment_time_text": time_text,
"comment_likecount": comment.get("like_count", None),
"comment_is_favorited": comment.get("is_favorited", False),
"comment_author": comment["author"],
"comment_author_id": comment["author_id"],
"comment_author_thumbnail": comment["author_thumbnail"],
"comment_author_is_uploader": comment.get(
"author_is_uploader", False
),
"comment_parent": comment["parent"],
}
return cleaned_comment
def upload_comments(self):
"""upload comments to es"""
if not self.is_activated:
return
print(f"{self.youtube_id}: upload comments")
_, _ = ElasticWrap(self.es_path).put(self.json_data)
vid_path = f"ta_video/_update/{self.youtube_id}"
data = {"doc": {"comment_count": len(self.comments_format)}}
_, _ = ElasticWrap(vid_path).post(data=data)
def delete_comments(self):
"""delete comments from es"""
print(f"{self.youtube_id}: delete comments")
_, _ = ElasticWrap(self.es_path).delete(refresh=True)
def get_es_comments(self):
"""get comments from ES"""
response, statuscode = ElasticWrap(self.es_path).get()
if statuscode == 404:
print(f"comments: not found {self.youtube_id}")
return False
return response.get("_source")
def reindex_comments(self):
"""update comments from youtube"""
self.check_config()
if not self.is_activated:
return
self.build_json()
if not self.json_data:
return
es_comments = self.get_es_comments()
if not self.comments_format:
return
if not self.comments_format and es_comments["comment_comments"]:
# don't overwrite comments in es
return
self.delete_comments()
self.upload_comments()
class CommentList:
"""interact with comments in group"""
def __init__(self, video_ids, task=False):
self.video_ids = video_ids
self.task = task
self.config = AppConfig().config
def index(self):
"""index comments for list, init with task object to notify"""
if not self.config["downloads"].get("comment_max"):
return
total_videos = len(self.video_ids)
for idx, youtube_id in enumerate(self.video_ids):
if self.task:
self.notify(idx, total_videos)
comment = Comments(youtube_id, config=self.config)
comment.build_json()
if comment.json_data:
comment.upload_comments()
def notify(self, idx, total_videos):
"""send notification on task"""
message = [f"Add comments for new videos {idx + 1}/{total_videos}"]
progress = (idx + 1) / total_videos
self.task.send_progress(message, progress=progress)