implement basic comment archive

This commit is contained in:
simon 2022-11-12 11:48:48 +07:00
parent 3c5302336f
commit 5b7e3e877b
No known key found for this signature in database
GPG Key ID: 2C15AA5E89985DD4
2 changed files with 167 additions and 0 deletions

View File

@ -460,6 +460,72 @@
},
"number_of_replicas": "0"
}
},
{
"index_name": "comments",
"expected_map": {
"youtube_id": {
"type": "keyword"
},
"comment_last_refresh": {
"type": "date"
},
"comment_comments": {
"properties": {
"comment_id": {
"type": "keyword"
},
"comment_text": {
"type" : "text"
},
"comment_timestamp": {
"type": "date"
},
"comment_time_text": {
"type" : "text"
},
"comment_likecount": {
"type": "long"
},
"comment_is_favorited": {
"type": "boolean"
},
"comment_author": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256,
"normalizer": "to_lower"
}
}
},
"comment_author_id": {
"type": "keyword"
},
"comment_author_thumbnail": {
"type": "keyword"
},
"comment_author_is_uploader": {
"type": "boolean"
},
"comment_parent": {
"type": "keyword"
}
}
}
},
"expected_set": {
"analysis": {
"normalizer": {
"to_lower": {
"type": "custom",
"filter": ["lowercase"]
}
}
},
"number_of_replicas": "0"
}
}
]
}

View File

@ -0,0 +1,101 @@
"""
Functionality:
- Download comments
- Index comments in ES
- Retrieve comments from ES
"""
from datetime import datetime
from home.src.download.yt_dlp_base import YtWrap
from home.src.es.connect import ElasticWrap
class Comments:
"""hold all comments functionality"""
def __init__(self, youtube_id):
self.youtube_id = youtube_id
self.es_path = f"ta_comments/_doc/{youtube_id}"
self.max_comments = "all,100,all,30"
self.json_data = False
def build_json(self):
"""build json document for es"""
comments_raw = self.get_comments()
comments_format = self.format_comments(comments_raw)
self.json_data = {
"youtube_id": self.youtube_id,
"comment_last_refresh": int(datetime.now().strftime("%s")),
"comment_comments": comments_format,
}
def build_yt_obs(self):
"""
get extractor config
max-comments,max-parents,max-replies,max-replies-per-thread
"""
max_comments_list = [i.strip() for i in self.max_comments.split(",")]
comment_sort = "top"
yt_obs = {
"skip_download": True,
"quiet": False,
"getcomments": True,
"extractor_args": {
"youtube": {
"max_comments": max_comments_list,
"comment_sort": [comment_sort],
}
},
}
return yt_obs
def get_comments(self):
"""get comments from youtube"""
print(f"comments: get comments with format {self.max_comments}")
yt_obs = self.build_yt_obs()
info_json = YtWrap(yt_obs).extract(self.youtube_id)
comments_raw = info_json.get("comments")
return comments_raw
def format_comments(self, comments_raw):
"""process comments to match format"""
comments = []
for comment in comments_raw:
cleaned_comment = self.clean_comment(comment)
comments.append(cleaned_comment)
return comments
def clean_comment(self, comment):
"""parse metadata from comment for indexing"""
time_text_datetime = datetime.utcfromtimestamp(comment["timestamp"])
time_text = time_text_datetime.strftime("%Y-%m-%d %H:%M:%S")
cleaned_comment = {
"comment_id": comment["id"],
"comment_text": comment["text"].replace("\xa0", ""),
"comment_timestamp": comment["timestamp"],
"comment_time_text": time_text,
"comment_likecount": comment["like_count"],
"comment_is_favorited": comment["is_favorited"],
"comment_author": comment["author"],
"comment_author_id": comment["author_id"],
"comment_author_thumbnail": comment["author_thumbnail"],
"comment_author_is_uploader": comment["author_is_uploader"],
"comment_parent": comment["parent"],
}
return cleaned_comment
def upload_comments(self):
"""upload comments to es"""
_, _ = ElasticWrap(self.es_path).put(self.json_data)
def delete_comments(self):
"""delete comments from es"""
_, _ = ElasticWrap(self.es_path).delete()