Index Comments, #build

Changed:
- added comment download and index
- [API] added comment api endpoints
This commit is contained in:
simon 2022-11-18 11:36:51 +07:00
commit 2fa907c478
No known key found for this signature in database
GPG Key ID: 2C15AA5E89985DD4
18 changed files with 472 additions and 7 deletions

View File

@ -12,6 +12,7 @@ Note:
**Video**
- [Video List](#video-list-view)
- [Video Single](#video-item-view)
- [Video Comments](#video-comment-view)
- [Video Single Progress](#video-progress-view)
- [Video Single Sponsorblock](#sponsor-block-view) WIP
@ -78,6 +79,9 @@ Pass page number as a query parameter: `page=2`. Defaults to *0*, `page=1` is re
## Video Item View
/api/video/\<video_id>/
## Video Comment View
/api/video/\<video_id>/comment/
## Video Progress View
/api/video/\<video_id>/progress

View File

@ -48,6 +48,8 @@ class SearchProcess:
processed = self._process_playlist(result["_source"])
if index == "ta_download":
processed = self._process_download(result["_source"])
if index == "ta_comment":
processed = self._process_comment(result["_source"])
return processed
@ -123,3 +125,17 @@ class SearchProcess:
}
)
return dict(sorted(download_dict.items()))
def _process_comment(self, comment_dict):
"""run on all comments, create reply thread"""
all_comments = comment_dict["comment_comments"]
processed_comments = []
for comment in all_comments:
if comment["comment_parent"] == "root":
comment.update({"comment_replies": []})
processed_comments.append(comment)
else:
processed_comments[-1]["comment_replies"].append(comment)
return processed_comments

View File

@ -18,6 +18,7 @@ from api.views import (
TaskApiView,
VideoApiListView,
VideoApiView,
VideoCommentView,
VideoProgressView,
VideoSponsorView,
)
@ -41,6 +42,11 @@ urlpatterns = [
VideoProgressView.as_view(),
name="api-video-progress",
),
path(
"video/<slug:video_id>/comment/",
VideoCommentView.as_view(),
name="api-video-comment",
),
path(
"video/<slug:video_id>/sponsor/",
VideoSponsorView.as_view(),

View File

@ -145,6 +145,22 @@ class VideoProgressView(ApiBaseView):
return Response(self.response)
class VideoCommentView(ApiBaseView):
"""resolves to /api/video/<video_id>/comment/
handle video comments
GET: return all comments from video with reply threads
"""
search_base = "ta_comment/_doc/"
def get(self, request, video_id):
"""get video comments"""
# pylint: disable=unused-argument
self.get_document(video_id)
return Response(self.response, status=self.status_code)
class VideoSponsorView(ApiBaseView):
"""resolves to /api/video/<video_id>/sponsor/
handle sponsor block integration

View File

@ -262,4 +262,4 @@ CORS_ALLOW_HEADERS = list(default_headers) + [
# TA application settings
TA_UPSTREAM = "https://github.com/tubearchivist/tubearchivist"
TA_VERSION = "v0.2.4"
TA_VERSION = "v0.2.5-unstable"

View File

@ -27,6 +27,8 @@
"subtitle": false,
"subtitle_source": false,
"subtitle_index": false,
"comment_max": false,
"comment_sort": "top",
"cookie_import": false,
"throttledratelimit": false,
"integrate_ryd": false,

View File

@ -15,6 +15,7 @@ from home.src.download.subscriptions import PlaylistSubscription
from home.src.download.yt_dlp_base import CookieHandler, YtWrap
from home.src.es.connect import ElasticWrap, IndexPaginate
from home.src.index.channel import YoutubeChannel
from home.src.index.comments import Comments
from home.src.index.playlist import YoutubePlaylist
from home.src.index.video import YoutubeVideo, index_new_video
from home.src.ta.config import AppConfig
@ -39,6 +40,7 @@ class DownloadPostProcess:
self.auto_delete_all()
self.auto_delete_overwrites()
self.validate_playlists()
self.get_comments()
def auto_delete_all(self):
"""handle auto delete"""
@ -139,6 +141,16 @@ class DownloadPostProcess:
RedisArchivist().set_message(key, mess_dict, expire=expire)
def get_comments(self):
"""get comments from youtube"""
if not self.download.config["downloads"]["comment_max"]:
return
for video_id in self.download.videos:
comment = Comments(video_id, config=self.download.config)
comment.build_json()
comment.upload_comments()
class VideoDownloader:
"""
@ -155,6 +167,7 @@ class VideoDownloader:
self.config = AppConfig().config
self._build_obs()
self.channels = set()
self.videos = set()
def run_queue(self):
"""setup download queue in redis loop until no more items"""
@ -187,6 +200,7 @@ class VideoDownloader:
youtube_id, video_overwrites=self.video_overwrites
)
self.channels.add(vid_dict["channel"]["channel_id"])
self.videos.add(vid_dict["youtube_id"])
mess_dict = {
"status": self.MSG,
"level": "info",

View File

@ -193,6 +193,9 @@
}
}
},
"comment_count": {
"type": "long"
},
"stats" : {
"properties" : {
"average_rating" : {
@ -460,6 +463,75 @@
},
"number_of_replicas": "0"
}
},
{
"index_name": "comment",
"expected_map": {
"youtube_id": {
"type": "keyword"
},
"comment_last_refresh": {
"type": "date"
},
"comment_channel_id": {
"type": "keyword"
},
"comment_comments": {
"properties": {
"comment_id": {
"type": "keyword"
},
"comment_text": {
"type" : "text"
},
"comment_timestamp": {
"type": "date"
},
"comment_time_text": {
"type" : "text"
},
"comment_likecount": {
"type": "long"
},
"comment_is_favorited": {
"type": "boolean"
},
"comment_author": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256,
"normalizer": "to_lower"
}
}
},
"comment_author_id": {
"type": "keyword"
},
"comment_author_thumbnail": {
"type": "keyword"
},
"comment_author_is_uploader": {
"type": "boolean"
},
"comment_parent": {
"type": "keyword"
}
}
}
},
"expected_set": {
"analysis": {
"normalizer": {
"to_lower": {
"type": "custom",
"filter": ["lowercase"]
}
}
},
"number_of_replicas": "0"
}
}
]
}

View File

@ -92,6 +92,12 @@ class ApplicationSettingsForm(forms.Form):
("1", "enable subtitle index"),
]
COMMENT_SORT_CHOICES = [
("", "-- change comments sort settings --"),
("top", "sort comments by top"),
("new", "sort comments by new"),
]
COOKIE_IMPORT_CHOICES = [
("", "-- change cookie settings"),
("0", "disable cookie"),
@ -120,6 +126,10 @@ class ApplicationSettingsForm(forms.Form):
downloads_subtitle_index = forms.ChoiceField(
widget=forms.Select, choices=SUBTITLE_INDEX_CHOICES, required=False
)
downloads_comment_max = forms.CharField(required=False)
downloads_comment_sort = forms.ChoiceField(
widget=forms.Select, choices=COMMENT_SORT_CHOICES, required=False
)
downloads_cookie_import = forms.ChoiceField(
widget=forms.Select, choices=COOKIE_IMPORT_CHOICES, required=False
)

View File

@ -50,7 +50,7 @@ class ChannelScraper:
url = f"https://www.youtube.com/channel/{self.channel_id}/about?hl=en"
cookies = {"CONSENT": "YES+xxxxxxxxxxxxxxxxxxxxxxxxxxx"}
response = requests.get(
url, cookies=cookies, headers=requests_headers()
url, cookies=cookies, headers=requests_headers(), timeout=10
)
if response.ok:
channel_page = response.text
@ -275,6 +275,15 @@ class YoutubeChannel(YouTubeItem):
}
_, _ = ElasticWrap("ta_video/_delete_by_query").post(data)
def delete_es_comments(self):
"""delete all comments from this channel"""
data = {
"query": {
"term": {"comment_channel_id": {"value": self.youtube_id}}
}
}
_, _ = ElasticWrap("ta_comment/_delete_by_query").post(data)
def delete_playlists(self):
"""delete all indexed playlist from es"""
all_playlists = self.get_indexed_playlists()
@ -301,6 +310,7 @@ class YoutubeChannel(YouTubeItem):
self.delete_playlists()
print(f"{self.youtube_id}: delete indexed videos")
self.delete_es_videos()
self.delete_es_comments()
self.del_in_es()
def index_channel_playlists(self):

View File

@ -0,0 +1,158 @@
"""
Functionality:
- Download comments
- Index comments in ES
- Retrieve comments from ES
"""
from datetime import datetime
from home.src.download.yt_dlp_base import YtWrap
from home.src.es.connect import ElasticWrap
from home.src.ta.config import AppConfig
class Comments:
"""hold all comments functionality"""
def __init__(self, youtube_id, config=False):
self.youtube_id = youtube_id
self.es_path = f"ta_comment/_doc/{youtube_id}"
self.json_data = False
self.config = config
self.is_activated = False
self.comments_format = False
def build_json(self):
"""build json document for es"""
print(f"{self.youtube_id}: get comments")
self.check_config()
if not self.is_activated:
return
comments_raw, channel_id = self.get_yt_comments()
self.format_comments(comments_raw)
self.json_data = {
"youtube_id": self.youtube_id,
"comment_last_refresh": int(datetime.now().strftime("%s")),
"comment_channel_id": channel_id,
"comment_comments": self.comments_format,
}
def check_config(self):
"""read config if not attached"""
if not self.config:
self.config = AppConfig().config
self.is_activated = bool(self.config["downloads"]["comment_max"])
def build_yt_obs(self):
"""
get extractor config
max-comments,max-parents,max-replies,max-replies-per-thread
"""
max_comments = self.config["downloads"]["comment_max"]
max_comments_list = [i.strip() for i in max_comments.split(",")]
comment_sort = self.config["downloads"]["comment_sort"]
yt_obs = {
"skip_download": True,
"quiet": False,
"getcomments": True,
"extractor_args": {
"youtube": {
"max_comments": max_comments_list,
"comment_sort": [comment_sort],
}
},
}
return yt_obs
def get_yt_comments(self):
"""get comments from youtube"""
yt_obs = self.build_yt_obs()
info_json = YtWrap(yt_obs).extract(self.youtube_id)
comments_raw = info_json.get("comments")
channel_id = info_json.get("channel_id")
return comments_raw, channel_id
def format_comments(self, comments_raw):
"""process comments to match format"""
comments = []
for comment in comments_raw:
cleaned_comment = self.clean_comment(comment)
comments.append(cleaned_comment)
self.comments_format = comments
def clean_comment(self, comment):
"""parse metadata from comment for indexing"""
time_text_datetime = datetime.utcfromtimestamp(comment["timestamp"])
if time_text_datetime.hour == 0 and time_text_datetime.minute == 0:
format_string = "%Y-%m-%d"
else:
format_string = "%Y-%m-%d %H:%M"
time_text = time_text_datetime.strftime(format_string)
cleaned_comment = {
"comment_id": comment["id"],
"comment_text": comment["text"].replace("\xa0", ""),
"comment_timestamp": comment["timestamp"],
"comment_time_text": time_text,
"comment_likecount": comment["like_count"],
"comment_is_favorited": comment["is_favorited"],
"comment_author": comment["author"],
"comment_author_id": comment["author_id"],
"comment_author_thumbnail": comment["author_thumbnail"],
"comment_author_is_uploader": comment["author_is_uploader"],
"comment_parent": comment["parent"],
}
return cleaned_comment
def upload_comments(self):
"""upload comments to es"""
if not self.is_activated:
return
_, _ = ElasticWrap(self.es_path).put(self.json_data)
vid_path = f"ta_video/_update/{self.youtube_id}"
data = {"doc": {"comment_count": len(self.comments_format)}}
_, _ = ElasticWrap(vid_path).post(data=data)
def delete_comments(self):
"""delete comments from es"""
print(f"{self.youtube_id}: delete comments")
_, _ = ElasticWrap(self.es_path).delete(refresh=True)
def get_es_comments(self):
"""get comments from ES"""
response, statuscode = ElasticWrap(self.es_path).get()
if statuscode == 404:
print(f"comments: not found {self.youtube_id}")
return False
return response.get("_source")
def reindex_comments(self):
"""update comments from youtube"""
self.check_config()
if not self.is_activated:
return
self.build_json()
es_comments = self.get_es_comments()
if not self.comments_format and es_comments["comment_comments"]:
# don't overwrite comments in es
return
self.delete_comments()
self.upload_comments()

View File

@ -16,6 +16,7 @@ from home.src.download.yt_dlp_base import CookieHandler
from home.src.download.yt_dlp_handler import VideoDownloader
from home.src.es.connect import ElasticWrap
from home.src.index.channel import YoutubeChannel
from home.src.index.comments import Comments
from home.src.index.playlist import YoutubePlaylist
from home.src.index.video import YoutubeVideo
from home.src.ta.config import AppConfig
@ -147,8 +148,7 @@ class Reindex:
if integrate_ryd:
self._get_unrated_vids()
@staticmethod
def _reindex_single_video(youtube_id):
def _reindex_single_video(self, youtube_id):
"""refresh data for single video"""
video = YoutubeVideo(youtube_id)
@ -182,6 +182,8 @@ class Reindex:
thumb_handler.delete_video_thumb()
thumb_handler.download_video_thumb(video.json_data["vid_thumb_url"])
Comments(youtube_id, config=self.config).reindex_comments()
return
@staticmethod

View File

@ -11,6 +11,7 @@ import requests
from django.conf import settings
from home.src.es.connect import ElasticWrap
from home.src.index import channel as ta_channel
from home.src.index import comments as ta_comments
from home.src.index import playlist as ta_playlist
from home.src.index.generic import YouTubeItem
from home.src.index.subtitle import YoutubeSubtitle
@ -302,6 +303,7 @@ class YoutubeVideo(YouTubeItem, YoutubeSubtitle):
self.del_in_playlists()
self.del_in_es()
self.delete_subtitles()
self.delete_comments()
def del_in_playlists(self):
"""remove downloaded in playlist"""
@ -326,6 +328,13 @@ class YoutubeVideo(YouTubeItem, YoutubeSubtitle):
print(f"{self.youtube_id}: delete subtitles")
YoutubeSubtitle(self).delete(subtitles=subtitles)
def delete_comments(self):
"""delete comments from es"""
comments = ta_comments.Comments(self.youtube_id, config=self.config)
comments.check_config()
if comments.is_activated:
comments.delete_comments()
def _get_ryd_stats(self):
"""get optional stats from returnyoutubedislikeapi.com"""
# pylint: disable=broad-except

View File

@ -114,6 +114,24 @@
{{ app_form.downloads_subtitle_index }}
</div>
</div>
<div class="settings-group">
<h2 id="comments">Comments</h2>
<div class="settings-item">
<p>Download and index comments: <span class="settings-current">{{ config.downloads.comment_max }}</span><br>
<i>Follow the yt-dlp max_comments documentation, <a href="https://github.com/yt-dlp/yt-dlp#youtube" target="_blank">max-comments,max-parents,max-replies,max-replies-per-thread</a>:</i><br>
<p>Example configurations:</p>
<ul>
<li><span class="settings-current">all,100,all,30</span>: Get 100 max-parents and 30 max-replies-per-thread.</li>
<li><span class="settings-current">1000,all,all,50</span>: Get a total of 1000 comments over all, 50 replies per thread.</li>
</ul>
{{ app_form.downloads_comment_max }}</p>
</div>
<div class="settings-item">
<p>Selected comment sort method: <span class="settings-current">{{ config.downloads.comment_sort }}</span><br>
<i>Select how many comments and threads to download:</i><br>
{{ app_form.downloads_comment_sort }}</p>
</div>
</div>
<div class="settings-group">
<h2 id="format">Cookie</h2>
<div class="settings-item">

View File

@ -123,6 +123,14 @@
</div>
{% endfor %}
{% endif %}
{% if video.comment_count %}
<div class="comments-section">
<h3>Comments: {{video.comment_count}}</h3>
<div id="comments-list" class="comments-list">
</div>
</div>
<script>getComments('{{ video.youtube_id }}')</script>
{% endif %}
</div>
<script>
var videoData = getVideoData('{{ video.youtube_id }}');

View File

@ -619,7 +619,8 @@ video:-webkit-full-screen {
margin-top: 1rem;
}
.description-box {
.description-box,
.comments-section {
margin-top: 1rem;
padding: 15px;
background-color: var(--highlight-bg);
@ -778,10 +779,14 @@ video:-webkit-full-screen {
margin-left: 5px;
}
.thumb-icon {
display: flex;
}
.thumb-icon img,
.rating-stars img {
width: 20px;
margin: 0;
margin: 0 5px;
filter: var(--img-filter);
}
@ -819,6 +824,37 @@ video:-webkit-full-screen {
width: 100%;
}
.comment-box {
padding-bottom: 1rem;
}
.comments-replies {
padding-left: 3rem;
margin-top: 1rem;
}
.comment-highlight {
background-color: var(--main-font);
padding: 3px;
color: var(--accent-font-dark);
font-family: Sen-bold, sans-serif;
width: fit-content;
}
.comment-meta {
display: flex;
}
.space-carrot {
margin: 0 5px;
}
.comment-like img {
width: 20px;
margin-left: 5px;
filter: var(--img-filter-error);
}
/* multi search page */
.multi-search-box {
padding-right: 20px;

View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="utf-8"?>
<!-- Generator: Adobe Illustrator 26.5.0, SVG Export Plug-In . SVG Version: 6.00 Build 0) -->
<svg version="1.1" id="Layer_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px"
viewBox="0 0 500 500" style="enable-background:new 0 0 500 500;" xml:space="preserve">
<path d="M499.9,159L499.9,159c0.1-1.7,0.1-3.4,0.1-5.2c0-69.5-58.6-129.7-130.9-129.7c-52.9,0-98.4,34-119,77.4h0
c-20.7-43.4-66.2-77.4-119-77.4C58.6,24.1,0,84.4,0,153.9c0,1.7,0.1,3.4,0.1,5.2h0c0,0-7.4,82.6,84.5,172.7
c41.8,41.9,88.5,81.6,165.4,144.1c76.9-62.5,123.6-102.3,165.4-144.1C507.2,241.6,499.9,159,499.9,159z"/>
</svg>

After

Width:  |  Height:  |  Size: 684 B

View File

@ -287,7 +287,7 @@ function resetToken() {
document.getElementById('text-reveal').replaceWith(message);
}
// restore from snapshot
// restore from snapshot
function restoreSnapshot(snapshotId) {
console.log('restore ' + snapshotId);
let apiEndpoint = '/api/snapshot/' + snapshotId + '/';
@ -1104,6 +1104,82 @@ function createFulltext(fullText) {
return fullTextDiv;
}
function getComments(videoId) {
let apiEndpoint = '/api/video/' + videoId + '/comment/';
let response = apiRequest(apiEndpoint, 'GET');
let allComments = response.data;
writeComments(allComments);
}
function writeComments(allComments) {
let commentsListBox = document.getElementById('comments-list');
for (let i = 0; i < allComments.length; i++) {
const rootComment = allComments[i];
let commentBox = createCommentBox(rootComment, true);
// add replies to commentBox
if (rootComment.comment_replies) {
let commentReplyBox = document.createElement('div');
commentReplyBox.setAttribute('class', 'comments-replies');
for (let j = 0; j < rootComment.comment_replies.length; j++) {
const commentReply = rootComment.comment_replies[j];
let commentReplyDiv = createCommentBox(commentReply, false);
commentReplyBox.appendChild(commentReplyDiv);
}
if (rootComment.comment_replies.length > 0) {
commentBox.appendChild(commentReplyBox);
}
}
commentsListBox.appendChild(commentBox);
}
}
function createCommentBox(comment, isRoot) {
let commentBox = document.createElement('div');
commentBox.setAttribute('class', 'comment-box');
let commentClass;
if (isRoot) {
commentClass = 'root-comment';
} else {
commentClass = 'reply-comment';
}
commentBox.classList.add = commentClass;
let commentAuthor = document.createElement('h3');
commentAuthor.innerText = comment.comment_author;
if (comment.comment_author_is_uploader) {
commentAuthor.setAttribute('class', 'comment-highlight');
}
commentBox.appendChild(commentAuthor);
let commentText = document.createElement('p');
commentText.innerText = comment.comment_text;
commentBox.appendChild(commentText);
const spacer = '<span class="space-carrot">|</span>';
let commentMeta = document.createElement('div');
commentMeta.setAttribute('class', 'comment-meta');
commentMeta.innerHTML = `<span>${comment.comment_time_text}</span>`;
if (comment.comment_likecount > 0) {
let numberFormatted = formatNumbers(comment.comment_likecount)
commentMeta.innerHTML += `${spacer}<span class="thumb-icon"><img src="/static/img/icon-thumb.svg"> ${numberFormatted}</span>`;
}
if (comment.comment_is_favorited) {
commentMeta.innerHTML += `${spacer}<span class="comment-like"><img src="/static/img/icon-heart.svg"></span>`;
}
commentBox.appendChild(commentMeta);
return commentBox;
}
// generic
function sendPost(payload) {