diff --git a/tubearchivist/api/README.md b/tubearchivist/api/README.md index 6e5efb4..593089d 100644 --- a/tubearchivist/api/README.md +++ b/tubearchivist/api/README.md @@ -12,6 +12,7 @@ Note: **Video** - [Video List](#video-list-view) - [Video Single](#video-item-view) +- [Video Comments](#video-comment-view) - [Video Single Progress](#video-progress-view) - [Video Single Sponsorblock](#sponsor-block-view) WIP @@ -78,6 +79,9 @@ Pass page number as a query parameter: `page=2`. Defaults to *0*, `page=1` is re ## Video Item View /api/video/\/ +## Video Comment View +/api/video/\/comment/ + ## Video Progress View /api/video/\/progress diff --git a/tubearchivist/api/src/search_processor.py b/tubearchivist/api/src/search_processor.py index 7a41163..7b594f6 100644 --- a/tubearchivist/api/src/search_processor.py +++ b/tubearchivist/api/src/search_processor.py @@ -48,6 +48,8 @@ class SearchProcess: processed = self._process_playlist(result["_source"]) if index == "ta_download": processed = self._process_download(result["_source"]) + if index == "ta_comment": + processed = self._process_comment(result["_source"]) return processed @@ -123,3 +125,17 @@ class SearchProcess: } ) return dict(sorted(download_dict.items())) + + def _process_comment(self, comment_dict): + """run on all comments, create reply thread""" + all_comments = comment_dict["comment_comments"] + processed_comments = [] + + for comment in all_comments: + if comment["comment_parent"] == "root": + comment.update({"comment_replies": []}) + processed_comments.append(comment) + else: + processed_comments[-1]["comment_replies"].append(comment) + + return processed_comments diff --git a/tubearchivist/api/urls.py b/tubearchivist/api/urls.py index e84ec05..7fb48a0 100644 --- a/tubearchivist/api/urls.py +++ b/tubearchivist/api/urls.py @@ -18,6 +18,7 @@ from api.views import ( TaskApiView, VideoApiListView, VideoApiView, + VideoCommentView, VideoProgressView, VideoSponsorView, ) @@ -41,6 +42,11 @@ urlpatterns = [ VideoProgressView.as_view(), name="api-video-progress", ), + path( + "video//comment/", + VideoCommentView.as_view(), + name="api-video-comment", + ), path( "video//sponsor/", VideoSponsorView.as_view(), diff --git a/tubearchivist/api/views.py b/tubearchivist/api/views.py index 2574db6..dab62e3 100644 --- a/tubearchivist/api/views.py +++ b/tubearchivist/api/views.py @@ -145,6 +145,22 @@ class VideoProgressView(ApiBaseView): return Response(self.response) +class VideoCommentView(ApiBaseView): + """resolves to /api/video//comment/ + handle video comments + GET: return all comments from video with reply threads + """ + + search_base = "ta_comment/_doc/" + + def get(self, request, video_id): + """get video comments""" + # pylint: disable=unused-argument + self.get_document(video_id) + + return Response(self.response, status=self.status_code) + + class VideoSponsorView(ApiBaseView): """resolves to /api/video//sponsor/ handle sponsor block integration diff --git a/tubearchivist/config/settings.py b/tubearchivist/config/settings.py index 5721e98..1f350f0 100644 --- a/tubearchivist/config/settings.py +++ b/tubearchivist/config/settings.py @@ -262,4 +262,4 @@ CORS_ALLOW_HEADERS = list(default_headers) + [ # TA application settings TA_UPSTREAM = "https://github.com/tubearchivist/tubearchivist" -TA_VERSION = "v0.2.4" +TA_VERSION = "v0.2.5-unstable" diff --git a/tubearchivist/home/config.json b/tubearchivist/home/config.json index a953a49..c8450d0 100644 --- a/tubearchivist/home/config.json +++ b/tubearchivist/home/config.json @@ -27,6 +27,8 @@ "subtitle": false, "subtitle_source": false, "subtitle_index": false, + "comment_max": false, + "comment_sort": "top", "cookie_import": false, "throttledratelimit": false, "integrate_ryd": false, diff --git a/tubearchivist/home/src/download/yt_dlp_handler.py b/tubearchivist/home/src/download/yt_dlp_handler.py index 4dcfe02..61a8c2f 100644 --- a/tubearchivist/home/src/download/yt_dlp_handler.py +++ b/tubearchivist/home/src/download/yt_dlp_handler.py @@ -15,6 +15,7 @@ from home.src.download.subscriptions import PlaylistSubscription from home.src.download.yt_dlp_base import CookieHandler, YtWrap from home.src.es.connect import ElasticWrap, IndexPaginate from home.src.index.channel import YoutubeChannel +from home.src.index.comments import Comments from home.src.index.playlist import YoutubePlaylist from home.src.index.video import YoutubeVideo, index_new_video from home.src.ta.config import AppConfig @@ -39,6 +40,7 @@ class DownloadPostProcess: self.auto_delete_all() self.auto_delete_overwrites() self.validate_playlists() + self.get_comments() def auto_delete_all(self): """handle auto delete""" @@ -139,6 +141,16 @@ class DownloadPostProcess: RedisArchivist().set_message(key, mess_dict, expire=expire) + def get_comments(self): + """get comments from youtube""" + if not self.download.config["downloads"]["comment_max"]: + return + + for video_id in self.download.videos: + comment = Comments(video_id, config=self.download.config) + comment.build_json() + comment.upload_comments() + class VideoDownloader: """ @@ -155,6 +167,7 @@ class VideoDownloader: self.config = AppConfig().config self._build_obs() self.channels = set() + self.videos = set() def run_queue(self): """setup download queue in redis loop until no more items""" @@ -187,6 +200,7 @@ class VideoDownloader: youtube_id, video_overwrites=self.video_overwrites ) self.channels.add(vid_dict["channel"]["channel_id"]) + self.videos.add(vid_dict["youtube_id"]) mess_dict = { "status": self.MSG, "level": "info", diff --git a/tubearchivist/home/src/es/index_mapping.json b/tubearchivist/home/src/es/index_mapping.json index f023eef..7270563 100644 --- a/tubearchivist/home/src/es/index_mapping.json +++ b/tubearchivist/home/src/es/index_mapping.json @@ -193,6 +193,9 @@ } } }, + "comment_count": { + "type": "long" + }, "stats" : { "properties" : { "average_rating" : { @@ -460,6 +463,75 @@ }, "number_of_replicas": "0" } + }, + { + "index_name": "comment", + "expected_map": { + "youtube_id": { + "type": "keyword" + }, + "comment_last_refresh": { + "type": "date" + }, + "comment_channel_id": { + "type": "keyword" + }, + "comment_comments": { + "properties": { + "comment_id": { + "type": "keyword" + }, + "comment_text": { + "type" : "text" + }, + "comment_timestamp": { + "type": "date" + }, + "comment_time_text": { + "type" : "text" + }, + "comment_likecount": { + "type": "long" + }, + "comment_is_favorited": { + "type": "boolean" + }, + "comment_author": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256, + "normalizer": "to_lower" + } + } + }, + "comment_author_id": { + "type": "keyword" + }, + "comment_author_thumbnail": { + "type": "keyword" + }, + "comment_author_is_uploader": { + "type": "boolean" + }, + "comment_parent": { + "type": "keyword" + } + } + } + }, + "expected_set": { + "analysis": { + "normalizer": { + "to_lower": { + "type": "custom", + "filter": ["lowercase"] + } + } + }, + "number_of_replicas": "0" + } } ] } \ No newline at end of file diff --git a/tubearchivist/home/src/frontend/forms.py b/tubearchivist/home/src/frontend/forms.py index 1e4365c..79cecbe 100644 --- a/tubearchivist/home/src/frontend/forms.py +++ b/tubearchivist/home/src/frontend/forms.py @@ -92,6 +92,12 @@ class ApplicationSettingsForm(forms.Form): ("1", "enable subtitle index"), ] + COMMENT_SORT_CHOICES = [ + ("", "-- change comments sort settings --"), + ("top", "sort comments by top"), + ("new", "sort comments by new"), + ] + COOKIE_IMPORT_CHOICES = [ ("", "-- change cookie settings"), ("0", "disable cookie"), @@ -120,6 +126,10 @@ class ApplicationSettingsForm(forms.Form): downloads_subtitle_index = forms.ChoiceField( widget=forms.Select, choices=SUBTITLE_INDEX_CHOICES, required=False ) + downloads_comment_max = forms.CharField(required=False) + downloads_comment_sort = forms.ChoiceField( + widget=forms.Select, choices=COMMENT_SORT_CHOICES, required=False + ) downloads_cookie_import = forms.ChoiceField( widget=forms.Select, choices=COOKIE_IMPORT_CHOICES, required=False ) diff --git a/tubearchivist/home/src/index/channel.py b/tubearchivist/home/src/index/channel.py index 7a554a9..7108d43 100644 --- a/tubearchivist/home/src/index/channel.py +++ b/tubearchivist/home/src/index/channel.py @@ -50,7 +50,7 @@ class ChannelScraper: url = f"https://www.youtube.com/channel/{self.channel_id}/about?hl=en" cookies = {"CONSENT": "YES+xxxxxxxxxxxxxxxxxxxxxxxxxxx"} response = requests.get( - url, cookies=cookies, headers=requests_headers() + url, cookies=cookies, headers=requests_headers(), timeout=10 ) if response.ok: channel_page = response.text @@ -275,6 +275,15 @@ class YoutubeChannel(YouTubeItem): } _, _ = ElasticWrap("ta_video/_delete_by_query").post(data) + def delete_es_comments(self): + """delete all comments from this channel""" + data = { + "query": { + "term": {"comment_channel_id": {"value": self.youtube_id}} + } + } + _, _ = ElasticWrap("ta_comment/_delete_by_query").post(data) + def delete_playlists(self): """delete all indexed playlist from es""" all_playlists = self.get_indexed_playlists() @@ -301,6 +310,7 @@ class YoutubeChannel(YouTubeItem): self.delete_playlists() print(f"{self.youtube_id}: delete indexed videos") self.delete_es_videos() + self.delete_es_comments() self.del_in_es() def index_channel_playlists(self): diff --git a/tubearchivist/home/src/index/comments.py b/tubearchivist/home/src/index/comments.py new file mode 100644 index 0000000..0ac2436 --- /dev/null +++ b/tubearchivist/home/src/index/comments.py @@ -0,0 +1,158 @@ +""" +Functionality: +- Download comments +- Index comments in ES +- Retrieve comments from ES +""" + +from datetime import datetime + +from home.src.download.yt_dlp_base import YtWrap +from home.src.es.connect import ElasticWrap +from home.src.ta.config import AppConfig + + +class Comments: + """hold all comments functionality""" + + def __init__(self, youtube_id, config=False): + self.youtube_id = youtube_id + self.es_path = f"ta_comment/_doc/{youtube_id}" + self.json_data = False + self.config = config + self.is_activated = False + self.comments_format = False + + def build_json(self): + """build json document for es""" + print(f"{self.youtube_id}: get comments") + self.check_config() + + if not self.is_activated: + return + + comments_raw, channel_id = self.get_yt_comments() + self.format_comments(comments_raw) + + self.json_data = { + "youtube_id": self.youtube_id, + "comment_last_refresh": int(datetime.now().strftime("%s")), + "comment_channel_id": channel_id, + "comment_comments": self.comments_format, + } + + def check_config(self): + """read config if not attached""" + if not self.config: + self.config = AppConfig().config + + self.is_activated = bool(self.config["downloads"]["comment_max"]) + + def build_yt_obs(self): + """ + get extractor config + max-comments,max-parents,max-replies,max-replies-per-thread + """ + max_comments = self.config["downloads"]["comment_max"] + max_comments_list = [i.strip() for i in max_comments.split(",")] + comment_sort = self.config["downloads"]["comment_sort"] + + yt_obs = { + "skip_download": True, + "quiet": False, + "getcomments": True, + "extractor_args": { + "youtube": { + "max_comments": max_comments_list, + "comment_sort": [comment_sort], + } + }, + } + + return yt_obs + + def get_yt_comments(self): + """get comments from youtube""" + yt_obs = self.build_yt_obs() + info_json = YtWrap(yt_obs).extract(self.youtube_id) + comments_raw = info_json.get("comments") + channel_id = info_json.get("channel_id") + return comments_raw, channel_id + + def format_comments(self, comments_raw): + """process comments to match format""" + comments = [] + + for comment in comments_raw: + cleaned_comment = self.clean_comment(comment) + comments.append(cleaned_comment) + + self.comments_format = comments + + def clean_comment(self, comment): + """parse metadata from comment for indexing""" + time_text_datetime = datetime.utcfromtimestamp(comment["timestamp"]) + + if time_text_datetime.hour == 0 and time_text_datetime.minute == 0: + format_string = "%Y-%m-%d" + else: + format_string = "%Y-%m-%d %H:%M" + + time_text = time_text_datetime.strftime(format_string) + + cleaned_comment = { + "comment_id": comment["id"], + "comment_text": comment["text"].replace("\xa0", ""), + "comment_timestamp": comment["timestamp"], + "comment_time_text": time_text, + "comment_likecount": comment["like_count"], + "comment_is_favorited": comment["is_favorited"], + "comment_author": comment["author"], + "comment_author_id": comment["author_id"], + "comment_author_thumbnail": comment["author_thumbnail"], + "comment_author_is_uploader": comment["author_is_uploader"], + "comment_parent": comment["parent"], + } + + return cleaned_comment + + def upload_comments(self): + """upload comments to es""" + if not self.is_activated: + return + + _, _ = ElasticWrap(self.es_path).put(self.json_data) + + vid_path = f"ta_video/_update/{self.youtube_id}" + data = {"doc": {"comment_count": len(self.comments_format)}} + _, _ = ElasticWrap(vid_path).post(data=data) + + def delete_comments(self): + """delete comments from es""" + print(f"{self.youtube_id}: delete comments") + _, _ = ElasticWrap(self.es_path).delete(refresh=True) + + def get_es_comments(self): + """get comments from ES""" + response, statuscode = ElasticWrap(self.es_path).get() + if statuscode == 404: + print(f"comments: not found {self.youtube_id}") + return False + + return response.get("_source") + + def reindex_comments(self): + """update comments from youtube""" + self.check_config() + if not self.is_activated: + return + + self.build_json() + es_comments = self.get_es_comments() + + if not self.comments_format and es_comments["comment_comments"]: + # don't overwrite comments in es + return + + self.delete_comments() + self.upload_comments() diff --git a/tubearchivist/home/src/index/reindex.py b/tubearchivist/home/src/index/reindex.py index ac0c363..b8b89f0 100644 --- a/tubearchivist/home/src/index/reindex.py +++ b/tubearchivist/home/src/index/reindex.py @@ -16,6 +16,7 @@ from home.src.download.yt_dlp_base import CookieHandler from home.src.download.yt_dlp_handler import VideoDownloader from home.src.es.connect import ElasticWrap from home.src.index.channel import YoutubeChannel +from home.src.index.comments import Comments from home.src.index.playlist import YoutubePlaylist from home.src.index.video import YoutubeVideo from home.src.ta.config import AppConfig @@ -147,8 +148,7 @@ class Reindex: if integrate_ryd: self._get_unrated_vids() - @staticmethod - def _reindex_single_video(youtube_id): + def _reindex_single_video(self, youtube_id): """refresh data for single video""" video = YoutubeVideo(youtube_id) @@ -182,6 +182,8 @@ class Reindex: thumb_handler.delete_video_thumb() thumb_handler.download_video_thumb(video.json_data["vid_thumb_url"]) + Comments(youtube_id, config=self.config).reindex_comments() + return @staticmethod diff --git a/tubearchivist/home/src/index/video.py b/tubearchivist/home/src/index/video.py index 9264080..7780c63 100644 --- a/tubearchivist/home/src/index/video.py +++ b/tubearchivist/home/src/index/video.py @@ -11,6 +11,7 @@ import requests from django.conf import settings from home.src.es.connect import ElasticWrap from home.src.index import channel as ta_channel +from home.src.index import comments as ta_comments from home.src.index import playlist as ta_playlist from home.src.index.generic import YouTubeItem from home.src.index.subtitle import YoutubeSubtitle @@ -302,6 +303,7 @@ class YoutubeVideo(YouTubeItem, YoutubeSubtitle): self.del_in_playlists() self.del_in_es() self.delete_subtitles() + self.delete_comments() def del_in_playlists(self): """remove downloaded in playlist""" @@ -326,6 +328,13 @@ class YoutubeVideo(YouTubeItem, YoutubeSubtitle): print(f"{self.youtube_id}: delete subtitles") YoutubeSubtitle(self).delete(subtitles=subtitles) + def delete_comments(self): + """delete comments from es""" + comments = ta_comments.Comments(self.youtube_id, config=self.config) + comments.check_config() + if comments.is_activated: + comments.delete_comments() + def _get_ryd_stats(self): """get optional stats from returnyoutubedislikeapi.com""" # pylint: disable=broad-except diff --git a/tubearchivist/home/templates/home/settings.html b/tubearchivist/home/templates/home/settings.html index 049f79c..b02e1bd 100644 --- a/tubearchivist/home/templates/home/settings.html +++ b/tubearchivist/home/templates/home/settings.html @@ -114,6 +114,24 @@ {{ app_form.downloads_subtitle_index }} +
+

Comments

+
+

Download and index comments: {{ config.downloads.comment_max }}
+ Follow the yt-dlp max_comments documentation, max-comments,max-parents,max-replies,max-replies-per-thread:
+

Example configurations:

+
    +
  • all,100,all,30: Get 100 max-parents and 30 max-replies-per-thread.
  • +
  • 1000,all,all,50: Get a total of 1000 comments over all, 50 replies per thread.
  • +
+ {{ app_form.downloads_comment_max }}

+
+
+

Selected comment sort method: {{ config.downloads.comment_sort }}
+ Select how many comments and threads to download:
+ {{ app_form.downloads_comment_sort }}

+
+

Cookie

diff --git a/tubearchivist/home/templates/home/video.html b/tubearchivist/home/templates/home/video.html index 705f6c6..a5fc044 100644 --- a/tubearchivist/home/templates/home/video.html +++ b/tubearchivist/home/templates/home/video.html @@ -123,6 +123,14 @@
{% endfor %} {% endif %} + {% if video.comment_count %} +
+

Comments: {{video.comment_count}}

+
+
+
+ + {% endif %}