From 0e8d3b76c180adfd0c64b8edf016d0ac0bb13938 Mon Sep 17 00:00:00 2001 From: simon Date: Tue, 8 Nov 2022 08:19:09 +0700 Subject: [PATCH 01/17] better wording for scheduler frequency, #358 --- docs/Settings.md | 2 +- tubearchivist/home/templates/home/settings.html | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/Settings.md b/docs/Settings.md index 72e69e7..0bc920d 100644 --- a/docs/Settings.md +++ b/docs/Settings.md @@ -102,7 +102,7 @@ Examples: NOTE: - Changes in the scheduler settings require a container restart to take effect. - Cron format as *number*/*number* are none standard cron and are not supported by the scheduler, for example **0 0/12 \*** is invalid, use **0 \*/12 \*** instead. -- Avoid an unnecessary frequent schedule to not get blocked by YouTube. For that reason * or wildcards for minutes are not supported. +- Avoid an unnecessary frequent schedule to not get blocked by YouTube. For that reason, the scheduler doesn't support schedules that trigger more than once per hour. ## Rescan Subscriptions That's the equivalent task as run from the downloads page looking through your channel and playlist and add missing videos to the download queue. diff --git a/tubearchivist/home/templates/home/settings.html b/tubearchivist/home/templates/home/settings.html index dd94879..049f79c 100644 --- a/tubearchivist/home/templates/home/settings.html +++ b/tubearchivist/home/templates/home/settings.html @@ -186,7 +186,7 @@

Note:

From e912cf7fb62941d87420dc47ccf96a312c70bba6 Mon Sep 17 00:00:00 2001 From: simon Date: Thu, 10 Nov 2022 09:53:49 +0700 Subject: [PATCH 02/17] rename to *common errors*, better issue template --- .github/ISSUE_TEMPLATE/BUG-REPORT.yml | 4 +++- .github/ISSUE_TEMPLATE/INSTALLATION-HELP.yml | 2 +- README.md | 4 ++-- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/BUG-REPORT.yml b/.github/ISSUE_TEMPLATE/BUG-REPORT.yml index dd0b723..b9f051a 100644 --- a/.github/ISSUE_TEMPLATE/BUG-REPORT.yml +++ b/.github/ISSUE_TEMPLATE/BUG-REPORT.yml @@ -11,10 +11,12 @@ body: - type: checkboxes id: latest attributes: - label: Latest and Greatest + label: I've read the documentation: options: - label: I'm running the latest version of Tube Archivist and have read the [release notes](https://github.com/tubearchivist/tubearchivist/releases/latest). required: true + - label: I have read through the [wiki](https://github.com/tubearchivist/tubearchivist/wiki) and the [readme](https://github.com/tubearchivist/tubearchivist#installing-and-updating), particularly the [common errors](https://github.com/tubearchivist/tubearchivist#common-errors) section. + required: true - type: input id: os diff --git a/.github/ISSUE_TEMPLATE/INSTALLATION-HELP.yml b/.github/ISSUE_TEMPLATE/INSTALLATION-HELP.yml index 8459327..07519dc 100644 --- a/.github/ISSUE_TEMPLATE/INSTALLATION-HELP.yml +++ b/.github/ISSUE_TEMPLATE/INSTALLATION-HELP.yml @@ -15,7 +15,7 @@ body: options: - label: I have read and understand the [installation instructions](https://github.com/tubearchivist/tubearchivist#installing-and-updating). required: true - - label: My issue is not described in the [potential pitfalls](https://github.com/tubearchivist/tubearchivist#potential-pitfalls) section. + - label: My issue is not described in the [common errors](https://github.com/tubearchivist/tubearchivist#common-errors) section. required: true - type: input diff --git a/README.md b/README.md index 4a406d7..bf029fd 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ * [Extended Universe](#extended-universe) * [Installing and updating](#installing-and-updating) * [Getting Started](#getting-started) -* [Potential pitfalls](#potential-pitfalls) +* [Common Errors](#common-errors) * [Roadmap](#roadmap) * [Known limitations](#known-limitations) * [Donate](#donate) @@ -165,7 +165,7 @@ You will see the current version number of **Tube Archivist** in the footer of t There is a Helm Chart available at https://github.com/insuusvenerati/helm-charts. Mostly self-explanatory but feel free to ask questions in the discord / subreddit. -## Potential pitfalls +## Common Errors ### vm.max_map_count **Elastic Search** in Docker requires the kernel setting of the host machine `vm.max_map_count` to be set to at least 262144. From 5b7e3e877b37750a2d9dffe85ac22268b96ce4ee Mon Sep 17 00:00:00 2001 From: simon Date: Sat, 12 Nov 2022 11:48:48 +0700 Subject: [PATCH 03/17] implement basic comment archive --- tubearchivist/home/src/es/index_mapping.json | 66 ++++++++++++ tubearchivist/home/src/index/comments.py | 101 +++++++++++++++++++ 2 files changed, 167 insertions(+) create mode 100644 tubearchivist/home/src/index/comments.py diff --git a/tubearchivist/home/src/es/index_mapping.json b/tubearchivist/home/src/es/index_mapping.json index f023eef..bed6973 100644 --- a/tubearchivist/home/src/es/index_mapping.json +++ b/tubearchivist/home/src/es/index_mapping.json @@ -460,6 +460,72 @@ }, "number_of_replicas": "0" } + }, + { + "index_name": "comments", + "expected_map": { + "youtube_id": { + "type": "keyword" + }, + "comment_last_refresh": { + "type": "date" + }, + "comment_comments": { + "properties": { + "comment_id": { + "type": "keyword" + }, + "comment_text": { + "type" : "text" + }, + "comment_timestamp": { + "type": "date" + }, + "comment_time_text": { + "type" : "text" + }, + "comment_likecount": { + "type": "long" + }, + "comment_is_favorited": { + "type": "boolean" + }, + "comment_author": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256, + "normalizer": "to_lower" + } + } + }, + "comment_author_id": { + "type": "keyword" + }, + "comment_author_thumbnail": { + "type": "keyword" + }, + "comment_author_is_uploader": { + "type": "boolean" + }, + "comment_parent": { + "type": "keyword" + } + } + } + }, + "expected_set": { + "analysis": { + "normalizer": { + "to_lower": { + "type": "custom", + "filter": ["lowercase"] + } + } + }, + "number_of_replicas": "0" + } } ] } \ No newline at end of file diff --git a/tubearchivist/home/src/index/comments.py b/tubearchivist/home/src/index/comments.py new file mode 100644 index 0000000..05f9a6e --- /dev/null +++ b/tubearchivist/home/src/index/comments.py @@ -0,0 +1,101 @@ +""" +Functionality: +- Download comments +- Index comments in ES +- Retrieve comments from ES +""" + +from datetime import datetime + +from home.src.download.yt_dlp_base import YtWrap +from home.src.es.connect import ElasticWrap + + +class Comments: + """hold all comments functionality""" + + def __init__(self, youtube_id): + self.youtube_id = youtube_id + self.es_path = f"ta_comments/_doc/{youtube_id}" + self.max_comments = "all,100,all,30" + self.json_data = False + + def build_json(self): + """build json document for es""" + comments_raw = self.get_comments() + comments_format = self.format_comments(comments_raw) + + self.json_data = { + "youtube_id": self.youtube_id, + "comment_last_refresh": int(datetime.now().strftime("%s")), + "comment_comments": comments_format, + } + + def build_yt_obs(self): + """ + get extractor config + max-comments,max-parents,max-replies,max-replies-per-thread + """ + max_comments_list = [i.strip() for i in self.max_comments.split(",")] + comment_sort = "top" + + yt_obs = { + "skip_download": True, + "quiet": False, + "getcomments": True, + "extractor_args": { + "youtube": { + "max_comments": max_comments_list, + "comment_sort": [comment_sort], + } + }, + } + + return yt_obs + + def get_comments(self): + """get comments from youtube""" + print(f"comments: get comments with format {self.max_comments}") + yt_obs = self.build_yt_obs() + info_json = YtWrap(yt_obs).extract(self.youtube_id) + comments_raw = info_json.get("comments") + return comments_raw + + def format_comments(self, comments_raw): + """process comments to match format""" + comments = [] + + for comment in comments_raw: + cleaned_comment = self.clean_comment(comment) + comments.append(cleaned_comment) + + return comments + + def clean_comment(self, comment): + """parse metadata from comment for indexing""" + time_text_datetime = datetime.utcfromtimestamp(comment["timestamp"]) + time_text = time_text_datetime.strftime("%Y-%m-%d %H:%M:%S") + + cleaned_comment = { + "comment_id": comment["id"], + "comment_text": comment["text"].replace("\xa0", ""), + "comment_timestamp": comment["timestamp"], + "comment_time_text": time_text, + "comment_likecount": comment["like_count"], + "comment_is_favorited": comment["is_favorited"], + "comment_author": comment["author"], + "comment_author_id": comment["author_id"], + "comment_author_thumbnail": comment["author_thumbnail"], + "comment_author_is_uploader": comment["author_is_uploader"], + "comment_parent": comment["parent"], + } + + return cleaned_comment + + def upload_comments(self): + """upload comments to es""" + _, _ = ElasticWrap(self.es_path).put(self.json_data) + + def delete_comments(self): + """delete comments from es""" + _, _ = ElasticWrap(self.es_path).delete() From f6b6185fb213bc055abf9fdd1f32e2f008c3a94a Mon Sep 17 00:00:00 2001 From: simon Date: Sat, 12 Nov 2022 12:40:14 +0700 Subject: [PATCH 04/17] rename ta_comment index, implement get comments from es --- tubearchivist/home/src/es/index_mapping.json | 2 +- tubearchivist/home/src/index/comments.py | 15 ++++++++++++--- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/tubearchivist/home/src/es/index_mapping.json b/tubearchivist/home/src/es/index_mapping.json index bed6973..4a325ea 100644 --- a/tubearchivist/home/src/es/index_mapping.json +++ b/tubearchivist/home/src/es/index_mapping.json @@ -462,7 +462,7 @@ } }, { - "index_name": "comments", + "index_name": "comment", "expected_map": { "youtube_id": { "type": "keyword" diff --git a/tubearchivist/home/src/index/comments.py b/tubearchivist/home/src/index/comments.py index 05f9a6e..a7423ee 100644 --- a/tubearchivist/home/src/index/comments.py +++ b/tubearchivist/home/src/index/comments.py @@ -16,13 +16,13 @@ class Comments: def __init__(self, youtube_id): self.youtube_id = youtube_id - self.es_path = f"ta_comments/_doc/{youtube_id}" + self.es_path = f"ta_comment/_doc/{youtube_id}" self.max_comments = "all,100,all,30" self.json_data = False def build_json(self): """build json document for es""" - comments_raw = self.get_comments() + comments_raw = self.get_yt_comments() comments_format = self.format_comments(comments_raw) self.json_data = { @@ -53,7 +53,7 @@ class Comments: return yt_obs - def get_comments(self): + def get_yt_comments(self): """get comments from youtube""" print(f"comments: get comments with format {self.max_comments}") yt_obs = self.build_yt_obs() @@ -99,3 +99,12 @@ class Comments: def delete_comments(self): """delete comments from es""" _, _ = ElasticWrap(self.es_path).delete() + + def get_es_comments(self): + """get comments from ES""" + response, statuscode = ElasticWrap(self.es_path).get() + if statuscode == 404: + print(f"comments: not found {self.youtube_id}") + return False + + return response From fb046bed5bd1f897eddde5e4a4db2a5b2c59d6bd Mon Sep 17 00:00:00 2001 From: simon Date: Sat, 12 Nov 2022 12:42:08 +0700 Subject: [PATCH 05/17] [API] implement get comments API view --- tubearchivist/api/README.md | 4 ++++ tubearchivist/api/src/search_processor.py | 16 ++++++++++++++++ tubearchivist/api/urls.py | 6 ++++++ tubearchivist/api/views.py | 16 ++++++++++++++++ 4 files changed, 42 insertions(+) diff --git a/tubearchivist/api/README.md b/tubearchivist/api/README.md index 6e5efb4..593089d 100644 --- a/tubearchivist/api/README.md +++ b/tubearchivist/api/README.md @@ -12,6 +12,7 @@ Note: **Video** - [Video List](#video-list-view) - [Video Single](#video-item-view) +- [Video Comments](#video-comment-view) - [Video Single Progress](#video-progress-view) - [Video Single Sponsorblock](#sponsor-block-view) WIP @@ -78,6 +79,9 @@ Pass page number as a query parameter: `page=2`. Defaults to *0*, `page=1` is re ## Video Item View /api/video/\/ +## Video Comment View +/api/video/\/comment/ + ## Video Progress View /api/video/\/progress diff --git a/tubearchivist/api/src/search_processor.py b/tubearchivist/api/src/search_processor.py index 7a41163..7b594f6 100644 --- a/tubearchivist/api/src/search_processor.py +++ b/tubearchivist/api/src/search_processor.py @@ -48,6 +48,8 @@ class SearchProcess: processed = self._process_playlist(result["_source"]) if index == "ta_download": processed = self._process_download(result["_source"]) + if index == "ta_comment": + processed = self._process_comment(result["_source"]) return processed @@ -123,3 +125,17 @@ class SearchProcess: } ) return dict(sorted(download_dict.items())) + + def _process_comment(self, comment_dict): + """run on all comments, create reply thread""" + all_comments = comment_dict["comment_comments"] + processed_comments = [] + + for comment in all_comments: + if comment["comment_parent"] == "root": + comment.update({"comment_replies": []}) + processed_comments.append(comment) + else: + processed_comments[-1]["comment_replies"].append(comment) + + return processed_comments diff --git a/tubearchivist/api/urls.py b/tubearchivist/api/urls.py index e84ec05..7fb48a0 100644 --- a/tubearchivist/api/urls.py +++ b/tubearchivist/api/urls.py @@ -18,6 +18,7 @@ from api.views import ( TaskApiView, VideoApiListView, VideoApiView, + VideoCommentView, VideoProgressView, VideoSponsorView, ) @@ -41,6 +42,11 @@ urlpatterns = [ VideoProgressView.as_view(), name="api-video-progress", ), + path( + "video//comment/", + VideoCommentView.as_view(), + name="api-video-comment", + ), path( "video//sponsor/", VideoSponsorView.as_view(), diff --git a/tubearchivist/api/views.py b/tubearchivist/api/views.py index 2574db6..dab62e3 100644 --- a/tubearchivist/api/views.py +++ b/tubearchivist/api/views.py @@ -145,6 +145,22 @@ class VideoProgressView(ApiBaseView): return Response(self.response) +class VideoCommentView(ApiBaseView): + """resolves to /api/video//comment/ + handle video comments + GET: return all comments from video with reply threads + """ + + search_base = "ta_comment/_doc/" + + def get(self, request, video_id): + """get video comments""" + # pylint: disable=unused-argument + self.get_document(video_id) + + return Response(self.response, status=self.status_code) + + class VideoSponsorView(ApiBaseView): """resolves to /api/video//sponsor/ handle sponsor block integration From 79d134cea895c7a77d73c0a6a794c89a38e601c8 Mon Sep 17 00:00:00 2001 From: simon Date: Mon, 14 Nov 2022 11:59:59 +0700 Subject: [PATCH 06/17] clean format comments date string --- tubearchivist/home/src/index/comments.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tubearchivist/home/src/index/comments.py b/tubearchivist/home/src/index/comments.py index a7423ee..e06b973 100644 --- a/tubearchivist/home/src/index/comments.py +++ b/tubearchivist/home/src/index/comments.py @@ -74,7 +74,13 @@ class Comments: def clean_comment(self, comment): """parse metadata from comment for indexing""" time_text_datetime = datetime.utcfromtimestamp(comment["timestamp"]) - time_text = time_text_datetime.strftime("%Y-%m-%d %H:%M:%S") + + if time_text_datetime.hour == 0 and time_text_datetime.minute == 0: + format_string = "%Y-%m-%d" + else: + format_string = "%Y-%m-%d %H:%M" + + time_text = time_text_datetime.strftime(format_string) cleaned_comment = { "comment_id": comment["id"], From ad016df10504346b8733e7bc6c047d422a8c103d Mon Sep 17 00:00:00 2001 From: simon Date: Mon, 14 Nov 2022 16:31:49 +0700 Subject: [PATCH 07/17] implement comment frontend --- tubearchivist/home/templates/home/video.html | 5 ++ tubearchivist/static/css/style.css | 37 +++++++++- tubearchivist/static/img/icon-heart.svg | 8 ++ tubearchivist/static/script.js | 77 +++++++++++++++++++- 4 files changed, 125 insertions(+), 2 deletions(-) create mode 100644 tubearchivist/static/img/icon-heart.svg diff --git a/tubearchivist/home/templates/home/video.html b/tubearchivist/home/templates/home/video.html index 705f6c6..df53a71 100644 --- a/tubearchivist/home/templates/home/video.html +++ b/tubearchivist/home/templates/home/video.html @@ -123,6 +123,11 @@ {% endfor %} {% endif %} +
+

Comments

+
+
+
+ {% endif %}