new channel parser, extractor lang, #build

Changed: - Changed channel metadata extractor to yt-dlp - Added channel tags - Added extractor lang config
2023-05-07 10:02:54 +07:00 · 2023-05-07 10:02:54 +07:00 · 0fef751ab5
parent cf37800c2b 206921baf0
commit 0fef751ab5
12 changed files with 129 additions and 171 deletions
--- a/tubearchivist/config/settings.py
+++ b/tubearchivist/config/settings.py
@ -256,4 +256,4 @@ CORS_ALLOW_HEADERS = list(default_headers) + [

 # TA application settings
 TA_UPSTREAM = "https://github.com/tubearchivist/tubearchivist"
-TA_VERSION = "v0.3.5"
+TA_VERSION = "v0.3.6-unstable"
--- a/tubearchivist/home/config.json
+++ b/tubearchivist/home/config.json
@ -33,6 +33,7 @@
        "comment_sort": "top",
        "cookie_import": false,
        "throttledratelimit": false,
+        "extractor_lang": false,
        "integrate_ryd": false,
        "integrate_sponsorblock": false
    },
--- a/tubearchivist/home/src/es/backup.py
+++ b/tubearchivist/home/src/es/backup.py
@ -32,6 +32,7 @@ class ElasticBackup:
        if not self.reason:
            raise ValueError("missing backup reason in ElasticBackup")

+        if self.task:
            self.task.send_progress(["Scanning your index."])
        for index in self.index_config:
            index_name = index["index_name"]
@ -42,6 +43,7 @@ class ElasticBackup:

            self.backup_index(index_name)

+        if self.task:
            self.task.send_progress(["Compress files to zip archive."])
        self.zip_it()
        if self.reason == "auto":
--- a/tubearchivist/home/src/es/index_mapping.json
+++ b/tubearchivist/home/src/es/index_mapping.json
@ -39,6 +39,16 @@
                "channel_last_refresh": {
                    "type": "date"
                },
+                "channel_tags": {
+                    "type": "text",
+                    "analyzer": "english",
+                    "fields": {
+                        "keyword": {
+                            "type": "keyword",
+                            "ignore_above": 256
+                        }
+                    }
+                },
                "channel_overwrites": {
                    "properties": {
                        "download_format": {
@ -121,6 +131,16 @@
                        "channel_last_refresh": {
                            "type": "date"
                        },
+                        "channel_tags": {
+                            "type": "text",
+                            "analyzer": "english",
+                            "fields": {
+                                "keyword": {
+                                    "type": "keyword",
+                                    "ignore_above": 256
+                                }
+                            }
+                        },
                        "channel_overwrites": {
                            "properties": {
                                "download_format": {
--- a/tubearchivist/home/src/frontend/forms.py
+++ b/tubearchivist/home/src/frontend/forms.py
@ -122,6 +122,7 @@ class ApplicationSettingsForm(forms.Form):
    downloads_autodelete_days = forms.IntegerField(required=False)
    downloads_format = forms.CharField(required=False)
    downloads_format_sort = forms.CharField(required=False)
+    downloads_extractor_lang = forms.CharField(required=False)
    downloads_add_metadata = forms.ChoiceField(
        widget=forms.Select, choices=METADATA_CHOICES, required=False
    )
--- a/tubearchivist/home/src/index/channel.py
+++ b/tubearchivist/home/src/index/channel.py
@ -6,158 +6,15 @@ functionality:

 import json
 import os
-import re
 from datetime import datetime

-import requests
-from bs4 import BeautifulSoup
 from home.src.download import queue  # partial import
 from home.src.download.thumbnails import ThumbManager
 from home.src.download.yt_dlp_base import YtWrap
 from home.src.es.connect import ElasticWrap, IndexPaginate
 from home.src.index.generic import YouTubeItem
 from home.src.index.playlist import YoutubePlaylist
-from home.src.ta.helper import clean_string, requests_headers
-
-
-class ChannelScraper:
-    """custom scraper using bs4 to scrape channel about page
-    will be able to be integrated into yt-dlp
-    once #2237 and #2350 are merged upstream
-    """
-
-    def __init__(self, channel_id):
-        self.channel_id = channel_id
-        self.soup = False
-        self.yt_json = False
-        self.json_data = False
-
-    def get_json(self):
-        """main method to return channel dict"""
-        self.get_soup()
-        self._extract_yt_json()
-        if self._is_deactivated():
-            return False
-
-        self._parse_channel_main()
-        self._parse_channel_meta()
-        return self.json_data
-
-    def get_soup(self):
-        """return soup from youtube"""
-        print(f"{self.channel_id}: scrape channel data from youtube")
-        url = f"https://www.youtube.com/channel/{self.channel_id}/about?hl=en"
-        cookies = {"CONSENT": "YES+xxxxxxxxxxxxxxxxxxxxxxxxxxx"}
-        response = requests.get(
-            url, cookies=cookies, headers=requests_headers(), timeout=10
-        )
-        if response.ok:
-            channel_page = response.text
-        else:
-            print(f"{self.channel_id}: failed to extract channel info")
-            raise ConnectionError
-        self.soup = BeautifulSoup(channel_page, "html.parser")
-
-    def _extract_yt_json(self):
-        """parse soup and get ytInitialData json"""
-        all_scripts = self.soup.find("body").find_all("script")
-        for script in all_scripts:
-            if "var ytInitialData = " in str(script):
-                script_content = str(script)
-                break
-        # extract payload
-        script_content = script_content.split("var ytInitialData = ")[1]
-        json_raw = script_content.rstrip(";</script>")
-        self.yt_json = json.loads(json_raw)
-
-    def _is_deactivated(self):
-        """check if channel is deactivated"""
-        alerts = self.yt_json.get("alerts")
-        if not alerts:
-            return False
-
-        for alert in alerts:
-            alert_text = alert["alertRenderer"]["text"]["simpleText"]
-            print(f"{self.channel_id}: failed to extract, {alert_text}")
-            return True
-
-    def _parse_channel_main(self):
-        """extract maintab values from scraped channel json data"""
-        main_tab = self.yt_json["header"]["c4TabbedHeaderRenderer"]
-        # build and return dict
-        self.json_data = {
-            "channel_active": True,
-            "channel_last_refresh": int(datetime.now().timestamp()),
-            "channel_subs": self._get_channel_subs(main_tab),
-            "channel_name": main_tab["title"],
-            "channel_banner_url": self._get_thumbnails(main_tab, "banner"),
-            "channel_tvart_url": self._get_thumbnails(main_tab, "tvBanner"),
-            "channel_id": self.channel_id,
-            "channel_subscribed": False,
-        }
-
-    @staticmethod
-    def _get_thumbnails(main_tab, thumb_name):
-        """extract banner url from main_tab"""
-        try:
-            all_banners = main_tab[thumb_name]["thumbnails"]
-            banner = sorted(all_banners, key=lambda k: k["width"])[-1]["url"]
-        except KeyError:
-            banner = False
-
-        return banner
-
-    @staticmethod
-    def _get_channel_subs(main_tab):
-        """process main_tab to get channel subs as int"""
-        try:
-            sub_text_simple = main_tab["subscriberCountText"]["simpleText"]
-            sub_text = sub_text_simple.split(" ")[0]
-            if sub_text[-1] == "K":
-                channel_subs = int(float(sub_text.replace("K", "")) * 1000)
-            elif sub_text[-1] == "M":
-                channel_subs = int(float(sub_text.replace("M", "")) * 1000000)
-            elif int(sub_text) >= 0:
-                channel_subs = int(sub_text)
-            else:
-                message = f"{sub_text} not dealt with"
-                print(message)
-        except KeyError:
-            channel_subs = 0
-
-        return channel_subs
-
-    def _parse_channel_meta(self):
-        """extract meta tab values from channel payload"""
-        # meta tab
-        meta_tab = self.yt_json["metadata"]["channelMetadataRenderer"]
-        all_thumbs = meta_tab["avatar"]["thumbnails"]
-        thumb_url = sorted(all_thumbs, key=lambda k: k["width"])[-1]["url"]
-        # stats tab
-        renderer = "twoColumnBrowseResultsRenderer"
-        all_tabs = self.yt_json["contents"][renderer]["tabs"]
-        for tab in all_tabs:
-            if "tabRenderer" in tab.keys():
-                if tab["tabRenderer"]["title"] == "About":
-                    about_tab = tab["tabRenderer"]["content"][
-                        "sectionListRenderer"
-                    ]["contents"][0]["itemSectionRenderer"]["contents"][0][
-                        "channelAboutFullMetadataRenderer"
-                    ]
-                    break
-        try:
-            channel_views_text = about_tab["viewCountText"]["simpleText"]
-            channel_views = int(re.sub(r"\D", "", channel_views_text))
-        except KeyError:
-            channel_views = 0
-
-        self.json_data.update(
-            {
-                "channel_description": meta_tab["description"],
-                "channel_thumb_url": thumb_url,
-                "channel_views": channel_views,
-            }
-        )
+from home.src.ta.helper import clean_string


 class YoutubeChannel(YouTubeItem):
@ -166,36 +23,94 @@ class YoutubeChannel(YouTubeItem):
    es_path = False
    index_name = "ta_channel"
    yt_base = "https://www.youtube.com/channel/"
+    yt_obs = {
+        "extract_flat": True,
+        "allow_playlist_files": True,
+    }

    def __init__(self, youtube_id, task=False):
        super().__init__(youtube_id)
-        self.es_path = f"{self.index_name}/_doc/{youtube_id}"
        self.all_playlists = False
        self.task = task

+    def build_yt_url(self):
+        """overwrite base to use channel about page"""
+        return f"{self.yt_base}{self.youtube_id}/about"
+
    def build_json(self, upload=False, fallback=False):
        """get from es or from youtube"""
        self.get_from_es()
        if self.json_data:
            return

-        self.get_from_youtube(fallback)
+        self.get_from_youtube()
+        if not self.youtube_meta and fallback:
+            self._video_fallback(fallback)
+        else:
+            self._process_youtube_meta()
+            self.get_channel_art()

        if upload:
            self.upload_to_es()
-        return

-    def get_from_youtube(self, fallback=False):
-        """use bs4 to scrape channel about page"""
-        self.json_data = ChannelScraper(self.youtube_id).get_json()
+    def _process_youtube_meta(self):
+        """extract relevant fields"""
+        self.youtube_meta["thumbnails"].reverse()
+        channel_subs = self.youtube_meta.get("channel_follower_count") or 0
+        self.json_data = {
+            "channel_active": True,
+            "channel_description": self.youtube_meta.get("description", False),
+            "channel_id": self.youtube_id,
+            "channel_last_refresh": int(datetime.now().timestamp()),
+            "channel_name": self.youtube_meta["uploader"],
+            "channel_subs": channel_subs,
+            "channel_subscribed": False,
+            "channel_tags": self._parse_tags(self.youtube_meta.get("tags")),
+            "channel_banner_url": self._get_banner_art(),
+            "channel_thumb_url": self._get_thumb_art(),
+            "channel_tvart_url": self._get_tv_art(),
+            "channel_views": self.youtube_meta.get("view_count", 0),
+        }

-        if not self.json_data and fallback:
-            self._video_fallback(fallback)
+    def _parse_tags(self, tags):
+        """parse channel tags"""
+        if not tags:
+            return False

-        if not self.json_data:
-            return
+        joined = " ".join(tags)
+        return [i.strip() for i in joined.split('"') if i and not i == " "]

-        self.get_channel_art()
+    def _get_thumb_art(self):
+        """extract thumb art"""
+        for i in self.youtube_meta["thumbnails"]:
+            if not i.get("width"):
+                continue
+            if i.get("width") == i.get("height"):
+                return i["url"]
+
+        return False
+
+    def _get_tv_art(self):
+        """extract tv artwork"""
+        for i in self.youtube_meta["thumbnails"]:
+            if i.get("id") == "avatar_uncropped":
+                return i["url"]
+            if not i.get("width"):
+                continue
+            if i["width"] // i["height"] < 2 and not i["width"] == i["height"]:
+                return i["url"]
+
+        return False
+
+    def _get_banner_art(self):
+        """extract banner artwork"""
+        for i in self.youtube_meta["thumbnails"]:
+            if not i.get("width"):
+                continue
+            if i["width"] // i["height"] > 5:
+                return i["url"]
+
+        return False

    def _video_fallback(self, fallback):
        """use video metadata as fallback"""
@ -209,6 +124,7 @@ class YoutubeChannel(YouTubeItem):
            "channel_tvart_url": False,
            "channel_id": self.youtube_id,
            "channel_subscribed": False,
+            "channel_tags": False,
            "channel_description": False,
            "channel_thumb_url": False,
            "channel_views": 0,
--- a/tubearchivist/home/src/index/generic.py
+++ b/tubearchivist/home/src/index/generic.py
@ -15,8 +15,8 @@ class YouTubeItem:
    """base class for youtube"""

    es_path = False
-    index_name = False
-    yt_base = False
+    index_name = ""
+    yt_base = ""
    yt_obs = {
        "skip_download": True,
        "noplaylist": True,
@ -24,18 +24,27 @@ class YouTubeItem:

    def __init__(self, youtube_id):
        self.youtube_id = youtube_id
+        self.es_path = f"{self.index_name}/_doc/{youtube_id}"
        self.config = AppConfig().config
        self.app_conf = self.config["application"]
        self.youtube_meta = False
        self.json_data = False

+    def build_yt_url(self):
+        """build youtube url"""
+        return self.yt_base + self.youtube_id
+
    def get_from_youtube(self):
        """use yt-dlp to get meta data from youtube"""
        print(f"{self.youtube_id}: get metadata from youtube")
-        url = self.yt_base + self.youtube_id
-        response = YtWrap(self.yt_obs, self.config).extract(url)
+        obs_request = self.yt_obs.copy()
+        if self.config["downloads"]["extractor_lang"]:
+            langs = self.config["downloads"]["extractor_lang"]
+            langs_list = [i.strip() for i in langs.split(",")]
+            obs_request["extractor_args"] = {"youtube": {"lang": langs_list}}

-        self.youtube_meta = response
+        url = self.build_yt_url()
+        self.youtube_meta = YtWrap(obs_request, self.config).extract(url)

    def get_from_es(self):
        """get indexed data from elastic search"""
--- a/tubearchivist/home/src/index/playlist.py
+++ b/tubearchivist/home/src/index/playlist.py
@ -26,7 +26,6 @@ class YoutubePlaylist(YouTubeItem):

    def __init__(self, youtube_id):
        super().__init__(youtube_id)
-        self.es_path = f"{self.index_name}/_doc/{youtube_id}"
        self.all_members = False
        self.nav = False
        self.all_youtube_ids = []
--- a/tubearchivist/home/src/index/video.py
+++ b/tubearchivist/home/src/index/video.py
@ -138,7 +138,6 @@ class YoutubeVideo(YouTubeItem, YoutubeSubtitle):
        self.channel_id = False
        self.video_overwrites = video_overwrites
        self.video_type = video_type
-        self.es_path = f"{self.index_name}/_doc/{youtube_id}"
        self.offline_import = False

    def build_json(self, youtube_meta_overwrite=False, media_path=False):
--- a/tubearchivist/home/templates/home/channel_id_about.html
+++ b/tubearchivist/home/templates/home/channel_id_about.html
@ -81,11 +81,18 @@
            <button onclick="textExpand()" id="text-expand-button">Show more</button>
        </div>
    {% endif %}
+    {% if channel_info.channel_tags %}
        <div class="description-box">
-        <h2>Customize {{ channel_info.channel_name }}</h2>
+            <div class="video-tag-box">
+                {% for tag in channel_info.channel_tags %}
+                    <span class="video-tag">{{ tag }}</span>
+                {% endfor %}
            </div>
+        </div>
+    {% endif %}
    <div id="overwrite-form" class="info-box">
        <div class="info-box-item">
+            <h2>Customize {{ channel_info.channel_name }}</h2>
            <form class="overwrite-form" action="/channel/{{ channel_info.channel_id }}/about/" method="POST">
                {% csrf_token %}
                <div class="overwrite-form-item">
--- a/tubearchivist/home/templates/home/settings.html
+++ b/tubearchivist/home/templates/home/settings.html
@ -108,6 +108,11 @@
                {{ app_form.downloads_format_sort }}
                <br>
            </div>
+            <div class="settings-item">
+                <p>Prefer translated metadata language: <span class="settings-current">{{ config.downloads.extractor_lang }}</span></p>
+                <i>This will change the language this video gets indexed as. That will only be available if the uploader provides translations. Add as two letter ISO language code, check the <a href="https://github.com/yt-dlp/yt-dlp#youtube" target="_blank">documentation</a> which languages are available.</i><br>
+                {{ app_form.downloads_extractor_lang}}
+            </div>
            <div class="settings-item">
                <p>Current metadata embed setting: <span class="settings-current">{{ config.downloads.add_metadata }}</span></p>
                <i>Metadata is not embedded into the downloaded files by default.</i><br>
--- a/tubearchivist/requirements.txt
+++ b/tubearchivist/requirements.txt
@ -1,12 +1,11 @@
-beautifulsoup4==4.12.2
 celery==5.2.7
-Django==4.2
+Django==4.2.1
 django-auth-ldap==4.3.0
 django-cors-headers==3.14.0
 djangorestframework==3.14.0
 Pillow==9.5.0
 redis==4.5.4
-requests==2.29.0
+requests==2.30.0
 ryd-client==0.0.6
 uWSGI==2.0.21
 whitenoise==6.4.0