new channel parser, extractor lang, #build

Changed:
- Changed channel metadata extractor to yt-dlp
- Added channel tags
- Added extractor lang config
This commit is contained in:
simon 2023-05-07 10:02:54 +07:00
commit 0fef751ab5
12 changed files with 129 additions and 171 deletions

View File

@ -256,4 +256,4 @@ CORS_ALLOW_HEADERS = list(default_headers) + [
# TA application settings
TA_UPSTREAM = "https://github.com/tubearchivist/tubearchivist"
TA_VERSION = "v0.3.5"
TA_VERSION = "v0.3.6-unstable"

View File

@ -33,6 +33,7 @@
"comment_sort": "top",
"cookie_import": false,
"throttledratelimit": false,
"extractor_lang": false,
"integrate_ryd": false,
"integrate_sponsorblock": false
},

View File

@ -32,7 +32,8 @@ class ElasticBackup:
if not self.reason:
raise ValueError("missing backup reason in ElasticBackup")
self.task.send_progress(["Scanning your index."])
if self.task:
self.task.send_progress(["Scanning your index."])
for index in self.index_config:
index_name = index["index_name"]
print(f"backup: export in progress for {index_name}")
@ -42,7 +43,8 @@ class ElasticBackup:
self.backup_index(index_name)
self.task.send_progress(["Compress files to zip archive."])
if self.task:
self.task.send_progress(["Compress files to zip archive."])
self.zip_it()
if self.reason == "auto":
self.rotate_backup()

View File

@ -39,6 +39,16 @@
"channel_last_refresh": {
"type": "date"
},
"channel_tags": {
"type": "text",
"analyzer": "english",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"channel_overwrites": {
"properties": {
"download_format": {
@ -121,6 +131,16 @@
"channel_last_refresh": {
"type": "date"
},
"channel_tags": {
"type": "text",
"analyzer": "english",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"channel_overwrites": {
"properties": {
"download_format": {

View File

@ -122,6 +122,7 @@ class ApplicationSettingsForm(forms.Form):
downloads_autodelete_days = forms.IntegerField(required=False)
downloads_format = forms.CharField(required=False)
downloads_format_sort = forms.CharField(required=False)
downloads_extractor_lang = forms.CharField(required=False)
downloads_add_metadata = forms.ChoiceField(
widget=forms.Select, choices=METADATA_CHOICES, required=False
)

View File

@ -6,158 +6,15 @@ functionality:
import json
import os
import re
from datetime import datetime
import requests
from bs4 import BeautifulSoup
from home.src.download import queue # partial import
from home.src.download.thumbnails import ThumbManager
from home.src.download.yt_dlp_base import YtWrap
from home.src.es.connect import ElasticWrap, IndexPaginate
from home.src.index.generic import YouTubeItem
from home.src.index.playlist import YoutubePlaylist
from home.src.ta.helper import clean_string, requests_headers
class ChannelScraper:
"""custom scraper using bs4 to scrape channel about page
will be able to be integrated into yt-dlp
once #2237 and #2350 are merged upstream
"""
def __init__(self, channel_id):
self.channel_id = channel_id
self.soup = False
self.yt_json = False
self.json_data = False
def get_json(self):
"""main method to return channel dict"""
self.get_soup()
self._extract_yt_json()
if self._is_deactivated():
return False
self._parse_channel_main()
self._parse_channel_meta()
return self.json_data
def get_soup(self):
"""return soup from youtube"""
print(f"{self.channel_id}: scrape channel data from youtube")
url = f"https://www.youtube.com/channel/{self.channel_id}/about?hl=en"
cookies = {"CONSENT": "YES+xxxxxxxxxxxxxxxxxxxxxxxxxxx"}
response = requests.get(
url, cookies=cookies, headers=requests_headers(), timeout=10
)
if response.ok:
channel_page = response.text
else:
print(f"{self.channel_id}: failed to extract channel info")
raise ConnectionError
self.soup = BeautifulSoup(channel_page, "html.parser")
def _extract_yt_json(self):
"""parse soup and get ytInitialData json"""
all_scripts = self.soup.find("body").find_all("script")
for script in all_scripts:
if "var ytInitialData = " in str(script):
script_content = str(script)
break
# extract payload
script_content = script_content.split("var ytInitialData = ")[1]
json_raw = script_content.rstrip(";</script>")
self.yt_json = json.loads(json_raw)
def _is_deactivated(self):
"""check if channel is deactivated"""
alerts = self.yt_json.get("alerts")
if not alerts:
return False
for alert in alerts:
alert_text = alert["alertRenderer"]["text"]["simpleText"]
print(f"{self.channel_id}: failed to extract, {alert_text}")
return True
def _parse_channel_main(self):
"""extract maintab values from scraped channel json data"""
main_tab = self.yt_json["header"]["c4TabbedHeaderRenderer"]
# build and return dict
self.json_data = {
"channel_active": True,
"channel_last_refresh": int(datetime.now().timestamp()),
"channel_subs": self._get_channel_subs(main_tab),
"channel_name": main_tab["title"],
"channel_banner_url": self._get_thumbnails(main_tab, "banner"),
"channel_tvart_url": self._get_thumbnails(main_tab, "tvBanner"),
"channel_id": self.channel_id,
"channel_subscribed": False,
}
@staticmethod
def _get_thumbnails(main_tab, thumb_name):
"""extract banner url from main_tab"""
try:
all_banners = main_tab[thumb_name]["thumbnails"]
banner = sorted(all_banners, key=lambda k: k["width"])[-1]["url"]
except KeyError:
banner = False
return banner
@staticmethod
def _get_channel_subs(main_tab):
"""process main_tab to get channel subs as int"""
try:
sub_text_simple = main_tab["subscriberCountText"]["simpleText"]
sub_text = sub_text_simple.split(" ")[0]
if sub_text[-1] == "K":
channel_subs = int(float(sub_text.replace("K", "")) * 1000)
elif sub_text[-1] == "M":
channel_subs = int(float(sub_text.replace("M", "")) * 1000000)
elif int(sub_text) >= 0:
channel_subs = int(sub_text)
else:
message = f"{sub_text} not dealt with"
print(message)
except KeyError:
channel_subs = 0
return channel_subs
def _parse_channel_meta(self):
"""extract meta tab values from channel payload"""
# meta tab
meta_tab = self.yt_json["metadata"]["channelMetadataRenderer"]
all_thumbs = meta_tab["avatar"]["thumbnails"]
thumb_url = sorted(all_thumbs, key=lambda k: k["width"])[-1]["url"]
# stats tab
renderer = "twoColumnBrowseResultsRenderer"
all_tabs = self.yt_json["contents"][renderer]["tabs"]
for tab in all_tabs:
if "tabRenderer" in tab.keys():
if tab["tabRenderer"]["title"] == "About":
about_tab = tab["tabRenderer"]["content"][
"sectionListRenderer"
]["contents"][0]["itemSectionRenderer"]["contents"][0][
"channelAboutFullMetadataRenderer"
]
break
try:
channel_views_text = about_tab["viewCountText"]["simpleText"]
channel_views = int(re.sub(r"\D", "", channel_views_text))
except KeyError:
channel_views = 0
self.json_data.update(
{
"channel_description": meta_tab["description"],
"channel_thumb_url": thumb_url,
"channel_views": channel_views,
}
)
from home.src.ta.helper import clean_string
class YoutubeChannel(YouTubeItem):
@ -166,36 +23,94 @@ class YoutubeChannel(YouTubeItem):
es_path = False
index_name = "ta_channel"
yt_base = "https://www.youtube.com/channel/"
yt_obs = {
"extract_flat": True,
"allow_playlist_files": True,
}
def __init__(self, youtube_id, task=False):
super().__init__(youtube_id)
self.es_path = f"{self.index_name}/_doc/{youtube_id}"
self.all_playlists = False
self.task = task
def build_yt_url(self):
"""overwrite base to use channel about page"""
return f"{self.yt_base}{self.youtube_id}/about"
def build_json(self, upload=False, fallback=False):
"""get from es or from youtube"""
self.get_from_es()
if self.json_data:
return
self.get_from_youtube(fallback)
self.get_from_youtube()
if not self.youtube_meta and fallback:
self._video_fallback(fallback)
else:
self._process_youtube_meta()
self.get_channel_art()
if upload:
self.upload_to_es()
return
def get_from_youtube(self, fallback=False):
"""use bs4 to scrape channel about page"""
self.json_data = ChannelScraper(self.youtube_id).get_json()
def _process_youtube_meta(self):
"""extract relevant fields"""
self.youtube_meta["thumbnails"].reverse()
channel_subs = self.youtube_meta.get("channel_follower_count") or 0
self.json_data = {
"channel_active": True,
"channel_description": self.youtube_meta.get("description", False),
"channel_id": self.youtube_id,
"channel_last_refresh": int(datetime.now().timestamp()),
"channel_name": self.youtube_meta["uploader"],
"channel_subs": channel_subs,
"channel_subscribed": False,
"channel_tags": self._parse_tags(self.youtube_meta.get("tags")),
"channel_banner_url": self._get_banner_art(),
"channel_thumb_url": self._get_thumb_art(),
"channel_tvart_url": self._get_tv_art(),
"channel_views": self.youtube_meta.get("view_count", 0),
}
if not self.json_data and fallback:
self._video_fallback(fallback)
def _parse_tags(self, tags):
"""parse channel tags"""
if not tags:
return False
if not self.json_data:
return
joined = " ".join(tags)
return [i.strip() for i in joined.split('"') if i and not i == " "]
self.get_channel_art()
def _get_thumb_art(self):
"""extract thumb art"""
for i in self.youtube_meta["thumbnails"]:
if not i.get("width"):
continue
if i.get("width") == i.get("height"):
return i["url"]
return False
def _get_tv_art(self):
"""extract tv artwork"""
for i in self.youtube_meta["thumbnails"]:
if i.get("id") == "avatar_uncropped":
return i["url"]
if not i.get("width"):
continue
if i["width"] // i["height"] < 2 and not i["width"] == i["height"]:
return i["url"]
return False
def _get_banner_art(self):
"""extract banner artwork"""
for i in self.youtube_meta["thumbnails"]:
if not i.get("width"):
continue
if i["width"] // i["height"] > 5:
return i["url"]
return False
def _video_fallback(self, fallback):
"""use video metadata as fallback"""
@ -209,6 +124,7 @@ class YoutubeChannel(YouTubeItem):
"channel_tvart_url": False,
"channel_id": self.youtube_id,
"channel_subscribed": False,
"channel_tags": False,
"channel_description": False,
"channel_thumb_url": False,
"channel_views": 0,

View File

@ -15,8 +15,8 @@ class YouTubeItem:
"""base class for youtube"""
es_path = False
index_name = False
yt_base = False
index_name = ""
yt_base = ""
yt_obs = {
"skip_download": True,
"noplaylist": True,
@ -24,18 +24,27 @@ class YouTubeItem:
def __init__(self, youtube_id):
self.youtube_id = youtube_id
self.es_path = f"{self.index_name}/_doc/{youtube_id}"
self.config = AppConfig().config
self.app_conf = self.config["application"]
self.youtube_meta = False
self.json_data = False
def build_yt_url(self):
"""build youtube url"""
return self.yt_base + self.youtube_id
def get_from_youtube(self):
"""use yt-dlp to get meta data from youtube"""
print(f"{self.youtube_id}: get metadata from youtube")
url = self.yt_base + self.youtube_id
response = YtWrap(self.yt_obs, self.config).extract(url)
obs_request = self.yt_obs.copy()
if self.config["downloads"]["extractor_lang"]:
langs = self.config["downloads"]["extractor_lang"]
langs_list = [i.strip() for i in langs.split(",")]
obs_request["extractor_args"] = {"youtube": {"lang": langs_list}}
self.youtube_meta = response
url = self.build_yt_url()
self.youtube_meta = YtWrap(obs_request, self.config).extract(url)
def get_from_es(self):
"""get indexed data from elastic search"""

View File

@ -26,7 +26,6 @@ class YoutubePlaylist(YouTubeItem):
def __init__(self, youtube_id):
super().__init__(youtube_id)
self.es_path = f"{self.index_name}/_doc/{youtube_id}"
self.all_members = False
self.nav = False
self.all_youtube_ids = []

View File

@ -138,7 +138,6 @@ class YoutubeVideo(YouTubeItem, YoutubeSubtitle):
self.channel_id = False
self.video_overwrites = video_overwrites
self.video_type = video_type
self.es_path = f"{self.index_name}/_doc/{youtube_id}"
self.offline_import = False
def build_json(self, youtube_meta_overwrite=False, media_path=False):

View File

@ -81,11 +81,18 @@
<button onclick="textExpand()" id="text-expand-button">Show more</button>
</div>
{% endif %}
<div class="description-box">
<h2>Customize {{ channel_info.channel_name }}</h2>
</div>
{% if channel_info.channel_tags %}
<div class="description-box">
<div class="video-tag-box">
{% for tag in channel_info.channel_tags %}
<span class="video-tag">{{ tag }}</span>
{% endfor %}
</div>
</div>
{% endif %}
<div id="overwrite-form" class="info-box">
<div class="info-box-item">
<h2>Customize {{ channel_info.channel_name }}</h2>
<form class="overwrite-form" action="/channel/{{ channel_info.channel_id }}/about/" method="POST">
{% csrf_token %}
<div class="overwrite-form-item">

View File

@ -108,6 +108,11 @@
{{ app_form.downloads_format_sort }}
<br>
</div>
<div class="settings-item">
<p>Prefer translated metadata language: <span class="settings-current">{{ config.downloads.extractor_lang }}</span></p>
<i>This will change the language this video gets indexed as. That will only be available if the uploader provides translations. Add as two letter ISO language code, check the <a href="https://github.com/yt-dlp/yt-dlp#youtube" target="_blank">documentation</a> which languages are available.</i><br>
{{ app_form.downloads_extractor_lang}}
</div>
<div class="settings-item">
<p>Current metadata embed setting: <span class="settings-current">{{ config.downloads.add_metadata }}</span></p>
<i>Metadata is not embedded into the downloaded files by default.</i><br>

View File

@ -1,12 +1,11 @@
beautifulsoup4==4.12.2
celery==5.2.7
Django==4.2
Django==4.2.1
django-auth-ldap==4.3.0
django-cors-headers==3.14.0
djangorestframework==3.14.0
Pillow==9.5.0
redis==4.5.4
requests==2.29.0
requests==2.30.0
ryd-client==0.0.6
uWSGI==2.0.21
whitenoise==6.4.0