rewrite url_str extractor to convert channel names into channel ids, #40

This commit is contained in:
simon 2021-10-31 16:04:28 +07:00
parent c92800701a
commit 1ba3090db1
5 changed files with 90 additions and 32 deletions

View File

@ -10,9 +10,9 @@ Accessible at `/channel/` of your Tube Archivist, the **Overview Page** shows a
The **Subscribe to Channels** button <img src="assets/icon-add.png?raw=true" alt="add icon" width="20px" style="margin:0 5px;"> opens a text field to subscribe to a channel. You have a few options: The **Subscribe to Channels** button <img src="assets/icon-add.png?raw=true" alt="add icon" width="20px" style="margin:0 5px;"> opens a text field to subscribe to a channel. You have a few options:
- Enter the YouTube channel ID, a 25 character alphanumeric string. For example *UCBa659QWEk1AI4Tg--mrJ2A* - Enter the YouTube channel ID, a 25 character alphanumeric string. For example *UCBa659QWEk1AI4Tg--mrJ2A*
- Enter the URL to the channel page on YouTube. For example *https://www.youtube.com/channel/UCBa659QWEk1AI4Tg--mrJ2A* - Enter the URL to the channel page on YouTube. For example *https://www.youtube.com/channel/UCBa659QWEk1AI4Tg--mrJ2A*
- Enter the channel name for example: *https://www.youtube.com/c/TomScottGo*.
- Enter the video URL for any video and let Tube Archivist extract the channel ID for you. For example *https://www.youtube.com/watch?v=2tdiKTSdE9Y* - Enter the video URL for any video and let Tube Archivist extract the channel ID for you. For example *https://www.youtube.com/watch?v=2tdiKTSdE9Y*
- Add one per line. - Add one per line.
- **Note**: Adding a link to a YouTube channel name is not yet supported, for example: *https://www.youtube.com/c/TomScottGo* will fail.
The search icon <img src="assets/icon-search.png?raw=true" alt="search icon" width="20px" style="margin:0 5px;"> opens a text box to search for indexed channel names. Possible matches will show as you type. The search icon <img src="assets/icon-search.png?raw=true" alt="search icon" width="20px" style="margin:0 5px;"> opens a text box to search for indexed channel names. Possible matches will show as you type.

View File

@ -18,9 +18,9 @@ The **Add to Download Queue** icon <img src="assets/icon-add.png?raw=true" alt="
- Add a YouTube video ID. For example *2tdiKTSdE9Y*. - Add a YouTube video ID. For example *2tdiKTSdE9Y*.
- Add a link to a YouTube video by providing the shortened URL, for example *https://youtu.be/2tdiKTSdE9Y*. - Add a link to a YouTube video by providing the shortened URL, for example *https://youtu.be/2tdiKTSdE9Y*.
- Add a Channel ID or Channel URL to add every available video to the download queue. This will ignore the channel page size as described before and is meant for an initial download of the whole channel. You can still ignore selected videos before starting the download. - Add a Channel ID or Channel URL to add every available video to the download queue. This will ignore the channel page size as described before and is meant for an initial download of the whole channel. You can still ignore selected videos before starting the download.
- Add a channel name like for example *https://www.youtube.com/c/TomScottGo*.
- Add a playlist ID or URL to add every available video in the list to the download queue, for example *https://www.youtube.com/playlist?list=PL96C35uN7xGLLeET0dOWaKHkAlPsrkcha* or *PL96C35uN7xGLLeET0dOWaKHkAlPsrkcha*. Note that when you add a link to a video in a playlist, Tube Archivist assumes you want to download only the specific video and not the whole playlist, for example *https://www.youtube.com/watch?v=CINVwWHlzTY&list=PL96C35uN7xGLLeET0dOWaKHkAlPsrkcha* will only add one video *CINVwWHlzTY* to the queue. - Add a playlist ID or URL to add every available video in the list to the download queue, for example *https://www.youtube.com/playlist?list=PL96C35uN7xGLLeET0dOWaKHkAlPsrkcha* or *PL96C35uN7xGLLeET0dOWaKHkAlPsrkcha*. Note that when you add a link to a video in a playlist, Tube Archivist assumes you want to download only the specific video and not the whole playlist, for example *https://www.youtube.com/watch?v=CINVwWHlzTY&list=PL96C35uN7xGLLeET0dOWaKHkAlPsrkcha* will only add one video *CINVwWHlzTY* to the queue.
- Add one link per line. - Add one link per line.
- **Note**: Adding a link to a YouTube channel name is not yet supported, for example: *https://www.youtube.com/c/TomScottGo* will fail.
## The Download Queue ## The Download Queue
Below the three buttons you find the download queue. New items will get added at the bottom of the queue, the next video to download once you click on **Start Download** will be the first in the list. Below the three buttons you find the download queue. New items will get added at the bottom of the queue, the next video to download once you click on **Start Download** will be the first in the list.

View File

@ -9,9 +9,11 @@ import re
import string import string
import subprocess import subprocess
import unicodedata import unicodedata
from urllib.parse import parse_qs, urlparse
import redis import redis
import requests import requests
import yt_dlp as youtube_dl
def get_total_hits(index, es_url, es_auth, match_field): def get_total_hits(index, es_url, es_auth, match_field):
@ -51,34 +53,91 @@ def ignore_filelist(filelist):
return cleaned return cleaned
def process_url_list(url_str): class UrlListParser:
"""parse url_list to find valid youtube video or channel ids""" """take a multi line string and detect valid youtube ids"""
to_replace = ["watch?v=", "playlist?list="]
url_list = re.split("\n+", url_str[0]) def __init__(self, url_str):
self.url_list = [i.strip() for i in url_str.split()]
def process_list(self):
"""loop through the list"""
youtube_ids = [] youtube_ids = []
for url in url_list: for url in self.url_list:
if "/c/" in url or "/user/" in url: parsed = urlparse(url)
raise ValueError("user name is not unique, use channel ID") print(f"processing: {url}")
print(parsed)
url_clean = url.strip().strip("/").split("/")[-1] if not parsed.netloc:
for i in to_replace: # is not a url
url_clean = url_clean.replace(i, "") id_type = self.find_valid_id(url)
url_no_param = url_clean.split("&")[0] youtube_id = url
str_len = len(url_no_param) elif parsed.path:
if str_len == 11: # is a url
link_type = "video" youtube_id, id_type = self.detect_from_url(parsed)
elif str_len == 24:
link_type = "channel"
elif str_len == 34:
link_type = "playlist"
else: else:
# unable to parse # not detected
raise ValueError("not a valid url: " + url) raise ValueError(f"failed to detect {url}")
youtube_ids.append({"url": url_no_param, "type": link_type}) youtube_ids.append({"url": youtube_id, "type": id_type})
return youtube_ids return youtube_ids
def detect_from_url(self, parsed):
"""detect from parsed url"""
if parsed.netloc == "youtu.be":
# shortened
youtube_id = parsed.path.strip("/")
return youtube_id, "video"
if parsed.query:
# detect from query string
query_parsed = parse_qs(parsed.query)
if "v" in query_parsed.keys():
youtube_id = query_parsed["v"][0]
return youtube_id, "video"
if "list" in query_parsed.keys():
youtube_id = query_parsed["list"][0]
return youtube_id, "playlist"
if parsed.path.startswith("/channel/"):
# channel id in url
youtube_id = parsed.path.split("/")[2]
return youtube_id, "channel"
# dedect channel with yt_dlp
youtube_id = self.extract_channel_name(parsed.geturl())
return youtube_id, "channel"
@staticmethod
def find_valid_id(id_str):
"""dedect valid id from length of string"""
str_len = len(id_str)
if str_len == 11:
id_type = "video"
elif str_len == 24:
id_type = "channel"
elif str_len == 34:
id_type = "playlist"
else:
# unable to parse
raise ValueError("not a valid id_str: " + id_str)
return id_type
@staticmethod
def extract_channel_name(url):
"""find channel id from channel name with yt-dlp help"""
obs = {
"default_search": "ytsearch",
"quiet": True,
"skip_download": True,
"extract_flat": True,
"playlistend": 0,
}
url_info = youtube_dl.YoutubeDL(obs).extract_info(url, download=False)
channel_id = url_info["channel_id"]
return channel_id
class RedisArchivist: class RedisArchivist:
"""collection of methods to interact with redis""" """collection of methods to interact with redis"""

View File

@ -15,7 +15,7 @@ import requests
import yt_dlp as youtube_dl import yt_dlp as youtube_dl
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from home.src.config import AppConfig from home.src.config import AppConfig
from home.src.helper import DurationConverter, clean_string, process_url_list from home.src.helper import DurationConverter, UrlListParser, clean_string
from home.src.thumbnails import ThumbManager from home.src.thumbnails import ThumbManager
@ -446,9 +446,8 @@ class WatchState:
def dedect_type(self): def dedect_type(self):
"""find youtube id type""" """find youtube id type"""
url_process = process_url_list([self.youtube_id]) url_process = UrlListParser(self.youtube_id).process_list()
url_type = url_process[0]["type"] url_type = url_process[0]["type"]
return url_type return url_type
def mark_vid_watched(self, revert=False): def mark_vid_watched(self, revert=False):

View File

@ -26,7 +26,7 @@ from home.forms import (
) )
from home.src.config import AppConfig from home.src.config import AppConfig
from home.src.download import ChannelSubscription, PendingList from home.src.download import ChannelSubscription, PendingList
from home.src.helper import RedisArchivist, RedisQueue, process_url_list from home.src.helper import RedisArchivist, RedisQueue, UrlListParser
from home.src.index import WatchState, YoutubeChannel, YoutubeVideo from home.src.index import WatchState, YoutubeChannel, YoutubeVideo
from home.src.searching import Pagination, SearchForm, SearchHandler from home.src.searching import Pagination, SearchForm, SearchHandler
from home.tasks import ( from home.tasks import (
@ -303,13 +303,13 @@ class DownloadView(View):
"""handle post requests""" """handle post requests"""
to_queue = AddToQueueForm(data=request.POST) to_queue = AddToQueueForm(data=request.POST)
if to_queue.is_valid(): if to_queue.is_valid():
vid_url_list = [request.POST.get("vid_url")] url_str = request.POST.get("vid_url")
print(vid_url_list) print(url_str)
try: try:
youtube_ids = process_url_list(vid_url_list) youtube_ids = UrlListParser(url_str).process_list()
except ValueError: except ValueError:
# failed to process # failed to process
print(f"failed to parse: {vid_url_list}") print(f"failed to parse: {url_str}")
mess_dict = { mess_dict = {
"status": "downloading", "status": "downloading",
"level": "error", "level": "error",
@ -521,8 +521,8 @@ class ChannelView(View):
"""handle http post requests""" """handle http post requests"""
subscribe_form = SubscribeToChannelForm(data=request.POST) subscribe_form = SubscribeToChannelForm(data=request.POST)
if subscribe_form.is_valid(): if subscribe_form.is_valid():
vid_url_list = [request.POST.get("subscribe")] url_str = request.POST.get("subscribe")
youtube_ids = process_url_list(vid_url_list) youtube_ids = UrlListParser(url_str).process_list()
print(youtube_ids) print(youtube_ids)
subscribe_to.delay(youtube_ids) subscribe_to.delay(youtube_ids)
@ -819,7 +819,7 @@ class PostData:
youtube_id = self.exec_val youtube_id = self.exec_val
print("add vid to dl queue: " + youtube_id) print("add vid to dl queue: " + youtube_id)
PendingList().delete_from_pending(youtube_id) PendingList().delete_from_pending(youtube_id)
youtube_ids = process_url_list([youtube_id]) youtube_ids = UrlListParser(youtube_id).process_list()
extrac_dl.delay(youtube_ids) extrac_dl.delay(youtube_ids)
return {"success": True} return {"success": True}