rewrite url_str extractor to convert channel names into channel ids, #40

This commit is contained in:
simon 2021-10-31 16:04:28 +07:00
parent c92800701a
commit 1ba3090db1
5 changed files with 90 additions and 32 deletions

View File

@ -10,9 +10,9 @@ Accessible at `/channel/` of your Tube Archivist, the **Overview Page** shows a
The **Subscribe to Channels** button <img src="assets/icon-add.png?raw=true" alt="add icon" width="20px" style="margin:0 5px;"> opens a text field to subscribe to a channel. You have a few options:
- Enter the YouTube channel ID, a 25 character alphanumeric string. For example *UCBa659QWEk1AI4Tg--mrJ2A*
- Enter the URL to the channel page on YouTube. For example *https://www.youtube.com/channel/UCBa659QWEk1AI4Tg--mrJ2A*
- Enter the channel name for example: *https://www.youtube.com/c/TomScottGo*.
- Enter the video URL for any video and let Tube Archivist extract the channel ID for you. For example *https://www.youtube.com/watch?v=2tdiKTSdE9Y*
- Add one per line.
- **Note**: Adding a link to a YouTube channel name is not yet supported, for example: *https://www.youtube.com/c/TomScottGo* will fail.
The search icon <img src="assets/icon-search.png?raw=true" alt="search icon" width="20px" style="margin:0 5px;"> opens a text box to search for indexed channel names. Possible matches will show as you type.

View File

@ -18,9 +18,9 @@ The **Add to Download Queue** icon <img src="assets/icon-add.png?raw=true" alt="
- Add a YouTube video ID. For example *2tdiKTSdE9Y*.
- Add a link to a YouTube video by providing the shortened URL, for example *https://youtu.be/2tdiKTSdE9Y*.
- Add a Channel ID or Channel URL to add every available video to the download queue. This will ignore the channel page size as described before and is meant for an initial download of the whole channel. You can still ignore selected videos before starting the download.
- Add a channel name like for example *https://www.youtube.com/c/TomScottGo*.
- Add a playlist ID or URL to add every available video in the list to the download queue, for example *https://www.youtube.com/playlist?list=PL96C35uN7xGLLeET0dOWaKHkAlPsrkcha* or *PL96C35uN7xGLLeET0dOWaKHkAlPsrkcha*. Note that when you add a link to a video in a playlist, Tube Archivist assumes you want to download only the specific video and not the whole playlist, for example *https://www.youtube.com/watch?v=CINVwWHlzTY&list=PL96C35uN7xGLLeET0dOWaKHkAlPsrkcha* will only add one video *CINVwWHlzTY* to the queue.
- Add one link per line.
- **Note**: Adding a link to a YouTube channel name is not yet supported, for example: *https://www.youtube.com/c/TomScottGo* will fail.
## The Download Queue
Below the three buttons you find the download queue. New items will get added at the bottom of the queue, the next video to download once you click on **Start Download** will be the first in the list.

View File

@ -9,9 +9,11 @@ import re
import string
import subprocess
import unicodedata
from urllib.parse import parse_qs, urlparse
import redis
import requests
import yt_dlp as youtube_dl
def get_total_hits(index, es_url, es_auth, match_field):
@ -51,33 +53,90 @@ def ignore_filelist(filelist):
return cleaned
def process_url_list(url_str):
"""parse url_list to find valid youtube video or channel ids"""
to_replace = ["watch?v=", "playlist?list="]
url_list = re.split("\n+", url_str[0])
youtube_ids = []
for url in url_list:
if "/c/" in url or "/user/" in url:
raise ValueError("user name is not unique, use channel ID")
class UrlListParser:
"""take a multi line string and detect valid youtube ids"""
url_clean = url.strip().strip("/").split("/")[-1]
for i in to_replace:
url_clean = url_clean.replace(i, "")
url_no_param = url_clean.split("&")[0]
str_len = len(url_no_param)
def __init__(self, url_str):
self.url_list = [i.strip() for i in url_str.split()]
def process_list(self):
"""loop through the list"""
youtube_ids = []
for url in self.url_list:
parsed = urlparse(url)
print(f"processing: {url}")
print(parsed)
if not parsed.netloc:
# is not a url
id_type = self.find_valid_id(url)
youtube_id = url
elif parsed.path:
# is a url
youtube_id, id_type = self.detect_from_url(parsed)
else:
# not detected
raise ValueError(f"failed to detect {url}")
youtube_ids.append({"url": youtube_id, "type": id_type})
return youtube_ids
def detect_from_url(self, parsed):
"""detect from parsed url"""
if parsed.netloc == "youtu.be":
# shortened
youtube_id = parsed.path.strip("/")
return youtube_id, "video"
if parsed.query:
# detect from query string
query_parsed = parse_qs(parsed.query)
if "v" in query_parsed.keys():
youtube_id = query_parsed["v"][0]
return youtube_id, "video"
if "list" in query_parsed.keys():
youtube_id = query_parsed["list"][0]
return youtube_id, "playlist"
if parsed.path.startswith("/channel/"):
# channel id in url
youtube_id = parsed.path.split("/")[2]
return youtube_id, "channel"
# dedect channel with yt_dlp
youtube_id = self.extract_channel_name(parsed.geturl())
return youtube_id, "channel"
@staticmethod
def find_valid_id(id_str):
"""dedect valid id from length of string"""
str_len = len(id_str)
if str_len == 11:
link_type = "video"
id_type = "video"
elif str_len == 24:
link_type = "channel"
id_type = "channel"
elif str_len == 34:
link_type = "playlist"
id_type = "playlist"
else:
# unable to parse
raise ValueError("not a valid url: " + url)
raise ValueError("not a valid id_str: " + id_str)
youtube_ids.append({"url": url_no_param, "type": link_type})
return id_type
return youtube_ids
@staticmethod
def extract_channel_name(url):
"""find channel id from channel name with yt-dlp help"""
obs = {
"default_search": "ytsearch",
"quiet": True,
"skip_download": True,
"extract_flat": True,
"playlistend": 0,
}
url_info = youtube_dl.YoutubeDL(obs).extract_info(url, download=False)
channel_id = url_info["channel_id"]
return channel_id
class RedisArchivist:

View File

@ -15,7 +15,7 @@ import requests
import yt_dlp as youtube_dl
from bs4 import BeautifulSoup
from home.src.config import AppConfig
from home.src.helper import DurationConverter, clean_string, process_url_list
from home.src.helper import DurationConverter, UrlListParser, clean_string
from home.src.thumbnails import ThumbManager
@ -446,9 +446,8 @@ class WatchState:
def dedect_type(self):
"""find youtube id type"""
url_process = process_url_list([self.youtube_id])
url_process = UrlListParser(self.youtube_id).process_list()
url_type = url_process[0]["type"]
return url_type
def mark_vid_watched(self, revert=False):

View File

@ -26,7 +26,7 @@ from home.forms import (
)
from home.src.config import AppConfig
from home.src.download import ChannelSubscription, PendingList
from home.src.helper import RedisArchivist, RedisQueue, process_url_list
from home.src.helper import RedisArchivist, RedisQueue, UrlListParser
from home.src.index import WatchState, YoutubeChannel, YoutubeVideo
from home.src.searching import Pagination, SearchForm, SearchHandler
from home.tasks import (
@ -303,13 +303,13 @@ class DownloadView(View):
"""handle post requests"""
to_queue = AddToQueueForm(data=request.POST)
if to_queue.is_valid():
vid_url_list = [request.POST.get("vid_url")]
print(vid_url_list)
url_str = request.POST.get("vid_url")
print(url_str)
try:
youtube_ids = process_url_list(vid_url_list)
youtube_ids = UrlListParser(url_str).process_list()
except ValueError:
# failed to process
print(f"failed to parse: {vid_url_list}")
print(f"failed to parse: {url_str}")
mess_dict = {
"status": "downloading",
"level": "error",
@ -521,8 +521,8 @@ class ChannelView(View):
"""handle http post requests"""
subscribe_form = SubscribeToChannelForm(data=request.POST)
if subscribe_form.is_valid():
vid_url_list = [request.POST.get("subscribe")]
youtube_ids = process_url_list(vid_url_list)
url_str = request.POST.get("subscribe")
youtube_ids = UrlListParser(url_str).process_list()
print(youtube_ids)
subscribe_to.delay(youtube_ids)
@ -819,7 +819,7 @@ class PostData:
youtube_id = self.exec_val
print("add vid to dl queue: " + youtube_id)
PendingList().delete_from_pending(youtube_id)
youtube_ids = process_url_list([youtube_id])
youtube_ids = UrlListParser(youtube_id).process_list()
extrac_dl.delay(youtube_ids)
return {"success": True}