rewrite url_str extractor to convert channel names into channel ids, #40

2025-07-26 10:48:16 +00:00 · 2021-10-31 16:04:28 +07:00 · 2021-10-31 16:04:28 +07:00 · 1ba3090db1
commit 1ba3090db1
parent c92800701a
5 changed files with 90 additions and 32 deletions
--- a/docs/Channels.md
+++ b/docs/Channels.md
@ -10,9 +10,9 @@ Accessible at `/channel/` of your Tube Archivist, the **Overview Page** shows a
 The **Subscribe to Channels** button <img src="assets/icon-add.png?raw=true" alt="add icon" width="20px" style="margin:0 5px;"> opens a text field to subscribe to a channel. You have a few options:
 - Enter the YouTube channel ID, a 25 character alphanumeric string. For example *UCBa659QWEk1AI4Tg--mrJ2A*
 - Enter the URL to the channel page on YouTube. For example *https://www.youtube.com/channel/UCBa659QWEk1AI4Tg--mrJ2A*
 - Enter the channel name for example: *https://www.youtube.com/c/TomScottGo*.
 - Enter the video URL for any video and let Tube Archivist extract the channel ID for you. For example *https://www.youtube.com/watch?v=2tdiKTSdE9Y*
 - Add one per line.
 - **Note**: Adding a link to a YouTube channel name is not yet supported, for example: *https://www.youtube.com/c/TomScottGo* will fail.
 The search icon <img src="assets/icon-search.png?raw=true" alt="search icon" width="20px" style="margin:0 5px;"> opens a text box to search for indexed channel names. Possible matches will show as you type. 
--- a/docs/Downloads.md
+++ b/docs/Downloads.md
@ -18,9 +18,9 @@ The **Add to Download Queue** icon <img src="assets/icon-add.png?raw=true" alt="
 - Add a YouTube video ID. For example *2tdiKTSdE9Y*.
 - Add a link to a YouTube video by providing the shortened URL, for example *https://youtu.be/2tdiKTSdE9Y*.
 - Add a Channel ID or Channel URL to add every available video to the download queue. This will ignore the channel page size as described before and is meant for an initial download of the whole channel. You can still ignore selected videos before starting the download.
 - Add a channel name like for example *https://www.youtube.com/c/TomScottGo*.
 - Add a playlist ID or URL to add every available video in the list to the download queue, for example *https://www.youtube.com/playlist?list=PL96C35uN7xGLLeET0dOWaKHkAlPsrkcha* or *PL96C35uN7xGLLeET0dOWaKHkAlPsrkcha*. Note that when you add a link to a video in a playlist, Tube Archivist assumes you want to download only the specific video and not the whole playlist, for example *https://www.youtube.com/watch?v=CINVwWHlzTY&list=PL96C35uN7xGLLeET0dOWaKHkAlPsrkcha* will only add one video *CINVwWHlzTY* to the queue.
 - Add one link per line.
 - **Note**: Adding a link to a YouTube channel name is not yet supported, for example: *https://www.youtube.com/c/TomScottGo* will fail.
 ## The Download Queue
 Below the three buttons you find the download queue. New items will get added at the bottom of the queue, the next video to download once you click on **Start Download** will be the first in the list.
--- a/tubearchivist/home/src/helper.py
+++ b/tubearchivist/home/src/helper.py
@ -9,9 +9,11 @@ import re
 import string
 import subprocess
 import unicodedata
 from urllib.parse import parse_qs, urlparse
 import redis
 import requests
 import yt_dlp as youtube_dl
 def get_total_hits(index, es_url, es_auth, match_field):
@ -51,34 +53,91 @@ def ignore_filelist(filelist):
    return cleaned
-def process_url_list(url_str):
+class UrlListParser:
-    """parse url_list to find valid youtube video or channel ids"""
+    """take a multi line string and detect valid youtube ids"""
-    to_replace = ["watch?v=", "playlist?list="]
+
-    url_list = re.split("\n+", url_str[0])
+    def __init__(self, url_str):
        self.url_list = [i.strip() for i in url_str.split()]
    def process_list(self):
        """loop through the list"""
        youtube_ids = []
-    for url in url_list:
+        for url in self.url_list:
-        if "/c/" in url or "/user/" in url:
+            parsed = urlparse(url)
-            raise ValueError("user name is not unique, use channel ID")
+            print(f"processing: {url}")
-
+            print(parsed)
-        url_clean = url.strip().strip("/").split("/")[-1]
+            if not parsed.netloc:
-        for i in to_replace:
+                # is not a url
-            url_clean = url_clean.replace(i, "")
+                id_type = self.find_valid_id(url)
-        url_no_param = url_clean.split("&")[0]
+                youtube_id = url
-        str_len = len(url_no_param)
+            elif parsed.path:
-        if str_len == 11:
+                # is a url
-            link_type = "video"
+                youtube_id, id_type = self.detect_from_url(parsed)
        elif str_len == 24:
            link_type = "channel"
        elif str_len == 34:
            link_type = "playlist"
            else:
-            # unable to parse
+                # not detected
-            raise ValueError("not a valid url: " + url)
+                raise ValueError(f"failed to detect {url}")
-        youtube_ids.append({"url": url_no_param, "type": link_type})
+            youtube_ids.append({"url": youtube_id, "type": id_type})
        return youtube_ids
    def detect_from_url(self, parsed):
        """detect from parsed url"""
        if parsed.netloc == "youtu.be":
            # shortened
            youtube_id = parsed.path.strip("/")
            return youtube_id, "video"
        if parsed.query:
            # detect from query string
            query_parsed = parse_qs(parsed.query)
            if "v" in query_parsed.keys():
                youtube_id = query_parsed["v"][0]
                return youtube_id, "video"
            if "list" in query_parsed.keys():
                youtube_id = query_parsed["list"][0]
                return youtube_id, "playlist"
        if parsed.path.startswith("/channel/"):
            # channel id in url
            youtube_id = parsed.path.split("/")[2]
            return youtube_id, "channel"
        # dedect channel with yt_dlp
        youtube_id = self.extract_channel_name(parsed.geturl())
        return youtube_id, "channel"
    @staticmethod
    def find_valid_id(id_str):
        """dedect valid id from length of string"""
        str_len = len(id_str)
        if str_len == 11:
            id_type = "video"
        elif str_len == 24:
            id_type = "channel"
        elif str_len == 34:
            id_type = "playlist"
        else:
            # unable to parse
            raise ValueError("not a valid id_str: " + id_str)
        return id_type
    @staticmethod
    def extract_channel_name(url):
        """find channel id from channel name with yt-dlp help"""
        obs = {
            "default_search": "ytsearch",
            "quiet": True,
            "skip_download": True,
            "extract_flat": True,
            "playlistend": 0,
        }
        url_info = youtube_dl.YoutubeDL(obs).extract_info(url, download=False)
        channel_id = url_info["channel_id"]
        return channel_id
 class RedisArchivist:
    """collection of methods to interact with redis"""
--- a/tubearchivist/home/src/index.py
+++ b/tubearchivist/home/src/index.py
@ -15,7 +15,7 @@ import requests
 import yt_dlp as youtube_dl
 from bs4 import BeautifulSoup
 from home.src.config import AppConfig
-from home.src.helper import DurationConverter, clean_string, process_url_list
+from home.src.helper import DurationConverter, UrlListParser, clean_string
 from home.src.thumbnails import ThumbManager
@ -446,9 +446,8 @@ class WatchState:
    def dedect_type(self):
        """find youtube id type"""
-        url_process = process_url_list([self.youtube_id])
+        url_process = UrlListParser(self.youtube_id).process_list()
        url_type = url_process[0]["type"]
        return url_type
    def mark_vid_watched(self, revert=False):
--- a/tubearchivist/home/views.py
+++ b/tubearchivist/home/views.py
@ -26,7 +26,7 @@ from home.forms import (
 )
 from home.src.config import AppConfig
 from home.src.download import ChannelSubscription, PendingList
-from home.src.helper import RedisArchivist, RedisQueue, process_url_list
+from home.src.helper import RedisArchivist, RedisQueue, UrlListParser
 from home.src.index import WatchState, YoutubeChannel, YoutubeVideo
 from home.src.searching import Pagination, SearchForm, SearchHandler
 from home.tasks import (
@ -303,13 +303,13 @@ class DownloadView(View):
        """handle post requests"""
        to_queue = AddToQueueForm(data=request.POST)
        if to_queue.is_valid():
-            vid_url_list = [request.POST.get("vid_url")]
+            url_str = request.POST.get("vid_url")
-            print(vid_url_list)
+            print(url_str)
            try:
-                youtube_ids = process_url_list(vid_url_list)
+                youtube_ids = UrlListParser(url_str).process_list()
            except ValueError:
                # failed to process
-                print(f"failed to parse: {vid_url_list}")
+                print(f"failed to parse: {url_str}")
                mess_dict = {
                    "status": "downloading",
                    "level": "error",
@ -521,8 +521,8 @@ class ChannelView(View):
        """handle http post requests"""
        subscribe_form = SubscribeToChannelForm(data=request.POST)
        if subscribe_form.is_valid():
-            vid_url_list = [request.POST.get("subscribe")]
+            url_str = request.POST.get("subscribe")
-            youtube_ids = process_url_list(vid_url_list)
+            youtube_ids = UrlListParser(url_str).process_list()
            print(youtube_ids)
            subscribe_to.delay(youtube_ids)
@ -819,7 +819,7 @@ class PostData:
        youtube_id = self.exec_val
        print("add vid to dl queue: " + youtube_id)
        PendingList().delete_from_pending(youtube_id)
-        youtube_ids = process_url_list([youtube_id])
+        youtube_ids = UrlListParser(youtube_id).process_list()
        extrac_dl.delay(youtube_ids)
        return {"success": True}