tubearchivist/tubearchivist/home/src/ta/urlparser.py

135 lines
4.3 KiB
Python

"""
Functionality:
- detect valid youtube ids and links from multi line string
- identify vid_type if possible
"""
from urllib.parse import parse_qs, urlparse
from home.src.download.yt_dlp_base import YtWrap
from home.src.index.video_constants import VideoTypeEnum
class Parser:
"""take a multi line string and detect valid youtube ids"""
def __init__(self, url_str):
self.url_list = [i.strip() for i in url_str.split()]
def parse(self):
"""parse the list"""
ids = []
for url in self.url_list:
parsed = urlparse(url)
if parsed.netloc:
# is url
identified = self.process_url(parsed)
else:
# is not url
identified = self._find_valid_id(url)
if "vid_type" not in identified:
identified.update(self._detect_vid_type(parsed.path))
ids.append(identified)
return ids
def process_url(self, parsed):
"""process as url"""
if parsed.netloc == "youtu.be":
# shortened
youtube_id = parsed.path.strip("/")
return self._validate_expected(youtube_id, "video")
query_parsed = parse_qs(parsed.query)
if "v" in query_parsed:
# video from v query str
youtube_id = query_parsed["v"][0]
return self._validate_expected(youtube_id, "video")
if "list" in query_parsed:
# playlist from list query str
youtube_id = query_parsed["list"][0]
return self._validate_expected(youtube_id, "playlist")
all_paths = parsed.path.strip("/").split("/")
if all_paths[0] == "shorts":
# is shorts video
item = self._validate_expected(all_paths[1], "video")
item.update({"vid_type": VideoTypeEnum.SHORTS.value})
return item
if all_paths[0] == "channel":
return self._validate_expected(all_paths[1], "channel")
# detect channel
channel_id = self._extract_channel_name(parsed.geturl())
return {"type": "channel", "url": channel_id}
def _validate_expected(self, youtube_id, expected_type):
"""raise value error if not matching"""
matched = self._find_valid_id(youtube_id)
if matched["type"] != expected_type:
raise ValueError(
f"{youtube_id} not of expected type {expected_type}"
)
return {"type": expected_type, "url": youtube_id}
def _find_valid_id(self, id_str):
"""detect valid id from length of string"""
if id_str in ("LL", "WL"):
return {"type": "playlist", "url": id_str}
if id_str.startswith("@"):
url = f"https://www.youtube.com/{id_str}"
channel_id = self._extract_channel_name(url)
return {"type": "channel", "url": channel_id}
len_id_str = len(id_str)
if len_id_str == 11:
item_type = "video"
elif len_id_str == 24:
item_type = "channel"
elif len_id_str in (34, 26, 18):
item_type = "playlist"
else:
raise ValueError(f"not a valid id_str: {id_str}")
return {"type": item_type, "url": id_str}
@staticmethod
def _extract_channel_name(url):
"""find channel id from channel name with yt-dlp help"""
obs_request = {
"check_formats": None,
"skip_download": True,
"extract_flat": True,
"playlistend": 0,
}
url_info = YtWrap(obs_request).extract(url)
channel_id = url_info.get("channel_id", False)
if channel_id:
return channel_id
url = url_info.get("url", False)
if url:
# handle old channel name redirect with url path split
channel_id = urlparse(url).path.strip("/").split("/")[1]
return channel_id
print(f"failed to extract channel id from {url}")
raise ValueError
def _detect_vid_type(self, path):
"""try to match enum from path, needs to be serializable"""
last = path.strip("/").split("/")[-1]
try:
vid_type = VideoTypeEnum(last).value
except ValueError:
vid_type = VideoTypeEnum.UNKNOWN.value
return {"vid_type": vid_type}