tubearchivist/tubearchivist/home/src/ta/helper.py

"""
Loose collection of helper functions
- don't import AppConfig class here to avoid circular imports
"""

import random
import re
import string
import subprocess
import unicodedata
from datetime import datetime
from urllib.parse import parse_qs, urlparse

from home.src.download.yt_dlp_base import YtWrap


def clean_string(file_name):
    """clean string to only asci characters"""
    whitelist = "-_.() " + string.ascii_letters + string.digits
    normalized = unicodedata.normalize("NFKD", file_name)
    ascii_only = normalized.encode("ASCII", "ignore").decode().strip()
    white_listed = "".join(c for c in ascii_only if c in whitelist)
    cleaned = re.sub(r"[ ]{2,}", " ", white_listed)
    return cleaned


def ignore_filelist(filelist):
    """ignore temp files for os.listdir sanitizer"""
    to_ignore = ["Icon\r\r", "Temporary Items", "Network Trash Folder"]
    cleaned = []
    for file_name in filelist:
        if file_name.startswith(".") or file_name in to_ignore:
            continue

        cleaned.append(file_name)

    return cleaned


def randomizor(length):
    """generate random alpha numeric string"""
    pool = string.digits + string.ascii_letters
    return "".join(random.choice(pool) for i in range(length))


def requests_headers():
    """build header with random user agent for requests outside of yt-dlp"""

    chrome_versions = (
        "90.0.4430.212",
        "90.0.4430.24",
        "90.0.4430.70",
        "90.0.4430.72",
        "90.0.4430.85",
        "90.0.4430.93",
        "91.0.4472.101",
        "91.0.4472.106",
        "91.0.4472.114",
        "91.0.4472.124",
        "91.0.4472.164",
        "91.0.4472.19",
        "91.0.4472.77",
        "92.0.4515.107",
        "92.0.4515.115",
        "92.0.4515.131",
        "92.0.4515.159",
        "92.0.4515.43",
        "93.0.4556.0",
        "93.0.4577.15",
        "93.0.4577.63",
        "93.0.4577.82",
        "94.0.4606.41",
        "94.0.4606.54",
        "94.0.4606.61",
        "94.0.4606.71",
        "94.0.4606.81",
        "94.0.4606.85",
        "95.0.4638.17",
        "95.0.4638.50",
        "95.0.4638.54",
        "95.0.4638.69",
        "95.0.4638.74",
        "96.0.4664.18",
        "96.0.4664.45",
        "96.0.4664.55",
        "96.0.4664.93",
        "97.0.4692.20",
    )
    template = (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        + "AppleWebKit/537.36 (KHTML, like Gecko) "
        + f"Chrome/{random.choice(chrome_versions)} Safari/537.36"
    )

    return {"User-Agent": template}


def date_praser(timestamp):
    """return formatted date string"""
    if isinstance(timestamp, int):
        date_obj = datetime.fromtimestamp(timestamp)
    elif isinstance(timestamp, str):
        date_obj = datetime.strptime(timestamp, "%Y-%m-%d")

    return datetime.strftime(date_obj, "%d %b, %Y")


class UrlListParser:
    """take a multi line string and detect valid youtube ids"""

    def __init__(self, url_str):
        self.url_list = [i.strip() for i in url_str.split()]

    def process_list(self):
        """loop through the list"""
        youtube_ids = []
        for url in self.url_list:
            parsed = urlparse(url)
            print(f"processing: {url}")
            print(parsed)
            if not parsed.netloc:
                # is not a url
                id_type = self.find_valid_id(url)
                youtube_id = url
            elif "youtube.com" not in url and "youtu.be" not in url:
                raise ValueError(f"{url} is not a youtube link")
            elif parsed.path:
                # is a url
                youtube_id, id_type = self.detect_from_url(parsed)
            else:
                # not detected
                raise ValueError(f"failed to detect {url}")

            youtube_ids.append({"url": youtube_id, "type": id_type})

        return youtube_ids

    def detect_from_url(self, parsed):
        """detect from parsed url"""
        if parsed.netloc == "youtu.be":
            # shortened
            youtube_id = parsed.path.strip("/")
            _ = self.find_valid_id(youtube_id)
            return youtube_id, "video"

        if parsed.query:
            # detect from query string
            query_parsed = parse_qs(parsed.query)
            if "v" in query_parsed.keys():
                youtube_id = query_parsed["v"][0]
                _ = self.find_valid_id(youtube_id)
                return youtube_id, "video"

            if "list" in query_parsed.keys():
                youtube_id = query_parsed["list"][0]
                return youtube_id, "playlist"

        if parsed.path.startswith("/channel/"):
            # channel id in url
            youtube_id = parsed.path.split("/")[2]
            _ = self.find_valid_id(youtube_id)
            return youtube_id, "channel"

        # dedect channel with yt_dlp
        youtube_id = self.extract_channel_name(parsed.geturl())
        return youtube_id, "channel"

    @staticmethod
    def find_valid_id(id_str):
        """dedect valid id from length of string"""
        str_len = len(id_str)
        if str_len == 11:
            id_type = "video"
        elif str_len == 24:
            id_type = "channel"
        elif str_len in [34, 18] or id_str in ["LL", "WL"]:
            id_type = "playlist"
        else:
            # unable to parse
            raise ValueError("not a valid id_str: " + id_str)

        return id_type

    @staticmethod
    def extract_channel_name(url):
        """find channel id from channel name with yt-dlp help"""
        obs_request = {
            "skip_download": True,
            "extract_flat": True,
            "playlistend": 0,
        }
        url_info = YtWrap(obs_request).extract(url)
        try:
            channel_id = url_info["channel_id"]
        except KeyError as error:
            print(f"failed to extract channel id from {url}")
            raise ValueError from error

        return channel_id


class DurationConverter:
    """
    using ffmpeg to get and parse duration from filepath
    """

    @staticmethod
    def get_sec(file_path):
        """read duration from file"""
        duration = subprocess.run(
            [
                "ffprobe",
                "-v",
                "error",
                "-show_entries",
                "format=duration",
                "-of",
                "default=noprint_wrappers=1:nokey=1",
                file_path,
            ],
            capture_output=True,
            check=True,
        )
        duration_raw = duration.stdout.decode().strip()
        if duration_raw == "N/A":
            return 0

        duration_sec = int(float(duration_raw))
        return duration_sec

    @staticmethod
    def get_str(duration_sec):
        """takes duration in sec and returns clean string"""
        if not duration_sec:
            # failed to extract
            return "NA"

        hours = duration_sec // 3600
        minutes = (duration_sec - (hours * 3600)) // 60
        secs = duration_sec - (hours * 3600) - (minutes * 60)

        duration_str = str()
        if hours:
            duration_str = str(hours).zfill(2) + ":"
        if minutes:
            duration_str = duration_str + str(minutes).zfill(2) + ":"
        else:
            duration_str = duration_str + "00:"
        duration_str = duration_str + str(secs).zfill(2)
        return duration_str