2021-09-05 17:10:14 +00:00
|
|
|
"""
|
|
|
|
Loose collection of helper functions
|
|
|
|
- don't import AppConfig class here to avoid circular imports
|
|
|
|
"""
|
|
|
|
|
2022-11-27 08:41:59 +00:00
|
|
|
import json
|
2022-11-25 11:49:36 +00:00
|
|
|
import os
|
2022-03-26 04:49:53 +00:00
|
|
|
import random
|
2021-09-05 17:10:14 +00:00
|
|
|
import re
|
|
|
|
import string
|
|
|
|
import unicodedata
|
2022-03-29 09:47:21 +00:00
|
|
|
from datetime import datetime
|
2023-04-09 06:27:47 +00:00
|
|
|
from urllib.parse import urlparse
|
2021-09-05 17:10:14 +00:00
|
|
|
|
2023-01-02 14:36:08 +00:00
|
|
|
import requests
|
2021-09-05 17:10:14 +00:00
|
|
|
|
|
|
|
|
2023-04-16 06:59:15 +00:00
|
|
|
def clean_string(file_name: str) -> str:
|
2021-09-21 09:25:22 +00:00
|
|
|
"""clean string to only asci characters"""
|
2021-09-05 17:10:14 +00:00
|
|
|
whitelist = "-_.() " + string.ascii_letters + string.digits
|
2021-09-21 09:25:22 +00:00
|
|
|
normalized = unicodedata.normalize("NFKD", file_name)
|
|
|
|
ascii_only = normalized.encode("ASCII", "ignore").decode().strip()
|
2023-04-16 06:59:15 +00:00
|
|
|
white_listed: str = "".join(c for c in ascii_only if c in whitelist)
|
|
|
|
cleaned: str = re.sub(r"[ ]{2,}", " ", white_listed)
|
2021-09-05 17:10:14 +00:00
|
|
|
return cleaned
|
|
|
|
|
|
|
|
|
2023-04-16 06:59:15 +00:00
|
|
|
def ignore_filelist(filelist: list[str]) -> list[str]:
|
2021-09-25 11:59:54 +00:00
|
|
|
"""ignore temp files for os.listdir sanitizer"""
|
|
|
|
to_ignore = ["Icon\r\r", "Temporary Items", "Network Trash Folder"]
|
2023-04-16 06:59:15 +00:00
|
|
|
cleaned: list[str] = []
|
2021-09-25 11:59:54 +00:00
|
|
|
for file_name in filelist:
|
|
|
|
if file_name.startswith(".") or file_name in to_ignore:
|
|
|
|
continue
|
|
|
|
|
|
|
|
cleaned.append(file_name)
|
|
|
|
|
|
|
|
return cleaned
|
|
|
|
|
|
|
|
|
2023-04-16 06:59:15 +00:00
|
|
|
def randomizor(length: int) -> str:
|
2022-04-05 14:51:10 +00:00
|
|
|
"""generate random alpha numeric string"""
|
2023-04-16 06:59:15 +00:00
|
|
|
pool: str = string.digits + string.ascii_letters
|
2022-04-05 14:51:10 +00:00
|
|
|
return "".join(random.choice(pool) for i in range(length))
|
|
|
|
|
|
|
|
|
2023-04-16 06:59:15 +00:00
|
|
|
def requests_headers() -> dict[str, str]:
|
2022-03-26 04:49:53 +00:00
|
|
|
"""build header with random user agent for requests outside of yt-dlp"""
|
|
|
|
|
|
|
|
chrome_versions = (
|
|
|
|
"90.0.4430.212",
|
|
|
|
"90.0.4430.24",
|
|
|
|
"90.0.4430.70",
|
|
|
|
"90.0.4430.72",
|
|
|
|
"90.0.4430.85",
|
|
|
|
"90.0.4430.93",
|
|
|
|
"91.0.4472.101",
|
|
|
|
"91.0.4472.106",
|
|
|
|
"91.0.4472.114",
|
|
|
|
"91.0.4472.124",
|
|
|
|
"91.0.4472.164",
|
|
|
|
"91.0.4472.19",
|
|
|
|
"91.0.4472.77",
|
|
|
|
"92.0.4515.107",
|
|
|
|
"92.0.4515.115",
|
|
|
|
"92.0.4515.131",
|
|
|
|
"92.0.4515.159",
|
|
|
|
"92.0.4515.43",
|
|
|
|
"93.0.4556.0",
|
|
|
|
"93.0.4577.15",
|
|
|
|
"93.0.4577.63",
|
|
|
|
"93.0.4577.82",
|
|
|
|
"94.0.4606.41",
|
|
|
|
"94.0.4606.54",
|
|
|
|
"94.0.4606.61",
|
|
|
|
"94.0.4606.71",
|
|
|
|
"94.0.4606.81",
|
|
|
|
"94.0.4606.85",
|
|
|
|
"95.0.4638.17",
|
|
|
|
"95.0.4638.50",
|
|
|
|
"95.0.4638.54",
|
|
|
|
"95.0.4638.69",
|
|
|
|
"95.0.4638.74",
|
|
|
|
"96.0.4664.18",
|
|
|
|
"96.0.4664.45",
|
|
|
|
"96.0.4664.55",
|
|
|
|
"96.0.4664.93",
|
|
|
|
"97.0.4692.20",
|
|
|
|
)
|
|
|
|
template = (
|
|
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
|
|
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
|
|
+ f"Chrome/{random.choice(chrome_versions)} Safari/537.36"
|
|
|
|
)
|
|
|
|
|
|
|
|
return {"User-Agent": template}
|
|
|
|
|
|
|
|
|
2023-04-16 06:59:15 +00:00
|
|
|
def date_praser(timestamp: int | str) -> str:
|
2022-03-29 09:47:21 +00:00
|
|
|
"""return formatted date string"""
|
|
|
|
if isinstance(timestamp, int):
|
|
|
|
date_obj = datetime.fromtimestamp(timestamp)
|
|
|
|
elif isinstance(timestamp, str):
|
|
|
|
date_obj = datetime.strptime(timestamp, "%Y-%m-%d")
|
|
|
|
|
|
|
|
return datetime.strftime(date_obj, "%d %b, %Y")
|
|
|
|
|
|
|
|
|
2023-04-16 06:59:15 +00:00
|
|
|
def time_parser(timestamp: str) -> float:
|
2022-07-21 10:15:36 +00:00
|
|
|
"""return seconds from timestamp, false on empty"""
|
|
|
|
if not timestamp:
|
|
|
|
return False
|
|
|
|
|
|
|
|
if timestamp.isnumeric():
|
|
|
|
return int(timestamp)
|
|
|
|
|
|
|
|
hours, minutes, seconds = timestamp.split(":", maxsplit=3)
|
|
|
|
return int(hours) * 60 * 60 + int(minutes) * 60 + float(seconds)
|
|
|
|
|
|
|
|
|
2023-04-16 06:59:15 +00:00
|
|
|
def clear_dl_cache(config: dict) -> int:
|
2022-11-25 11:49:36 +00:00
|
|
|
"""clear leftover files from dl cache"""
|
|
|
|
print("clear download cache")
|
|
|
|
cache_dir = os.path.join(config["application"]["cache_dir"], "download")
|
2023-02-02 03:46:59 +00:00
|
|
|
leftover_files = os.listdir(cache_dir)
|
|
|
|
for cached in leftover_files:
|
2022-11-25 11:49:36 +00:00
|
|
|
to_delete = os.path.join(cache_dir, cached)
|
|
|
|
os.remove(to_delete)
|
|
|
|
|
2023-02-02 03:46:59 +00:00
|
|
|
return len(leftover_files)
|
|
|
|
|
2022-11-25 11:49:36 +00:00
|
|
|
|
2023-04-16 06:59:15 +00:00
|
|
|
def get_mapping() -> dict:
|
2022-11-27 08:41:59 +00:00
|
|
|
"""read index_mapping.json and get expected mapping and settings"""
|
|
|
|
with open("home/src/es/index_mapping.json", "r", encoding="utf-8") as f:
|
2023-04-16 06:59:15 +00:00
|
|
|
index_config: dict = json.load(f).get("index_config")
|
2022-11-27 08:41:59 +00:00
|
|
|
|
|
|
|
return index_config
|
|
|
|
|
|
|
|
|
2023-04-16 06:59:15 +00:00
|
|
|
def is_shorts(youtube_id: str) -> bool:
|
2023-01-02 14:36:08 +00:00
|
|
|
"""check if youtube_id is a shorts video, bot not it it's not a shorts"""
|
|
|
|
shorts_url = f"https://www.youtube.com/shorts/{youtube_id}"
|
|
|
|
response = requests.head(
|
|
|
|
shorts_url, headers=requests_headers(), timeout=10
|
|
|
|
)
|
|
|
|
|
|
|
|
return response.status_code == 200
|
|
|
|
|
|
|
|
|
2023-04-16 06:59:15 +00:00
|
|
|
def ta_host_parser(ta_host: str) -> tuple[list[str], list[str]]:
|
2023-04-09 06:27:47 +00:00
|
|
|
"""parse ta_host env var for ALLOWED_HOSTS and CSRF_TRUSTED_ORIGINS"""
|
2023-04-16 06:59:15 +00:00
|
|
|
allowed_hosts: list[str] = []
|
|
|
|
csrf_trusted_origins: list[str] = []
|
2023-04-09 06:27:47 +00:00
|
|
|
for host in ta_host.split():
|
|
|
|
host_clean = host.strip()
|
|
|
|
if not host_clean.startswith("http"):
|
2023-04-09 06:35:13 +00:00
|
|
|
host_clean = f"http://{host_clean}"
|
2023-04-09 06:27:47 +00:00
|
|
|
|
|
|
|
parsed = urlparse(host_clean)
|
|
|
|
allowed_hosts.append(f"{parsed.hostname}")
|
|
|
|
csrf_trusted_origins.append(f"{parsed.scheme}://{parsed.hostname}")
|
|
|
|
|
|
|
|
return allowed_hosts, csrf_trusted_origins
|