add random headers for requests outside of yt-dlp

This commit is contained in:
simon 2022-03-26 11:49:53 +07:00
parent 6d874f4b7a
commit fcadb5ead8
No known key found for this signature in database
GPG Key ID: 2C15AA5E89985DD4
3 changed files with 66 additions and 4 deletions

View File

@ -17,7 +17,7 @@ from home.src.download.thumbnails import ThumbManager
from home.src.es.connect import ElasticWrap, IndexPaginate
from home.src.index.generic import YouTubeItem
from home.src.index.playlist import YoutubePlaylist
from home.src.ta.helper import clean_string
from home.src.ta.helper import clean_string, requests_headers
from home.src.ta.ta_redis import RedisArchivist
@ -46,7 +46,9 @@ class ChannelScraper:
print(f"{self.channel_id}: scrape channel data from youtube")
url = f"https://www.youtube.com/channel/{self.channel_id}/about?hl=en"
cookies = {"CONSENT": "YES+xxxxxxxxxxxxxxxxxxxxxxxxxxx"}
response = requests.get(url, cookies=cookies)
response = requests.get(
url, cookies=cookies, headers=requests_headers()
)
if response.ok:
channel_page = response.text
else:

View File

@ -12,7 +12,11 @@ import requests
from home.src.es.connect import ElasticWrap
from home.src.index import channel as ta_channel
from home.src.index.generic import YouTubeItem
from home.src.ta.helper import DurationConverter, clean_string
from home.src.ta.helper import (
DurationConverter,
clean_string,
requests_headers,
)
from ryd_client import ryd_client
@ -115,9 +119,12 @@ class YoutubeSubtitle:
dest_path = os.path.join(videos_base, subtitle["media_url"])
source = subtitle["source"]
lang = subtitle.get("lang")
response = requests.get(subtitle["url"])
response = requests.get(
subtitle["url"], headers=requests_headers()
)
if not response.ok:
print(f"{self.video.youtube_id}: failed to download subtitle")
print(response.text)
continue
parser = SubtitleParser(response.text, lang, source)

View File

@ -3,6 +3,7 @@ Loose collection of helper functions
- don't import AppConfig class here to avoid circular imports
"""
import random
import re
import string
import subprocess
@ -35,6 +36,58 @@ def ignore_filelist(filelist):
return cleaned
def requests_headers():
"""build header with random user agent for requests outside of yt-dlp"""
chrome_versions = (
"90.0.4430.212",
"90.0.4430.24",
"90.0.4430.70",
"90.0.4430.72",
"90.0.4430.85",
"90.0.4430.93",
"91.0.4472.101",
"91.0.4472.106",
"91.0.4472.114",
"91.0.4472.124",
"91.0.4472.164",
"91.0.4472.19",
"91.0.4472.77",
"92.0.4515.107",
"92.0.4515.115",
"92.0.4515.131",
"92.0.4515.159",
"92.0.4515.43",
"93.0.4556.0",
"93.0.4577.15",
"93.0.4577.63",
"93.0.4577.82",
"94.0.4606.41",
"94.0.4606.54",
"94.0.4606.61",
"94.0.4606.71",
"94.0.4606.81",
"94.0.4606.85",
"95.0.4638.17",
"95.0.4638.50",
"95.0.4638.54",
"95.0.4638.69",
"95.0.4638.74",
"96.0.4664.18",
"96.0.4664.45",
"96.0.4664.55",
"96.0.4664.93",
"97.0.4692.20",
)
template = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
+ f"Chrome/{random.choice(chrome_versions)} Safari/537.36"
)
return {"User-Agent": template}
class UrlListParser:
"""take a multi line string and detect valid youtube ids"""