From fcadb5ead84cd6065e461da3c5d8d8bd3bc6b588 Mon Sep 17 00:00:00 2001 From: simon Date: Sat, 26 Mar 2022 11:49:53 +0700 Subject: [PATCH] add random headers for requests outside of yt-dlp --- tubearchivist/home/src/index/channel.py | 6 ++- tubearchivist/home/src/index/video.py | 11 ++++- tubearchivist/home/src/ta/helper.py | 53 +++++++++++++++++++++++++ 3 files changed, 66 insertions(+), 4 deletions(-) diff --git a/tubearchivist/home/src/index/channel.py b/tubearchivist/home/src/index/channel.py index 66a4fb3..953078d 100644 --- a/tubearchivist/home/src/index/channel.py +++ b/tubearchivist/home/src/index/channel.py @@ -17,7 +17,7 @@ from home.src.download.thumbnails import ThumbManager from home.src.es.connect import ElasticWrap, IndexPaginate from home.src.index.generic import YouTubeItem from home.src.index.playlist import YoutubePlaylist -from home.src.ta.helper import clean_string +from home.src.ta.helper import clean_string, requests_headers from home.src.ta.ta_redis import RedisArchivist @@ -46,7 +46,9 @@ class ChannelScraper: print(f"{self.channel_id}: scrape channel data from youtube") url = f"https://www.youtube.com/channel/{self.channel_id}/about?hl=en" cookies = {"CONSENT": "YES+xxxxxxxxxxxxxxxxxxxxxxxxxxx"} - response = requests.get(url, cookies=cookies) + response = requests.get( + url, cookies=cookies, headers=requests_headers() + ) if response.ok: channel_page = response.text else: diff --git a/tubearchivist/home/src/index/video.py b/tubearchivist/home/src/index/video.py index d59650f..1dafb7b 100644 --- a/tubearchivist/home/src/index/video.py +++ b/tubearchivist/home/src/index/video.py @@ -12,7 +12,11 @@ import requests from home.src.es.connect import ElasticWrap from home.src.index import channel as ta_channel from home.src.index.generic import YouTubeItem -from home.src.ta.helper import DurationConverter, clean_string +from home.src.ta.helper import ( + DurationConverter, + clean_string, + requests_headers, +) from ryd_client import ryd_client @@ -115,9 +119,12 @@ class YoutubeSubtitle: dest_path = os.path.join(videos_base, subtitle["media_url"]) source = subtitle["source"] lang = subtitle.get("lang") - response = requests.get(subtitle["url"]) + response = requests.get( + subtitle["url"], headers=requests_headers() + ) if not response.ok: print(f"{self.video.youtube_id}: failed to download subtitle") + print(response.text) continue parser = SubtitleParser(response.text, lang, source) diff --git a/tubearchivist/home/src/ta/helper.py b/tubearchivist/home/src/ta/helper.py index 04cea5e..3a1af52 100644 --- a/tubearchivist/home/src/ta/helper.py +++ b/tubearchivist/home/src/ta/helper.py @@ -3,6 +3,7 @@ Loose collection of helper functions - don't import AppConfig class here to avoid circular imports """ +import random import re import string import subprocess @@ -35,6 +36,58 @@ def ignore_filelist(filelist): return cleaned +def requests_headers(): + """build header with random user agent for requests outside of yt-dlp""" + + chrome_versions = ( + "90.0.4430.212", + "90.0.4430.24", + "90.0.4430.70", + "90.0.4430.72", + "90.0.4430.85", + "90.0.4430.93", + "91.0.4472.101", + "91.0.4472.106", + "91.0.4472.114", + "91.0.4472.124", + "91.0.4472.164", + "91.0.4472.19", + "91.0.4472.77", + "92.0.4515.107", + "92.0.4515.115", + "92.0.4515.131", + "92.0.4515.159", + "92.0.4515.43", + "93.0.4556.0", + "93.0.4577.15", + "93.0.4577.63", + "93.0.4577.82", + "94.0.4606.41", + "94.0.4606.54", + "94.0.4606.61", + "94.0.4606.71", + "94.0.4606.81", + "94.0.4606.85", + "95.0.4638.17", + "95.0.4638.50", + "95.0.4638.54", + "95.0.4638.69", + "95.0.4638.74", + "96.0.4664.18", + "96.0.4664.45", + "96.0.4664.55", + "96.0.4664.93", + "97.0.4692.20", + ) + template = ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + + "AppleWebKit/537.36 (KHTML, like Gecko) " + + f"Chrome/{random.choice(chrome_versions)} Safari/537.36" + ) + + return {"User-Agent": template} + + class UrlListParser: """take a multi line string and detect valid youtube ids"""