refactor and consolidate Reindex class

This commit is contained in:
simon 2022-03-23 15:48:38 +07:00
parent 1f7d6871cf
commit fda520ad44
No known key found for this signature in database
GPG Key ID: 2C15AA5E89985DD4
3 changed files with 98 additions and 149 deletions

View File

@ -10,7 +10,6 @@ import os
import re import re
import shutil import shutil
import subprocess import subprocess
from datetime import datetime
import requests import requests
from home.src.download.queue import PendingList from home.src.download.queue import PendingList
@ -319,10 +318,7 @@ def scan_filesystem():
def reindex_old_documents(): def reindex_old_documents():
"""daily refresh of old documents""" """daily refresh of old documents"""
# continue if needed handler = Reindex()
reindex_handler = Reindex() handler.check_outdated()
reindex_handler.check_outdated() handler.reindex()
reindex_handler.reindex() RedisArchivist().set_message("last_reindex", handler.now, expire=False)
# set timestamp
now = int(datetime.now().strftime("%s"))
RedisArchivist().set_message("last_reindex", now, expire=False)

View File

@ -4,85 +4,60 @@ functionality:
- index and update in es - index and update in es
""" """
import json
from datetime import datetime from datetime import datetime
from math import ceil from math import ceil
from time import sleep from time import sleep
import requests
from home.src.download.queue import PendingList from home.src.download.queue import PendingList
from home.src.download.thumbnails import ThumbManager from home.src.download.thumbnails import ThumbManager
from home.src.es.connect import ElasticWrap
from home.src.index.channel import YoutubeChannel from home.src.index.channel import YoutubeChannel
from home.src.index.playlist import YoutubePlaylist from home.src.index.playlist import YoutubePlaylist
from home.src.index.video import YoutubeVideo from home.src.index.video import YoutubeVideo
from home.src.ta.config import AppConfig from home.src.ta.config import AppConfig
from home.src.ta.helper import get_total_hits
class Reindex: class Reindex:
"""check for outdated documents and refresh data from youtube""" """check for outdated documents and refresh data from youtube"""
MATCH_FIELD = {
"ta_video": "active",
"ta_channel": "channel_active",
"ta_playlist": "playlist_active",
}
MULTIPLY = 1.2
def __init__(self): def __init__(self):
# config # config
config = AppConfig().config self.now = int(datetime.now().strftime("%s"))
self.sleep_interval = config["downloads"]["sleep_interval"] self.config = AppConfig().config
self.es_url = config["application"]["es_url"] self.interval = self.config["scheduler"]["check_reindex_days"]
self.es_auth = config["application"]["es_auth"]
self.refresh_interval = config["scheduler"]["check_reindex_days"]
self.integrate_ryd = config["downloads"]["integrate_ryd"]
# scan # scan
self.all_youtube_ids = False self.all_youtube_ids = False
self.all_channel_ids = False self.all_channel_ids = False
self.all_playlist_ids = False self.all_playlist_ids = False
def get_daily(self): def _get_daily(self):
"""get daily refresh values""" """get daily refresh values"""
total_videos = get_total_hits( total_videos = self._get_total_hits("ta_video")
"ta_video", self.es_url, self.es_auth, "active" video_daily = ceil(total_videos / self.interval * self.MULTIPLY)
) total_channels = self._get_total_hits("ta_channel")
video_daily = ceil(total_videos / self.refresh_interval * 1.2) channel_daily = ceil(total_channels / self.interval * self.MULTIPLY)
total_channels = get_total_hits( total_playlists = self._get_total_hits("ta_playlist")
"ta_channel", self.es_url, self.es_auth, "channel_active" playlist_daily = ceil(total_playlists / self.interval * self.MULTIPLY)
)
channel_daily = ceil(total_channels / self.refresh_interval * 1.2)
total_playlists = get_total_hits(
"ta_playlist", self.es_url, self.es_auth, "playlist_active"
)
playlist_daily = ceil(total_playlists / self.refresh_interval * 1.2)
return (video_daily, channel_daily, playlist_daily) return (video_daily, channel_daily, playlist_daily)
def get_outdated_vids(self, size): def _get_total_hits(self, index):
"""get daily videos to refresh""" """get total hits from index"""
headers = {"Content-type": "application/json"} match_field = self.MATCH_FIELD[index]
now = int(datetime.now().strftime("%s")) path = f"{index}/_search?filter_path=hits.total"
now_lte = now - self.refresh_interval * 24 * 60 * 60 data = {"query": {"match": {match_field: True}}}
data = { response, _ = ElasticWrap(path).post(data=data)
"size": size, total_hits = response["hits"]["total"]["value"]
"query": { return total_hits
"bool": {
"must": [
{"match": {"active": True}},
{"range": {"vid_last_refresh": {"lte": now_lte}}},
]
}
},
"sort": [{"vid_last_refresh": {"order": "asc"}}],
"_source": False,
}
query_str = json.dumps(data)
url = self.es_url + "/ta_video/_search"
response = requests.get(
url, data=query_str, headers=headers, auth=self.es_auth
)
if not response.ok:
print(response.text)
response_dict = json.loads(response.text)
all_youtube_ids = [i["_id"] for i in response_dict["hits"]["hits"]]
return all_youtube_ids
def get_unrated_vids(self): def _get_unrated_vids(self):
"""get all videos without rating if ryd integration is enabled""" """get max 200 videos without rating if ryd integration is enabled"""
headers = {"Content-type": "application/json"}
data = { data = {
"size": 200, "size": 200,
"query": { "query": {
@ -91,86 +66,78 @@ class Reindex:
} }
}, },
} }
query_str = json.dumps(data) response, _ = ElasticWrap("ta_video/_search").get(data=data)
url = self.es_url + "/ta_video/_search"
response = requests.get( missing_rating = [i["_id"] for i in response["hits"]["hits"]]
url, data=query_str, headers=headers, auth=self.es_auth
)
if not response.ok:
print(response.text)
response_dict = json.loads(response.text)
missing_rating = [i["_id"] for i in response_dict["hits"]["hits"]]
self.all_youtube_ids = self.all_youtube_ids + missing_rating self.all_youtube_ids = self.all_youtube_ids + missing_rating
def get_outdated_channels(self, size): def _get_outdated_vids(self, size):
"""get daily channels to refresh""" """get daily videos to refresh"""
headers = {"Content-type": "application/json"} now_lte = self.now - self.interval * 24 * 60 * 60
now = int(datetime.now().strftime("%s")) must_list = [
now_lte = now - self.refresh_interval * 24 * 60 * 60 {"match": {"active": True}},
{"range": {"vid_last_refresh": {"lte": now_lte}}},
]
data = { data = {
"size": size, "size": size,
"query": { "query": {"bool": {"must": must_list}},
"bool": { "sort": [{"vid_last_refresh": {"order": "asc"}}],
"must": [ "_source": False,
{"match": {"channel_active": True}}, }
{"range": {"channel_last_refresh": {"lte": now_lte}}}, response, _ = ElasticWrap("ta_video/_search").get(data=data)
]
} all_youtube_ids = [i["_id"] for i in response["hits"]["hits"]]
}, return all_youtube_ids
def _get_outdated_channels(self, size):
"""get daily channels to refresh"""
now_lte = self.now - self.interval * 24 * 60 * 60
must_list = [
{"match": {"channel_active": True}},
{"range": {"channel_last_refresh": {"lte": now_lte}}},
]
data = {
"size": size,
"query": {"bool": {"must": must_list}},
"sort": [{"channel_last_refresh": {"order": "asc"}}], "sort": [{"channel_last_refresh": {"order": "asc"}}],
"_source": False, "_source": False,
} }
query_str = json.dumps(data) response, _ = ElasticWrap("ta_channel/_search").get(data=data)
url = self.es_url + "/ta_channel/_search"
response = requests.get( all_channel_ids = [i["_id"] for i in response["hits"]["hits"]]
url, data=query_str, headers=headers, auth=self.es_auth
)
if not response.ok:
print(response.text)
response_dict = json.loads(response.text)
all_channel_ids = [i["_id"] for i in response_dict["hits"]["hits"]]
return all_channel_ids return all_channel_ids
def get_outdated_playlists(self, size): def _get_outdated_playlists(self, size):
"""get daily outdated playlists to refresh""" """get daily outdated playlists to refresh"""
headers = {"Content-type": "application/json"} now_lte = self.now - self.interval * 24 * 60 * 60
now = int(datetime.now().strftime("%s")) must_list = [
now_lte = now - self.refresh_interval * 24 * 60 * 60 {"match": {"playlist_active": True}},
{"range": {"playlist_last_refresh": {"lte": now_lte}}},
]
data = { data = {
"size": size, "size": size,
"query": { "query": {"bool": {"must": must_list}},
"bool": {
"must": [
{"match": {"playlist_active": True}},
{"range": {"playlist_last_refresh": {"lte": now_lte}}},
]
}
},
"sort": [{"playlist_last_refresh": {"order": "asc"}}], "sort": [{"playlist_last_refresh": {"order": "asc"}}],
"_source": False, "_source": False,
} }
query_str = json.dumps(data) response, _ = ElasticWrap("ta_playlist/_search").get(data=data)
url = self.es_url + "/ta_playlist/_search"
response = requests.get( all_playlist_ids = [i["_id"] for i in response["hits"]["hits"]]
url, data=query_str, headers=headers, auth=self.es_auth
)
if not response.ok:
print(response.text)
response_dict = json.loads(response.text)
all_playlist_ids = [i["_id"] for i in response_dict["hits"]["hits"]]
return all_playlist_ids return all_playlist_ids
def check_outdated(self): def check_outdated(self):
"""add missing vids and channels""" """add missing vids and channels"""
video_daily, channel_daily, playlist_daily = self.get_daily() video_daily, channel_daily, playlist_daily = self._get_daily()
self.all_youtube_ids = self.get_outdated_vids(video_daily) self.all_youtube_ids = self._get_outdated_vids(video_daily)
self.all_channel_ids = self.get_outdated_channels(channel_daily) self.all_channel_ids = self._get_outdated_channels(channel_daily)
self.all_playlist_ids = self.get_outdated_playlists(playlist_daily) self.all_playlist_ids = self._get_outdated_playlists(playlist_daily)
if self.integrate_ryd:
self.get_unrated_vids() integrate_ryd = self.config["downloads"]["integrate_ryd"]
if integrate_ryd:
self._get_unrated_vids()
@staticmethod @staticmethod
def reindex_single_video(youtube_id): def _reindex_single_video(youtube_id):
"""refresh data for single video""" """refresh data for single video"""
video = YoutubeVideo(youtube_id) video = YoutubeVideo(youtube_id)
@ -204,20 +171,21 @@ class Reindex:
return return
@staticmethod @staticmethod
def reindex_single_channel(channel_id): def _reindex_single_channel(channel_id):
"""refresh channel data and sync to videos""" """refresh channel data and sync to videos"""
channel = YoutubeChannel(channel_id) channel = YoutubeChannel(channel_id)
channel.get_from_es() channel.get_from_es()
subscribed = channel.json_data["channel_subscribed"] subscribed = channel.json_data["channel_subscribed"]
overwrites = channel.json_data["channel_overwrites"] overwrites = channel.json_data.get("channel_overwrites", False)
channel.get_from_youtube() channel.get_from_youtube()
channel.json_data["channel_subscribed"] = subscribed channel.json_data["channel_subscribed"] = subscribed
channel.json_data["channel_overwrites"] = overwrites if overwrites:
channel.json_data["channel_overwrites"] = overwrites
channel.upload_to_es() channel.upload_to_es()
channel.sync_to_videos() channel.sync_to_videos()
@staticmethod @staticmethod
def reindex_single_playlist(playlist_id, all_indexed_ids): def _reindex_single_playlist(playlist_id, all_indexed_ids):
"""refresh playlist data""" """refresh playlist data"""
playlist = YoutubePlaylist(playlist_id) playlist = YoutubePlaylist(playlist_id)
playlist.get_from_es() playlist.get_from_es()
@ -234,18 +202,19 @@ class Reindex:
def reindex(self): def reindex(self):
"""reindex what's needed""" """reindex what's needed"""
sleep_interval = self.config["downloads"]["sleep_interval"]
# videos # videos
print(f"reindexing {len(self.all_youtube_ids)} videos") print(f"reindexing {len(self.all_youtube_ids)} videos")
for youtube_id in self.all_youtube_ids: for youtube_id in self.all_youtube_ids:
self.reindex_single_video(youtube_id) self._reindex_single_video(youtube_id)
if self.sleep_interval: if sleep_interval:
sleep(self.sleep_interval) sleep(sleep_interval)
# channels # channels
print(f"reindexing {len(self.all_channel_ids)} channels") print(f"reindexing {len(self.all_channel_ids)} channels")
for channel_id in self.all_channel_ids: for channel_id in self.all_channel_ids:
self.reindex_single_channel(channel_id) self._reindex_single_channel(channel_id)
if self.sleep_interval: if sleep_interval:
sleep(self.sleep_interval) sleep(sleep_interval)
# playlist # playlist
print(f"reindexing {len(self.all_playlist_ids)} playlists") print(f"reindexing {len(self.all_playlist_ids)} playlists")
if self.all_playlist_ids: if self.all_playlist_ids:
@ -253,6 +222,6 @@ class Reindex:
handler.get_indexed() handler.get_indexed()
all_indexed_ids = [i["youtube_id"] for i in handler.all_videos] all_indexed_ids = [i["youtube_id"] for i in handler.all_videos]
for playlist_id in self.all_playlist_ids: for playlist_id in self.all_playlist_ids:
self.reindex_single_playlist(playlist_id, all_indexed_ids) self._reindex_single_playlist(playlist_id, all_indexed_ids)
if self.sleep_interval: if sleep_interval:
sleep(self.sleep_interval) sleep(sleep_interval)

View File

@ -3,31 +3,15 @@ Loose collection of helper functions
- don't import AppConfig class here to avoid circular imports - don't import AppConfig class here to avoid circular imports
""" """
import json
import re import re
import string import string
import subprocess import subprocess
import unicodedata import unicodedata
from urllib.parse import parse_qs, urlparse from urllib.parse import parse_qs, urlparse
import requests
import yt_dlp import yt_dlp
def get_total_hits(index, es_url, es_auth, match_field):
"""get total hits from index"""
headers = {"Content-type": "application/json"}
data = {"query": {"match": {match_field: True}}}
payload = json.dumps(data)
url = f"{es_url}/{index}/_search?filter_path=hits.total"
request = requests.post(url, data=payload, headers=headers, auth=es_auth)
if not request.ok:
print(request.text)
total_json = json.loads(request.text)
total_hits = total_json["hits"]["total"]["value"]
return total_hits
def clean_string(file_name): def clean_string(file_name):
"""clean string to only asci characters""" """clean string to only asci characters"""
whitelist = "-_.() " + string.ascii_letters + string.digits whitelist = "-_.() " + string.ascii_letters + string.digits