refacor PendingList class into subclasses

2025-08-08 00:48:15 +00:00 · 2022-03-18 17:19:21 +07:00 · 2022-03-18 17:19:21 +07:00 · 05b8dbc02f
commit 05b8dbc02f
parent 1498fadf27
1 changed files with 190 additions and 199 deletions
--- a/tubearchivist/home/src/download/queue.py
+++ b/tubearchivist/home/src/download/queue.py
@ -5,36 +5,119 @@ Functionality:
 """
 import json
 import os
 from datetime import datetime
 import requests
 import yt_dlp
-from home.src.download.subscriptions import ChannelSubscription
+from home.src.download.subscriptions import (
-from home.src.es.connect import IndexPaginate
+    ChannelSubscription,
    PlaylistSubscription,
 )
 from home.src.download.thumbnails import ThumbManager
 from home.src.es.connect import ElasticWrap, IndexPaginate
 from home.src.index.playlist import YoutubePlaylist
-from home.src.ta.config import AppConfig
+from home.src.ta.helper import DurationConverter
 from home.src.ta.helper import DurationConverter, ignore_filelist
 from home.src.ta.ta_redis import RedisArchivist
-class PendingList:
+class PendingIndex:
-    """manage the pending videos list"""
+    """base class holding all export methods"""
    CONFIG = AppConfig().config
    ES_URL = CONFIG["application"]["es_url"]
    ES_AUTH = CONFIG["application"]["es_auth"]
    VIDEOS = CONFIG["application"]["videos"]
    def __init__(self):
-        self.all_channel_ids = False
+        self.all_pending = False
-        self.all_downloaded = False
+        self.all_ignored = False
-        self.missing_from_playlists = []
+        self.all_videos = False
        self.all_channels = False
        self.all_overwrites = False
        self.to_skip = False
-    def parse_url_list(self, youtube_ids):
+    def get_download(self):
        """get a list of all pending videos in ta_download"""
        data = {
            "query": {"match_all": {}},
            "sort": [{"timestamp": {"order": "asc"}}],
        }
        all_results = IndexPaginate("ta_download", data).get_results()
        self.all_pending = []
        self.all_ignored = []
        self.to_skip = []
        for result in all_results:
            self.to_skip.append(result["youtube_id"])
            if result["status"] == "pending":
                self.all_pending.append(result)
            elif result["status"] == "ignore":
                self.all_ignored.append(result)
    def get_indexed(self):
        """get a list of all videos indexed"""
        data = {
            "query": {"match_all": {}},
            "sort": [{"published": {"order": "desc"}}],
        }
        self.all_videos = IndexPaginate("ta_video", data).get_results()
        for video in self.all_videos:
            self.to_skip.append(video["youtube_id"])
    def get_channels(self):
        """get a list of all channels indexed"""
        self.all_channels = []
        self.all_overwrites = {}
        data = {
            "query": {"match_all": {}},
            "sort": [{"channel_id": {"order": "asc"}}],
        }
        channels = IndexPaginate("ta_channel", data).get_results()
        for channel in channels:
            channel_id = channel["channel_id"]
            self.all_channels.append(channel_id)
            if channel.get("channel_overwrites"):
                self.all_overwrites.update(
                    {channel_id: channel.get("channel_overwrites")}
                )
 class PendingInteract:
    """interact with items in download queue"""
    def __init__(self, video_id=False, status=False):
        self.video_id = video_id
        self.status = status
    def delete_item(self):
        """delete single item from pending"""
        path = f"ta_download/_doc/{self.video_id}"
        _, _ = ElasticWrap(path).delete()
    def delete_by_status(self):
        """delete all matching item by status"""
        data = {"query": {"term": {"status": {"value": self.status}}}}
        path = "ta_download/_delete_by_query"
        _, _ = ElasticWrap(path).post(data=data)
    def update_status(self):
        """update status field of pending item"""
        data = {"doc": {"status": self.status}}
        path = f"ta_download/_update/{self.video_id}"
        _, _ = ElasticWrap(path).post(data=data)
 class PendingList(PendingIndex):
    """manage the pending videos list"""
    def __init__(self, youtube_ids):
        super().__init__()
        self.youtube_ids = youtube_ids
        self.to_skip = False
        self.missing_videos = False
    def parse_url_list(self):
        """extract youtube ids from list"""
-        missing_videos = []
+        self.missing_videos = []
-        for entry in youtube_ids:
+        self.get_download()
        self.get_indexed()
        for entry in self.youtube_ids:
            # notify
            mess_dict = {
                "status": "message:add",
@ -43,97 +126,89 @@ class PendingList:
                "message": "Extracting lists",
            }
            RedisArchivist().set_message("message:add", mess_dict)
-            # extract
+            self._process_entry(entry)
            url = entry["url"]
            url_type = entry["type"]
            if url_type == "video":
                missing_videos.append(url)
            elif url_type == "channel":
                video_results = ChannelSubscription().get_last_youtube_videos(
                    url, limit=False
                )
                youtube_ids = [i[0] for i in video_results]
                missing_videos = missing_videos + youtube_ids
            elif url_type == "playlist":
                self.missing_from_playlists.append(entry)
                playlist = YoutubePlaylist(url)
                playlist.build_json()
                video_results = playlist.json_data.get("playlist_entries")
                youtube_ids = [i["youtube_id"] for i in video_results]
                missing_videos = missing_videos + youtube_ids
-        return missing_videos
+    def _process_entry(self, entry):
        """process single entry from url list"""
        if entry["type"] == "video":
            self._add_video(entry["url"])
        elif entry["type"] == "channel":
            self._parse_channel(entry["url"])
        elif entry["type"] == "playlist":
            self._parse_playlist(entry["url"])
            new_thumbs = PlaylistSubscription().process_url_str(
                [entry], subscribed=False
            )
            ThumbManager().download_playlist(new_thumbs)
        else:
            raise ValueError(f"invalid url_type: {entry}")
-    def add_to_pending(self, missing_videos, ignore=False):
+    def _add_video(self, url):
-        """build the bulk json data from pending"""
+        """add video to list"""
-        # check if channel is indexed
+        if url not in self.missing_videos and url not in self.to_skip:
-        channel_handler = ChannelSubscription()
+            self.missing_videos.append(url)
-        all_indexed = channel_handler.get_channels(subscribed_only=False)
+
-        self.all_channel_ids = [i["channel_id"] for i in all_indexed]
+    def _parse_channel(self, url):
-        # check if already there
+        """add all videos of channel to list"""
-        self.all_downloaded = self.get_all_downloaded()
+        video_results = ChannelSubscription().get_last_youtube_videos(
            url, limit=False
        )
        youtube_ids = [i[0] for i in video_results]
        for video_id in youtube_ids:
            self._add_video(video_id)
    def _parse_playlist(self, url):
        """add all videos of playlist to list"""
        playlist = YoutubePlaylist(url)
        playlist.build_json()
        video_results = playlist.json_data.get("playlist_entries")
        youtube_ids = [i["youtube_id"] for i in video_results]
        for video_id in youtube_ids:
            self._add_video(video_id)
    def add_to_pending(self, status="pending"):
        """add missing videos to pending list"""
        self.get_channels()
        bulk_list = []
        thumb_handler = ThumbManager()
        for idx, youtube_id in enumerate(self.missing_videos):
            video_details = self._get_youtube_details(youtube_id)
            if not video_details:
                continue
            video_details["status"] = status
            action = {"create": {"_id": youtube_id, "_index": "ta_download"}}
            bulk_list.append(json.dumps(action))
            bulk_list.append(json.dumps(video_details))
            thumb_needed = [(youtube_id, video_details["vid_thumb_url"])]
            thumb_handler.download_vid(thumb_needed)
            self._notify_add(idx)
        bulk_list, all_videos_added = self.build_bulk(missing_videos, ignore)
        # add last newline
        bulk_list.append("\n")
        query_str = "\n".join(bulk_list)
-        headers = {"Content-type": "application/x-ndjson"}
+        _, _ = ElasticWrap("_bulk").post(query_str, ndjson=True)
        url = self.ES_URL + "/_bulk"
        request = requests.post(
            url, data=query_str, headers=headers, auth=self.ES_AUTH
        )
        if not request.ok:
            print(request)
            raise ValueError("failed to add video to download queue")
-        return all_videos_added
+    def _notify_add(self, idx):
        """send notification for adding videos to download queue"""
        progress = f"{idx + 1}/{len(self.missing_videos)}"
        mess_dict = {
            "status": "message:add",
            "level": "info",
            "title": "Adding new videos to download queue.",
            "message": "Progress: " + progress,
        }
        if idx + 1 == len(self.missing_videos):
            RedisArchivist().set_message("message:add", mess_dict, expire=4)
        else:
            RedisArchivist().set_message("message:add", mess_dict)
-    def build_bulk(self, missing_videos, ignore=False):
+        if idx + 1 % 25 == 0:
-        """build the bulk lists"""
+            print("adding to queue progress: " + progress)
        bulk_list = []
        all_videos_added = []
-        for idx, youtube_id in enumerate(missing_videos):
+    def _get_youtube_details(self, youtube_id):
            # check if already downloaded
            if youtube_id in self.all_downloaded:
                continue
            video = self.get_youtube_details(youtube_id)
            # skip on download error
            if not video:
                continue
            channel_indexed = video["channel_id"] in self.all_channel_ids
            video["channel_indexed"] = channel_indexed
            if ignore:
                video["status"] = "ignore"
            else:
                video["status"] = "pending"
            action = {"create": {"_id": youtube_id, "_index": "ta_download"}}
            bulk_list.append(json.dumps(action))
            bulk_list.append(json.dumps(video))
            all_videos_added.append((youtube_id, video["vid_thumb_url"]))
            # notify
            progress = f"{idx + 1}/{len(missing_videos)}"
            mess_dict = {
                "status": "message:add",
                "level": "info",
                "title": "Adding new videos to download queue.",
                "message": "Progress: " + progress,
            }
            if idx + 1 == len(missing_videos):
                RedisArchivist().set_message(
                    "message:add", mess_dict, expire=4
                )
            else:
                RedisArchivist().set_message("message:add", mess_dict)
            if idx + 1 % 25 == 0:
                print("adding to queue progress: " + progress)
        return bulk_list, all_videos_added
    @staticmethod
    def get_youtube_details(youtube_id):
        """get details from youtubedl for single pending video"""
        obs = {
            "default_search": "ytsearch",
@ -151,113 +226,29 @@ class PendingList:
        # stop if video is streaming live now
        if vid["is_live"]:
            return False
-        # parse response
+
-        seconds = vid["duration"]
+        return self._parse_youtube_details(vid)
-        duration_str = DurationConverter.get_str(seconds)
+
    def _parse_youtube_details(self, vid):
        """parse response"""
        vid_id = vid.get("id")
        duration_str = DurationConverter.get_str(vid["duration"])
        if duration_str == "NA":
-            print(f"skip extracting duration for: {youtube_id}")
+            print(f"skip extracting duration for: {vid_id}")
-        upload_date = vid["upload_date"]
+        published = datetime.strptime(vid["upload_date"], "%Y%m%d").strftime(
-        upload_dt = datetime.strptime(upload_date, "%Y%m%d")
+            "%Y-%m-%d"
-        published = upload_dt.strftime("%Y-%m-%d")
+        )
        # build dict
        youtube_details = {
-            "youtube_id": youtube_id,
+            "youtube_id": vid_id,
            "channel_name": vid["channel"],
            "vid_thumb_url": vid["thumbnail"],
            "title": vid["title"],
            "channel_id": vid["channel_id"],
            "channel_indexed": vid["channel_id"] in self.all_channels,
            "duration": duration_str,
            "published": published,
            "timestamp": int(datetime.now().strftime("%s")),
        }
        return youtube_details
    @staticmethod
    def get_all_pending():
        """get a list of all pending videos in ta_download"""
        data = {
            "query": {"match_all": {}},
            "sort": [{"timestamp": {"order": "asc"}}],
        }
        all_results = IndexPaginate("ta_download", data).get_results()
        all_pending = []
        all_ignore = []
        for result in all_results:
            if result["status"] == "pending":
                all_pending.append(result)
            elif result["status"] == "ignore":
                all_ignore.append(result)
        return all_pending, all_ignore
    @staticmethod
    def get_all_indexed():
        """get a list of all videos indexed"""
        data = {
            "query": {"match_all": {}},
            "sort": [{"published": {"order": "desc"}}],
        }
        all_indexed = IndexPaginate("ta_video", data).get_results()
        return all_indexed
    def get_all_downloaded(self):
        """get a list of all videos in archive"""
        channel_folders = os.listdir(self.VIDEOS)
        all_channel_folders = ignore_filelist(channel_folders)
        all_downloaded = []
        for channel_folder in all_channel_folders:
            channel_path = os.path.join(self.VIDEOS, channel_folder)
            videos = os.listdir(channel_path)
            all_videos = ignore_filelist(videos)
            youtube_vids = [i[9:20] for i in all_videos]
            for youtube_id in youtube_vids:
                all_downloaded.append(youtube_id)
        return all_downloaded
    def delete_from_pending(self, youtube_id):
        """delete the youtube_id from ta_download"""
        url = f"{self.ES_URL}/ta_download/_doc/{youtube_id}"
        response = requests.delete(url, auth=self.ES_AUTH)
        if not response.ok:
            print(response.text)
    def delete_pending(self, status):
        """delete download queue based on status value"""
        data = {"query": {"term": {"status": {"value": status}}}}
        payload = json.dumps(data)
        url = self.ES_URL + "/ta_download/_delete_by_query"
        headers = {"Content-type": "application/json"}
        response = requests.post(
            url, data=payload, headers=headers, auth=self.ES_AUTH
        )
        if not response.ok:
            print(response.text)
    def ignore_from_pending(self, ignore_list):
        """build the bulk query string"""
        stamp = int(datetime.now().strftime("%s"))
        bulk_list = []
        for youtube_id in ignore_list:
            action = {"update": {"_id": youtube_id, "_index": "ta_download"}}
            source = {"doc": {"status": "ignore", "timestamp": stamp}}
            bulk_list.append(json.dumps(action))
            bulk_list.append(json.dumps(source))
        # add last newline
        bulk_list.append("\n")
        query_str = "\n".join(bulk_list)
        headers = {"Content-type": "application/x-ndjson"}
        url = self.ES_URL + "/_bulk"
        request = requests.post(
            url, data=query_str, headers=headers, auth=self.ES_AUTH
        )
        if not request.ok:
            print(request)
            raise ValueError("failed to set video to ignore")