tubearchivist/tubearchivist/home/src/download/queue.py

"""
Functionality:
- handle download queue
- linked with ta_dowload index
"""

import json
from datetime import datetime

from home.src.download.subscriptions import (
    ChannelSubscription,
    PlaylistSubscription,
)
from home.src.download.thumbnails import ThumbManager
from home.src.download.yt_dlp_base import YtWrap
from home.src.es.connect import ElasticWrap, IndexPaginate
from home.src.index.playlist import YoutubePlaylist
from home.src.ta.config import AppConfig
from home.src.ta.helper import DurationConverter
from home.src.ta.ta_redis import RedisArchivist


class PendingIndex:
    """base class holding all export methods"""

    def __init__(self):
        self.all_pending = False
        self.all_ignored = False
        self.all_videos = False
        self.all_channels = False
        self.channel_overwrites = False
        self.video_overwrites = False
        self.to_skip = False

    def get_download(self):
        """get a list of all pending videos in ta_download"""
        data = {
            "query": {"match_all": {}},
            "sort": [{"timestamp": {"order": "asc"}}],
        }
        all_results = IndexPaginate("ta_download", data).get_results()

        self.all_pending = []
        self.all_ignored = []
        self.to_skip = []

        for result in all_results:
            self.to_skip.append(result["youtube_id"])
            if result["status"] == "pending":
                self.all_pending.append(result)
            elif result["status"] == "ignore":
                self.all_ignored.append(result)

    def get_indexed(self):
        """get a list of all videos indexed"""
        data = {
            "query": {"match_all": {}},
            "sort": [{"published": {"order": "desc"}}],
        }
        self.all_videos = IndexPaginate("ta_video", data).get_results()
        for video in self.all_videos:
            self.to_skip.append(video["youtube_id"])

    def get_channels(self):
        """get a list of all channels indexed"""
        self.all_channels = []
        self.channel_overwrites = {}
        data = {
            "query": {"match_all": {}},
            "sort": [{"channel_id": {"order": "asc"}}],
        }
        channels = IndexPaginate("ta_channel", data).get_results()

        for channel in channels:
            channel_id = channel["channel_id"]
            self.all_channels.append(channel_id)
            if channel.get("channel_overwrites"):
                self.channel_overwrites.update(
                    {channel_id: channel.get("channel_overwrites")}
                )

        self._map_overwrites()

    def _map_overwrites(self):
        """map video ids to channel ids overwrites"""
        self.video_overwrites = {}
        for video in self.all_pending:
            video_id = video["youtube_id"]
            channel_id = video["channel_id"]
            overwrites = self.channel_overwrites.get(channel_id, False)
            if overwrites:
                self.video_overwrites.update({video_id: overwrites})


class PendingInteract:
    """interact with items in download queue"""

    def __init__(self, video_id=False, status=False):
        self.video_id = video_id
        self.status = status

    def delete_item(self):
        """delete single item from pending"""
        path = f"ta_download/_doc/{self.video_id}"
        _, _ = ElasticWrap(path).delete(refresh=True)

    def delete_by_status(self):
        """delete all matching item by status"""
        data = {"query": {"term": {"status": {"value": self.status}}}}
        path = "ta_download/_delete_by_query"
        _, _ = ElasticWrap(path).post(data=data)

    def update_status(self):
        """update status field of pending item"""
        data = {"doc": {"status": self.status}}
        path = f"ta_download/_update/{self.video_id}"
        _, _ = ElasticWrap(path).post(data=data)


class PendingList(PendingIndex):
    """manage the pending videos list"""

    yt_obs = {
        "default_search": "ytsearch",
        "quiet": True,
        "check_formats": "selected",
        "noplaylist": True,
        "writethumbnail": True,
        "simulate": True,
        "socket_timeout": 3,
    }

    def __init__(self, youtube_ids=False):
        super().__init__()
        self.config = AppConfig().config
        self.youtube_ids = youtube_ids
        self.to_skip = False
        self.missing_videos = False

    def parse_url_list(self):
        """extract youtube ids from list"""
        self.missing_videos = []
        self.get_download()
        self.get_indexed()
        for entry in self.youtube_ids:
            # notify
            mess_dict = {
                "status": "message:add",
                "level": "info",
                "title": "Adding to download queue.",
                "message": "Extracting lists",
            }
            RedisArchivist().set_message("message:add", mess_dict)
            self._process_entry(entry)

    def _process_entry(self, entry):
        """process single entry from url list"""
        if entry["type"] == "video":
            self._add_video(entry["url"])
        elif entry["type"] == "channel":
            self._parse_channel(entry["url"])
        elif entry["type"] == "playlist":
            self._parse_playlist(entry["url"])
            new_thumbs = PlaylistSubscription().process_url_str(
                [entry], subscribed=False
            )
            ThumbManager().download_playlist(new_thumbs)
        else:
            raise ValueError(f"invalid url_type: {entry}")

    def _add_video(self, url):
        """add video to list"""
        if url not in self.missing_videos and url not in self.to_skip:
            self.missing_videos.append(url)
        else:
            print(f"{url}: skipped adding already indexed video to download.")

    def _parse_channel(self, url):
        """add all videos of channel to list"""
        video_results = ChannelSubscription().get_last_youtube_videos(
            url, limit=False
        )
        youtube_ids = [i[0] for i in video_results]
        for video_id in youtube_ids:
            self._add_video(video_id)

    def _parse_playlist(self, url):
        """add all videos of playlist to list"""
        playlist = YoutubePlaylist(url)
        playlist.build_json()
        video_results = playlist.json_data.get("playlist_entries")
        youtube_ids = [i["youtube_id"] for i in video_results]
        for video_id in youtube_ids:
            self._add_video(video_id)

    def add_to_pending(self, status="pending"):
        """add missing videos to pending list"""
        self.get_channels()
        bulk_list = []

        thumb_handler = ThumbManager()
        for idx, youtube_id in enumerate(self.missing_videos):
            video_details = self.get_youtube_details(youtube_id)
            if not video_details:
                continue

            video_details["status"] = status
            action = {"create": {"_id": youtube_id, "_index": "ta_download"}}
            bulk_list.append(json.dumps(action))
            bulk_list.append(json.dumps(video_details))

            thumb_needed = [(youtube_id, video_details["vid_thumb_url"])]
            thumb_handler.download_vid(thumb_needed)
            self._notify_add(idx)

        if bulk_list:
            # add last newline
            bulk_list.append("\n")
            query_str = "\n".join(bulk_list)
            _, _ = ElasticWrap("_bulk").post(query_str, ndjson=True)

    def _notify_add(self, idx):
        """send notification for adding videos to download queue"""
        progress = f"{idx + 1}/{len(self.missing_videos)}"
        mess_dict = {
            "status": "message:add",
            "level": "info",
            "title": "Adding new videos to download queue.",
            "message": "Progress: " + progress,
        }
        if idx + 1 == len(self.missing_videos):
            RedisArchivist().set_message("message:add", mess_dict, expire=4)
        else:
            RedisArchivist().set_message("message:add", mess_dict)

        if idx + 1 % 25 == 0:
            print("adding to queue progress: " + progress)

    def get_youtube_details(self, youtube_id):
        """get details from youtubedl for single pending video"""
        vid = YtWrap(self.yt_obs, self.config).extract(youtube_id)
        if not vid:
            return False

        if vid.get("id") != youtube_id:
            # skip premium videos with different id
            print(f"{youtube_id}: skipping premium video, id not matching")
            return False
        # stop if video is streaming live now
        if vid["is_live"]:
            return False

        return self._parse_youtube_details(vid)

    def _parse_youtube_details(self, vid):
        """parse response"""
        vid_id = vid.get("id")
        duration_str = DurationConverter.get_str(vid["duration"])
        if duration_str == "NA":
            print(f"skip extracting duration for: {vid_id}")
        published = datetime.strptime(vid["upload_date"], "%Y%m%d").strftime(
            "%Y-%m-%d"
        )

        # build dict
        youtube_details = {
            "youtube_id": vid_id,
            "channel_name": vid["channel"],
            "vid_thumb_url": vid["thumbnail"],
            "title": vid["title"],
            "channel_id": vid["channel_id"],
            "channel_indexed": vid["channel_id"] in self.all_channels,
            "duration": duration_str,
            "published": published,
            "timestamp": int(datetime.now().strftime("%s")),
        }
        return youtube_details
update doc strings to represent new module structure 2022-01-23 12:32:08 +00:00			`"""`
			`Functionality:`
			`- handle download queue`
			`- linked with ta_dowload index`
			`"""`
major refactor, split up modules 2022-01-22 15:13:37 +00:00
			`import json`
			`from datetime import datetime`

refacor PendingList class into subclasses 2022-03-18 10:19:21 +00:00			`from home.src.download.subscriptions import (`
			`ChannelSubscription,`
			`PlaylistSubscription,`
			`)`
			`from home.src.download.thumbnails import ThumbManager`
integrate new YtWrap class in download queue and subscriptions classes 2022-05-24 03:36:39 +00:00			`from home.src.download.yt_dlp_base import YtWrap`
refacor PendingList class into subclasses 2022-03-18 10:19:21 +00:00			`from home.src.es.connect import ElasticWrap, IndexPaginate`
major refactor, split up modules 2022-01-22 15:13:37 +00:00			`from home.src.index.playlist import YoutubePlaylist`
add cookiefile to PendingList and VideoDownloader 2022-04-30 10:27:57 +00:00			`from home.src.ta.config import AppConfig`
refacor PendingList class into subclasses 2022-03-18 10:19:21 +00:00			`from home.src.ta.helper import DurationConverter`
major refactor, split up modules 2022-01-22 15:13:37 +00:00			`from home.src.ta.ta_redis import RedisArchivist`


refacor PendingList class into subclasses 2022-03-18 10:19:21 +00:00			`class PendingIndex:`
			`"""base class holding all export methods"""`
major refactor, split up modules 2022-01-22 15:13:37 +00:00
			`def __init__(self):`
refacor PendingList class into subclasses 2022-03-18 10:19:21 +00:00			`self.all_pending = False`
			`self.all_ignored = False`
			`self.all_videos = False`
			`self.all_channels = False`
rename channel_overwrites attribute 2022-03-19 08:03:38 +00:00			`self.channel_overwrites = False`
map channel overwrite to video id for later efficient mapping 2022-03-18 14:39:33 +00:00			`self.video_overwrites = False`
refacor PendingList class into subclasses 2022-03-18 10:19:21 +00:00			`self.to_skip = False`

			`def get_download(self):`
			`"""get a list of all pending videos in ta_download"""`
			`data = {`
			`"query": {"match_all": {}},`
			`"sort": [{"timestamp": {"order": "asc"}}],`
			`}`
			`all_results = IndexPaginate("ta_download", data).get_results()`

			`self.all_pending = []`
			`self.all_ignored = []`
			`self.to_skip = []`

			`for result in all_results:`
			`self.to_skip.append(result["youtube_id"])`
			`if result["status"] == "pending":`
			`self.all_pending.append(result)`
			`elif result["status"] == "ignore":`
			`self.all_ignored.append(result)`

			`def get_indexed(self):`
			`"""get a list of all videos indexed"""`
			`data = {`
			`"query": {"match_all": {}},`
			`"sort": [{"published": {"order": "desc"}}],`
			`}`
			`self.all_videos = IndexPaginate("ta_video", data).get_results()`
			`for video in self.all_videos:`
			`self.to_skip.append(video["youtube_id"])`

			`def get_channels(self):`
			`"""get a list of all channels indexed"""`
			`self.all_channels = []`
rename channel_overwrites attribute 2022-03-19 08:03:38 +00:00			`self.channel_overwrites = {}`
refacor PendingList class into subclasses 2022-03-18 10:19:21 +00:00			`data = {`
			`"query": {"match_all": {}},`
			`"sort": [{"channel_id": {"order": "asc"}}],`
			`}`
			`channels = IndexPaginate("ta_channel", data).get_results()`

			`for channel in channels:`
			`channel_id = channel["channel_id"]`
			`self.all_channels.append(channel_id)`
			`if channel.get("channel_overwrites"):`
rename channel_overwrites attribute 2022-03-19 08:03:38 +00:00			`self.channel_overwrites.update(`
refacor PendingList class into subclasses 2022-03-18 10:19:21 +00:00			`{channel_id: channel.get("channel_overwrites")}`
			`)`
major refactor, split up modules 2022-01-22 15:13:37 +00:00
map channel overwrite to video id for later efficient mapping 2022-03-18 14:39:33 +00:00			`self._map_overwrites()`

			`def _map_overwrites(self):`
			`"""map video ids to channel ids overwrites"""`
			`self.video_overwrites = {}`
			`for video in self.all_pending:`
			`video_id = video["youtube_id"]`
			`channel_id = video["channel_id"]`
rename channel_overwrites attribute 2022-03-19 08:03:38 +00:00			`overwrites = self.channel_overwrites.get(channel_id, False)`
map channel overwrite to video id for later efficient mapping 2022-03-18 14:39:33 +00:00			`if overwrites:`
			`self.video_overwrites.update({video_id: overwrites})`

refacor PendingList class into subclasses 2022-03-18 10:19:21 +00:00
			`class PendingInteract:`
			`"""interact with items in download queue"""`

			`def __init__(self, video_id=False, status=False):`
			`self.video_id = video_id`
			`self.status = status`

			`def delete_item(self):`
			`"""delete single item from pending"""`
			`path = f"ta_download/_doc/{self.video_id}"`
refresh for PendingInteract delete to avoid race condition, #217 2022-05-02 11:20:56 +00:00			`_, _ = ElasticWrap(path).delete(refresh=True)`
refacor PendingList class into subclasses 2022-03-18 10:19:21 +00:00
			`def delete_by_status(self):`
			`"""delete all matching item by status"""`
			`data = {"query": {"term": {"status": {"value": self.status}}}}`
			`path = "ta_download/_delete_by_query"`
			`_, _ = ElasticWrap(path).post(data=data)`

			`def update_status(self):`
			`"""update status field of pending item"""`
			`data = {"doc": {"status": self.status}}`
			`path = f"ta_download/_update/{self.video_id}"`
			`_, _ = ElasticWrap(path).post(data=data)`


			`class PendingList(PendingIndex):`
			`"""manage the pending videos list"""`

add cookiefile to PendingList and VideoDownloader 2022-04-30 10:27:57 +00:00			`yt_obs = {`
			`"default_search": "ytsearch",`
			`"quiet": True,`
			`"check_formats": "selected",`
			`"noplaylist": True,`
			`"writethumbnail": True,`
			`"simulate": True,`
add socket_timeout when passing check_formats to yt-dlp 2022-05-02 02:48:52 +00:00			`"socket_timeout": 3,`
add cookiefile to PendingList and VideoDownloader 2022-04-30 10:27:57 +00:00			`}`

use the refactored PendingList class 2022-03-18 11:27:25 +00:00			`def __init__(self, youtube_ids=False):`
refacor PendingList class into subclasses 2022-03-18 10:19:21 +00:00			`super().__init__()`
refactor use cookie io_stream 2022-05-24 08:51:58 +00:00			`self.config = AppConfig().config`
refacor PendingList class into subclasses 2022-03-18 10:19:21 +00:00			`self.youtube_ids = youtube_ids`
			`self.to_skip = False`
			`self.missing_videos = False`

			`def parse_url_list(self):`
major refactor, split up modules 2022-01-22 15:13:37 +00:00			`"""extract youtube ids from list"""`
refacor PendingList class into subclasses 2022-03-18 10:19:21 +00:00			`self.missing_videos = []`
			`self.get_download()`
			`self.get_indexed()`
			`for entry in self.youtube_ids:`
major refactor, split up modules 2022-01-22 15:13:37 +00:00			`# notify`
			`mess_dict = {`
			`"status": "message:add",`
			`"level": "info",`
			`"title": "Adding to download queue.",`
			`"message": "Extracting lists",`
			`}`
			`RedisArchivist().set_message("message:add", mess_dict)`
refacor PendingList class into subclasses 2022-03-18 10:19:21 +00:00			`self._process_entry(entry)`

			`def _process_entry(self, entry):`
			`"""process single entry from url list"""`
			`if entry["type"] == "video":`
			`self._add_video(entry["url"])`
			`elif entry["type"] == "channel":`
			`self._parse_channel(entry["url"])`
			`elif entry["type"] == "playlist":`
			`self._parse_playlist(entry["url"])`
			`new_thumbs = PlaylistSubscription().process_url_str(`
			`[entry], subscribed=False`
			`)`
			`ThumbManager().download_playlist(new_thumbs)`
			`else:`
			`raise ValueError(f"invalid url_type: {entry}")`

			`def _add_video(self, url):`
			`"""add video to list"""`
			`if url not in self.missing_videos and url not in self.to_skip:`
			`self.missing_videos.append(url)`
log message when skipping video 2022-05-28 04:11:09 +00:00			`else:`
			`print(f"{url}: skipped adding already indexed video to download.")`
refacor PendingList class into subclasses 2022-03-18 10:19:21 +00:00
			`def _parse_channel(self, url):`
			`"""add all videos of channel to list"""`
			`video_results = ChannelSubscription().get_last_youtube_videos(`
			`url, limit=False`
major refactor, split up modules 2022-01-22 15:13:37 +00:00			`)`
refacor PendingList class into subclasses 2022-03-18 10:19:21 +00:00			`youtube_ids = [i[0] for i in video_results]`
			`for video_id in youtube_ids:`
			`self._add_video(video_id)`

			`def _parse_playlist(self, url):`
			`"""add all videos of playlist to list"""`
			`playlist = YoutubePlaylist(url)`
			`playlist.build_json()`
			`video_results = playlist.json_data.get("playlist_entries")`
			`youtube_ids = [i["youtube_id"] for i in video_results]`
			`for video_id in youtube_ids:`
			`self._add_video(video_id)`

			`def add_to_pending(self, status="pending"):`
			`"""add missing videos to pending list"""`
			`self.get_channels()`
major refactor, split up modules 2022-01-22 15:13:37 +00:00			`bulk_list = []`

refacor PendingList class into subclasses 2022-03-18 10:19:21 +00:00			`thumb_handler = ThumbManager()`
			`for idx, youtube_id in enumerate(self.missing_videos):`
use the refactored PendingList class 2022-03-18 11:27:25 +00:00			`video_details = self.get_youtube_details(youtube_id)`
refacor PendingList class into subclasses 2022-03-18 10:19:21 +00:00			`if not video_details:`
major refactor, split up modules 2022-01-22 15:13:37 +00:00			`continue`

refacor PendingList class into subclasses 2022-03-18 10:19:21 +00:00			`video_details["status"] = status`
major refactor, split up modules 2022-01-22 15:13:37 +00:00			`action = {"create": {"_id": youtube_id, "_index": "ta_download"}}`
			`bulk_list.append(json.dumps(action))`
refacor PendingList class into subclasses 2022-03-18 10:19:21 +00:00			`bulk_list.append(json.dumps(video_details))`

			`thumb_needed = [(youtube_id, video_details["vid_thumb_url"])]`
			`thumb_handler.download_vid(thumb_needed)`
			`self._notify_add(idx)`

skip premium videos, clean exit for empty bulk_list, #237 2022-05-05 16:38:10 +00:00			`if bulk_list:`
			`# add last newline`
			`bulk_list.append("\n")`
			`query_str = "\n".join(bulk_list)`
			`_, _ = ElasticWrap("_bulk").post(query_str, ndjson=True)`
refacor PendingList class into subclasses 2022-03-18 10:19:21 +00:00
			`def _notify_add(self, idx):`
			`"""send notification for adding videos to download queue"""`
			`progress = f"{idx + 1}/{len(self.missing_videos)}"`
			`mess_dict = {`
			`"status": "message:add",`
			`"level": "info",`
			`"title": "Adding new videos to download queue.",`
			`"message": "Progress: " + progress,`
			`}`
			`if idx + 1 == len(self.missing_videos):`
			`RedisArchivist().set_message("message:add", mess_dict, expire=4)`
			`else:`
			`RedisArchivist().set_message("message:add", mess_dict)`
major refactor, split up modules 2022-01-22 15:13:37 +00:00
refacor PendingList class into subclasses 2022-03-18 10:19:21 +00:00			`if idx + 1 % 25 == 0:`
			`print("adding to queue progress: " + progress)`
major refactor, split up modules 2022-01-22 15:13:37 +00:00
use the refactored PendingList class 2022-03-18 11:27:25 +00:00			`def get_youtube_details(self, youtube_id):`
major refactor, split up modules 2022-01-22 15:13:37 +00:00			`"""get details from youtubedl for single pending video"""`
refactor use cookie io_stream 2022-05-24 08:51:58 +00:00			`vid = YtWrap(self.yt_obs, self.config).extract(youtube_id)`
integrate new YtWrap class in download queue and subscriptions classes 2022-05-24 03:36:39 +00:00			`if not vid:`
skip premium videos, clean exit for empty bulk_list, #237 2022-05-05 16:38:10 +00:00			`return False`
integrate new YtWrap class in download queue and subscriptions classes 2022-05-24 03:36:39 +00:00
skip premium videos, clean exit for empty bulk_list, #237 2022-05-05 16:38:10 +00:00			`if vid.get("id") != youtube_id:`
			`# skip premium videos with different id`
			`print(f"{youtube_id}: skipping premium video, id not matching")`
major refactor, split up modules 2022-01-22 15:13:37 +00:00			`return False`
			`# stop if video is streaming live now`
			`if vid["is_live"]:`
			`return False`
refacor PendingList class into subclasses 2022-03-18 10:19:21 +00:00
			`return self._parse_youtube_details(vid)`

			`def _parse_youtube_details(self, vid):`
			`"""parse response"""`
			`vid_id = vid.get("id")`
			`duration_str = DurationConverter.get_str(vid["duration"])`
major refactor, split up modules 2022-01-22 15:13:37 +00:00			`if duration_str == "NA":`
refacor PendingList class into subclasses 2022-03-18 10:19:21 +00:00			`print(f"skip extracting duration for: {vid_id}")`
			`published = datetime.strptime(vid["upload_date"], "%Y%m%d").strftime(`
			`"%Y-%m-%d"`
			`)`

major refactor, split up modules 2022-01-22 15:13:37 +00:00			`# build dict`
			`youtube_details = {`
refacor PendingList class into subclasses 2022-03-18 10:19:21 +00:00			`"youtube_id": vid_id,`
major refactor, split up modules 2022-01-22 15:13:37 +00:00			`"channel_name": vid["channel"],`
			`"vid_thumb_url": vid["thumbnail"],`
			`"title": vid["title"],`
			`"channel_id": vid["channel_id"],`
refacor PendingList class into subclasses 2022-03-18 10:19:21 +00:00			`"channel_indexed": vid["channel_id"] in self.all_channels,`
major refactor, split up modules 2022-01-22 15:13:37 +00:00			`"duration": duration_str,`
			`"published": published,`
			`"timestamp": int(datetime.now().strftime("%s")),`
			`}`
			`return youtube_details`