switch to using flat indexing of media, only index metadata for media once, resolves #38 and dramatically reduces crawl requests to youtube

2021-02-02 17:24:19 +11:00 · 2021-02-02 17:24:19 +11:00 · 749df3f7bb
parent 2c2f53e5b2
commit 749df3f7bb
6 changed files with 117 additions and 58 deletions
--- a/tubesync/sync/models.py
+++ b/tubesync/sync/models.py
@ -113,6 +113,12 @@ class Source(models.Model):
        SOURCE_TYPE_YOUTUBE_CHANNEL_ID: 'https://www.youtube.com/channel/{key}',
        SOURCE_TYPE_YOUTUBE_PLAYLIST: 'https://www.youtube.com/playlist?list={key}',
    }
+    # Format used to create indexable URLs
+    INDEX_URLS = {
+        SOURCE_TYPE_YOUTUBE_CHANNEL: 'https://www.youtube.com/c/{key}/videos',
+        SOURCE_TYPE_YOUTUBE_CHANNEL_ID: 'https://www.youtube.com/channel/{key}/videos',
+        SOURCE_TYPE_YOUTUBE_PLAYLIST: 'https://www.youtube.com/playlist?list={key}',
+    }
    # Callback functions to get a list of media from the source
    INDEXERS = {
        SOURCE_TYPE_YOUTUBE_CHANNEL: get_youtube_media_info,
@ -341,10 +347,19 @@ class Source(models.Model):
        url = obj.URLS.get(source_type)
        return url.format(key=key)

+    @classmethod
+    def create_index_url(obj, source_type, key):
+        url = obj.INDEX_URLS.get(source_type)
+        return url.format(key=key)
+
    @property
    def url(self):
        return Source.create_url(self.source_type, self.key)

+    @property
+    def index_url(self):
+        return Source.create_index_url(self.source_type, self.key)
+
    @property
    def format_summary(self):
        if self.source_resolution == Source.SOURCE_RESOLUTION_AUDIO:
@ -437,25 +452,8 @@ class Source(models.Model):
        indexer = self.INDEXERS.get(self.source_type, None)
        if not callable(indexer):
            raise Exception(f'Source type f"{self.source_type}" has no indexer')
-        response = indexer(self.url)
-
-        # Account for nested playlists, such as a channel of playlists of playlists
-        def _recurse_playlists(playlist):
-            videos = []
-            if not playlist:
-                return videos
-            entries = playlist.get('entries', [])
-            for entry in entries:
-                if not entry:
-                    continue
-                subentries = entry.get('entries', [])
-                if subentries:
-                    videos = videos + _recurse_playlists(entry)
-                else:
-                    videos.append(entry)
-            return videos
-
-        return _recurse_playlists(response)
+        response = indexer(self.index_url)
+        return response.get('entries', [])


 def get_media_thumb_path(instance, filename):
@ -481,6 +479,12 @@ class Media(models.Model):
        Source.SOURCE_TYPE_YOUTUBE_CHANNEL_ID: 'https://www.youtube.com/watch?v={key}',
        Source.SOURCE_TYPE_YOUTUBE_PLAYLIST: 'https://www.youtube.com/watch?v={key}',
    }
+    # Callback functions to get a list of media from the source
+    INDEXERS = {
+        Source.SOURCE_TYPE_YOUTUBE_CHANNEL: get_youtube_media_info,
+        Source.SOURCE_TYPE_YOUTUBE_CHANNEL_ID: get_youtube_media_info,
+        Source.SOURCE_TYPE_YOUTUBE_PLAYLIST: get_youtube_media_info,
+    }
    # Maps standardised names to names used in source metdata
    METADATA_FIELDS = {
        'upload_date': {
@ -904,6 +908,10 @@ class Media(models.Model):
            'hdr': display_format['hdr'],
        }

+    @property
+    def has_metadata(self):
+        return self.metadata is not None
+
    @property
    def loaded_metadata(self):
        try:
@ -1180,6 +1188,16 @@ class Media(models.Model):
        # Return the download paramaters
        return format_str, self.source.extension

+    def index_metadata(self):
+        '''
+            Index the media metadata returning a dict of info.
+        '''
+        indexer = self.INDEXERS.get(self.source.source_type, None)
+        if not callable(indexer):
+            raise Exception(f'Meida with source type f"{self.source.source_type}" '
+                            f'has no indexer')
+        return indexer(self.url)
+

 class MediaServer(models.Model):
    '''
--- a/tubesync/sync/signals.py
+++ b/tubesync/sync/signals.py
@ -8,8 +8,9 @@ from background_task.models import Task
 from common.logger import log
 from .models import Source, Media, MediaServer
 from .tasks import (delete_task_by_source, delete_task_by_media, index_source_task,
-                    download_media_thumbnail, map_task_to_instance,
-                    check_source_directory_exists, download_media, rescan_media_server)
+                    download_media_thumbnail, download_media_metadata,
+                    map_task_to_instance, check_source_directory_exists,
+                    download_media, rescan_media_server)
 from .utils import delete_file


@ -93,6 +94,7 @@ def task_task_failed(sender, task_id, completed_task, **kwargs):
 def media_post_save(sender, instance, created, **kwargs):
    # Triggered after media is saved, Recalculate the "can_download" flag, this may
    # need to change if the source specifications have been changed
+    if instance.metadata:
        post_save.disconnect(media_post_save, sender=Media)
        if instance.get_format_str():
            if not instance.can_download:
@ -103,6 +105,16 @@ def media_post_save(sender, instance, created, **kwargs):
                instance.can_download = False
                instance.save()
        post_save.connect(media_post_save, sender=Media)
+    # If the media is missing metadata schedule it to be downloaded
+    if not instance.metadata:
+        log.info(f'Scheduling task to download metadata for: {instance.url}')
+        verbose_name = _('Downloading metadata for "{}"')
+        download_media_metadata(
+            str(instance.pk),
+            priority=10,
+            verbose_name=verbose_name.format(instance.pk),
+            remove_existing_tasks=True
+        )
    # If the media is missing a thumbnail schedule it to be downloaded
    if not instance.thumb_file_exists:
        instance.thumb = None
--- a/tubesync/sync/tasks.py
+++ b/tubesync/sync/tasks.py
@ -179,30 +179,6 @@ def index_source_task(source_id):
        except Media.DoesNotExist:
            media = Media(key=key)
        media.source = source
-        media.metadata = json.dumps(video)
-        upload_date = media.upload_date
-        # Media must have a valid upload date
-        if upload_date:
-            media.published = timezone.make_aware(upload_date)
-        else:
-            log.error(f'Media has no upload date, skipping: {source} / {media}')
-            continue
-        # If the source has a download cap date check the upload date is allowed
-        max_cap_age = source.download_cap_date
-        if max_cap_age:
-            if media.published < max_cap_age:
-                # Media was published after the cap date, skip it
-                log.warn(f'Media: {source} / {media} is older than cap age '
-                         f'{max_cap_age}, skipping')
-                continue
-        # If the source has a cut-off check the upload date is within the allowed delta
-        if source.delete_old_media and source.days_to_keep > 0:
-            delta = timezone.now() - timedelta(days=source.days_to_keep)
-            if media.published < delta:
-                # Media was published after the cutoff date, skip it
-                log.warn(f'Media: {source} / {media} is older than '
-                         f'{source.days_to_keep} days, skipping')
-                continue
        try:
            media.save()
            log.info(f'Indexed media: {source} / {media}')
@ -234,6 +210,56 @@ def check_source_directory_exists(source_id):
        source.make_directory()


+@background(schedule=0)
+def download_media_metadata(media_id):
+    '''
+        Downloads the metadata for a media item.
+    '''
+    try:
+        media = Media.objects.get(pk=media_id)
+    except Media.DoesNotExist:
+        # Task triggered but the media no longer exists, do nothing
+        log.error(f'Task download_media_metadata(pk={media_id}) called but no '
+                  f'media exists with ID: {media_id}')
+        return
+    source = media.source
+    metadata = media.index_metadata()
+    media.metadata = json.dumps(metadata)
+    upload_date = media.upload_date
+    # Media must have a valid upload date
+    if upload_date:
+        media.published = timezone.make_aware(upload_date)
+    else:
+        log.error(f'Media has no upload date, skipping: {source} / {media}')
+        media.skip = True
+    # If the source has a download cap date check the upload date is allowed
+    max_cap_age = source.download_cap_date
+    if max_cap_age:
+        if media.published < max_cap_age:
+            # Media was published after the cap date, skip it
+            log.warn(f'Media: {source} / {media} is older than cap age '
+                        f'{max_cap_age}, skipping')
+            media.skip = True
+    # If the source has a cut-off check the upload date is within the allowed delta
+    if source.delete_old_media and source.days_to_keep > 0:
+        delta = timezone.now() - timedelta(days=source.days_to_keep)
+        if media.published < delta:
+            # Media was published after the cutoff date, skip it
+            log.warn(f'Media: {source} / {media} is older than '
+                        f'{source.days_to_keep} days, skipping')
+            media.skip = True
+    # Check we can download the media item
+    if not media.skip:
+        if media.get_format_str():
+            media.can_download = True
+        else:
+            media.can_download = False
+    # Save the media
+    media.save()
+    log.info(f'Saved {len(media.metadata)} bytes of metadata for: '
+             f'{source} / {media_id}')
+
+
@background(schedule=0)
 def download_media_thumbnail(media_id, url):
    '''
--- a/tubesync/sync/templates/sync/media-item.html
+++ b/tubesync/sync/templates/sync/media-item.html
@ -109,7 +109,7 @@
      {% else %}
      <tr title="Can the media be downloaded?">
        <td class="hide-on-small-only">Can download?</td>
-        <td><span class="hide-on-med-and-up">Can download?<br></span><strong>{% if youtube_dl_format %}<i class="fas fa-check"></i>{% else %}<i class="fas fa-times"></i>{% endif %}</strong></td>
+        <td><span class="hide-on-med-and-up">Can download?<br></span><strong>{% if media.can_download %}<i class="fas fa-check"></i>{% else %}<i class="fas fa-times"></i>{% endif %}</strong></td>
      </tr>
      {% endif %}
      <tr title="The available media formats">
--- a/tubesync/sync/templates/sync/media.html
+++ b/tubesync/sync/templates/sync/media.html
@ -24,8 +24,10 @@
            {% else %}
              {% if m.skip %}
              <span class="error-text"><i class="fas fa-times" title="Skipping media"></i> Skipped</span>
+              {% elif not m.has_metadata %}
+              <i class="far fa-clock" title="Waiting for metadata"></i> Fetching metadata
              {% elif m.can_download %}
-              <i class="far fa-clock" title="Waiting to download or downloading"></i> {{ m.published|date:'Y-m-d' }}
+              <i class="far fa-clock" title="Waiting to download or downloading"></i> Downloading
              {% else %}
              <span class="error-text"><i class="fas fa-exclamation-triangle" title="No matching formats to download"></i> No matching formats</span>
              {% endif %}
--- a/tubesync/sync/youtube.py
+++ b/tubesync/sync/youtube.py
@ -37,7 +37,8 @@ def get_media_info(url):
        'skip_download': True,
        'forcejson': True,
        'simulate': True,
-        'logger': log
+        'logger': log,
+        'extract_flat': True,
    })
    response = {}
    with youtube_dl.YoutubeDL(opts) as y: