switch to using flat indexing of media, only index metadata for media once, resolves #38 and dramatically reduces crawl requests to youtube

This commit is contained in:
meeb 2021-02-02 17:24:19 +11:00
parent 2c2f53e5b2
commit 749df3f7bb
6 changed files with 117 additions and 58 deletions

View File

@ -113,6 +113,12 @@ class Source(models.Model):
SOURCE_TYPE_YOUTUBE_CHANNEL_ID: 'https://www.youtube.com/channel/{key}',
SOURCE_TYPE_YOUTUBE_PLAYLIST: 'https://www.youtube.com/playlist?list={key}',
}
# Format used to create indexable URLs
INDEX_URLS = {
SOURCE_TYPE_YOUTUBE_CHANNEL: 'https://www.youtube.com/c/{key}/videos',
SOURCE_TYPE_YOUTUBE_CHANNEL_ID: 'https://www.youtube.com/channel/{key}/videos',
SOURCE_TYPE_YOUTUBE_PLAYLIST: 'https://www.youtube.com/playlist?list={key}',
}
# Callback functions to get a list of media from the source
INDEXERS = {
SOURCE_TYPE_YOUTUBE_CHANNEL: get_youtube_media_info,
@ -341,10 +347,19 @@ class Source(models.Model):
url = obj.URLS.get(source_type)
return url.format(key=key)
@classmethod
def create_index_url(obj, source_type, key):
url = obj.INDEX_URLS.get(source_type)
return url.format(key=key)
@property
def url(self):
return Source.create_url(self.source_type, self.key)
@property
def index_url(self):
return Source.create_index_url(self.source_type, self.key)
@property
def format_summary(self):
if self.source_resolution == Source.SOURCE_RESOLUTION_AUDIO:
@ -437,25 +452,8 @@ class Source(models.Model):
indexer = self.INDEXERS.get(self.source_type, None)
if not callable(indexer):
raise Exception(f'Source type f"{self.source_type}" has no indexer')
response = indexer(self.url)
# Account for nested playlists, such as a channel of playlists of playlists
def _recurse_playlists(playlist):
videos = []
if not playlist:
return videos
entries = playlist.get('entries', [])
for entry in entries:
if not entry:
continue
subentries = entry.get('entries', [])
if subentries:
videos = videos + _recurse_playlists(entry)
else:
videos.append(entry)
return videos
return _recurse_playlists(response)
response = indexer(self.index_url)
return response.get('entries', [])
def get_media_thumb_path(instance, filename):
@ -481,6 +479,12 @@ class Media(models.Model):
Source.SOURCE_TYPE_YOUTUBE_CHANNEL_ID: 'https://www.youtube.com/watch?v={key}',
Source.SOURCE_TYPE_YOUTUBE_PLAYLIST: 'https://www.youtube.com/watch?v={key}',
}
# Callback functions to get a list of media from the source
INDEXERS = {
Source.SOURCE_TYPE_YOUTUBE_CHANNEL: get_youtube_media_info,
Source.SOURCE_TYPE_YOUTUBE_CHANNEL_ID: get_youtube_media_info,
Source.SOURCE_TYPE_YOUTUBE_PLAYLIST: get_youtube_media_info,
}
# Maps standardised names to names used in source metdata
METADATA_FIELDS = {
'upload_date': {
@ -904,6 +908,10 @@ class Media(models.Model):
'hdr': display_format['hdr'],
}
@property
def has_metadata(self):
return self.metadata is not None
@property
def loaded_metadata(self):
try:
@ -1180,6 +1188,16 @@ class Media(models.Model):
# Return the download paramaters
return format_str, self.source.extension
def index_metadata(self):
'''
Index the media metadata returning a dict of info.
'''
indexer = self.INDEXERS.get(self.source.source_type, None)
if not callable(indexer):
raise Exception(f'Meida with source type f"{self.source.source_type}" '
f'has no indexer')
return indexer(self.url)
class MediaServer(models.Model):
'''

View File

@ -8,8 +8,9 @@ from background_task.models import Task
from common.logger import log
from .models import Source, Media, MediaServer
from .tasks import (delete_task_by_source, delete_task_by_media, index_source_task,
download_media_thumbnail, map_task_to_instance,
check_source_directory_exists, download_media, rescan_media_server)
download_media_thumbnail, download_media_metadata,
map_task_to_instance, check_source_directory_exists,
download_media, rescan_media_server)
from .utils import delete_file
@ -93,6 +94,7 @@ def task_task_failed(sender, task_id, completed_task, **kwargs):
def media_post_save(sender, instance, created, **kwargs):
# Triggered after media is saved, Recalculate the "can_download" flag, this may
# need to change if the source specifications have been changed
if instance.metadata:
post_save.disconnect(media_post_save, sender=Media)
if instance.get_format_str():
if not instance.can_download:
@ -103,6 +105,16 @@ def media_post_save(sender, instance, created, **kwargs):
instance.can_download = False
instance.save()
post_save.connect(media_post_save, sender=Media)
# If the media is missing metadata schedule it to be downloaded
if not instance.metadata:
log.info(f'Scheduling task to download metadata for: {instance.url}')
verbose_name = _('Downloading metadata for "{}"')
download_media_metadata(
str(instance.pk),
priority=10,
verbose_name=verbose_name.format(instance.pk),
remove_existing_tasks=True
)
# If the media is missing a thumbnail schedule it to be downloaded
if not instance.thumb_file_exists:
instance.thumb = None

View File

@ -179,30 +179,6 @@ def index_source_task(source_id):
except Media.DoesNotExist:
media = Media(key=key)
media.source = source
media.metadata = json.dumps(video)
upload_date = media.upload_date
# Media must have a valid upload date
if upload_date:
media.published = timezone.make_aware(upload_date)
else:
log.error(f'Media has no upload date, skipping: {source} / {media}')
continue
# If the source has a download cap date check the upload date is allowed
max_cap_age = source.download_cap_date
if max_cap_age:
if media.published < max_cap_age:
# Media was published after the cap date, skip it
log.warn(f'Media: {source} / {media} is older than cap age '
f'{max_cap_age}, skipping')
continue
# If the source has a cut-off check the upload date is within the allowed delta
if source.delete_old_media and source.days_to_keep > 0:
delta = timezone.now() - timedelta(days=source.days_to_keep)
if media.published < delta:
# Media was published after the cutoff date, skip it
log.warn(f'Media: {source} / {media} is older than '
f'{source.days_to_keep} days, skipping')
continue
try:
media.save()
log.info(f'Indexed media: {source} / {media}')
@ -234,6 +210,56 @@ def check_source_directory_exists(source_id):
source.make_directory()
@background(schedule=0)
def download_media_metadata(media_id):
'''
Downloads the metadata for a media item.
'''
try:
media = Media.objects.get(pk=media_id)
except Media.DoesNotExist:
# Task triggered but the media no longer exists, do nothing
log.error(f'Task download_media_metadata(pk={media_id}) called but no '
f'media exists with ID: {media_id}')
return
source = media.source
metadata = media.index_metadata()
media.metadata = json.dumps(metadata)
upload_date = media.upload_date
# Media must have a valid upload date
if upload_date:
media.published = timezone.make_aware(upload_date)
else:
log.error(f'Media has no upload date, skipping: {source} / {media}')
media.skip = True
# If the source has a download cap date check the upload date is allowed
max_cap_age = source.download_cap_date
if max_cap_age:
if media.published < max_cap_age:
# Media was published after the cap date, skip it
log.warn(f'Media: {source} / {media} is older than cap age '
f'{max_cap_age}, skipping')
media.skip = True
# If the source has a cut-off check the upload date is within the allowed delta
if source.delete_old_media and source.days_to_keep > 0:
delta = timezone.now() - timedelta(days=source.days_to_keep)
if media.published < delta:
# Media was published after the cutoff date, skip it
log.warn(f'Media: {source} / {media} is older than '
f'{source.days_to_keep} days, skipping')
media.skip = True
# Check we can download the media item
if not media.skip:
if media.get_format_str():
media.can_download = True
else:
media.can_download = False
# Save the media
media.save()
log.info(f'Saved {len(media.metadata)} bytes of metadata for: '
f'{source} / {media_id}')
@background(schedule=0)
def download_media_thumbnail(media_id, url):
'''

View File

@ -109,7 +109,7 @@
{% else %}
<tr title="Can the media be downloaded?">
<td class="hide-on-small-only">Can download?</td>
<td><span class="hide-on-med-and-up">Can download?<br></span><strong>{% if youtube_dl_format %}<i class="fas fa-check"></i>{% else %}<i class="fas fa-times"></i>{% endif %}</strong></td>
<td><span class="hide-on-med-and-up">Can download?<br></span><strong>{% if media.can_download %}<i class="fas fa-check"></i>{% else %}<i class="fas fa-times"></i>{% endif %}</strong></td>
</tr>
{% endif %}
<tr title="The available media formats">

View File

@ -24,8 +24,10 @@
{% else %}
{% if m.skip %}
<span class="error-text"><i class="fas fa-times" title="Skipping media"></i> Skipped</span>
{% elif not m.has_metadata %}
<i class="far fa-clock" title="Waiting for metadata"></i> Fetching metadata
{% elif m.can_download %}
<i class="far fa-clock" title="Waiting to download or downloading"></i> {{ m.published|date:'Y-m-d' }}
<i class="far fa-clock" title="Waiting to download or downloading"></i> Downloading
{% else %}
<span class="error-text"><i class="fas fa-exclamation-triangle" title="No matching formats to download"></i> No matching formats</span>
{% endif %}

View File

@ -37,7 +37,8 @@ def get_media_info(url):
'skip_download': True,
'forcejson': True,
'simulate': True,
'logger': log
'logger': log,
'extract_flat': True,
})
response = {}
with youtube_dl.YoutubeDL(opts) as y: