switch to using flat indexing of media, only index metadata for media once, resolves #38 and dramatically reduces crawl requests to youtube
This commit is contained in:
parent
2c2f53e5b2
commit
749df3f7bb
|
@ -113,6 +113,12 @@ class Source(models.Model):
|
|||
SOURCE_TYPE_YOUTUBE_CHANNEL_ID: 'https://www.youtube.com/channel/{key}',
|
||||
SOURCE_TYPE_YOUTUBE_PLAYLIST: 'https://www.youtube.com/playlist?list={key}',
|
||||
}
|
||||
# Format used to create indexable URLs
|
||||
INDEX_URLS = {
|
||||
SOURCE_TYPE_YOUTUBE_CHANNEL: 'https://www.youtube.com/c/{key}/videos',
|
||||
SOURCE_TYPE_YOUTUBE_CHANNEL_ID: 'https://www.youtube.com/channel/{key}/videos',
|
||||
SOURCE_TYPE_YOUTUBE_PLAYLIST: 'https://www.youtube.com/playlist?list={key}',
|
||||
}
|
||||
# Callback functions to get a list of media from the source
|
||||
INDEXERS = {
|
||||
SOURCE_TYPE_YOUTUBE_CHANNEL: get_youtube_media_info,
|
||||
|
@ -341,10 +347,19 @@ class Source(models.Model):
|
|||
url = obj.URLS.get(source_type)
|
||||
return url.format(key=key)
|
||||
|
||||
@classmethod
|
||||
def create_index_url(obj, source_type, key):
|
||||
url = obj.INDEX_URLS.get(source_type)
|
||||
return url.format(key=key)
|
||||
|
||||
@property
|
||||
def url(self):
|
||||
return Source.create_url(self.source_type, self.key)
|
||||
|
||||
@property
|
||||
def index_url(self):
|
||||
return Source.create_index_url(self.source_type, self.key)
|
||||
|
||||
@property
|
||||
def format_summary(self):
|
||||
if self.source_resolution == Source.SOURCE_RESOLUTION_AUDIO:
|
||||
|
@ -437,25 +452,8 @@ class Source(models.Model):
|
|||
indexer = self.INDEXERS.get(self.source_type, None)
|
||||
if not callable(indexer):
|
||||
raise Exception(f'Source type f"{self.source_type}" has no indexer')
|
||||
response = indexer(self.url)
|
||||
|
||||
# Account for nested playlists, such as a channel of playlists of playlists
|
||||
def _recurse_playlists(playlist):
|
||||
videos = []
|
||||
if not playlist:
|
||||
return videos
|
||||
entries = playlist.get('entries', [])
|
||||
for entry in entries:
|
||||
if not entry:
|
||||
continue
|
||||
subentries = entry.get('entries', [])
|
||||
if subentries:
|
||||
videos = videos + _recurse_playlists(entry)
|
||||
else:
|
||||
videos.append(entry)
|
||||
return videos
|
||||
|
||||
return _recurse_playlists(response)
|
||||
response = indexer(self.index_url)
|
||||
return response.get('entries', [])
|
||||
|
||||
|
||||
def get_media_thumb_path(instance, filename):
|
||||
|
@ -481,6 +479,12 @@ class Media(models.Model):
|
|||
Source.SOURCE_TYPE_YOUTUBE_CHANNEL_ID: 'https://www.youtube.com/watch?v={key}',
|
||||
Source.SOURCE_TYPE_YOUTUBE_PLAYLIST: 'https://www.youtube.com/watch?v={key}',
|
||||
}
|
||||
# Callback functions to get a list of media from the source
|
||||
INDEXERS = {
|
||||
Source.SOURCE_TYPE_YOUTUBE_CHANNEL: get_youtube_media_info,
|
||||
Source.SOURCE_TYPE_YOUTUBE_CHANNEL_ID: get_youtube_media_info,
|
||||
Source.SOURCE_TYPE_YOUTUBE_PLAYLIST: get_youtube_media_info,
|
||||
}
|
||||
# Maps standardised names to names used in source metdata
|
||||
METADATA_FIELDS = {
|
||||
'upload_date': {
|
||||
|
@ -904,6 +908,10 @@ class Media(models.Model):
|
|||
'hdr': display_format['hdr'],
|
||||
}
|
||||
|
||||
@property
|
||||
def has_metadata(self):
|
||||
return self.metadata is not None
|
||||
|
||||
@property
|
||||
def loaded_metadata(self):
|
||||
try:
|
||||
|
@ -1180,6 +1188,16 @@ class Media(models.Model):
|
|||
# Return the download paramaters
|
||||
return format_str, self.source.extension
|
||||
|
||||
def index_metadata(self):
|
||||
'''
|
||||
Index the media metadata returning a dict of info.
|
||||
'''
|
||||
indexer = self.INDEXERS.get(self.source.source_type, None)
|
||||
if not callable(indexer):
|
||||
raise Exception(f'Meida with source type f"{self.source.source_type}" '
|
||||
f'has no indexer')
|
||||
return indexer(self.url)
|
||||
|
||||
|
||||
class MediaServer(models.Model):
|
||||
'''
|
||||
|
|
|
@ -8,8 +8,9 @@ from background_task.models import Task
|
|||
from common.logger import log
|
||||
from .models import Source, Media, MediaServer
|
||||
from .tasks import (delete_task_by_source, delete_task_by_media, index_source_task,
|
||||
download_media_thumbnail, map_task_to_instance,
|
||||
check_source_directory_exists, download_media, rescan_media_server)
|
||||
download_media_thumbnail, download_media_metadata,
|
||||
map_task_to_instance, check_source_directory_exists,
|
||||
download_media, rescan_media_server)
|
||||
from .utils import delete_file
|
||||
|
||||
|
||||
|
@ -93,6 +94,7 @@ def task_task_failed(sender, task_id, completed_task, **kwargs):
|
|||
def media_post_save(sender, instance, created, **kwargs):
|
||||
# Triggered after media is saved, Recalculate the "can_download" flag, this may
|
||||
# need to change if the source specifications have been changed
|
||||
if instance.metadata:
|
||||
post_save.disconnect(media_post_save, sender=Media)
|
||||
if instance.get_format_str():
|
||||
if not instance.can_download:
|
||||
|
@ -103,6 +105,16 @@ def media_post_save(sender, instance, created, **kwargs):
|
|||
instance.can_download = False
|
||||
instance.save()
|
||||
post_save.connect(media_post_save, sender=Media)
|
||||
# If the media is missing metadata schedule it to be downloaded
|
||||
if not instance.metadata:
|
||||
log.info(f'Scheduling task to download metadata for: {instance.url}')
|
||||
verbose_name = _('Downloading metadata for "{}"')
|
||||
download_media_metadata(
|
||||
str(instance.pk),
|
||||
priority=10,
|
||||
verbose_name=verbose_name.format(instance.pk),
|
||||
remove_existing_tasks=True
|
||||
)
|
||||
# If the media is missing a thumbnail schedule it to be downloaded
|
||||
if not instance.thumb_file_exists:
|
||||
instance.thumb = None
|
||||
|
|
|
@ -179,30 +179,6 @@ def index_source_task(source_id):
|
|||
except Media.DoesNotExist:
|
||||
media = Media(key=key)
|
||||
media.source = source
|
||||
media.metadata = json.dumps(video)
|
||||
upload_date = media.upload_date
|
||||
# Media must have a valid upload date
|
||||
if upload_date:
|
||||
media.published = timezone.make_aware(upload_date)
|
||||
else:
|
||||
log.error(f'Media has no upload date, skipping: {source} / {media}')
|
||||
continue
|
||||
# If the source has a download cap date check the upload date is allowed
|
||||
max_cap_age = source.download_cap_date
|
||||
if max_cap_age:
|
||||
if media.published < max_cap_age:
|
||||
# Media was published after the cap date, skip it
|
||||
log.warn(f'Media: {source} / {media} is older than cap age '
|
||||
f'{max_cap_age}, skipping')
|
||||
continue
|
||||
# If the source has a cut-off check the upload date is within the allowed delta
|
||||
if source.delete_old_media and source.days_to_keep > 0:
|
||||
delta = timezone.now() - timedelta(days=source.days_to_keep)
|
||||
if media.published < delta:
|
||||
# Media was published after the cutoff date, skip it
|
||||
log.warn(f'Media: {source} / {media} is older than '
|
||||
f'{source.days_to_keep} days, skipping')
|
||||
continue
|
||||
try:
|
||||
media.save()
|
||||
log.info(f'Indexed media: {source} / {media}')
|
||||
|
@ -234,6 +210,56 @@ def check_source_directory_exists(source_id):
|
|||
source.make_directory()
|
||||
|
||||
|
||||
@background(schedule=0)
|
||||
def download_media_metadata(media_id):
|
||||
'''
|
||||
Downloads the metadata for a media item.
|
||||
'''
|
||||
try:
|
||||
media = Media.objects.get(pk=media_id)
|
||||
except Media.DoesNotExist:
|
||||
# Task triggered but the media no longer exists, do nothing
|
||||
log.error(f'Task download_media_metadata(pk={media_id}) called but no '
|
||||
f'media exists with ID: {media_id}')
|
||||
return
|
||||
source = media.source
|
||||
metadata = media.index_metadata()
|
||||
media.metadata = json.dumps(metadata)
|
||||
upload_date = media.upload_date
|
||||
# Media must have a valid upload date
|
||||
if upload_date:
|
||||
media.published = timezone.make_aware(upload_date)
|
||||
else:
|
||||
log.error(f'Media has no upload date, skipping: {source} / {media}')
|
||||
media.skip = True
|
||||
# If the source has a download cap date check the upload date is allowed
|
||||
max_cap_age = source.download_cap_date
|
||||
if max_cap_age:
|
||||
if media.published < max_cap_age:
|
||||
# Media was published after the cap date, skip it
|
||||
log.warn(f'Media: {source} / {media} is older than cap age '
|
||||
f'{max_cap_age}, skipping')
|
||||
media.skip = True
|
||||
# If the source has a cut-off check the upload date is within the allowed delta
|
||||
if source.delete_old_media and source.days_to_keep > 0:
|
||||
delta = timezone.now() - timedelta(days=source.days_to_keep)
|
||||
if media.published < delta:
|
||||
# Media was published after the cutoff date, skip it
|
||||
log.warn(f'Media: {source} / {media} is older than '
|
||||
f'{source.days_to_keep} days, skipping')
|
||||
media.skip = True
|
||||
# Check we can download the media item
|
||||
if not media.skip:
|
||||
if media.get_format_str():
|
||||
media.can_download = True
|
||||
else:
|
||||
media.can_download = False
|
||||
# Save the media
|
||||
media.save()
|
||||
log.info(f'Saved {len(media.metadata)} bytes of metadata for: '
|
||||
f'{source} / {media_id}')
|
||||
|
||||
|
||||
@background(schedule=0)
|
||||
def download_media_thumbnail(media_id, url):
|
||||
'''
|
||||
|
|
|
@ -109,7 +109,7 @@
|
|||
{% else %}
|
||||
<tr title="Can the media be downloaded?">
|
||||
<td class="hide-on-small-only">Can download?</td>
|
||||
<td><span class="hide-on-med-and-up">Can download?<br></span><strong>{% if youtube_dl_format %}<i class="fas fa-check"></i>{% else %}<i class="fas fa-times"></i>{% endif %}</strong></td>
|
||||
<td><span class="hide-on-med-and-up">Can download?<br></span><strong>{% if media.can_download %}<i class="fas fa-check"></i>{% else %}<i class="fas fa-times"></i>{% endif %}</strong></td>
|
||||
</tr>
|
||||
{% endif %}
|
||||
<tr title="The available media formats">
|
||||
|
|
|
@ -24,8 +24,10 @@
|
|||
{% else %}
|
||||
{% if m.skip %}
|
||||
<span class="error-text"><i class="fas fa-times" title="Skipping media"></i> Skipped</span>
|
||||
{% elif not m.has_metadata %}
|
||||
<i class="far fa-clock" title="Waiting for metadata"></i> Fetching metadata
|
||||
{% elif m.can_download %}
|
||||
<i class="far fa-clock" title="Waiting to download or downloading"></i> {{ m.published|date:'Y-m-d' }}
|
||||
<i class="far fa-clock" title="Waiting to download or downloading"></i> Downloading
|
||||
{% else %}
|
||||
<span class="error-text"><i class="fas fa-exclamation-triangle" title="No matching formats to download"></i> No matching formats</span>
|
||||
{% endif %}
|
||||
|
|
|
@ -37,7 +37,8 @@ def get_media_info(url):
|
|||
'skip_download': True,
|
||||
'forcejson': True,
|
||||
'simulate': True,
|
||||
'logger': log
|
||||
'logger': log,
|
||||
'extract_flat': True,
|
||||
})
|
||||
response = {}
|
||||
with youtube_dl.YoutubeDL(opts) as y:
|
||||
|
|
Loading…
Reference in New Issue