Merge pull request #425 from locke4/main

Add support for regex video title filtering
2023-11-20 16:53:58 +11:00
parent 7f4e8586b7 d1cb7ef76c
commit 33b471175a
10 changed files with 127 additions and 30 deletions
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -4,6 +4,7 @@ env:
  IMAGE_NAME: tubesync
 on:
  workflow_dispatch:
  push:
    branches:
      - main
--- a/tubesync/common/templates/pagination.html
+++ b/tubesync/common/templates/pagination.html
@@ -3,7 +3,7 @@
  <div class="col s12">
    <div class="pagination">
      {% for i in paginator.page_range %}
-        <a class="pagenum{% if i == page_obj.number %} currentpage{% endif %}" href="?{% if filter %}filter={{ filter }}&{% endif %}page={{ i }}{% if show_skipped %}&show_skipped=yes{% endif %}">{{ i }}</a>
+        <a class="pagenum{% if i == page_obj.number %} currentpage{% endif %}" href="?{% if filter %}filter={{ filter }}&{% endif %}page={{ i }}{% if show_skipped %}&show_skipped=yes{% endif %}{% if only_skipped %}&only_skipped=yes{% endif %}">{{ i }}</a>
      {% endfor %}
    </div>
  </div>
--- a/tubesync/sync/migrations/0020_auto_20231024_1825.py
+++ b/tubesync/sync/migrations/0020_auto_20231024_1825.py
@@ -0,0 +1,29 @@
 # Generated by Django 3.2.22 on 2023-10-24 17:25
 import django.core.validators
 from django.db import migrations, models
 class Migration(migrations.Migration):
    dependencies = [
        ('sync', '0019_add_delete_removed_media'),
    ]
    operations = [
        migrations.AddField(
            model_name='source',
            name='filter_text',
            field=models.CharField(blank=True, default='', help_text='Regex compatible filter string for video titles', max_length=100, verbose_name='filter string'),
        ),
        migrations.AlterField(
            model_name='source',
            name='auto_subtitles',
            field=models.BooleanField(default=False, help_text='Accept auto-generated subtitles', verbose_name='accept auto-generated subs'),
        ),
        migrations.AlterField(
            model_name='source',
            name='sub_langs',
            field=models.CharField(default='en', help_text='List of subtitles langs to download, comma-separated. Example: en,fr or all,-fr,-live_chat', max_length=30, validators=[django.core.validators.RegexValidator(message='Subtitle langs must be a comma-separated list of langs. example: en,fr or all,-fr,-live_chat', regex='^(\\-?[\\_\\.a-zA-Z]+,)*(\\-?[\\_\\.a-zA-Z]+){1}$')], verbose_name='subs langs'),
        ),
    ]
--- a/tubesync/sync/models.py
+++ b/tubesync/sync/models.py
@@ -1,6 +1,7 @@
 import os
 import uuid
 import json
 import re
 from xml.etree import ElementTree
 from collections import OrderedDict
 from datetime import datetime, timedelta
@@ -287,6 +288,13 @@ class Source(models.Model):
        help_text=_('If "delete old media" is ticked, the number of days after which '
                    'to automatically delete media')
    )
    filter_text = models.CharField(
        _('filter string'),
        max_length=100,
        default='',
        blank=True,
        help_text=_('Regex compatible filter string for video titles')
    )
    delete_removed_media = models.BooleanField(
        _('delete removed media'),
        default=False,
@@ -538,6 +546,9 @@ class Source(models.Model):
        except Exception as e:
            return ''
    def is_regex_match(self, media_item_title):
        return bool(re.search(self.filter_text,media_item_title))       
    def index_media(self):
        '''
            Index the media source returning a list of media metadata as dicts.
--- a/tubesync/sync/signals.py
+++ b/tubesync/sync/signals.py
@@ -104,36 +104,57 @@ def media_post_save(sender, instance, created, **kwargs):
    # already been downloaded
    if not instance.downloaded:
        max_cap_age = instance.source.download_cap_date
-        published = instance.published
+        filter_text = instance.source.filter_text
-        if not published:
+        published = instance.published 
-            if not instance.skip:
+
-                log.warn(f'Media: {instance.source} / {instance} has no published date '
+        if instance.skip:
-                         f'set, marking to be skipped')
+            #currently marked to be skipped, check if skip conditions still apply
            if not published:
                log.debug(f'Media: {instance.source} / {instance} has no published date '
                        f'set but is already marked to be skipped')
            else:            
                if max_cap_age and filter_text:
                    if (published > max_cap_age) and (source.is_regex_match(instance.title)):
                        # Media was published after the cap date and matches the filter text, but is set to be skipped
                        print('Has a valid publishing date and matches filter, marking unskipped')
                        instance.skip = False
                        cap_changed = True
                    else:
                        print('does not have a valid publishing date or filter string, already marked skipped')
                        log.info(f'Media: {instance.source} / {instance} has no published date '
                                f'set but is already marked to be skipped')
                elif max_cap_age:
                    if published > max_cap_age:
                        # Media was published after the cap date but is set to be skipped
                        log.info(f'Media: {instance.source} / {instance} has a valid '
                                f'publishing date, marking to be unskipped')
                        instance.skip = False
                        cap_changed = True
                elif filter_text:
                    if source.is_regex_match(instance.title):
                        # Media matches the filter text but is set to be skipped
                        log.info(f'Media: {instance.source} / {instance} matches the filter text, marking to be unskipped')
                        instance.skip = False
                        cap_changed = True
        else:
            if not published:
                log.info(f'Media: {instance.source} / {instance} has no published date, marking to be skipped')
                instance.skip = True
                cap_changed = True
            else:
-                log.debug(f'Media: {instance.source} / {instance} has no published date '
+                if max_cap_age:
-                          f'set but is already marked to be skipped')
+                    if published <= max_cap_age:            
-        else:
+                        log.info(f'Media: {instance.source} / {instance} is too old for '
-            if max_cap_age:
+                                f'the download cap date, marking to be skipped')
-                if published > max_cap_age and instance.skip:
+                        instance.skip = True
-                    # Media was published after the cap date but is set to be skipped
+                        cap_changed = True
-                    log.info(f'Media: {instance.source} / {instance} has a valid '
+                if filter_text:
-                            f'publishing date, marking to be unskipped')
+                    if not re.search(filter_text,instance.title):
-                    instance.skip = False
+                        #media doesn't match the filter text but is not marked to be skipped
-                    cap_changed = True
+                        log.info(f'Media: {instance.source} / {instance} does not match the filter text')
-                elif published <= max_cap_age and not instance.skip:
+                        instance.skip = True
-                    log.info(f'Media: {instance.source} / {instance} is too old for '
+                        cap_changed = True
-                            f'the download cap date, marking to be skipped')
+      
                    instance.skip = True
                    cap_changed = True
            else:
                if instance.skip:
                    # Media marked to be skipped but source download cap removed
                    log.info(f'Media: {instance.source} / {instance} has a valid '
                            f'publishing date, marking to be unskipped')
                    instance.skip = False
                    cap_changed = True
    # Recalculate the "can_download" flag, this may
    # need to change if the source specifications have been changed
    if instance.metadata:
--- a/tubesync/sync/tasks.py
+++ b/tubesync/sync/tasks.py
@@ -254,6 +254,11 @@ def download_media_metadata(media_id):
            log.warn(f'Media: {source} / {media} is older than cap age '
                     f'{max_cap_age}, skipping')
            media.skip = True
    # If the source has a search filter, check the video title matches the filter
    if source.filter_text and not source.is_regex_match(media.title):
        # Filter text not found in the media title. Accepts regex string, blank search filter results in this returning false
        log.warn(f'Media: {source} / {media} does not match {source.filter_text}, skipping')
        media.skip = True
    # If the source has a cut-off check the upload date is within the allowed delta
    if source.delete_old_media and source.days_to_keep > 0:
        if not isinstance(media.published, datetime):
--- a/tubesync/sync/templates/sync/media.html
+++ b/tubesync/sync/templates/sync/media.html
@@ -64,5 +64,5 @@
  </div>
  {% endfor %}
 </div>
-{% include 'pagination.html' with pagination=sources.paginator filter=source.pk show_skipped=show_skipped %}
+{% include 'pagination.html' with pagination=sources.paginator filter=source.pk show_skipped=show_skipped only_skipped=only_skipped%}
 {% endblock %}
--- a/tubesync/sync/templates/sync/source.html
+++ b/tubesync/sync/templates/sync/source.html
@@ -43,6 +43,10 @@
        <td class="hide-on-small-only">Directory</td>
        <td><span class="hide-on-med-and-up">Directory<br></span><strong>{{ source.directory }}</strong></td>
      </tr>
      <tr title="Filter text">
        <td class="hide-on-small-only">Filter text</td>
        <td><span class="hide-on-med-and-up">Filter text<br></span><strong>{{ source.filter_text }}</strong></td>
      </tr>
      <tr title="Media file name format to use for saving files">
        <td class="hide-on-small-only">Media format</td>
        <td><span class="hide-on-med-and-up">Media format<br></span><strong>{{ source.media_format }}</strong></td>
--- a/tubesync/sync/tests.py
+++ b/tubesync/sync/tests.py
@@ -175,6 +175,7 @@ class FrontEndTestCase(TestCase):
            'directory': 'testdirectory',
            'media_format': settings.MEDIA_FORMATSTR_DEFAULT,
            'download_cap': 0,
            'filter_text':'.*',
            'index_schedule': 3600,
            'delete_old_media': False,
            'days_to_keep': 14,
@@ -217,6 +218,7 @@ class FrontEndTestCase(TestCase):
            'directory': 'testdirectory',
            'media_format': settings.MEDIA_FORMATSTR_DEFAULT,
            'download_cap': 0,
            'filter_text':'.*',
            'index_schedule': Source.IndexSchedule.EVERY_HOUR,
            'delete_old_media': False,
            'days_to_keep': 14,
@@ -247,6 +249,7 @@ class FrontEndTestCase(TestCase):
            'directory': 'testdirectory',
            'media_format': settings.MEDIA_FORMATSTR_DEFAULT,
            'download_cap': 0,
            'filter_text':'.*',
            'index_schedule': Source.IndexSchedule.EVERY_2_HOURS,  # changed
            'delete_old_media': False,
            'days_to_keep': 14,
@@ -1468,6 +1471,29 @@ class FormatMatchingTestCase(TestCase):
            self.media.get_best_video_format()
            self.media.get_best_audio_format()
    def test_is_regex_match(self):
        self.media.metadata = all_test_metadata['boring']
        expected_matches = {
            ('.*'): (True),
            ('no fancy stuff'): (True),
            ('No fancy stuff'): (False),
            ('(?i)No fancy stuff'): (True), #set case insensitive flag
            ('no'): (True),
            ('Foo'): (False),
            ('^(?!.*fancy).*$'): (False),
            ('^(?!.*funny).*$'): (True),
            ('(?=.*f.*)(?=.{0,2}|.{4,})'): (True),
            ('f{4,}'): (False),
            ('^[^A-Z]*$'): (True),
            ('^[^a-z]*$'): (False),
            ('^[^\\s]*$'): (False)
        }
        for params, expected in expected_matches.items():
            self.source.filter_text = params
            expected_match_result = expected
            self.assertEqual(self.source.is_regex_match(self.media.title), expected_match_result)
 class TasksTestCase(TestCase):
    def setUp(self):
--- a/tubesync/sync/views.py
+++ b/tubesync/sync/views.py
@@ -294,7 +294,7 @@ class ValidateSourceView(FormView):
 class EditSourceMixin:
    model = Source
-    fields = ('source_type', 'key', 'name', 'directory', 'media_format',
+    fields = ('source_type', 'key', 'name', 'directory', 'filter_text', 'media_format',
              'index_schedule', 'download_media', 'download_cap', 'delete_old_media',
              'delete_removed_media', 'days_to_keep', 'source_resolution', 'source_vcodec',
              'source_acodec', 'prefer_60fps', 'prefer_hdr', 'fallback', 'copy_thumbnails',