migrate regex_replace functions to AutomationEngine

create TITLE_REPLACE, UNIT_REPLACE and FOOD REPLACE automation types create migration for new types
2023-08-31 12:46:34 -05:00 · 2023-08-31 12:46:34 -05:00 · 30c891abfc
commit 30c891abfc
parent b8317c2c29
6 changed files with 143 additions and 66 deletions
--- a/cookbook/helper/automation_helper.py
+++ b/cookbook/helper/automation_helper.py
@ -8,18 +8,28 @@ from cookbook.models import Automation

 class AutomationEngine:
    request = None
+    source = None
    use_cache = None
    food_aliases = None
    keyword_aliases = None
    unit_aliases = None
    never_unit = None
    transpose_words = None
-    description_replace = None
-    instruction_replace = None
+    regex_replace = {
+        Automation.DESCRIPTION_REPLACE: None,
+        Automation.INSTRUCTION_REPLACE: None,
+        Automation.FOOD_REPLACE: None,
+        Automation.UNIT_REPLACE: None,
+        Automation.TITLE_REPLACE: None,
+    }

-    def __init__(self, request, use_cache=True):
+    def __init__(self, request, use_cache=True, source=None):
        self.request = request
        self.use_cache = use_cache
+        if not source:
+            self.source = "default_string_to_avoid_false_regex_match"
+        else:
+            self.source = source

    def apply_keyword_automation(self, keyword):
        keyword = keyword.strip()
@ -92,7 +102,7 @@ class AutomationEngine:
        else:
            if automation := Automation.objects.filter(space=self.request.space, type=Automation.FOOD_ALIAS, param_1__iexact=food, disabled=False).order_by('order').first():
                return automation.param_2
-        return food
+        return self.apply_regex_replace_automation(food)

    def apply_never_unit_automation(self, tokens):
        """
@ -151,7 +161,8 @@ class AutomationEngine:
                caches['default'].touch(TRANSPOSE_WORDS_CACHE_KEY, 30)
            else:
                i = 0
-                for a in Automation.objects.filter(space=self.request.space, disabled=False, type=Automation.TRANSPOSE_WORDS).only('param_1', 'param_2').order_by('order').all():
+                for a in Automation.objects.filter(space=self.request.space, disabled=False, type=Automation.TRANSPOSE_WORDS).only(
+                        'param_1', 'param_2').order_by('order').all()[:512]:
                    self.transpose_words[i] = [a.param_1.lower(), a.param_2.lower()]
                    i += 1
                caches['default'].set(TRANSPOSE_WORDS_CACHE_KEY, self.transpose_words, 30)
@ -166,10 +177,52 @@ class AutomationEngine:
        else:
            for rule in Automation.objects.filter(space=self.request.space, type=Automation.TRANSPOSE_WORDS, disabled=False) \
                    .annotate(param_1_lower=Lower('param_1'), param_2_lower=Lower('param_2')) \
-                    .filter(param_1_lower__in=tokens, param_2_lower__in=tokens).order_by('order'):
+                    .filter(param_1_lower__in=tokens, param_2_lower__in=tokens).order_by('order')[:512]:
                if rule.param_1 in tokens and rule.param_2 in tokens:
                    string = re.sub(rf"\b({rule.param_1})\W*({rule.param_2})\b", r"\2 \1", string, flags=re.IGNORECASE)
        return string

-    def apply_regex_replace_automation(self, string):
+    def apply_regex_replace_automation(self, string, automation_type):
+        # TODO add warning - maybe on SPACE page? when a max of 512 automations of a specific type is exceeded (ALIAS types excluded?)
+        """
+        Replaces strings in a recipe field that are from a matched source
+        field_type are Automation.type that apply regex replacements
+        Automation.DESCRIPTION_REPLACE
+        Automation.INSTRUCTION_REPLACE
+        # TODO implement these
+        Automation.FOOD_REPLACE
+        Automation.UNIT_REPLACE
+        Automation.TITLE_REPLACE
+
+        regex replacment utilized the following fields from the Automation model
+        :param 1: source that should apply the automation in regex format ('.*' for all)
+        :param 2: regex pattern to match ()
+        :param 3: replacement string (leave blank to delete)
+        return: new string
+        """
+        if self.use_cache and self.regex_replace[automation_type] is None:
+            self.regex_replace[automation_type] = {}
+            REGEX_REPLACE_CACHE_KEY = f'automation_regex_replace_{self.request.space.pk}'
+            if c := caches['default'].get(REGEX_REPLACE_CACHE_KEY, None):
+                self.regex_replace[automation_type] = c[automation_type]
+                caches['default'].touch(REGEX_REPLACE_CACHE_KEY, 30)
+            else:
+                i = 0
+                for a in Automation.objects.filter(space=self.request.space, disabled=False, type=automation_type).only(
+                        'param_1', 'param_2', 'param_3').order_by('order').all()[:512]:
+                    self.regex_replace[automation_type][i] = [a.param_1, a.param_2, a.param_3]
+                    i += 1
+                caches['default'].set(REGEX_REPLACE_CACHE_KEY, self.regex_replace, 30)
+        else:
+            self.regex_replace[automation_type] = {}
+
+        if self.regex_replace[automation_type]:
+            for rule in self.regex_replace[automation_type].values():
+                if re.match(rule[0], (self.source)[:512]):
+                    string = re.sub(rule[1], rule[2], string)
+        else:
+            for rule in Automation.objects.filter(space=self.request.space, disabled=False, type=automation_type).only(
+                    'param_1', 'param_2', 'param_3').order_by('order').all()[:512]:
+                if re.match(rule.param_1, (self.source)[:512]):
+                    string = re.sub(rule.param_2, rule.param_3, string)
        return string
--- a/cookbook/helper/ingredient_parser.py
+++ b/cookbook/helper/ingredient_parser.py
@ -44,7 +44,6 @@ class IngredientParser:
        #         self.unit_aliases[a.param_1.lower()] = a.param_2
        #     caches['default'].set(UNIT_CACHE_KEY, self.unit_aliases, 30)

-        # TODO migrated to automation engine
        # NEVER_UNIT_CACHE_KEY = f'automation_never_unit_{self.request.space.pk}'
        # if c := caches['default'].get(NEVER_UNIT_CACHE_KEY, None):
        #     self.never_unit = c
@ -54,7 +53,6 @@ class IngredientParser:
        #         self.never_unit[a.param_1.lower()] = a.param_2
        #     caches['default'].set(NEVER_UNIT_CACHE_KEY, self.never_unit, 30)

-        # TODO migrated to automation engine
        # TRANSPOSE_WORDS_CACHE_KEY = f'automation_transpose_words_{self.request.space.pk}'
        # if c := caches['default'].get(TRANSPOSE_WORDS_CACHE_KEY, None):
        #     self.transpose_words = c
--- a/cookbook/helper/recipe_url_import.py
+++ b/cookbook/helper/recipe_url_import.py
@ -9,13 +9,37 @@ from isodate.isoerror import ISO8601Error
 from pytube import YouTube
 from recipe_scrapers._utils import get_host_name, get_minutes

+from cookbook.helper.automation_helper import AutomationEngine
 from cookbook.helper.ingredient_parser import IngredientParser
 from cookbook.models import Automation, Keyword, PropertyType


 def get_from_scraper(scrape, request):
    # converting the scrape_me object to the existing json format based on ld+json
-    recipe_json = {}
+
+    recipe_json = {
+        'steps': [],
+        'internal': True
+    }
+    keywords = []
+
+    # assign source URL
+    try:
+        source_url = scrape.canonical_url()
+    except Exception:
+        try:
+            source_url = scrape.url
+        except Exception:
+            pass
+    if source_url:
+        recipe_json['source_url'] = source_url
+        try:
+            keywords.append(source_url.replace('http://', '').replace('https://', '').split('/')[0])
+        except Exception:
+            recipe_json['source_url'] = ''
+
+    automation_engine = AutomationEngine(request, source=recipe_json['source_url'])
+    # assign recipe name
    try:
        recipe_json['name'] = parse_name(scrape.title()[:128] or None)
    except Exception:
@ -29,6 +53,8 @@ def get_from_scraper(scrape, request):
    if isinstance(recipe_json['name'], list) and len(recipe_json['name']) > 0:
        recipe_json['name'] = recipe_json['name'][0]

+    # assign recipe description
+    # TODO notify user about limit if reached - >256 description will be truncated
    try:
        description = scrape.description() or None
    except Exception:
@ -39,8 +65,21 @@ def get_from_scraper(scrape, request):
        except Exception:
            description = ''

-    recipe_json['internal'] = True
+    recipe_json['description'] = parse_description(description)

+    # automations = Automation.objects.filter(
+    #     type=Automation.DESCRIPTION_REPLACE,
+    #     space=request.space,
+    #     disabled=False).only(
+    #     'param_1',
+    #     'param_2',
+    #     'param_3').all().order_by('order')[
+    #         :512]
+    # for a in automations:
+    #     if re.match(a.param_1, (recipe_json['source_url'])[:512]):
+    recipe_json['description'] = automation_engine.apply_regex_replace_automation(recipe_json['description'], Automation.DESCRIPTION_REPLACE)
+
+    # assign servings attributes
    try:
        # dont use scrape.yields() as this will always return "x servings" or "x items", should be improved in scrapers directly
        servings = scrape.schema.data.get('recipeYield') or 1
@ -50,6 +89,7 @@ def get_from_scraper(scrape, request):
    recipe_json['servings'] = parse_servings(servings)
    recipe_json['servings_text'] = parse_servings_text(servings)

+    # assign time attributes
    try:
        recipe_json['working_time'] = get_minutes(scrape.prep_time()) or 0
    except Exception:
@ -74,6 +114,7 @@ def get_from_scraper(scrape, request):
            except Exception:
                pass

+    # assign image
    try:
        recipe_json['image'] = parse_image(scrape.image()) or None
    except Exception:
@ -84,7 +125,7 @@ def get_from_scraper(scrape, request):
        except Exception:
            recipe_json['image'] = ''

-    keywords = []
+    # assign keywords
    try:
        if scrape.schema.data.get("keywords"):
            keywords += listify_keywords(scrape.schema.data.get("keywords"))
@ -109,20 +150,6 @@ def get_from_scraper(scrape, request):
        except Exception:
            pass

-    try:
-        source_url = scrape.canonical_url()
-    except Exception:
-        try:
-            source_url = scrape.url
-        except Exception:
-            pass
-    if source_url:
-        recipe_json['source_url'] = source_url
-        try:
-            keywords.append(source_url.replace('http://', '').replace('https://', '').split('/')[0])
-        except Exception:
-            recipe_json['source_url'] = ''
-
    try:
        if scrape.author():
            keywords.append(scrape.author())
@ -136,7 +163,7 @@ def get_from_scraper(scrape, request):

    ingredient_parser = IngredientParser(request, True)

-    recipe_json['steps'] = []
+    # assign steps
    try:
        for i in parse_instructions(scrape.instructions()):
            recipe_json['steps'].append({'instruction': i, 'ingredients': [], 'show_ingredients_table': request.user.userpreference.show_step_ingredients, })
@ -145,26 +172,10 @@ def get_from_scraper(scrape, request):
    if len(recipe_json['steps']) == 0:
        recipe_json['steps'].append({'instruction': '', 'ingredients': [], })

-    parsed_description = parse_description(description)
-    # TODO notify user about limit if reached
-    # limits exist to limit the attack surface for dos style attacks
-    # TODO migrate to AutomationEngine
-    automations = Automation.objects.filter(
-        type=Automation.DESCRIPTION_REPLACE,
-        space=request.space,
-        disabled=False).only(
-        'param_1',
-        'param_2',
-        'param_3').all().order_by('order')[
-            :512]
-    for a in automations:
-        if re.match(a.param_1, (recipe_json['source_url'])[:512]):
-            parsed_description = re.sub(a.param_2, a.param_3, parsed_description, count=1)
-
-    if len(parsed_description) > 256:  # split at 256 as long descriptions don't look good on recipe cards
-        recipe_json['steps'][0]['instruction'] = f'*{parsed_description}*  \n\n' + recipe_json['steps'][0]['instruction']
+    if len(recipe_json['description']) > 256:  # split at 256 as long descriptions don't look good on recipe cards
+        recipe_json['steps'][0]['instruction'] = f"*{recipe_json['description']}*  \n\n" + recipe_json['steps'][0]['instruction']
    else:
-        recipe_json['description'] = parsed_description[:512]
+        recipe_json['description'] = recipe_json['description'][:512]

    try:
        for x in scrape.ingredients():
@ -205,20 +216,20 @@ def get_from_scraper(scrape, request):
        traceback.print_exc()
        pass

-    if 'source_url' in recipe_json and recipe_json['source_url']:
-        # TODO migrate to AutomationEngine
-        automations = Automation.objects.filter(
-            type=Automation.INSTRUCTION_REPLACE,
-            space=request.space,
-            disabled=False).only(
-            'param_1',
-            'param_2',
-            'param_3').order_by('order').all()[
-            :512]
-        for a in automations:
-            if re.match(a.param_1, (recipe_json['source_url'])[:512]):
+    # if 'source_url' in recipe_json and recipe_json['source_url']:
+        # automations = Automation.objects.filter(
+        #     type=Automation.INSTRUCTION_REPLACE,
+        #     space=request.space,
+        #     disabled=False).only(
+        #     'param_1',
+        #     'param_2',
+        #     'param_3').order_by('order').all()[
+        #     :512]
+        # for a in automations:
+        #     if re.match(a.param_1, (recipe_json['source_url'])[:512]):
    for s in recipe_json['steps']:
-                    s['instruction'] = re.sub(a.param_2, a.param_3, s['instruction'])
+        s['instruction'] = automation_engine.apply_regex_replace_automation(s['instruction'], Automation.INSTRUCTION_REPLACE)
+        # re.sub(a.param_2, a.param_3, s['instruction'])

    return recipe_json

@ -268,6 +279,7 @@ def get_from_youtube_scraper(url, request):
        ]
    }

+    # TODO add automation here
    try:
        video = YouTube(url=url)
        default_recipe_json['name'] = video.title
@ -416,7 +428,6 @@ def parse_keywords(keyword_json, request):
    automation_engine = AutomationEngine(request)
    # keyword_aliases = {}
    # retrieve keyword automation cache if it exists, otherwise build from database
-    # TODO migrate to AutomationEngine
    # KEYWORD_CACHE_KEY = f'automation_keyword_alias_{space.pk}'
    # if c := caches['default'].get(KEYWORD_CACHE_KEY, None):
    #     keyword_aliases = c
--- a/cookbook/migrations/0199_alter_propertytype_options_alter_automation_type_and_more.py
+++ b/cookbook/migrations/0199_alter_propertytype_options_alter_automation_type_and_more.py
--- a/cookbook/models.py
+++ b/cookbook/models.py
@ -1314,11 +1314,23 @@ class Automation(ExportModelOperationsMixin('automations'), models.Model, Permis
    INSTRUCTION_REPLACE = 'INSTRUCTION_REPLACE'
    NEVER_UNIT = 'NEVER_UNIT'
    TRANSPOSE_WORDS = 'TRANSPOSE_WORDS'
+    FOOD_REPLACE = 'FOOD_REPLACE'
+    UNIT_REPLACE = 'UNIT_REPLACE'
+    TITLE_REPLACE = 'TITLE_REPLACE'

    type = models.CharField(max_length=128,
-                            choices=((FOOD_ALIAS, _('Food Alias')), (UNIT_ALIAS, _('Unit Alias')), (KEYWORD_ALIAS, _('Keyword Alias')),
-                                     (DESCRIPTION_REPLACE, _('Description Replace')), (INSTRUCTION_REPLACE, _('Instruction Replace')),
-                                     (NEVER_UNIT, _('Never Unit')), (TRANSPOSE_WORDS, _('Transpose Words')),))
+                            choices=(
+                                (FOOD_ALIAS, _('Food Alias')),
+                                (UNIT_ALIAS, _('Unit Alias')),
+                                (KEYWORD_ALIAS, _('Keyword Alias')),
+                                (DESCRIPTION_REPLACE, _('Description Replace')),
+                                (INSTRUCTION_REPLACE, _('Instruction Replace')),
+                                (NEVER_UNIT, _('Never Unit')),
+                                (TRANSPOSE_WORDS, _('Transpose Words')),
+                                (FOOD_REPLACE, _('Food Replace')),
+                                (UNIT_REPLACE, _('Unit Replace')),
+                                (TITLE_REPLACE, _('Title Replace')),
+                            ))
    name = models.CharField(max_length=128, default='')
    description = models.TextField(blank=True, null=True)

--- a/docs/features/automation.md
+++ b/docs/features/automation.md
@ -35,12 +35,15 @@ and what to replace it with.

 -   **Parameter 1**: pattern of which sites to match (e.g. `.*.chefkoch.de.*`, `.*`)
 -   **Parameter 2**: pattern of what to replace (e.g. `.*`)
-   **Parameter 3**: value to replace matched occurrence of parameter 2 with. Only one occurrence of the pattern is replaced.
+-   **Parameter 3**: value to replace matched occurrence of parameter 2 with. Only the first occurrence of the pattern is replaced.

 To replace the description the python [re.sub](https://docs.python.org/2/library/re.html#re.sub) function is used
-like this `re.sub(<parameter 2>, <parameter 2>, <descriotion>, count=1)`
+like this `re.sub(<parameter 2>, <parameter 3>, <description>, count=1)`

 To test out your patterns and learn about RegEx you can use [regexr.com](https://regexr.com/)
+ChatGPT and similiar LLMs are also useful for creating RegEx patterns:
+`ChatGPT please create a Regex expression in the format of re.sub(<parameter 2>, <parameter 3>, <description>, count=1)
+that will change the string <example string here> into the string <desired result here>`

 !!! info
 In order to prevent denial of service attacks on the RegEx engine the number of replace automations