migrate regex_replace functions to AutomationEngine

create TITLE_REPLACE, UNIT_REPLACE and FOOD REPLACE automation types
create migration for new types
This commit is contained in:
smilerz 2023-08-31 12:46:34 -05:00
parent b8317c2c29
commit 30c891abfc
No known key found for this signature in database
GPG Key ID: 39444C7606D47126
6 changed files with 143 additions and 66 deletions

View File

@ -8,18 +8,28 @@ from cookbook.models import Automation
class AutomationEngine:
request = None
source = None
use_cache = None
food_aliases = None
keyword_aliases = None
unit_aliases = None
never_unit = None
transpose_words = None
description_replace = None
instruction_replace = None
regex_replace = {
Automation.DESCRIPTION_REPLACE: None,
Automation.INSTRUCTION_REPLACE: None,
Automation.FOOD_REPLACE: None,
Automation.UNIT_REPLACE: None,
Automation.TITLE_REPLACE: None,
}
def __init__(self, request, use_cache=True):
def __init__(self, request, use_cache=True, source=None):
self.request = request
self.use_cache = use_cache
if not source:
self.source = "default_string_to_avoid_false_regex_match"
else:
self.source = source
def apply_keyword_automation(self, keyword):
keyword = keyword.strip()
@ -92,7 +102,7 @@ class AutomationEngine:
else:
if automation := Automation.objects.filter(space=self.request.space, type=Automation.FOOD_ALIAS, param_1__iexact=food, disabled=False).order_by('order').first():
return automation.param_2
return food
return self.apply_regex_replace_automation(food)
def apply_never_unit_automation(self, tokens):
"""
@ -151,7 +161,8 @@ class AutomationEngine:
caches['default'].touch(TRANSPOSE_WORDS_CACHE_KEY, 30)
else:
i = 0
for a in Automation.objects.filter(space=self.request.space, disabled=False, type=Automation.TRANSPOSE_WORDS).only('param_1', 'param_2').order_by('order').all():
for a in Automation.objects.filter(space=self.request.space, disabled=False, type=Automation.TRANSPOSE_WORDS).only(
'param_1', 'param_2').order_by('order').all()[:512]:
self.transpose_words[i] = [a.param_1.lower(), a.param_2.lower()]
i += 1
caches['default'].set(TRANSPOSE_WORDS_CACHE_KEY, self.transpose_words, 30)
@ -166,10 +177,52 @@ class AutomationEngine:
else:
for rule in Automation.objects.filter(space=self.request.space, type=Automation.TRANSPOSE_WORDS, disabled=False) \
.annotate(param_1_lower=Lower('param_1'), param_2_lower=Lower('param_2')) \
.filter(param_1_lower__in=tokens, param_2_lower__in=tokens).order_by('order'):
.filter(param_1_lower__in=tokens, param_2_lower__in=tokens).order_by('order')[:512]:
if rule.param_1 in tokens and rule.param_2 in tokens:
string = re.sub(rf"\b({rule.param_1})\W*({rule.param_2})\b", r"\2 \1", string, flags=re.IGNORECASE)
return string
def apply_regex_replace_automation(self, string):
def apply_regex_replace_automation(self, string, automation_type):
# TODO add warning - maybe on SPACE page? when a max of 512 automations of a specific type is exceeded (ALIAS types excluded?)
"""
Replaces strings in a recipe field that are from a matched source
field_type are Automation.type that apply regex replacements
Automation.DESCRIPTION_REPLACE
Automation.INSTRUCTION_REPLACE
# TODO implement these
Automation.FOOD_REPLACE
Automation.UNIT_REPLACE
Automation.TITLE_REPLACE
regex replacment utilized the following fields from the Automation model
:param 1: source that should apply the automation in regex format ('.*' for all)
:param 2: regex pattern to match ()
:param 3: replacement string (leave blank to delete)
return: new string
"""
if self.use_cache and self.regex_replace[automation_type] is None:
self.regex_replace[automation_type] = {}
REGEX_REPLACE_CACHE_KEY = f'automation_regex_replace_{self.request.space.pk}'
if c := caches['default'].get(REGEX_REPLACE_CACHE_KEY, None):
self.regex_replace[automation_type] = c[automation_type]
caches['default'].touch(REGEX_REPLACE_CACHE_KEY, 30)
else:
i = 0
for a in Automation.objects.filter(space=self.request.space, disabled=False, type=automation_type).only(
'param_1', 'param_2', 'param_3').order_by('order').all()[:512]:
self.regex_replace[automation_type][i] = [a.param_1, a.param_2, a.param_3]
i += 1
caches['default'].set(REGEX_REPLACE_CACHE_KEY, self.regex_replace, 30)
else:
self.regex_replace[automation_type] = {}
if self.regex_replace[automation_type]:
for rule in self.regex_replace[automation_type].values():
if re.match(rule[0], (self.source)[:512]):
string = re.sub(rule[1], rule[2], string)
else:
for rule in Automation.objects.filter(space=self.request.space, disabled=False, type=automation_type).only(
'param_1', 'param_2', 'param_3').order_by('order').all()[:512]:
if re.match(rule.param_1, (self.source)[:512]):
string = re.sub(rule.param_2, rule.param_3, string)
return string

View File

@ -44,7 +44,6 @@ class IngredientParser:
# self.unit_aliases[a.param_1.lower()] = a.param_2
# caches['default'].set(UNIT_CACHE_KEY, self.unit_aliases, 30)
# TODO migrated to automation engine
# NEVER_UNIT_CACHE_KEY = f'automation_never_unit_{self.request.space.pk}'
# if c := caches['default'].get(NEVER_UNIT_CACHE_KEY, None):
# self.never_unit = c
@ -54,7 +53,6 @@ class IngredientParser:
# self.never_unit[a.param_1.lower()] = a.param_2
# caches['default'].set(NEVER_UNIT_CACHE_KEY, self.never_unit, 30)
# TODO migrated to automation engine
# TRANSPOSE_WORDS_CACHE_KEY = f'automation_transpose_words_{self.request.space.pk}'
# if c := caches['default'].get(TRANSPOSE_WORDS_CACHE_KEY, None):
# self.transpose_words = c

View File

@ -9,13 +9,37 @@ from isodate.isoerror import ISO8601Error
from pytube import YouTube
from recipe_scrapers._utils import get_host_name, get_minutes
from cookbook.helper.automation_helper import AutomationEngine
from cookbook.helper.ingredient_parser import IngredientParser
from cookbook.models import Automation, Keyword, PropertyType
def get_from_scraper(scrape, request):
# converting the scrape_me object to the existing json format based on ld+json
recipe_json = {}
recipe_json = {
'steps': [],
'internal': True
}
keywords = []
# assign source URL
try:
source_url = scrape.canonical_url()
except Exception:
try:
source_url = scrape.url
except Exception:
pass
if source_url:
recipe_json['source_url'] = source_url
try:
keywords.append(source_url.replace('http://', '').replace('https://', '').split('/')[0])
except Exception:
recipe_json['source_url'] = ''
automation_engine = AutomationEngine(request, source=recipe_json['source_url'])
# assign recipe name
try:
recipe_json['name'] = parse_name(scrape.title()[:128] or None)
except Exception:
@ -29,6 +53,8 @@ def get_from_scraper(scrape, request):
if isinstance(recipe_json['name'], list) and len(recipe_json['name']) > 0:
recipe_json['name'] = recipe_json['name'][0]
# assign recipe description
# TODO notify user about limit if reached - >256 description will be truncated
try:
description = scrape.description() or None
except Exception:
@ -39,8 +65,21 @@ def get_from_scraper(scrape, request):
except Exception:
description = ''
recipe_json['internal'] = True
recipe_json['description'] = parse_description(description)
# automations = Automation.objects.filter(
# type=Automation.DESCRIPTION_REPLACE,
# space=request.space,
# disabled=False).only(
# 'param_1',
# 'param_2',
# 'param_3').all().order_by('order')[
# :512]
# for a in automations:
# if re.match(a.param_1, (recipe_json['source_url'])[:512]):
recipe_json['description'] = automation_engine.apply_regex_replace_automation(recipe_json['description'], Automation.DESCRIPTION_REPLACE)
# assign servings attributes
try:
# dont use scrape.yields() as this will always return "x servings" or "x items", should be improved in scrapers directly
servings = scrape.schema.data.get('recipeYield') or 1
@ -50,6 +89,7 @@ def get_from_scraper(scrape, request):
recipe_json['servings'] = parse_servings(servings)
recipe_json['servings_text'] = parse_servings_text(servings)
# assign time attributes
try:
recipe_json['working_time'] = get_minutes(scrape.prep_time()) or 0
except Exception:
@ -74,6 +114,7 @@ def get_from_scraper(scrape, request):
except Exception:
pass
# assign image
try:
recipe_json['image'] = parse_image(scrape.image()) or None
except Exception:
@ -84,7 +125,7 @@ def get_from_scraper(scrape, request):
except Exception:
recipe_json['image'] = ''
keywords = []
# assign keywords
try:
if scrape.schema.data.get("keywords"):
keywords += listify_keywords(scrape.schema.data.get("keywords"))
@ -109,20 +150,6 @@ def get_from_scraper(scrape, request):
except Exception:
pass
try:
source_url = scrape.canonical_url()
except Exception:
try:
source_url = scrape.url
except Exception:
pass
if source_url:
recipe_json['source_url'] = source_url
try:
keywords.append(source_url.replace('http://', '').replace('https://', '').split('/')[0])
except Exception:
recipe_json['source_url'] = ''
try:
if scrape.author():
keywords.append(scrape.author())
@ -136,7 +163,7 @@ def get_from_scraper(scrape, request):
ingredient_parser = IngredientParser(request, True)
recipe_json['steps'] = []
# assign steps
try:
for i in parse_instructions(scrape.instructions()):
recipe_json['steps'].append({'instruction': i, 'ingredients': [], 'show_ingredients_table': request.user.userpreference.show_step_ingredients, })
@ -145,26 +172,10 @@ def get_from_scraper(scrape, request):
if len(recipe_json['steps']) == 0:
recipe_json['steps'].append({'instruction': '', 'ingredients': [], })
parsed_description = parse_description(description)
# TODO notify user about limit if reached
# limits exist to limit the attack surface for dos style attacks
# TODO migrate to AutomationEngine
automations = Automation.objects.filter(
type=Automation.DESCRIPTION_REPLACE,
space=request.space,
disabled=False).only(
'param_1',
'param_2',
'param_3').all().order_by('order')[
:512]
for a in automations:
if re.match(a.param_1, (recipe_json['source_url'])[:512]):
parsed_description = re.sub(a.param_2, a.param_3, parsed_description, count=1)
if len(parsed_description) > 256: # split at 256 as long descriptions don't look good on recipe cards
recipe_json['steps'][0]['instruction'] = f'*{parsed_description}* \n\n' + recipe_json['steps'][0]['instruction']
if len(recipe_json['description']) > 256: # split at 256 as long descriptions don't look good on recipe cards
recipe_json['steps'][0]['instruction'] = f"*{recipe_json['description']}* \n\n" + recipe_json['steps'][0]['instruction']
else:
recipe_json['description'] = parsed_description[:512]
recipe_json['description'] = recipe_json['description'][:512]
try:
for x in scrape.ingredients():
@ -205,20 +216,20 @@ def get_from_scraper(scrape, request):
traceback.print_exc()
pass
if 'source_url' in recipe_json and recipe_json['source_url']:
# TODO migrate to AutomationEngine
automations = Automation.objects.filter(
type=Automation.INSTRUCTION_REPLACE,
space=request.space,
disabled=False).only(
'param_1',
'param_2',
'param_3').order_by('order').all()[
:512]
for a in automations:
if re.match(a.param_1, (recipe_json['source_url'])[:512]):
# if 'source_url' in recipe_json and recipe_json['source_url']:
# automations = Automation.objects.filter(
# type=Automation.INSTRUCTION_REPLACE,
# space=request.space,
# disabled=False).only(
# 'param_1',
# 'param_2',
# 'param_3').order_by('order').all()[
# :512]
# for a in automations:
# if re.match(a.param_1, (recipe_json['source_url'])[:512]):
for s in recipe_json['steps']:
s['instruction'] = re.sub(a.param_2, a.param_3, s['instruction'])
s['instruction'] = automation_engine.apply_regex_replace_automation(s['instruction'], Automation.INSTRUCTION_REPLACE)
# re.sub(a.param_2, a.param_3, s['instruction'])
return recipe_json
@ -268,6 +279,7 @@ def get_from_youtube_scraper(url, request):
]
}
# TODO add automation here
try:
video = YouTube(url=url)
default_recipe_json['name'] = video.title
@ -416,7 +428,6 @@ def parse_keywords(keyword_json, request):
automation_engine = AutomationEngine(request)
# keyword_aliases = {}
# retrieve keyword automation cache if it exists, otherwise build from database
# TODO migrate to AutomationEngine
# KEYWORD_CACHE_KEY = f'automation_keyword_alias_{space.pk}'
# if c := caches['default'].get(KEYWORD_CACHE_KEY, None):
# keyword_aliases = c

View File

@ -1314,11 +1314,23 @@ class Automation(ExportModelOperationsMixin('automations'), models.Model, Permis
INSTRUCTION_REPLACE = 'INSTRUCTION_REPLACE'
NEVER_UNIT = 'NEVER_UNIT'
TRANSPOSE_WORDS = 'TRANSPOSE_WORDS'
FOOD_REPLACE = 'FOOD_REPLACE'
UNIT_REPLACE = 'UNIT_REPLACE'
TITLE_REPLACE = 'TITLE_REPLACE'
type = models.CharField(max_length=128,
choices=((FOOD_ALIAS, _('Food Alias')), (UNIT_ALIAS, _('Unit Alias')), (KEYWORD_ALIAS, _('Keyword Alias')),
(DESCRIPTION_REPLACE, _('Description Replace')), (INSTRUCTION_REPLACE, _('Instruction Replace')),
(NEVER_UNIT, _('Never Unit')), (TRANSPOSE_WORDS, _('Transpose Words')),))
choices=(
(FOOD_ALIAS, _('Food Alias')),
(UNIT_ALIAS, _('Unit Alias')),
(KEYWORD_ALIAS, _('Keyword Alias')),
(DESCRIPTION_REPLACE, _('Description Replace')),
(INSTRUCTION_REPLACE, _('Instruction Replace')),
(NEVER_UNIT, _('Never Unit')),
(TRANSPOSE_WORDS, _('Transpose Words')),
(FOOD_REPLACE, _('Food Replace')),
(UNIT_REPLACE, _('Unit Replace')),
(TITLE_REPLACE, _('Title Replace')),
))
name = models.CharField(max_length=128, default='')
description = models.TextField(blank=True, null=True)

View File

@ -35,12 +35,15 @@ and what to replace it with.
- **Parameter 1**: pattern of which sites to match (e.g. `.*.chefkoch.de.*`, `.*`)
- **Parameter 2**: pattern of what to replace (e.g. `.*`)
- **Parameter 3**: value to replace matched occurrence of parameter 2 with. Only one occurrence of the pattern is replaced.
- **Parameter 3**: value to replace matched occurrence of parameter 2 with. Only the first occurrence of the pattern is replaced.
To replace the description the python [re.sub](https://docs.python.org/2/library/re.html#re.sub) function is used
like this `re.sub(<parameter 2>, <parameter 2>, <descriotion>, count=1)`
like this `re.sub(<parameter 2>, <parameter 3>, <description>, count=1)`
To test out your patterns and learn about RegEx you can use [regexr.com](https://regexr.com/)
ChatGPT and similiar LLMs are also useful for creating RegEx patterns:
`ChatGPT please create a Regex expression in the format of re.sub(<parameter 2>, <parameter 3>, <description>, count=1)
that will change the string <example string here> into the string <desired result here>`
!!! info
In order to prevent denial of service attacks on the RegEx engine the number of replace automations