migrate regex_replace functions to AutomationEngine

create TITLE_REPLACE, UNIT_REPLACE and FOOD REPLACE automation types
create migration for new types
This commit is contained in:
smilerz 2023-08-31 12:46:34 -05:00
parent b8317c2c29
commit 30c891abfc
No known key found for this signature in database
GPG Key ID: 39444C7606D47126
6 changed files with 143 additions and 66 deletions

View File

@ -8,18 +8,28 @@ from cookbook.models import Automation
class AutomationEngine: class AutomationEngine:
request = None request = None
source = None
use_cache = None use_cache = None
food_aliases = None food_aliases = None
keyword_aliases = None keyword_aliases = None
unit_aliases = None unit_aliases = None
never_unit = None never_unit = None
transpose_words = None transpose_words = None
description_replace = None regex_replace = {
instruction_replace = None Automation.DESCRIPTION_REPLACE: None,
Automation.INSTRUCTION_REPLACE: None,
Automation.FOOD_REPLACE: None,
Automation.UNIT_REPLACE: None,
Automation.TITLE_REPLACE: None,
}
def __init__(self, request, use_cache=True): def __init__(self, request, use_cache=True, source=None):
self.request = request self.request = request
self.use_cache = use_cache self.use_cache = use_cache
if not source:
self.source = "default_string_to_avoid_false_regex_match"
else:
self.source = source
def apply_keyword_automation(self, keyword): def apply_keyword_automation(self, keyword):
keyword = keyword.strip() keyword = keyword.strip()
@ -92,7 +102,7 @@ class AutomationEngine:
else: else:
if automation := Automation.objects.filter(space=self.request.space, type=Automation.FOOD_ALIAS, param_1__iexact=food, disabled=False).order_by('order').first(): if automation := Automation.objects.filter(space=self.request.space, type=Automation.FOOD_ALIAS, param_1__iexact=food, disabled=False).order_by('order').first():
return automation.param_2 return automation.param_2
return food return self.apply_regex_replace_automation(food)
def apply_never_unit_automation(self, tokens): def apply_never_unit_automation(self, tokens):
""" """
@ -151,7 +161,8 @@ class AutomationEngine:
caches['default'].touch(TRANSPOSE_WORDS_CACHE_KEY, 30) caches['default'].touch(TRANSPOSE_WORDS_CACHE_KEY, 30)
else: else:
i = 0 i = 0
for a in Automation.objects.filter(space=self.request.space, disabled=False, type=Automation.TRANSPOSE_WORDS).only('param_1', 'param_2').order_by('order').all(): for a in Automation.objects.filter(space=self.request.space, disabled=False, type=Automation.TRANSPOSE_WORDS).only(
'param_1', 'param_2').order_by('order').all()[:512]:
self.transpose_words[i] = [a.param_1.lower(), a.param_2.lower()] self.transpose_words[i] = [a.param_1.lower(), a.param_2.lower()]
i += 1 i += 1
caches['default'].set(TRANSPOSE_WORDS_CACHE_KEY, self.transpose_words, 30) caches['default'].set(TRANSPOSE_WORDS_CACHE_KEY, self.transpose_words, 30)
@ -166,10 +177,52 @@ class AutomationEngine:
else: else:
for rule in Automation.objects.filter(space=self.request.space, type=Automation.TRANSPOSE_WORDS, disabled=False) \ for rule in Automation.objects.filter(space=self.request.space, type=Automation.TRANSPOSE_WORDS, disabled=False) \
.annotate(param_1_lower=Lower('param_1'), param_2_lower=Lower('param_2')) \ .annotate(param_1_lower=Lower('param_1'), param_2_lower=Lower('param_2')) \
.filter(param_1_lower__in=tokens, param_2_lower__in=tokens).order_by('order'): .filter(param_1_lower__in=tokens, param_2_lower__in=tokens).order_by('order')[:512]:
if rule.param_1 in tokens and rule.param_2 in tokens: if rule.param_1 in tokens and rule.param_2 in tokens:
string = re.sub(rf"\b({rule.param_1})\W*({rule.param_2})\b", r"\2 \1", string, flags=re.IGNORECASE) string = re.sub(rf"\b({rule.param_1})\W*({rule.param_2})\b", r"\2 \1", string, flags=re.IGNORECASE)
return string return string
def apply_regex_replace_automation(self, string): def apply_regex_replace_automation(self, string, automation_type):
# TODO add warning - maybe on SPACE page? when a max of 512 automations of a specific type is exceeded (ALIAS types excluded?)
"""
Replaces strings in a recipe field that are from a matched source
field_type are Automation.type that apply regex replacements
Automation.DESCRIPTION_REPLACE
Automation.INSTRUCTION_REPLACE
# TODO implement these
Automation.FOOD_REPLACE
Automation.UNIT_REPLACE
Automation.TITLE_REPLACE
regex replacment utilized the following fields from the Automation model
:param 1: source that should apply the automation in regex format ('.*' for all)
:param 2: regex pattern to match ()
:param 3: replacement string (leave blank to delete)
return: new string
"""
if self.use_cache and self.regex_replace[automation_type] is None:
self.regex_replace[automation_type] = {}
REGEX_REPLACE_CACHE_KEY = f'automation_regex_replace_{self.request.space.pk}'
if c := caches['default'].get(REGEX_REPLACE_CACHE_KEY, None):
self.regex_replace[automation_type] = c[automation_type]
caches['default'].touch(REGEX_REPLACE_CACHE_KEY, 30)
else:
i = 0
for a in Automation.objects.filter(space=self.request.space, disabled=False, type=automation_type).only(
'param_1', 'param_2', 'param_3').order_by('order').all()[:512]:
self.regex_replace[automation_type][i] = [a.param_1, a.param_2, a.param_3]
i += 1
caches['default'].set(REGEX_REPLACE_CACHE_KEY, self.regex_replace, 30)
else:
self.regex_replace[automation_type] = {}
if self.regex_replace[automation_type]:
for rule in self.regex_replace[automation_type].values():
if re.match(rule[0], (self.source)[:512]):
string = re.sub(rule[1], rule[2], string)
else:
for rule in Automation.objects.filter(space=self.request.space, disabled=False, type=automation_type).only(
'param_1', 'param_2', 'param_3').order_by('order').all()[:512]:
if re.match(rule.param_1, (self.source)[:512]):
string = re.sub(rule.param_2, rule.param_3, string)
return string return string

View File

@ -44,7 +44,6 @@ class IngredientParser:
# self.unit_aliases[a.param_1.lower()] = a.param_2 # self.unit_aliases[a.param_1.lower()] = a.param_2
# caches['default'].set(UNIT_CACHE_KEY, self.unit_aliases, 30) # caches['default'].set(UNIT_CACHE_KEY, self.unit_aliases, 30)
# TODO migrated to automation engine
# NEVER_UNIT_CACHE_KEY = f'automation_never_unit_{self.request.space.pk}' # NEVER_UNIT_CACHE_KEY = f'automation_never_unit_{self.request.space.pk}'
# if c := caches['default'].get(NEVER_UNIT_CACHE_KEY, None): # if c := caches['default'].get(NEVER_UNIT_CACHE_KEY, None):
# self.never_unit = c # self.never_unit = c
@ -54,7 +53,6 @@ class IngredientParser:
# self.never_unit[a.param_1.lower()] = a.param_2 # self.never_unit[a.param_1.lower()] = a.param_2
# caches['default'].set(NEVER_UNIT_CACHE_KEY, self.never_unit, 30) # caches['default'].set(NEVER_UNIT_CACHE_KEY, self.never_unit, 30)
# TODO migrated to automation engine
# TRANSPOSE_WORDS_CACHE_KEY = f'automation_transpose_words_{self.request.space.pk}' # TRANSPOSE_WORDS_CACHE_KEY = f'automation_transpose_words_{self.request.space.pk}'
# if c := caches['default'].get(TRANSPOSE_WORDS_CACHE_KEY, None): # if c := caches['default'].get(TRANSPOSE_WORDS_CACHE_KEY, None):
# self.transpose_words = c # self.transpose_words = c

View File

@ -9,13 +9,37 @@ from isodate.isoerror import ISO8601Error
from pytube import YouTube from pytube import YouTube
from recipe_scrapers._utils import get_host_name, get_minutes from recipe_scrapers._utils import get_host_name, get_minutes
from cookbook.helper.automation_helper import AutomationEngine
from cookbook.helper.ingredient_parser import IngredientParser from cookbook.helper.ingredient_parser import IngredientParser
from cookbook.models import Automation, Keyword, PropertyType from cookbook.models import Automation, Keyword, PropertyType
def get_from_scraper(scrape, request): def get_from_scraper(scrape, request):
# converting the scrape_me object to the existing json format based on ld+json # converting the scrape_me object to the existing json format based on ld+json
recipe_json = {}
recipe_json = {
'steps': [],
'internal': True
}
keywords = []
# assign source URL
try:
source_url = scrape.canonical_url()
except Exception:
try:
source_url = scrape.url
except Exception:
pass
if source_url:
recipe_json['source_url'] = source_url
try:
keywords.append(source_url.replace('http://', '').replace('https://', '').split('/')[0])
except Exception:
recipe_json['source_url'] = ''
automation_engine = AutomationEngine(request, source=recipe_json['source_url'])
# assign recipe name
try: try:
recipe_json['name'] = parse_name(scrape.title()[:128] or None) recipe_json['name'] = parse_name(scrape.title()[:128] or None)
except Exception: except Exception:
@ -29,6 +53,8 @@ def get_from_scraper(scrape, request):
if isinstance(recipe_json['name'], list) and len(recipe_json['name']) > 0: if isinstance(recipe_json['name'], list) and len(recipe_json['name']) > 0:
recipe_json['name'] = recipe_json['name'][0] recipe_json['name'] = recipe_json['name'][0]
# assign recipe description
# TODO notify user about limit if reached - >256 description will be truncated
try: try:
description = scrape.description() or None description = scrape.description() or None
except Exception: except Exception:
@ -39,8 +65,21 @@ def get_from_scraper(scrape, request):
except Exception: except Exception:
description = '' description = ''
recipe_json['internal'] = True recipe_json['description'] = parse_description(description)
# automations = Automation.objects.filter(
# type=Automation.DESCRIPTION_REPLACE,
# space=request.space,
# disabled=False).only(
# 'param_1',
# 'param_2',
# 'param_3').all().order_by('order')[
# :512]
# for a in automations:
# if re.match(a.param_1, (recipe_json['source_url'])[:512]):
recipe_json['description'] = automation_engine.apply_regex_replace_automation(recipe_json['description'], Automation.DESCRIPTION_REPLACE)
# assign servings attributes
try: try:
# dont use scrape.yields() as this will always return "x servings" or "x items", should be improved in scrapers directly # dont use scrape.yields() as this will always return "x servings" or "x items", should be improved in scrapers directly
servings = scrape.schema.data.get('recipeYield') or 1 servings = scrape.schema.data.get('recipeYield') or 1
@ -50,6 +89,7 @@ def get_from_scraper(scrape, request):
recipe_json['servings'] = parse_servings(servings) recipe_json['servings'] = parse_servings(servings)
recipe_json['servings_text'] = parse_servings_text(servings) recipe_json['servings_text'] = parse_servings_text(servings)
# assign time attributes
try: try:
recipe_json['working_time'] = get_minutes(scrape.prep_time()) or 0 recipe_json['working_time'] = get_minutes(scrape.prep_time()) or 0
except Exception: except Exception:
@ -74,6 +114,7 @@ def get_from_scraper(scrape, request):
except Exception: except Exception:
pass pass
# assign image
try: try:
recipe_json['image'] = parse_image(scrape.image()) or None recipe_json['image'] = parse_image(scrape.image()) or None
except Exception: except Exception:
@ -84,7 +125,7 @@ def get_from_scraper(scrape, request):
except Exception: except Exception:
recipe_json['image'] = '' recipe_json['image'] = ''
keywords = [] # assign keywords
try: try:
if scrape.schema.data.get("keywords"): if scrape.schema.data.get("keywords"):
keywords += listify_keywords(scrape.schema.data.get("keywords")) keywords += listify_keywords(scrape.schema.data.get("keywords"))
@ -109,20 +150,6 @@ def get_from_scraper(scrape, request):
except Exception: except Exception:
pass pass
try:
source_url = scrape.canonical_url()
except Exception:
try:
source_url = scrape.url
except Exception:
pass
if source_url:
recipe_json['source_url'] = source_url
try:
keywords.append(source_url.replace('http://', '').replace('https://', '').split('/')[0])
except Exception:
recipe_json['source_url'] = ''
try: try:
if scrape.author(): if scrape.author():
keywords.append(scrape.author()) keywords.append(scrape.author())
@ -136,7 +163,7 @@ def get_from_scraper(scrape, request):
ingredient_parser = IngredientParser(request, True) ingredient_parser = IngredientParser(request, True)
recipe_json['steps'] = [] # assign steps
try: try:
for i in parse_instructions(scrape.instructions()): for i in parse_instructions(scrape.instructions()):
recipe_json['steps'].append({'instruction': i, 'ingredients': [], 'show_ingredients_table': request.user.userpreference.show_step_ingredients, }) recipe_json['steps'].append({'instruction': i, 'ingredients': [], 'show_ingredients_table': request.user.userpreference.show_step_ingredients, })
@ -145,26 +172,10 @@ def get_from_scraper(scrape, request):
if len(recipe_json['steps']) == 0: if len(recipe_json['steps']) == 0:
recipe_json['steps'].append({'instruction': '', 'ingredients': [], }) recipe_json['steps'].append({'instruction': '', 'ingredients': [], })
parsed_description = parse_description(description) if len(recipe_json['description']) > 256: # split at 256 as long descriptions don't look good on recipe cards
# TODO notify user about limit if reached recipe_json['steps'][0]['instruction'] = f"*{recipe_json['description']}* \n\n" + recipe_json['steps'][0]['instruction']
# limits exist to limit the attack surface for dos style attacks
# TODO migrate to AutomationEngine
automations = Automation.objects.filter(
type=Automation.DESCRIPTION_REPLACE,
space=request.space,
disabled=False).only(
'param_1',
'param_2',
'param_3').all().order_by('order')[
:512]
for a in automations:
if re.match(a.param_1, (recipe_json['source_url'])[:512]):
parsed_description = re.sub(a.param_2, a.param_3, parsed_description, count=1)
if len(parsed_description) > 256: # split at 256 as long descriptions don't look good on recipe cards
recipe_json['steps'][0]['instruction'] = f'*{parsed_description}* \n\n' + recipe_json['steps'][0]['instruction']
else: else:
recipe_json['description'] = parsed_description[:512] recipe_json['description'] = recipe_json['description'][:512]
try: try:
for x in scrape.ingredients(): for x in scrape.ingredients():
@ -205,20 +216,20 @@ def get_from_scraper(scrape, request):
traceback.print_exc() traceback.print_exc()
pass pass
if 'source_url' in recipe_json and recipe_json['source_url']: # if 'source_url' in recipe_json and recipe_json['source_url']:
# TODO migrate to AutomationEngine # automations = Automation.objects.filter(
automations = Automation.objects.filter( # type=Automation.INSTRUCTION_REPLACE,
type=Automation.INSTRUCTION_REPLACE, # space=request.space,
space=request.space, # disabled=False).only(
disabled=False).only( # 'param_1',
'param_1', # 'param_2',
'param_2', # 'param_3').order_by('order').all()[
'param_3').order_by('order').all()[ # :512]
:512] # for a in automations:
for a in automations: # if re.match(a.param_1, (recipe_json['source_url'])[:512]):
if re.match(a.param_1, (recipe_json['source_url'])[:512]): for s in recipe_json['steps']:
for s in recipe_json['steps']: s['instruction'] = automation_engine.apply_regex_replace_automation(s['instruction'], Automation.INSTRUCTION_REPLACE)
s['instruction'] = re.sub(a.param_2, a.param_3, s['instruction']) # re.sub(a.param_2, a.param_3, s['instruction'])
return recipe_json return recipe_json
@ -268,6 +279,7 @@ def get_from_youtube_scraper(url, request):
] ]
} }
# TODO add automation here
try: try:
video = YouTube(url=url) video = YouTube(url=url)
default_recipe_json['name'] = video.title default_recipe_json['name'] = video.title
@ -416,7 +428,6 @@ def parse_keywords(keyword_json, request):
automation_engine = AutomationEngine(request) automation_engine = AutomationEngine(request)
# keyword_aliases = {} # keyword_aliases = {}
# retrieve keyword automation cache if it exists, otherwise build from database # retrieve keyword automation cache if it exists, otherwise build from database
# TODO migrate to AutomationEngine
# KEYWORD_CACHE_KEY = f'automation_keyword_alias_{space.pk}' # KEYWORD_CACHE_KEY = f'automation_keyword_alias_{space.pk}'
# if c := caches['default'].get(KEYWORD_CACHE_KEY, None): # if c := caches['default'].get(KEYWORD_CACHE_KEY, None):
# keyword_aliases = c # keyword_aliases = c

View File

@ -1314,11 +1314,23 @@ class Automation(ExportModelOperationsMixin('automations'), models.Model, Permis
INSTRUCTION_REPLACE = 'INSTRUCTION_REPLACE' INSTRUCTION_REPLACE = 'INSTRUCTION_REPLACE'
NEVER_UNIT = 'NEVER_UNIT' NEVER_UNIT = 'NEVER_UNIT'
TRANSPOSE_WORDS = 'TRANSPOSE_WORDS' TRANSPOSE_WORDS = 'TRANSPOSE_WORDS'
FOOD_REPLACE = 'FOOD_REPLACE'
UNIT_REPLACE = 'UNIT_REPLACE'
TITLE_REPLACE = 'TITLE_REPLACE'
type = models.CharField(max_length=128, type = models.CharField(max_length=128,
choices=((FOOD_ALIAS, _('Food Alias')), (UNIT_ALIAS, _('Unit Alias')), (KEYWORD_ALIAS, _('Keyword Alias')), choices=(
(DESCRIPTION_REPLACE, _('Description Replace')), (INSTRUCTION_REPLACE, _('Instruction Replace')), (FOOD_ALIAS, _('Food Alias')),
(NEVER_UNIT, _('Never Unit')), (TRANSPOSE_WORDS, _('Transpose Words')),)) (UNIT_ALIAS, _('Unit Alias')),
(KEYWORD_ALIAS, _('Keyword Alias')),
(DESCRIPTION_REPLACE, _('Description Replace')),
(INSTRUCTION_REPLACE, _('Instruction Replace')),
(NEVER_UNIT, _('Never Unit')),
(TRANSPOSE_WORDS, _('Transpose Words')),
(FOOD_REPLACE, _('Food Replace')),
(UNIT_REPLACE, _('Unit Replace')),
(TITLE_REPLACE, _('Title Replace')),
))
name = models.CharField(max_length=128, default='') name = models.CharField(max_length=128, default='')
description = models.TextField(blank=True, null=True) description = models.TextField(blank=True, null=True)

View File

@ -35,12 +35,15 @@ and what to replace it with.
- **Parameter 1**: pattern of which sites to match (e.g. `.*.chefkoch.de.*`, `.*`) - **Parameter 1**: pattern of which sites to match (e.g. `.*.chefkoch.de.*`, `.*`)
- **Parameter 2**: pattern of what to replace (e.g. `.*`) - **Parameter 2**: pattern of what to replace (e.g. `.*`)
- **Parameter 3**: value to replace matched occurrence of parameter 2 with. Only one occurrence of the pattern is replaced. - **Parameter 3**: value to replace matched occurrence of parameter 2 with. Only the first occurrence of the pattern is replaced.
To replace the description the python [re.sub](https://docs.python.org/2/library/re.html#re.sub) function is used To replace the description the python [re.sub](https://docs.python.org/2/library/re.html#re.sub) function is used
like this `re.sub(<parameter 2>, <parameter 2>, <descriotion>, count=1)` like this `re.sub(<parameter 2>, <parameter 3>, <description>, count=1)`
To test out your patterns and learn about RegEx you can use [regexr.com](https://regexr.com/) To test out your patterns and learn about RegEx you can use [regexr.com](https://regexr.com/)
ChatGPT and similiar LLMs are also useful for creating RegEx patterns:
`ChatGPT please create a Regex expression in the format of re.sub(<parameter 2>, <parameter 3>, <description>, count=1)
that will change the string <example string here> into the string <desired result here>`
!!! info !!! info
In order to prevent denial of service attacks on the RegEx engine the number of replace automations In order to prevent denial of service attacks on the RegEx engine the number of replace automations