migrate regex_replace functions to AutomationEngine
create TITLE_REPLACE, UNIT_REPLACE and FOOD REPLACE automation types create migration for new types
This commit is contained in:
parent
b8317c2c29
commit
30c891abfc
@ -8,18 +8,28 @@ from cookbook.models import Automation
|
||||
|
||||
class AutomationEngine:
|
||||
request = None
|
||||
source = None
|
||||
use_cache = None
|
||||
food_aliases = None
|
||||
keyword_aliases = None
|
||||
unit_aliases = None
|
||||
never_unit = None
|
||||
transpose_words = None
|
||||
description_replace = None
|
||||
instruction_replace = None
|
||||
regex_replace = {
|
||||
Automation.DESCRIPTION_REPLACE: None,
|
||||
Automation.INSTRUCTION_REPLACE: None,
|
||||
Automation.FOOD_REPLACE: None,
|
||||
Automation.UNIT_REPLACE: None,
|
||||
Automation.TITLE_REPLACE: None,
|
||||
}
|
||||
|
||||
def __init__(self, request, use_cache=True):
|
||||
def __init__(self, request, use_cache=True, source=None):
|
||||
self.request = request
|
||||
self.use_cache = use_cache
|
||||
if not source:
|
||||
self.source = "default_string_to_avoid_false_regex_match"
|
||||
else:
|
||||
self.source = source
|
||||
|
||||
def apply_keyword_automation(self, keyword):
|
||||
keyword = keyword.strip()
|
||||
@ -92,7 +102,7 @@ class AutomationEngine:
|
||||
else:
|
||||
if automation := Automation.objects.filter(space=self.request.space, type=Automation.FOOD_ALIAS, param_1__iexact=food, disabled=False).order_by('order').first():
|
||||
return automation.param_2
|
||||
return food
|
||||
return self.apply_regex_replace_automation(food)
|
||||
|
||||
def apply_never_unit_automation(self, tokens):
|
||||
"""
|
||||
@ -151,7 +161,8 @@ class AutomationEngine:
|
||||
caches['default'].touch(TRANSPOSE_WORDS_CACHE_KEY, 30)
|
||||
else:
|
||||
i = 0
|
||||
for a in Automation.objects.filter(space=self.request.space, disabled=False, type=Automation.TRANSPOSE_WORDS).only('param_1', 'param_2').order_by('order').all():
|
||||
for a in Automation.objects.filter(space=self.request.space, disabled=False, type=Automation.TRANSPOSE_WORDS).only(
|
||||
'param_1', 'param_2').order_by('order').all()[:512]:
|
||||
self.transpose_words[i] = [a.param_1.lower(), a.param_2.lower()]
|
||||
i += 1
|
||||
caches['default'].set(TRANSPOSE_WORDS_CACHE_KEY, self.transpose_words, 30)
|
||||
@ -166,10 +177,52 @@ class AutomationEngine:
|
||||
else:
|
||||
for rule in Automation.objects.filter(space=self.request.space, type=Automation.TRANSPOSE_WORDS, disabled=False) \
|
||||
.annotate(param_1_lower=Lower('param_1'), param_2_lower=Lower('param_2')) \
|
||||
.filter(param_1_lower__in=tokens, param_2_lower__in=tokens).order_by('order'):
|
||||
.filter(param_1_lower__in=tokens, param_2_lower__in=tokens).order_by('order')[:512]:
|
||||
if rule.param_1 in tokens and rule.param_2 in tokens:
|
||||
string = re.sub(rf"\b({rule.param_1})\W*({rule.param_2})\b", r"\2 \1", string, flags=re.IGNORECASE)
|
||||
return string
|
||||
|
||||
def apply_regex_replace_automation(self, string):
|
||||
def apply_regex_replace_automation(self, string, automation_type):
|
||||
# TODO add warning - maybe on SPACE page? when a max of 512 automations of a specific type is exceeded (ALIAS types excluded?)
|
||||
"""
|
||||
Replaces strings in a recipe field that are from a matched source
|
||||
field_type are Automation.type that apply regex replacements
|
||||
Automation.DESCRIPTION_REPLACE
|
||||
Automation.INSTRUCTION_REPLACE
|
||||
# TODO implement these
|
||||
Automation.FOOD_REPLACE
|
||||
Automation.UNIT_REPLACE
|
||||
Automation.TITLE_REPLACE
|
||||
|
||||
regex replacment utilized the following fields from the Automation model
|
||||
:param 1: source that should apply the automation in regex format ('.*' for all)
|
||||
:param 2: regex pattern to match ()
|
||||
:param 3: replacement string (leave blank to delete)
|
||||
return: new string
|
||||
"""
|
||||
if self.use_cache and self.regex_replace[automation_type] is None:
|
||||
self.regex_replace[automation_type] = {}
|
||||
REGEX_REPLACE_CACHE_KEY = f'automation_regex_replace_{self.request.space.pk}'
|
||||
if c := caches['default'].get(REGEX_REPLACE_CACHE_KEY, None):
|
||||
self.regex_replace[automation_type] = c[automation_type]
|
||||
caches['default'].touch(REGEX_REPLACE_CACHE_KEY, 30)
|
||||
else:
|
||||
i = 0
|
||||
for a in Automation.objects.filter(space=self.request.space, disabled=False, type=automation_type).only(
|
||||
'param_1', 'param_2', 'param_3').order_by('order').all()[:512]:
|
||||
self.regex_replace[automation_type][i] = [a.param_1, a.param_2, a.param_3]
|
||||
i += 1
|
||||
caches['default'].set(REGEX_REPLACE_CACHE_KEY, self.regex_replace, 30)
|
||||
else:
|
||||
self.regex_replace[automation_type] = {}
|
||||
|
||||
if self.regex_replace[automation_type]:
|
||||
for rule in self.regex_replace[automation_type].values():
|
||||
if re.match(rule[0], (self.source)[:512]):
|
||||
string = re.sub(rule[1], rule[2], string)
|
||||
else:
|
||||
for rule in Automation.objects.filter(space=self.request.space, disabled=False, type=automation_type).only(
|
||||
'param_1', 'param_2', 'param_3').order_by('order').all()[:512]:
|
||||
if re.match(rule.param_1, (self.source)[:512]):
|
||||
string = re.sub(rule.param_2, rule.param_3, string)
|
||||
return string
|
||||
|
@ -44,7 +44,6 @@ class IngredientParser:
|
||||
# self.unit_aliases[a.param_1.lower()] = a.param_2
|
||||
# caches['default'].set(UNIT_CACHE_KEY, self.unit_aliases, 30)
|
||||
|
||||
# TODO migrated to automation engine
|
||||
# NEVER_UNIT_CACHE_KEY = f'automation_never_unit_{self.request.space.pk}'
|
||||
# if c := caches['default'].get(NEVER_UNIT_CACHE_KEY, None):
|
||||
# self.never_unit = c
|
||||
@ -54,7 +53,6 @@ class IngredientParser:
|
||||
# self.never_unit[a.param_1.lower()] = a.param_2
|
||||
# caches['default'].set(NEVER_UNIT_CACHE_KEY, self.never_unit, 30)
|
||||
|
||||
# TODO migrated to automation engine
|
||||
# TRANSPOSE_WORDS_CACHE_KEY = f'automation_transpose_words_{self.request.space.pk}'
|
||||
# if c := caches['default'].get(TRANSPOSE_WORDS_CACHE_KEY, None):
|
||||
# self.transpose_words = c
|
||||
|
@ -9,13 +9,37 @@ from isodate.isoerror import ISO8601Error
|
||||
from pytube import YouTube
|
||||
from recipe_scrapers._utils import get_host_name, get_minutes
|
||||
|
||||
from cookbook.helper.automation_helper import AutomationEngine
|
||||
from cookbook.helper.ingredient_parser import IngredientParser
|
||||
from cookbook.models import Automation, Keyword, PropertyType
|
||||
|
||||
|
||||
def get_from_scraper(scrape, request):
|
||||
# converting the scrape_me object to the existing json format based on ld+json
|
||||
recipe_json = {}
|
||||
|
||||
recipe_json = {
|
||||
'steps': [],
|
||||
'internal': True
|
||||
}
|
||||
keywords = []
|
||||
|
||||
# assign source URL
|
||||
try:
|
||||
source_url = scrape.canonical_url()
|
||||
except Exception:
|
||||
try:
|
||||
source_url = scrape.url
|
||||
except Exception:
|
||||
pass
|
||||
if source_url:
|
||||
recipe_json['source_url'] = source_url
|
||||
try:
|
||||
keywords.append(source_url.replace('http://', '').replace('https://', '').split('/')[0])
|
||||
except Exception:
|
||||
recipe_json['source_url'] = ''
|
||||
|
||||
automation_engine = AutomationEngine(request, source=recipe_json['source_url'])
|
||||
# assign recipe name
|
||||
try:
|
||||
recipe_json['name'] = parse_name(scrape.title()[:128] or None)
|
||||
except Exception:
|
||||
@ -29,6 +53,8 @@ def get_from_scraper(scrape, request):
|
||||
if isinstance(recipe_json['name'], list) and len(recipe_json['name']) > 0:
|
||||
recipe_json['name'] = recipe_json['name'][0]
|
||||
|
||||
# assign recipe description
|
||||
# TODO notify user about limit if reached - >256 description will be truncated
|
||||
try:
|
||||
description = scrape.description() or None
|
||||
except Exception:
|
||||
@ -39,8 +65,21 @@ def get_from_scraper(scrape, request):
|
||||
except Exception:
|
||||
description = ''
|
||||
|
||||
recipe_json['internal'] = True
|
||||
recipe_json['description'] = parse_description(description)
|
||||
|
||||
# automations = Automation.objects.filter(
|
||||
# type=Automation.DESCRIPTION_REPLACE,
|
||||
# space=request.space,
|
||||
# disabled=False).only(
|
||||
# 'param_1',
|
||||
# 'param_2',
|
||||
# 'param_3').all().order_by('order')[
|
||||
# :512]
|
||||
# for a in automations:
|
||||
# if re.match(a.param_1, (recipe_json['source_url'])[:512]):
|
||||
recipe_json['description'] = automation_engine.apply_regex_replace_automation(recipe_json['description'], Automation.DESCRIPTION_REPLACE)
|
||||
|
||||
# assign servings attributes
|
||||
try:
|
||||
# dont use scrape.yields() as this will always return "x servings" or "x items", should be improved in scrapers directly
|
||||
servings = scrape.schema.data.get('recipeYield') or 1
|
||||
@ -50,6 +89,7 @@ def get_from_scraper(scrape, request):
|
||||
recipe_json['servings'] = parse_servings(servings)
|
||||
recipe_json['servings_text'] = parse_servings_text(servings)
|
||||
|
||||
# assign time attributes
|
||||
try:
|
||||
recipe_json['working_time'] = get_minutes(scrape.prep_time()) or 0
|
||||
except Exception:
|
||||
@ -74,6 +114,7 @@ def get_from_scraper(scrape, request):
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# assign image
|
||||
try:
|
||||
recipe_json['image'] = parse_image(scrape.image()) or None
|
||||
except Exception:
|
||||
@ -84,7 +125,7 @@ def get_from_scraper(scrape, request):
|
||||
except Exception:
|
||||
recipe_json['image'] = ''
|
||||
|
||||
keywords = []
|
||||
# assign keywords
|
||||
try:
|
||||
if scrape.schema.data.get("keywords"):
|
||||
keywords += listify_keywords(scrape.schema.data.get("keywords"))
|
||||
@ -109,20 +150,6 @@ def get_from_scraper(scrape, request):
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
source_url = scrape.canonical_url()
|
||||
except Exception:
|
||||
try:
|
||||
source_url = scrape.url
|
||||
except Exception:
|
||||
pass
|
||||
if source_url:
|
||||
recipe_json['source_url'] = source_url
|
||||
try:
|
||||
keywords.append(source_url.replace('http://', '').replace('https://', '').split('/')[0])
|
||||
except Exception:
|
||||
recipe_json['source_url'] = ''
|
||||
|
||||
try:
|
||||
if scrape.author():
|
||||
keywords.append(scrape.author())
|
||||
@ -136,7 +163,7 @@ def get_from_scraper(scrape, request):
|
||||
|
||||
ingredient_parser = IngredientParser(request, True)
|
||||
|
||||
recipe_json['steps'] = []
|
||||
# assign steps
|
||||
try:
|
||||
for i in parse_instructions(scrape.instructions()):
|
||||
recipe_json['steps'].append({'instruction': i, 'ingredients': [], 'show_ingredients_table': request.user.userpreference.show_step_ingredients, })
|
||||
@ -145,26 +172,10 @@ def get_from_scraper(scrape, request):
|
||||
if len(recipe_json['steps']) == 0:
|
||||
recipe_json['steps'].append({'instruction': '', 'ingredients': [], })
|
||||
|
||||
parsed_description = parse_description(description)
|
||||
# TODO notify user about limit if reached
|
||||
# limits exist to limit the attack surface for dos style attacks
|
||||
# TODO migrate to AutomationEngine
|
||||
automations = Automation.objects.filter(
|
||||
type=Automation.DESCRIPTION_REPLACE,
|
||||
space=request.space,
|
||||
disabled=False).only(
|
||||
'param_1',
|
||||
'param_2',
|
||||
'param_3').all().order_by('order')[
|
||||
:512]
|
||||
for a in automations:
|
||||
if re.match(a.param_1, (recipe_json['source_url'])[:512]):
|
||||
parsed_description = re.sub(a.param_2, a.param_3, parsed_description, count=1)
|
||||
|
||||
if len(parsed_description) > 256: # split at 256 as long descriptions don't look good on recipe cards
|
||||
recipe_json['steps'][0]['instruction'] = f'*{parsed_description}* \n\n' + recipe_json['steps'][0]['instruction']
|
||||
if len(recipe_json['description']) > 256: # split at 256 as long descriptions don't look good on recipe cards
|
||||
recipe_json['steps'][0]['instruction'] = f"*{recipe_json['description']}* \n\n" + recipe_json['steps'][0]['instruction']
|
||||
else:
|
||||
recipe_json['description'] = parsed_description[:512]
|
||||
recipe_json['description'] = recipe_json['description'][:512]
|
||||
|
||||
try:
|
||||
for x in scrape.ingredients():
|
||||
@ -205,20 +216,20 @@ def get_from_scraper(scrape, request):
|
||||
traceback.print_exc()
|
||||
pass
|
||||
|
||||
if 'source_url' in recipe_json and recipe_json['source_url']:
|
||||
# TODO migrate to AutomationEngine
|
||||
automations = Automation.objects.filter(
|
||||
type=Automation.INSTRUCTION_REPLACE,
|
||||
space=request.space,
|
||||
disabled=False).only(
|
||||
'param_1',
|
||||
'param_2',
|
||||
'param_3').order_by('order').all()[
|
||||
:512]
|
||||
for a in automations:
|
||||
if re.match(a.param_1, (recipe_json['source_url'])[:512]):
|
||||
# if 'source_url' in recipe_json and recipe_json['source_url']:
|
||||
# automations = Automation.objects.filter(
|
||||
# type=Automation.INSTRUCTION_REPLACE,
|
||||
# space=request.space,
|
||||
# disabled=False).only(
|
||||
# 'param_1',
|
||||
# 'param_2',
|
||||
# 'param_3').order_by('order').all()[
|
||||
# :512]
|
||||
# for a in automations:
|
||||
# if re.match(a.param_1, (recipe_json['source_url'])[:512]):
|
||||
for s in recipe_json['steps']:
|
||||
s['instruction'] = re.sub(a.param_2, a.param_3, s['instruction'])
|
||||
s['instruction'] = automation_engine.apply_regex_replace_automation(s['instruction'], Automation.INSTRUCTION_REPLACE)
|
||||
# re.sub(a.param_2, a.param_3, s['instruction'])
|
||||
|
||||
return recipe_json
|
||||
|
||||
@ -268,6 +279,7 @@ def get_from_youtube_scraper(url, request):
|
||||
]
|
||||
}
|
||||
|
||||
# TODO add automation here
|
||||
try:
|
||||
video = YouTube(url=url)
|
||||
default_recipe_json['name'] = video.title
|
||||
@ -416,7 +428,6 @@ def parse_keywords(keyword_json, request):
|
||||
automation_engine = AutomationEngine(request)
|
||||
# keyword_aliases = {}
|
||||
# retrieve keyword automation cache if it exists, otherwise build from database
|
||||
# TODO migrate to AutomationEngine
|
||||
# KEYWORD_CACHE_KEY = f'automation_keyword_alias_{space.pk}'
|
||||
# if c := caches['default'].get(KEYWORD_CACHE_KEY, None):
|
||||
# keyword_aliases = c
|
||||
|
@ -1314,11 +1314,23 @@ class Automation(ExportModelOperationsMixin('automations'), models.Model, Permis
|
||||
INSTRUCTION_REPLACE = 'INSTRUCTION_REPLACE'
|
||||
NEVER_UNIT = 'NEVER_UNIT'
|
||||
TRANSPOSE_WORDS = 'TRANSPOSE_WORDS'
|
||||
FOOD_REPLACE = 'FOOD_REPLACE'
|
||||
UNIT_REPLACE = 'UNIT_REPLACE'
|
||||
TITLE_REPLACE = 'TITLE_REPLACE'
|
||||
|
||||
type = models.CharField(max_length=128,
|
||||
choices=((FOOD_ALIAS, _('Food Alias')), (UNIT_ALIAS, _('Unit Alias')), (KEYWORD_ALIAS, _('Keyword Alias')),
|
||||
(DESCRIPTION_REPLACE, _('Description Replace')), (INSTRUCTION_REPLACE, _('Instruction Replace')),
|
||||
(NEVER_UNIT, _('Never Unit')), (TRANSPOSE_WORDS, _('Transpose Words')),))
|
||||
choices=(
|
||||
(FOOD_ALIAS, _('Food Alias')),
|
||||
(UNIT_ALIAS, _('Unit Alias')),
|
||||
(KEYWORD_ALIAS, _('Keyword Alias')),
|
||||
(DESCRIPTION_REPLACE, _('Description Replace')),
|
||||
(INSTRUCTION_REPLACE, _('Instruction Replace')),
|
||||
(NEVER_UNIT, _('Never Unit')),
|
||||
(TRANSPOSE_WORDS, _('Transpose Words')),
|
||||
(FOOD_REPLACE, _('Food Replace')),
|
||||
(UNIT_REPLACE, _('Unit Replace')),
|
||||
(TITLE_REPLACE, _('Title Replace')),
|
||||
))
|
||||
name = models.CharField(max_length=128, default='')
|
||||
description = models.TextField(blank=True, null=True)
|
||||
|
||||
|
@ -35,12 +35,15 @@ and what to replace it with.
|
||||
|
||||
- **Parameter 1**: pattern of which sites to match (e.g. `.*.chefkoch.de.*`, `.*`)
|
||||
- **Parameter 2**: pattern of what to replace (e.g. `.*`)
|
||||
- **Parameter 3**: value to replace matched occurrence of parameter 2 with. Only one occurrence of the pattern is replaced.
|
||||
- **Parameter 3**: value to replace matched occurrence of parameter 2 with. Only the first occurrence of the pattern is replaced.
|
||||
|
||||
To replace the description the python [re.sub](https://docs.python.org/2/library/re.html#re.sub) function is used
|
||||
like this `re.sub(<parameter 2>, <parameter 2>, <descriotion>, count=1)`
|
||||
like this `re.sub(<parameter 2>, <parameter 3>, <description>, count=1)`
|
||||
|
||||
To test out your patterns and learn about RegEx you can use [regexr.com](https://regexr.com/)
|
||||
ChatGPT and similiar LLMs are also useful for creating RegEx patterns:
|
||||
`ChatGPT please create a Regex expression in the format of re.sub(<parameter 2>, <parameter 3>, <description>, count=1)
|
||||
that will change the string <example string here> into the string <desired result here>`
|
||||
|
||||
!!! info
|
||||
In order to prevent denial of service attacks on the RegEx engine the number of replace automations
|
||||
|
Loading…
Reference in New Issue
Block a user