Squashed commit of the following:

commit 36403ecbae
Author: smilerz <smilerz@gmail.com>
Date:   Fri Sep 1 12:04:04 2023 -0500

    update migration for new Automation Types

commit 4620ebaf30
Author: smilerz <smilerz@gmail.com>
Date:   Fri Sep 1 07:49:10 2023 -0500

    add Name and Instruction automation to YouTube importer

commit c907da84c1
Author: smilerz <smilerz@gmail.com>
Date:   Fri Sep 1 07:45:32 2023 -0500

    remove old commented automation code

commit 9b5e39415e
Author: smilerz <smilerz@gmail.com>
Date:   Fri Sep 1 07:37:36 2023 -0500

    test for automations applied during url import
    renamed TITLE_REPLACE to NAME_REPLACE

commit 2679a22464
Author: smilerz <smilerz@gmail.com>
Date:   Thu Aug 31 15:29:59 2023 -0500

    added tests for regex_replace

commit 8bae21025b
Author: smilerz <smilerz@gmail.com>
Date:   Thu Aug 31 13:51:46 2023 -0500

    updated Automation Modal and translations

commit 4120adc546
Author: smilerz <smilerz@gmail.com>
Date:   Thu Aug 31 13:12:41 2023 -0500

    applied regex_replace automation to food and unit automations
    updated automation documentation

commit 30c891abfc
Author: smilerz <smilerz@gmail.com>
Date:   Thu Aug 31 12:46:34 2023 -0500

    migrate regex_replace functions to AutomationEngine
    create TITLE_REPLACE, UNIT_REPLACE and FOOD REPLACE automation types
    create migration for new types

commit b8317c2c29
Author: smilerz <smilerz@gmail.com>
Date:   Wed Aug 30 20:44:40 2023 -0500

    move transpose words to AutomationEngine
    create tests for transpose words

commit 39253cfd02
Author: smilerz <smilerz@gmail.com>
Date:   Wed Aug 30 17:03:29 2023 -0500

    refactor never_unit automation to AutomationEngine
    create tests for never_unit

commit 7c0b8b151c
Author: smilerz <smilerz@gmail.com>
Date:   Wed Aug 30 11:21:06 2023 -0500

    update ingredient parser to use AutomationEngine for unt, keyword, food
    update test_ingredient_parser tests to accomodate changes

commit 8e1b8923af
Author: smilerz <smilerz@gmail.com>
Date:   Mon Aug 28 16:44:35 2023 -0500

    keyword and unit Automtations refactored to Automation Engine
    keyword and unit automation tests added

commit 52eb876a08
Author: smilerz <smilerz@gmail.com>
Date:   Mon Aug 28 15:03:19 2023 -0500

    food_alias tests added

commit a820b9c09e
Author: smilerz <smilerz@gmail.com>
Date:   Sat Aug 26 12:37:16 2023 -0500

    create AutomationEngine class
    create food_automation method
    refactor food automations to use AutomationEngine
This commit is contained in:
smilerz
2023-09-12 09:46:08 -05:00
parent c72bf57ccb
commit 768a5ea237
15 changed files with 625 additions and 546 deletions

View File

@ -2,7 +2,6 @@ import re
import traceback
from html import unescape
from django.core.cache import caches
from django.utils.dateparse import parse_duration
from django.utils.translation import gettext as _
from isodate import parse_duration as iso_parse_duration
@ -10,13 +9,37 @@ from isodate.isoerror import ISO8601Error
from pytube import YouTube
from recipe_scrapers._utils import get_host_name, get_minutes
from cookbook.helper.automation_helper import AutomationEngine
from cookbook.helper.ingredient_parser import IngredientParser
from cookbook.models import Automation, Keyword, PropertyType
def get_from_scraper(scrape, request):
# converting the scrape_me object to the existing json format based on ld+json
recipe_json = {}
recipe_json = {
'steps': [],
'internal': True
}
keywords = []
# assign source URL
try:
source_url = scrape.canonical_url()
except Exception:
try:
source_url = scrape.url
except Exception:
pass
if source_url:
recipe_json['source_url'] = source_url
try:
keywords.append(source_url.replace('http://', '').replace('https://', '').split('/')[0])
except Exception:
recipe_json['source_url'] = ''
automation_engine = AutomationEngine(request, source=recipe_json.get('source_url'))
# assign recipe name
try:
recipe_json['name'] = parse_name(scrape.title()[:128] or None)
except Exception:
@ -30,6 +53,10 @@ def get_from_scraper(scrape, request):
if isinstance(recipe_json['name'], list) and len(recipe_json['name']) > 0:
recipe_json['name'] = recipe_json['name'][0]
recipe_json['name'] = automation_engine.apply_regex_replace_automation(recipe_json['name'], Automation.NAME_REPLACE)
# assign recipe description
# TODO notify user about limit if reached - >256 description will be truncated
try:
description = scrape.description() or None
except Exception:
@ -40,8 +67,10 @@ def get_from_scraper(scrape, request):
except Exception:
description = ''
recipe_json['internal'] = True
recipe_json['description'] = parse_description(description)
recipe_json['description'] = automation_engine.apply_regex_replace_automation(recipe_json['description'], Automation.DESCRIPTION_REPLACE)
# assign servings attributes
try:
# dont use scrape.yields() as this will always return "x servings" or "x items", should be improved in scrapers directly
servings = scrape.schema.data.get('recipeYield') or 1
@ -51,6 +80,7 @@ def get_from_scraper(scrape, request):
recipe_json['servings'] = parse_servings(servings)
recipe_json['servings_text'] = parse_servings_text(servings)
# assign time attributes
try:
recipe_json['working_time'] = get_minutes(scrape.prep_time()) or 0
except Exception:
@ -75,6 +105,7 @@ def get_from_scraper(scrape, request):
except Exception:
pass
# assign image
try:
recipe_json['image'] = parse_image(scrape.image()) or None
except Exception:
@ -85,7 +116,7 @@ def get_from_scraper(scrape, request):
except Exception:
recipe_json['image'] = ''
keywords = []
# assign keywords
try:
if scrape.schema.data.get("keywords"):
keywords += listify_keywords(scrape.schema.data.get("keywords"))
@ -110,20 +141,6 @@ def get_from_scraper(scrape, request):
except Exception:
pass
try:
source_url = scrape.canonical_url()
except Exception:
try:
source_url = scrape.url
except Exception:
pass
if source_url:
recipe_json['source_url'] = source_url
try:
keywords.append(source_url.replace('http://', '').replace('https://', '').split('/')[0])
except Exception:
recipe_json['source_url'] = ''
try:
if scrape.author():
keywords.append(scrape.author())
@ -131,13 +148,13 @@ def get_from_scraper(scrape, request):
pass
try:
recipe_json['keywords'] = parse_keywords(list(set(map(str.casefold, keywords))), request.space)
recipe_json['keywords'] = parse_keywords(list(set(map(str.casefold, keywords))), request)
except AttributeError:
recipe_json['keywords'] = keywords
ingredient_parser = IngredientParser(request, True)
recipe_json['steps'] = []
# assign steps
try:
for i in parse_instructions(scrape.instructions()):
recipe_json['steps'].append({'instruction': i, 'ingredients': [], 'show_ingredients_table': request.user.userpreference.show_step_ingredients, })
@ -146,25 +163,10 @@ def get_from_scraper(scrape, request):
if len(recipe_json['steps']) == 0:
recipe_json['steps'].append({'instruction': '', 'ingredients': [], })
parsed_description = parse_description(description)
# TODO notify user about limit if reached
# limits exist to limit the attack surface for dos style attacks
automations = Automation.objects.filter(
type=Automation.DESCRIPTION_REPLACE,
space=request.space,
disabled=False).only(
'param_1',
'param_2',
'param_3').all().order_by('order')[
:512]
for a in automations:
if re.match(a.param_1, (recipe_json['source_url'])[:512]):
parsed_description = re.sub(a.param_2, a.param_3, parsed_description, count=1)
if len(parsed_description) > 256: # split at 256 as long descriptions don't look good on recipe cards
recipe_json['steps'][0]['instruction'] = f'*{parsed_description}* \n\n' + recipe_json['steps'][0]['instruction']
if len(recipe_json['description']) > 256: # split at 256 as long descriptions don't look good on recipe cards
recipe_json['steps'][0]['instruction'] = f"*{recipe_json['description']}* \n\n" + recipe_json['steps'][0]['instruction']
else:
recipe_json['description'] = parsed_description[:512]
recipe_json['description'] = recipe_json['description'][:512]
try:
for x in scrape.ingredients():
@ -205,19 +207,9 @@ def get_from_scraper(scrape, request):
traceback.print_exc()
pass
if 'source_url' in recipe_json and recipe_json['source_url']:
automations = Automation.objects.filter(
type=Automation.INSTRUCTION_REPLACE,
space=request.space,
disabled=False).only(
'param_1',
'param_2',
'param_3').order_by('order').all()[
:512]
for a in automations:
if re.match(a.param_1, (recipe_json['source_url'])[:512]):
for s in recipe_json['steps']:
s['instruction'] = re.sub(a.param_2, a.param_3, s['instruction'])
for s in recipe_json['steps']:
s['instruction'] = automation_engine.apply_regex_replace_automation(s['instruction'], Automation.INSTRUCTION_REPLACE)
# re.sub(a.param_2, a.param_3, s['instruction'])
return recipe_json
@ -267,11 +259,14 @@ def get_from_youtube_scraper(url, request):
]
}
# TODO add automation here
try:
automation_engine = AutomationEngine(request, source=url)
video = YouTube(url=url)
default_recipe_json['name'] = video.title
default_recipe_json['name'] = automation_engine.apply_regex_replace_automation(video.title, Automation.NAME_REPLACE)
default_recipe_json['image'] = video.thumbnail_url
default_recipe_json['steps'][0]['instruction'] = video.description
default_recipe_json['steps'][0]['instruction'] = automation_engine.apply_regex_replace_automation(video.description, Automation.INSTRUCTION_REPLACE)
except Exception:
pass
@ -410,18 +405,19 @@ def parse_time(recipe_time):
return recipe_time
def parse_keywords(keyword_json, space):
def parse_keywords(keyword_json, request):
keywords = []
keyword_aliases = {}
automation_engine = AutomationEngine(request)
# keyword_aliases = {}
# retrieve keyword automation cache if it exists, otherwise build from database
KEYWORD_CACHE_KEY = f'automation_keyword_alias_{space.pk}'
if c := caches['default'].get(KEYWORD_CACHE_KEY, None):
keyword_aliases = c
caches['default'].touch(KEYWORD_CACHE_KEY, 30)
else:
for a in Automation.objects.filter(space=space, disabled=False, type=Automation.KEYWORD_ALIAS).only('param_1', 'param_2').order_by('order').all():
keyword_aliases[a.param_1.lower()] = a.param_2
caches['default'].set(KEYWORD_CACHE_KEY, keyword_aliases, 30)
# KEYWORD_CACHE_KEY = f'automation_keyword_alias_{space.pk}'
# if c := caches['default'].get(KEYWORD_CACHE_KEY, None):
# keyword_aliases = c
# caches['default'].touch(KEYWORD_CACHE_KEY, 30)
# else:
# for a in Automation.objects.filter(space=space, disabled=False, type=Automation.KEYWORD_ALIAS).only('param_1', 'param_2').order_by('order').all():
# keyword_aliases[a.param_1.lower()] = a.param_2
# caches['default'].set(KEYWORD_CACHE_KEY, keyword_aliases, 30)
# keywords as list
for kw in keyword_json:
@ -429,12 +425,13 @@ def parse_keywords(keyword_json, space):
# if alias exists use that instead
if len(kw) != 0:
if keyword_aliases:
try:
kw = keyword_aliases[kw.lower()]
except KeyError:
pass
if k := Keyword.objects.filter(name=kw, space=space).first():
# if keyword_aliases:
# try:
# kw = keyword_aliases[kw.lower()]
# except KeyError:
# pass
automation_engine.apply_keyword_automation(kw)
if k := Keyword.objects.filter(name=kw, space=request.space).first():
keywords.append({'label': str(k), 'name': k.name, 'id': k.id})
else:
keywords.append({'label': kw, 'name': kw})