import re
import traceback
from html import unescape
from django.utils.dateparse import parse_duration
from django.utils.translation import gettext as _
from isodate import parse_duration as iso_parse_duration
from isodate.isoerror import ISO8601Error
from pytube import YouTube
from recipe_scrapers._utils import get_host_name, get_minutes
from cookbook.helper.automation_helper import AutomationEngine
from cookbook.helper.ingredient_parser import IngredientParser
from cookbook.models import Automation, Keyword, PropertyType
def get_from_scraper(scrape, request):
# converting the scrape_me object to the existing json format based on ld+json
recipe_json = {
'steps': [],
'internal': True
}
keywords = []
# assign source URL
try:
source_url = scrape.canonical_url()
except Exception:
try:
source_url = scrape.url
except Exception:
pass
if source_url:
recipe_json['source_url'] = source_url
try:
keywords.append(source_url.replace('http://', '').replace('https://', '').split('/')[0])
except Exception:
recipe_json['source_url'] = ''
automation_engine = AutomationEngine(request, source=recipe_json.get('source_url'))
# assign recipe name
try:
recipe_json['name'] = parse_name(scrape.title()[:128] or None)
except Exception:
recipe_json['name'] = None
if not recipe_json['name']:
try:
recipe_json['name'] = scrape.schema.data.get('name') or ''
except Exception:
recipe_json['name'] = ''
if isinstance(recipe_json['name'], list) and len(recipe_json['name']) > 0:
recipe_json['name'] = recipe_json['name'][0]
recipe_json['name'] = automation_engine.apply_regex_replace_automation(recipe_json['name'], Automation.NAME_REPLACE)
# assign recipe description
# TODO notify user about limit if reached - >256 description will be truncated
try:
description = scrape.description() or None
except Exception:
description = None
if not description:
try:
description = scrape.schema.data.get("description") or ''
except Exception:
description = ''
recipe_json['description'] = parse_description(description)
recipe_json['description'] = automation_engine.apply_regex_replace_automation(recipe_json['description'], Automation.DESCRIPTION_REPLACE)
# assign servings attributes
try:
# dont use scrape.yields() as this will always return "x servings" or "x items", should be improved in scrapers directly
servings = scrape.schema.data.get('recipeYield') or 1
except Exception:
servings = 1
recipe_json['servings'] = parse_servings(servings)
recipe_json['servings_text'] = parse_servings_text(servings)
# assign time attributes
try:
recipe_json['working_time'] = get_minutes(scrape.prep_time()) or 0
except Exception:
try:
recipe_json['working_time'] = get_minutes(scrape.schema.data.get("prepTime")) or 0
except Exception:
recipe_json['working_time'] = 0
try:
recipe_json['waiting_time'] = get_minutes(scrape.cook_time()) or 0
except Exception:
try:
recipe_json['waiting_time'] = get_minutes(scrape.schema.data.get("cookTime")) or 0
except Exception:
recipe_json['waiting_time'] = 0
if recipe_json['working_time'] + recipe_json['waiting_time'] == 0:
try:
recipe_json['working_time'] = get_minutes(scrape.total_time()) or 0
except Exception:
try:
recipe_json['working_time'] = get_minutes(scrape.schema.data.get("totalTime")) or 0
except Exception:
pass
# assign image
try:
recipe_json['image'] = parse_image(scrape.image()) or None
except Exception:
recipe_json['image'] = None
if not recipe_json['image']:
try:
recipe_json['image'] = parse_image(scrape.schema.data.get('image')) or ''
except Exception:
recipe_json['image'] = ''
# assign keywords
try:
if scrape.schema.data.get("keywords"):
keywords += listify_keywords(scrape.schema.data.get("keywords"))
except Exception:
pass
try:
if scrape.category():
keywords += listify_keywords(scrape.category())
except Exception:
try:
if scrape.schema.data.get('recipeCategory'):
keywords += listify_keywords(scrape.schema.data.get("recipeCategory"))
except Exception:
pass
try:
if scrape.cuisine():
keywords += listify_keywords(scrape.cuisine())
except Exception:
try:
if scrape.schema.data.get('recipeCuisine'):
keywords += listify_keywords(scrape.schema.data.get("recipeCuisine"))
except Exception:
pass
try:
if scrape.author():
keywords.append(scrape.author())
except Exception:
pass
try:
recipe_json['keywords'] = parse_keywords(list(set(map(str.casefold, keywords))), request)
except AttributeError:
recipe_json['keywords'] = keywords
ingredient_parser = IngredientParser(request, True)
# assign steps
try:
for i in parse_instructions(scrape.instructions()):
recipe_json['steps'].append({'instruction': i, 'ingredients': [], 'show_ingredients_table': request.user.userpreference.show_step_ingredients, })
except Exception:
pass
if len(recipe_json['steps']) == 0:
recipe_json['steps'].append({'instruction': '', 'ingredients': [], })
recipe_json['description'] = recipe_json['description'][:512]
if len(recipe_json['description']) > 256: # split at 256 as long descriptions don't look good on recipe cards
recipe_json['steps'][0]['instruction'] = f"*{recipe_json['description']}* \n\n" + recipe_json['steps'][0]['instruction']
try:
for x in scrape.ingredients():
if x.strip() != '':
try:
amount, unit, ingredient, note = ingredient_parser.parse(x)
ingredient = {
'amount': amount,
'food': {
'name': ingredient,
},
'unit': None,
'note': note,
'original_text': x
}
if unit:
ingredient['unit'] = {'name': unit, }
recipe_json['steps'][0]['ingredients'].append(ingredient)
except Exception:
recipe_json['steps'][0]['ingredients'].append(
{
'amount': 0,
'unit': None,
'food': {
'name': x,
},
'note': '',
'original_text': x
}
)
except Exception:
pass
try:
recipe_json['properties'] = get_recipe_properties(request.space, scrape.schema.nutrients())
print(recipe_json['properties'])
except Exception:
traceback.print_exc()
pass
for s in recipe_json['steps']:
s['instruction'] = automation_engine.apply_regex_replace_automation(s['instruction'], Automation.INSTRUCTION_REPLACE)
# re.sub(a.param_2, a.param_3, s['instruction'])
return recipe_json
def get_recipe_properties(space, property_data):
# {'servingSize': '1', 'calories': '302 kcal', 'proteinContent': '7,66g', 'fatContent': '11,56g', 'carbohydrateContent': '41,33g'}
properties = {
"property-calories": "calories",
"property-carbohydrates": "carbohydrateContent",
"property-proteins": "proteinContent",
"property-fats": "fatContent",
}
recipe_properties = []
for pt in PropertyType.objects.filter(space=space, open_data_slug__in=list(properties.keys())).all():
for p in list(properties.keys()):
if pt.open_data_slug == p:
if properties[p] in property_data:
recipe_properties.append({
'property_type': {
'id': pt.id,
'name': pt.name,
},
'property_amount': parse_servings(property_data[properties[p]]) / parse_servings(property_data['servingSize']),
})
return recipe_properties
def get_from_youtube_scraper(url, request):
"""A YouTube Information Scraper."""
kw, created = Keyword.objects.get_or_create(name='YouTube', space=request.space)
default_recipe_json = {
'name': '',
'internal': True,
'description': '',
'servings': 1,
'working_time': 0,
'waiting_time': 0,
'image': "",
'keywords': [{'name': kw.name, 'label': kw.name, 'id': kw.pk}],
'source_url': url,
'steps': [
{
'ingredients': [],
'instruction': ''
}
]
}
try:
automation_engine = AutomationEngine(request, source=url)
video = YouTube(url)
video.streams.first() # this is required to execute some kind of generator/web request that fetches the description
default_recipe_json['name'] = automation_engine.apply_regex_replace_automation(video.title, Automation.NAME_REPLACE)
default_recipe_json['image'] = video.thumbnail_url
if video.description:
default_recipe_json['steps'][0]['instruction'] = automation_engine.apply_regex_replace_automation(video.description, Automation.INSTRUCTION_REPLACE)
except Exception:
pass
return default_recipe_json
def parse_name(name):
if isinstance(name, list):
try:
name = name[0]
except Exception:
name = 'ERROR'
return normalize_string(name)
def parse_description(description):
return normalize_string(description)
def clean_instruction_string(instruction):
# handle HTML tags that can be converted to markup
normalized_string = instruction \
.replace("