refactored json parser to create functions for each sub parser
This commit is contained in:
@ -3,6 +3,8 @@ import re
|
||||
from isodate import parse_duration as iso_parse_duration
|
||||
from isodate.isoerror import ISO8601Error
|
||||
|
||||
import microdata
|
||||
from bs4 import BeautifulSoup
|
||||
from cookbook.helper.ingredient_parser import parse as parse_single_ingredient
|
||||
from cookbook.models import Keyword
|
||||
from django.utils.dateparse import parse_duration
|
||||
@ -14,18 +16,53 @@ from recipe_scrapers._utils import get_minutes
|
||||
def get_from_scraper(scrape, space):
|
||||
# converting the scrape_me object to the existing json format based on ld+json
|
||||
|
||||
recipe_json = {}
|
||||
try:
|
||||
recipe_json['name'] = parse_name(scrape.title() or scrape.schema.data.get('name') or '')
|
||||
except (TypeError, AttributeError):
|
||||
recipe_json['name'] = ''
|
||||
# first try finding ld+json as its most common
|
||||
for ld in soup.find_all('script', type='application/ld+json'):
|
||||
try:
|
||||
ld_json = json.loads(ld.string.replace('\n', ''))
|
||||
if type(ld_json) != list:
|
||||
ld_json = [ld_json]
|
||||
|
||||
for ld_json_item in ld_json:
|
||||
# recipes type might be wrapped in @graph type
|
||||
if '@graph' in ld_json_item:
|
||||
for x in ld_json_item['@graph']:
|
||||
if '@type' in x and x['@type'] == 'Recipe':
|
||||
ld_json_item = x
|
||||
|
||||
if ('@type' in ld_json_item and ld_json_item['@type'] == 'Recipe'):
|
||||
return JsonResponse(find_recipe_json(ld_json_item, url))
|
||||
except JSONDecodeError:
|
||||
return JsonResponse(
|
||||
{
|
||||
'error': True,
|
||||
'msg': _('The requested site provided malformed data and cannot be read.') # noqa: E501
|
||||
},
|
||||
status=400)
|
||||
|
||||
# now try to find microdata
|
||||
items = microdata.get_items(html_text)
|
||||
for i in items:
|
||||
md_json = json.loads(i.json())
|
||||
if 'schema.org/Recipe' in str(md_json['type']):
|
||||
return JsonResponse(find_recipe_json(md_json['properties'], url))
|
||||
|
||||
return JsonResponse(
|
||||
{
|
||||
'error': True,
|
||||
'msg': _('The requested site does not provide any recognized data format to import the recipe from.') # noqa: E501
|
||||
},
|
||||
status=400)
|
||||
|
||||
|
||||
def find_recipe_json(ld_json, url):
|
||||
ld_json['name'] = parse_name(ld_json['name'])
|
||||
|
||||
try:
|
||||
description = scrape.schema.data.get("description") or ''
|
||||
except AttributeError:
|
||||
description = ''
|
||||
|
||||
recipe_json['description'] = parse_description(description)
|
||||
if 'recipeIngredient' in ld_json:
|
||||
ld_json['recipeIngredient'] = parse_ingredients(ld_json['recipeIngredient'])
|
||||
|
||||
try:
|
||||
servings = scrape.yields()
|
||||
@ -47,78 +84,45 @@ def get_from_scraper(scrape, space):
|
||||
recipe_json['prepTime'] = get_minutes(scrape.total_time()) or 0
|
||||
except AttributeError:
|
||||
pass
|
||||
keywords = []
|
||||
if 'keywords' in ld_json:
|
||||
keywords += listify_keywords(ld_json['keywords'])
|
||||
if 'recipeCategory' in ld_json:
|
||||
keywords += listify_keywords(ld_json['recipeCategory'])
|
||||
if 'recipeCuisine' in ld_json:
|
||||
keywords += listify_keywords(ld_json['keywords'])
|
||||
ld_json['keywords'] = parse_keywords(list(set(map(str.casefold, keywords))))
|
||||
|
||||
if 'recipeInstructions' in ld_json:
|
||||
ld_json['recipeInstructions'] = parse_instructions(ld_json['recipeInstructions'])
|
||||
|
||||
if 'image' in ld_json:
|
||||
ld_json['image'] = parse_image(ld_json['image'])
|
||||
|
||||
if 'cookTime' in ld_json:
|
||||
ld_json['cookTime'] = parse_cooktime(ld_json['cookTime'])
|
||||
|
||||
if 'prepTime' in ld_json:
|
||||
ld_json['prepTime'] = parse_cooktime(ld_json['prepTime'])
|
||||
|
||||
ld_json['servings'] = 1
|
||||
try:
|
||||
recipe_json['image'] = parse_image(scrape.image()) or ''
|
||||
except (AttributeError, TypeError, SchemaOrgException):
|
||||
recipe_json['image'] = ''
|
||||
if 'recipeYield' in ld_json:
|
||||
if type(ld_json['recipeYield']) == str:
|
||||
ld_json['servings'] = int(re.findall(r'\b\d+\b', ld_json['recipeYield'])[0])
|
||||
elif type(ld_json['recipeYield']) == list:
|
||||
ld_json['servings'] = int(re.findall(r'\b\d+\b', ld_json['recipeYield'][0])[0])
|
||||
except Exception as e:
|
||||
print(e)
|
||||
|
||||
for x in ld_json['recipeIngredient']:
|
||||
if x.replace(' ', '') != '':
|
||||
x = x.replace('½', "0.5").replace('¼', "0.25").replace('¾', "0.75")
|
||||
try:
|
||||
amount, unit, ingredient, note = parse_ingredient(x)
|
||||
if ingredient:
|
||||
ingredients.append(
|
||||
{
|
||||
'amount': amount,
|
||||
'unit': {
|
||||
'text': unit,
|
||||
'id': random.randrange(10000, 99999)
|
||||
},
|
||||
'ingredient': {
|
||||
'text': ingredient,
|
||||
'id': random.randrange(10000, 99999)
|
||||
},
|
||||
'note': note,
|
||||
'original': x
|
||||
}
|
||||
)
|
||||
except Exception:
|
||||
ingredients.append(
|
||||
{
|
||||
'amount': amount,
|
||||
'unit': {
|
||||
'text': unit,
|
||||
'id': random.randrange(10000, 99999)
|
||||
},
|
||||
'ingredient': {
|
||||
'text': ingredient,
|
||||
'id': random.randrange(10000, 99999)
|
||||
},
|
||||
'note': note,
|
||||
'original': x
|
||||
}
|
||||
)
|
||||
except Exception:
|
||||
ingredients.append(
|
||||
{
|
||||
'amount': 0,
|
||||
'unit': {
|
||||
'text': '',
|
||||
'id': random.randrange(10000, 99999)
|
||||
},
|
||||
'ingredient': {
|
||||
'text': x,
|
||||
'id': random.randrange(10000, 99999)
|
||||
},
|
||||
'note': '',
|
||||
'original': x
|
||||
}
|
||||
)
|
||||
recipe_json['recipeIngredient'] = ingredients
|
||||
except AttributeError:
|
||||
recipe_json['recipeIngredient'] = ingredients
|
||||
for key in list(ld_json):
|
||||
if key not in [
|
||||
'prepTime', 'cookTime', 'image', 'recipeInstructions',
|
||||
'keywords', 'name', 'recipeIngredient', 'servings'
|
||||
]:
|
||||
ld_json.pop(key, None)
|
||||
|
||||
try:
|
||||
recipe_json['recipeInstructions'] = parse_instructions(scrape.instructions())
|
||||
except AttributeError:
|
||||
recipe_json['recipeInstructions'] = ""
|
||||
|
||||
if scrape.url:
|
||||
recipe_json['url'] = scrape.url
|
||||
recipe_json['recipeInstructions'] += "\n\nImported from " + scrape.url
|
||||
return recipe_json
|
||||
return ld_json
|
||||
|
||||
|
||||
def parse_name(name):
|
||||
@ -127,17 +131,11 @@ def parse_name(name):
|
||||
name = name[0]
|
||||
except Exception:
|
||||
name = 'ERROR'
|
||||
return normalize_string(name)
|
||||
return name
|
||||
|
||||
|
||||
def parse_ingredients(ingredients):
|
||||
# some pages have comma separated ingredients in a single array entry
|
||||
try:
|
||||
if type(ingredients[0]) == dict:
|
||||
return ingredients
|
||||
except (KeyError, IndexError):
|
||||
pass
|
||||
|
||||
if (len(ingredients) == 1 and type(ingredients) == list):
|
||||
ingredients = ingredients[0].split(',')
|
||||
elif type(ingredients) == str:
|
||||
@ -153,7 +151,6 @@ def parse_ingredients(ingredients):
|
||||
|
||||
for x in ingredients:
|
||||
if x.replace(' ', '') != '':
|
||||
x = x.replace('½', "0.5").replace('¼', "0.25").replace('¾', "0.75")
|
||||
try:
|
||||
amount, unit, ingredient, note = parse_single_ingredient(x)
|
||||
if ingredient:
|
||||
@ -195,10 +192,6 @@ def parse_ingredients(ingredients):
|
||||
return ingredients
|
||||
|
||||
|
||||
def parse_description(description):
|
||||
return normalize_string(description)
|
||||
|
||||
|
||||
def parse_instructions(instructions):
|
||||
instruction_text = ''
|
||||
|
||||
@ -220,98 +213,69 @@ def parse_instructions(instructions):
|
||||
instruction_text += str(i)
|
||||
instructions = instruction_text
|
||||
|
||||
return normalize_string(instructions)
|
||||
instructions = re.sub(r'\n\s*\n', '\n\n', instructions)
|
||||
instructions = re.sub(' +', ' ', instructions)
|
||||
instructions = instructions.replace('<p>', '')
|
||||
instructions = instructions.replace('</p>', '')
|
||||
return instruction_text
|
||||
|
||||
|
||||
def parse_image(image):
|
||||
# check if list of images is returned, take first if so
|
||||
if type(image) == list:
|
||||
for pic in image:
|
||||
if (type(pic) == str) and (pic[:4] == 'http'):
|
||||
image = pic
|
||||
elif 'url' in pic:
|
||||
image = pic['url']
|
||||
elif type(image) == dict:
|
||||
if 'url' in image:
|
||||
image = image['url']
|
||||
if (type(image)) == list:
|
||||
if type(image[0]) == str:
|
||||
image = image[0]
|
||||
elif 'url' in image[0]:
|
||||
image = image[0]['url']
|
||||
|
||||
# ignore relative image paths
|
||||
if image[:4] != 'http':
|
||||
if 'http' not in image:
|
||||
image = ''
|
||||
return image
|
||||
|
||||
|
||||
def parse_servings(servings):
|
||||
if type(servings) == str:
|
||||
try:
|
||||
servings = int(re.search(r'\d+', servings).group())
|
||||
except AttributeError:
|
||||
servings = 1
|
||||
elif type(servings) == list:
|
||||
try:
|
||||
servings = int(re.findall(r'\b\d+\b', servings[0])[0])
|
||||
except KeyError:
|
||||
servings = 1
|
||||
return servings
|
||||
|
||||
|
||||
def parse_cooktime(cooktime):
|
||||
if type(cooktime) not in [int, float]:
|
||||
try:
|
||||
cooktime = float(re.search(r'\d+', cooktime).group())
|
||||
except (ValueError, AttributeError):
|
||||
try:
|
||||
cooktime = round(iso_parse_duration(cooktime).seconds / 60)
|
||||
except ISO8601Error:
|
||||
try:
|
||||
if (type(cooktime) == list and len(cooktime) > 0):
|
||||
cooktime = cooktime[0]
|
||||
cooktime = round(parse_duration(cooktime).seconds / 60)
|
||||
except AttributeError:
|
||||
cooktime = 0
|
||||
|
||||
try:
|
||||
if (type(cooktime) == list and len(cooktime) > 0):
|
||||
cooktime = cooktime[0]
|
||||
cooktime = round(parse_duration(cooktime).seconds / 60)
|
||||
except TypeError:
|
||||
cooktime = 0
|
||||
if type(cooktime) != int or float:
|
||||
cooktime = 0
|
||||
return cooktime
|
||||
|
||||
|
||||
def parse_preptime(preptime):
|
||||
if type(preptime) not in [int, float]:
|
||||
try:
|
||||
preptime = float(re.search(r'\d+', preptime).group())
|
||||
except ValueError:
|
||||
try:
|
||||
preptime = round(iso_parse_duration(preptime).seconds / 60)
|
||||
except ISO8601Error:
|
||||
try:
|
||||
if (type(preptime) == list and len(preptime) > 0):
|
||||
preptime = preptime[0]
|
||||
preptime = round(parse_duration(preptime).seconds / 60)
|
||||
except AttributeError:
|
||||
preptime = 0
|
||||
|
||||
try:
|
||||
if (type(preptime) == list and len(preptime) > 0):
|
||||
preptime = preptime[0]
|
||||
preptime = round(
|
||||
parse_duration(
|
||||
preptime
|
||||
).seconds / 60
|
||||
)
|
||||
except TypeError:
|
||||
preptime = 0
|
||||
if type(preptime) != int or float:
|
||||
preptime = 0
|
||||
return preptime
|
||||
|
||||
|
||||
def parse_keywords(keyword_json, space):
|
||||
def parse_keywords(keyword_json):
|
||||
keywords = []
|
||||
# keywords as list
|
||||
for kw in keyword_json:
|
||||
kw = normalize_string(kw)
|
||||
if len(kw) != 0:
|
||||
if k := Keyword.objects.filter(name=kw, space=space).first():
|
||||
keywords.append({'id': str(k.id), 'text': str(k)})
|
||||
else:
|
||||
keywords.append({'id': random.randrange(1111111, 9999999, 1), 'text': kw})
|
||||
if k := Keyword.objects.filter(name=kw).first():
|
||||
keywords.append({'id': str(k.id), 'text': str(k)})
|
||||
else:
|
||||
keywords.append({'id': random.randrange(1111111, 9999999, 1), 'text': kw})
|
||||
|
||||
return keywords
|
||||
|
||||
|
||||
def listify_keywords(keyword_list):
|
||||
# keywords as string
|
||||
try:
|
||||
if type(keyword_list[0]) == dict:
|
||||
return keyword_list
|
||||
except (KeyError, IndexError):
|
||||
pass
|
||||
if type(keyword_list) == str:
|
||||
keyword_list = keyword_list.split(',')
|
||||
|
||||
|
Reference in New Issue
Block a user