From 44dee16e0a8b81f39c5a3bb761782f25d3f39c6e Mon Sep 17 00:00:00 2001 From: smilerz Date: Mon, 15 Mar 2021 15:56:44 -0500 Subject: [PATCH] manually parse json --- cookbook/helper/recipe_html_import.py | 123 +++++++ cookbook/helper/recipe_url_import.py | 106 ++++-- cookbook/templates/import_json.html | 16 +- cookbook/templates/import_json_working.html | 7 +- cookbook/templates/url_import.html | 349 +++++++++++++++++++- cookbook/views/api.py | 21 +- 6 files changed, 566 insertions(+), 56 deletions(-) create mode 100644 cookbook/helper/recipe_html_import.py diff --git a/cookbook/helper/recipe_html_import.py b/cookbook/helper/recipe_html_import.py new file mode 100644 index 00000000..19ef4014 --- /dev/null +++ b/cookbook/helper/recipe_html_import.py @@ -0,0 +1,123 @@ +import json +from json.decoder import JSONDecodeError + +from bs4 import BeautifulSoup +from bs4.element import Tag +# from cookbook.helper.ingredient_parser import parse as parse_ingredient +from cookbook.helper import recipe_url_import as helper + + +# %% + +# %% +def get_from_raw(text): + def build_node(k, v): + if isinstance(v, dict): + node = { + 'name': k, + 'value': k, + 'children': get_children_dict(v) + } + elif isinstance(v, list): + node = { + 'name': k, + 'value': k, + 'children': get_children_list(v) + } + else: + node = { + 'name': k + ": " + str(v), + 'value': str(v) + } + return node + + def get_children_dict(children): + kid_list = [] + for k, v in children.items(): + kid_list.append(build_node(k, v)) + return kid_list + + def get_children_list(children): + kid_list = [] + for kid in children: + if type(kid) == list: + node = { + 'name': "unknown list", + 'value': "unknown list", + 'children': get_children_list(kid) + } + kid_list.append(node) + elif type(kid) == dict: + for k, v in kid.items(): + kid_list.append(build_node(k, v)) + else: + kid_list.append({ + 'name': kid, + 'value': kid + }) + return kid_list + + recipe_json = { + 'name': '', + 'description': '', + 'image': '', + 'keywords': [], + 'recipeIngredient': [], + 'recipeInstructions': '', + 'servings': '', + 'prepTime': '', + 'cookTime': '' + } + recipe_tree = [] + temp_tree = [] + parse_list = [] + + try: + parse_list.append(json.loads(text)) + except JSONDecodeError: + soup = BeautifulSoup(text, "html.parser") + for el in soup.find_all('script', type='application/ld+json'): + parse_list.append(el) + for el in soup.find_all(type='application/json'): + parse_list.append(el) + + # first try finding ld+json as its most common + for el in parse_list: + + if isinstance(el, Tag): + el = json.loads(el.string) + + for k, v in el.items(): + if isinstance(v, dict): + node = { + 'name': k, + 'value': k, + 'children': get_children_dict(v) + } + elif isinstance(v, list): + node = { + 'name': k, + 'value': k, + 'children': get_children_list(v) + } + else: + node = { + 'name': k + ": " + str(v), + 'value': str(v) + } + temp_tree.append(node) + if ('@type' in el and el['@type'] == 'Recipe'): + recipe_json = helper.find_recipe_json(el, None) + recipe_tree += [{'name': 'ld+json', 'children': temp_tree}] + else: + recipe_tree += [{'name': 'json', 'children': temp_tree}] + + temp_tree = [] + + # overide keyword structure from dict to list + kws = [] + for kw in recipe_json['keywords']: + kws.append(kw['text']) + recipe_json['keywords'] = kws + + return recipe_json, recipe_tree diff --git a/cookbook/helper/recipe_url_import.py b/cookbook/helper/recipe_url_import.py index ffb4a370..99e5d161 100644 --- a/cookbook/helper/recipe_url_import.py +++ b/cookbook/helper/recipe_url_import.py @@ -2,6 +2,8 @@ import json import random import re from json import JSONDecodeError +from isodate import parse_duration as iso_parse_duration +from isodate.isoerror import ISO8601Error import microdata from bs4 import BeautifulSoup @@ -64,6 +66,8 @@ def find_recipe_json(ld_json, url): if 'recipeIngredient' in ld_json: ld_json['recipeIngredient'] = parse_ingredients(ld_json['recipeIngredient']) + else: + ld_json['recipeIngredient'] = "" keywords = [] if 'keywords' in ld_json: @@ -71,22 +75,40 @@ def find_recipe_json(ld_json, url): if 'recipeCategory' in ld_json: keywords += listify_keywords(ld_json['recipeCategory']) if 'recipeCuisine' in ld_json: - keywords += listify_keywords(ld_json['keywords']) - ld_json['keywords'] = parse_keywords(list(set(map(str.casefold, keywords)))) + keywords += listify_keywords(ld_json['recipeCuisine']) + try: + ld_json['keywords'] = parse_keywords(list(set(map(str.casefold, keywords)))) + except TypeError: + pass if 'recipeInstructions' in ld_json: ld_json['recipeInstructions'] = parse_instructions(ld_json['recipeInstructions']) + else: + ld_json['recipeInstructions'] = "" if 'image' in ld_json: ld_json['image'] = parse_image(ld_json['image']) + else: + ld_json['image'] = "" + + if 'description' not in ld_json: + ld_json['description'] = "" if 'cookTime' in ld_json: ld_json['cookTime'] = parse_cooktime(ld_json['cookTime']) + else: + ld_json['cookTime'] = 0 if 'prepTime' in ld_json: ld_json['prepTime'] = parse_cooktime(ld_json['prepTime']) + else: + ld_json['prepTime'] = 0 - ld_json['servings'] = 1 + if 'servings' in ld_json: + if type(ld_json['servings']) == str: + ld_json['servings'] = int(re.search(r'\d+', ld_json['servings']).group()) + else: + ld_json['servings'] = 1 try: if 'recipeYield' in ld_json: if type(ld_json['recipeYield']) == str: @@ -117,6 +139,12 @@ def parse_name(name): def parse_ingredients(ingredients): # some pages have comma separated ingredients in a single array entry + try: + if type(ingredients[0]) == dict: + return ingredients + except (KeyError, IndexError): + pass + if (len(ingredients) == 1 and type(ingredients) == list): ingredients = ingredients[0].split(',') elif type(ingredients) == str: @@ -197,50 +225,59 @@ def parse_instructions(instructions): instructions = re.sub(r'\n\s*\n', '\n\n', instructions) instructions = re.sub(' +', ' ', instructions) - instructions = instructions.replace('

', '') - instructions = instructions.replace('

', '') - return instruction_text + instructions = re.sub('

', '\n', instructions) + instructions = re.sub('<[^<]+?>', '', instructions) + return instructions def parse_image(image): # check if list of images is returned, take first if so - if (type(image)) == list: - if type(image[0]) == str: - image = image[0] - elif 'url' in image[0]: - image = image[0]['url'] + if type(image) == list: + for pic in image: + if (type(pic) == str) and (pic[:4] == 'http'): + image = pic + elif 'url' in pic: + image = pic['url'] # ignore relative image paths - if 'http' not in image: + if image[:4] != 'http': image = '' return image def parse_cooktime(cooktime): - try: - if (type(cooktime) == list and len(cooktime) > 0): - cooktime = cooktime[0] - cooktime = round(parse_duration(cooktime).seconds / 60) - except TypeError: - cooktime = 0 - if type(cooktime) != int or float: - cooktime = 0 + if type(cooktime) not in [int, float]: + try: + cooktime = float(re.search(r'\d+', cooktime).group()) + except (ValueError, AttributeError): + try: + cooktime = round(iso_parse_duration(cooktime).seconds / 60) + except ISO8601Error: + try: + if (type(cooktime) == list and len(cooktime) > 0): + cooktime = cooktime[0] + cooktime = round(parse_duration(cooktime).seconds / 60) + except AttributeError: + cooktime = 0 + return cooktime def parse_preptime(preptime): - try: - if (type(preptime) == list and len(preptime) > 0): - preptime = preptime[0] - preptime = round( - parse_duration( - preptime - ).seconds / 60 - ) - except TypeError: - preptime = 0 - if type(preptime) != int or float: - preptime = 0 + if type(preptime) not in [int, float]: + try: + preptime = float(re.search(r'\d+', preptime).group()) + except ValueError: + try: + preptime = round(iso_parse_duration(preptime).seconds / 60) + except ISO8601Error: + try: + if (type(preptime) == list and len(preptime) > 0): + preptime = preptime[0] + preptime = round(parse_duration(preptime).seconds / 60) + except AttributeError: + preptime = 0 + return preptime @@ -258,6 +295,11 @@ def parse_keywords(keyword_json): def listify_keywords(keyword_list): # keywords as string + try: + if type(keyword_list[0]) == dict: + return keyword_list + except KeyError: + pass if type(keyword_list) == str: keyword_list = keyword_list.split(',') diff --git a/cookbook/templates/import_json.html b/cookbook/templates/import_json.html index e71e77d5..ce778056 100644 --- a/cookbook/templates/import_json.html +++ b/cookbook/templates/import_json.html @@ -1,3 +1,4 @@ + {% extends "base.html" %} {% load crispy_forms_filters %} {% load i18n %} @@ -24,7 +25,7 @@

{% trans 'Import From Source' %}

- +
Simply paste a web page source or JSON document into this textarea and click import.
@@ -51,18 +52,17 @@