diff --git a/cookbook/helper/recipe_html_import.py b/cookbook/helper/recipe_html_import.py index 1ebe1d27..bc47d318 100644 --- a/cookbook/helper/recipe_html_import.py +++ b/cookbook/helper/recipe_html_import.py @@ -1,13 +1,14 @@ import json +import re from json.decoder import JSONDecodeError from bs4 import BeautifulSoup from bs4.element import Tag -# from cookbook.helper.ingredient_parser import parse as parse_ingredient from cookbook.helper import recipe_url_import as helper +from recipe_scrapers._utils import get_host_name, normalize_string -def get_recipe_from_source(text, space): +def get_recipe_from_source(text, url, space): def build_node(k, v): if isinstance(v, dict): node = { @@ -23,8 +24,8 @@ def get_recipe_from_source(text, space): } else: node = { - 'name': k + ": " + str(v), - 'value': str(v) + 'name': k + ": " + normalize_string(str(v)), + 'value': normalize_string(str(v)) } return node @@ -49,8 +50,8 @@ def get_recipe_from_source(text, space): kid_list.append(build_node(k, v)) else: kid_list.append({ - 'name': kid, - 'value': kid + 'name': normalize_string(str(kid)), + 'value': normalize_string(str(kid)) }) return kid_list @@ -68,15 +69,25 @@ def get_recipe_from_source(text, space): recipe_tree = [] temp_tree = [] parse_list = [] + html_data = [] + images = [] try: - parse_list.append(json.loads(text)) + parse_list.append(remove_graph(json.loads(text))) except JSONDecodeError: soup = BeautifulSoup(text, "html.parser") + html_data = get_from_html(soup) + images += get_images_from_source(soup, url) for el in soup.find_all('script', type='application/ld+json'): - parse_list.append(el) + parse_list.append(remove_graph(el)) for el in soup.find_all(type='application/json'): - parse_list.append(el) + parse_list.append(remove_graph(el)) + + # if a url was not provided, try to find one in the first document + if not url: + if 'url' in parse_list[0]: + url = parse_list[0]['url'] + # first try finding ld+json as its most common for el in parse_list: @@ -102,28 +113,67 @@ def get_recipe_from_source(text, space): } else: node = { - 'name': k + ": " + str(v), - 'value': str(v) + 'name': k + ": " + normalize_string(str(v)), + 'value': normalize_string(str(v)) } temp_tree.append(node) - # recipes type might be wrapped in @graph type - if '@graph' in el: - for x in el['@graph']: - if '@type' in x and x['@type'] == 'Recipe': - el = x if '@type' in el and el['@type'] == 'Recipe': recipe_json = helper.find_recipe_json(el, None, space) recipe_tree += [{'name': 'ld+json', 'children': temp_tree}] else: recipe_tree += [{'name': 'json', 'children': temp_tree}] - temp_tree = [] - return recipe_json, recipe_tree + return recipe_json, recipe_tree, html_data, images -def get_from_html(text, space): +def get_from_html(soup): + INVISIBLE_ELEMS = ('style', 'script', 'head', 'title') + html = [] for s in soup.strings: if ((s.parent.name not in INVISIBLE_ELEMS) and (len(s.strip()) > 0)): - print(s.parent.name, s, len(s)) \ No newline at end of file + html.append(s) + return html + +# todo - look for site info in the soup +def get_images_from_source(soup, url): + sources = ['src', 'srcset', 'data-src'] + images = [] + img_tags = soup.find_all('img') + if url: + site = get_host_name(url) + prot = url.split(':')[0] + + urls = [] + for img in img_tags: + for src in sources: + try: + urls.append(img[src]) + except KeyError: + pass + + for u in urls: + u = u.split('?')[0] + filename = re.search(r'/([\w_-]+[.](jpg|jpeg|gif|png))$', u) + if filename: + if (('http' not in u) and (url)): + # sometimes an image source can be relative + # if it is provide the base url + u = '{}://{}{}'.format(prot, site, u) + if 'http' in u: + images.append(u) + return images + +def remove_graph(el): + # recipes type might be wrapped in @graph type + if isinstance(el, Tag): + try: + el = json.loads(el.string) + except TypeError: + pass + if '@graph' in el: + for x in el['@graph']: + if '@type' in x and x['@type'] == 'Recipe': + el = x + return el \ No newline at end of file diff --git a/cookbook/helper/recipe_url_import.py b/cookbook/helper/recipe_url_import.py index 8cf9aaef..00eb6e7a 100644 --- a/cookbook/helper/recipe_url_import.py +++ b/cookbook/helper/recipe_url_import.py @@ -12,7 +12,7 @@ from cookbook.models import Keyword from django.http import JsonResponse from django.utils.dateparse import parse_duration from django.utils.translation import gettext as _ -from recipe_scrapers import _utils +from recipe_scrapers._utils import get_minutes, normalize_string def get_from_html_old(html_text, url, space): @@ -91,7 +91,9 @@ def find_recipe_json(ld_json, url, space): else: ld_json['image'] = "" - if 'description' not in ld_json: + if 'description' in ld_json: + ld_json['description'] = normalize_string(ld_json['description'] ) + else: ld_json['description'] = "" if 'cookTime' in ld_json: @@ -105,18 +107,11 @@ def find_recipe_json(ld_json, url, space): ld_json['prepTime'] = 0 if 'servings' in ld_json: - if type(ld_json['servings']) == str: - ld_json['servings'] = int(re.search(r'\d+', ld_json['servings']).group()) + ld_json['servings'] = parse_servings(ld_json['servings']) + elif 'recipeYield' in ld_json: + ld_json['servings'] = parse_servings(ld_json['recipeYield']) else: ld_json['servings'] = 1 - try: - if 'recipeYield' in ld_json: - if type(ld_json['recipeYield']) == str: - ld_json['servings'] = int(re.findall(r'\b\d+\b', ld_json['recipeYield'])[0]) - elif type(ld_json['recipeYield']) == list: - ld_json['servings'] = int(re.findall(r'\b\d+\b', ld_json['recipeYield'][0])[0]) - except Exception as e: - print(e) for key in list(ld_json): if key not in [ @@ -136,14 +131,14 @@ def get_from_scraper(scrape, space): try: description = scrape.schema.data.get("description") or '' - recipe_json['prepTime'] = _utils.get_minutes(scrape.schema.data.get("prepTime")) or 0 - recipe_json['cookTime'] = _utils.get_minutes(scrape.schema.data.get("cookTime")) or 0 + recipe_json['prepTime'] = get_minutes(scrape.schema.data.get("prepTime")) or 0 + recipe_json['cookTime'] = get_minutes(scrape.schema.data.get("cookTime")) or 0 except AttributeError: description = '' recipe_json['prepTime'] = 0 recipe_json['cookTime'] = 0 - recipe_json['description'] = description + recipe_json['description'] = normalize_string(description) try: servings = scrape.yields() @@ -231,7 +226,7 @@ def parse_name(name): name = name[0] except Exception: name = 'ERROR' - return name + return normalize_string(name) def parse_ingredients(ingredients): @@ -324,7 +319,7 @@ def parse_instructions(instructions): instructions = re.sub(' +', ' ', instructions) instructions = re.sub('
', '\n', instructions) instructions = re.sub('<[^<]+?>', '', instructions) - return instructions + return normalize_string(instructions) def parse_image(image): @@ -342,6 +337,19 @@ def parse_image(image): return image +def parse_servings(servings): + if type(servings) == str: + try: + servings = int(re.search(r'\d+', servings).group()) + except AttributeError: + servings = 1 + elif type(servings) == list: + try: + servings = int(re.findall(r'\b\d+\b', servings[0])[0]) + except KeyError: + servings = 1 + return servings + def parse_cooktime(cooktime): if type(cooktime) not in [int, float]: try: @@ -382,6 +390,7 @@ def parse_keywords(keyword_json, space): keywords = [] # keywords as list for kw in keyword_json: + kw = normalize_string(kw) if k := Keyword.objects.filter(name=kw, space=space).first(): keywords.append({'id': str(k.id), 'text': str(k)}) else: @@ -395,7 +404,7 @@ def listify_keywords(keyword_list): try: if type(keyword_list[0]) == dict: return keyword_list - except KeyError: + except (KeyError, IndexError): pass if type(keyword_list) == str: keyword_list = keyword_list.split(',') diff --git a/cookbook/templates/url_import.html b/cookbook/templates/url_import.html index c915338a..eb02fbf9 100644 --- a/cookbook/templates/url_import.html +++ b/cookbook/templates/url_import.html @@ -37,11 +37,11 @@