refactored json parser to create functions for each sub parser

2021-03-07 14:28:29 -06:00
parent 1690abaf47
commit 8f3f1c230c
1 changed files with 117 additions and 153 deletions
--- a/cookbook/helper/recipe_url_import.py
+++ b/cookbook/helper/recipe_url_import.py
@ -3,6 +3,8 @@ import re
 from isodate import parse_duration as iso_parse_duration
 from isodate.isoerror import ISO8601Error

+import microdata
+from bs4 import BeautifulSoup
 from cookbook.helper.ingredient_parser import parse as parse_single_ingredient
 from cookbook.models import Keyword
 from django.utils.dateparse import parse_duration
@ -14,18 +16,53 @@ from recipe_scrapers._utils import get_minutes
 def get_from_scraper(scrape, space):
    # converting the scrape_me object to the existing json format based on ld+json

-    recipe_json = {}
-    try:
-        recipe_json['name'] = parse_name(scrape.title() or scrape.schema.data.get('name') or '')
-    except (TypeError, AttributeError):
-        recipe_json['name'] = ''
+    # first try finding ld+json as its most common
+    for ld in soup.find_all('script', type='application/ld+json'):
+        try:
+            ld_json = json.loads(ld.string.replace('\n', ''))
+            if type(ld_json) != list:
+                ld_json = [ld_json]
+
+            for ld_json_item in ld_json:
+                # recipes type might be wrapped in @graph type
+                if '@graph' in ld_json_item:
+                    for x in ld_json_item['@graph']:
+                        if '@type' in x and x['@type'] == 'Recipe':
+                            ld_json_item = x
+
+                if ('@type' in ld_json_item and ld_json_item['@type'] == 'Recipe'):
+                    return JsonResponse(find_recipe_json(ld_json_item, url))
+        except JSONDecodeError:
+            return JsonResponse(
+                {
+                    'error': True,
+                    'msg': _('The requested site provided malformed data and cannot be read.')  # noqa: E501
+                },
+                status=400)
+
+    # now try to find microdata
+    items = microdata.get_items(html_text)
+    for i in items:
+        md_json = json.loads(i.json())
+        if 'schema.org/Recipe' in str(md_json['type']):
+            return JsonResponse(find_recipe_json(md_json['properties'], url))
+
+    return JsonResponse(
+        {
+            'error': True,
+            'msg': _('The requested site does not provide any recognized data format to import the recipe from.')  # noqa: E501
+        },
+        status=400)
+
+
+def find_recipe_json(ld_json, url):
+    ld_json['name'] = parse_name(ld_json['name'])

-    try:
-        description = scrape.schema.data.get("description") or ''
    except AttributeError:
        description = ''

-    recipe_json['description'] = parse_description(description)
+    if 'recipeIngredient' in ld_json:
+        ld_json['recipeIngredient'] = parse_ingredients(ld_json['recipeIngredient'])

    try:
        servings = scrape.yields()
@ -47,78 +84,45 @@ def get_from_scraper(scrape, space):
            recipe_json['prepTime'] = get_minutes(scrape.total_time()) or 0
        except AttributeError:
            pass
+    keywords = []
+    if 'keywords' in ld_json:
+        keywords += listify_keywords(ld_json['keywords'])
+    if 'recipeCategory' in ld_json:
+        keywords += listify_keywords(ld_json['recipeCategory'])
+    if 'recipeCuisine' in ld_json:
+        keywords += listify_keywords(ld_json['keywords'])
+    ld_json['keywords'] = parse_keywords(list(set(map(str.casefold, keywords))))

+    if 'recipeInstructions' in ld_json:
+        ld_json['recipeInstructions'] = parse_instructions(ld_json['recipeInstructions'])
+
+    if 'image' in ld_json:
+        ld_json['image'] = parse_image(ld_json['image'])
+
+    if 'cookTime' in ld_json:
+        ld_json['cookTime'] = parse_cooktime(ld_json['cookTime'])
+
+    if 'prepTime' in ld_json:
+        ld_json['prepTime'] = parse_cooktime(ld_json['prepTime'])
+
+    ld_json['servings'] = 1
    try:
-        recipe_json['image'] = parse_image(scrape.image()) or ''
-    except (AttributeError, TypeError, SchemaOrgException):
-        recipe_json['image'] = ''
+        if 'recipeYield' in ld_json:
+            if type(ld_json['recipeYield']) == str:
+                ld_json['servings'] = int(re.findall(r'\b\d+\b', ld_json['recipeYield'])[0])
+            elif type(ld_json['recipeYield']) == list:
+                ld_json['servings'] = int(re.findall(r'\b\d+\b', ld_json['recipeYield'][0])[0])
+    except Exception as e:
+        print(e)

-        for x in ld_json['recipeIngredient']:
-            if x.replace(' ', '') != '':
-                x = x.replace('&frac12;', "0.5").replace('&frac14;', "0.25").replace('&frac34;', "0.75")
-                try:
-                    amount, unit, ingredient, note = parse_ingredient(x)
-                    if ingredient:
-                        ingredients.append(
-                            {
-                                'amount': amount,
-                                'unit': {
-                                    'text': unit,
-                                    'id': random.randrange(10000, 99999)
-                                },
-                                'ingredient': {
-                                    'text': ingredient,
-                                    'id': random.randrange(10000, 99999)
-                                },
-                                'note': note,
-                                'original': x
-                            }
-                        )
-                except Exception:
-                    ingredients.append(
-                        {
-                            'amount': amount,
-                            'unit': {
-                                'text': unit,
-                                'id': random.randrange(10000, 99999)
-                            },
-                            'ingredient': {
-                                'text': ingredient,
-                                'id': random.randrange(10000, 99999)
-                            },
-                            'note': note,
-                            'original': x
-                        }
-                    )
-            except Exception:
-                ingredients.append(
-                    {
-                        'amount': 0,
-                        'unit': {
-                            'text': '',
-                            'id': random.randrange(10000, 99999)
-                        },
-                        'ingredient': {
-                            'text': x,
-                            'id': random.randrange(10000, 99999)
-                        },
-                        'note': '',
-                        'original': x
-                    }
-                )
-        recipe_json['recipeIngredient'] = ingredients
-    except AttributeError:
-        recipe_json['recipeIngredient'] = ingredients
+    for key in list(ld_json):
+        if key not in [
+            'prepTime', 'cookTime', 'image', 'recipeInstructions',
+            'keywords', 'name', 'recipeIngredient', 'servings'
+        ]:
+            ld_json.pop(key, None)

-    try:
-        recipe_json['recipeInstructions'] = parse_instructions(scrape.instructions())
-    except AttributeError:
-        recipe_json['recipeInstructions'] = ""
-
-    if scrape.url:
-        recipe_json['url'] = scrape.url
-        recipe_json['recipeInstructions'] += "\n\nImported from " + scrape.url
-    return recipe_json
+    return ld_json


 def parse_name(name):
@ -127,17 +131,11 @@ def parse_name(name):
            name = name[0]
        except Exception:
            name = 'ERROR'
-    return normalize_string(name)
+    return name


 def parse_ingredients(ingredients):
    # some pages have comma separated ingredients in a single array entry
-    try:
-        if type(ingredients[0]) == dict:
-            return ingredients
-    except (KeyError, IndexError):
-        pass
-
    if (len(ingredients) == 1 and type(ingredients) == list):
        ingredients = ingredients[0].split(',')
    elif type(ingredients) == str:
@ -153,7 +151,6 @@ def parse_ingredients(ingredients):

    for x in ingredients:
        if x.replace(' ', '') != '':
-            x = x.replace('&frac12;', "0.5").replace('&frac14;', "0.25").replace('&frac34;', "0.75")
            try:
                amount, unit, ingredient, note = parse_single_ingredient(x)
                if ingredient:
@ -195,10 +192,6 @@ def parse_ingredients(ingredients):
    return ingredients


-def parse_description(description):
-    return normalize_string(description)
-
-
 def parse_instructions(instructions):
    instruction_text = ''

@ -220,98 +213,69 @@ def parse_instructions(instructions):
                    instruction_text += str(i)
        instructions = instruction_text

-    return normalize_string(instructions)
+    instructions = re.sub(r'\n\s*\n', '\n\n', instructions)
+    instructions = re.sub(' +', ' ', instructions)
+    instructions = instructions.replace('<p>', '')
+    instructions = instructions.replace('</p>', '')
+    return instruction_text


 def parse_image(image):
    # check if list of images is returned, take first if so
-    if type(image) == list:
-        for pic in image:
-            if (type(pic) == str) and (pic[:4] == 'http'):
-                image = pic
-            elif 'url' in pic:
-                image = pic['url']
-    elif type(image) == dict:
-        if 'url' in image:
-            image = image['url']
+    if (type(image)) == list:
+        if type(image[0]) == str:
+            image = image[0]
+        elif 'url' in image[0]:
+            image = image[0]['url']

    # ignore relative image paths
-    if image[:4] != 'http':
+    if 'http' not in image:
        image = ''
    return image


-def parse_servings(servings):
-    if type(servings) == str:
-        try:
-            servings = int(re.search(r'\d+', servings).group())
-        except AttributeError:
-            servings = 1
-    elif type(servings) == list:
-        try:
-            servings = int(re.findall(r'\b\d+\b', servings[0])[0])
-        except KeyError:
-            servings = 1
-    return servings
-
-
 def parse_cooktime(cooktime):
-    if type(cooktime) not in [int, float]:
-        try:
-            cooktime = float(re.search(r'\d+', cooktime).group())
-        except (ValueError, AttributeError):
-            try:
-                cooktime = round(iso_parse_duration(cooktime).seconds / 60)
-            except ISO8601Error:
-                try:
-                    if (type(cooktime) == list and len(cooktime) > 0):
-                        cooktime = cooktime[0]
-                    cooktime = round(parse_duration(cooktime).seconds / 60)
-                except AttributeError:
-                    cooktime = 0
-
+    try:
+        if (type(cooktime) == list and len(cooktime) > 0):
+            cooktime = cooktime[0]
+        cooktime = round(parse_duration(cooktime).seconds / 60)
+    except TypeError:
+        cooktime = 0
+    if type(cooktime) != int or float:
+        cooktime = 0
    return cooktime


 def parse_preptime(preptime):
-    if type(preptime) not in [int, float]:
-        try:
-            preptime = float(re.search(r'\d+', preptime).group())
-        except ValueError:
-            try:
-                preptime = round(iso_parse_duration(preptime).seconds / 60)
-            except ISO8601Error:
-                try:
-                    if (type(preptime) == list and len(preptime) > 0):
-                        preptime = preptime[0]
-                    preptime = round(parse_duration(preptime).seconds / 60)
-                except AttributeError:
-                    preptime = 0
-
+    try:
+        if (type(preptime) == list and len(preptime) > 0):
+            preptime = preptime[0]
+        preptime = round(
+            parse_duration(
+                preptime
+            ).seconds / 60
+        )
+    except TypeError:
+        preptime = 0
+    if type(preptime) != int or float:
+        preptime = 0
    return preptime


-def parse_keywords(keyword_json, space):
+def parse_keywords(keyword_json):
    keywords = []
    # keywords as list
    for kw in keyword_json:
-        kw = normalize_string(kw)
-        if len(kw) != 0:
-            if k := Keyword.objects.filter(name=kw, space=space).first():
-                keywords.append({'id': str(k.id), 'text': str(k)})
-            else:
-                keywords.append({'id': random.randrange(1111111, 9999999, 1), 'text': kw})
+        if k := Keyword.objects.filter(name=kw).first():
+            keywords.append({'id': str(k.id), 'text': str(k)})
+        else:
+            keywords.append({'id': random.randrange(1111111, 9999999, 1), 'text': kw})

    return keywords


 def listify_keywords(keyword_list):
    # keywords as string
-    try:
-        if type(keyword_list[0]) == dict:
-            return keyword_list
-    except (KeyError, IndexError):
-        pass
    if type(keyword_list) == str:
        keyword_list = keyword_list.split(',')