From ec6a10ca0a24c2b0f0bb6741c4d1e359d6944483 Mon Sep 17 00:00:00 2001 From: smilerz Date: Sat, 3 Apr 2021 11:12:01 -0500 Subject: [PATCH] ensure time is always a number --- cookbook/helper/recipe_html_import.py | 16 +++--- cookbook/helper/recipe_url_import.py | 81 ++------------------------- cookbook/helper/scrapers/scrapers.py | 10 ++-- 3 files changed, 16 insertions(+), 91 deletions(-) diff --git a/cookbook/helper/recipe_html_import.py b/cookbook/helper/recipe_html_import.py index d2264721..6df0ce1a 100644 --- a/cookbook/helper/recipe_html_import.py +++ b/cookbook/helper/recipe_html_import.py @@ -5,13 +5,10 @@ from bs4 import BeautifulSoup from bs4.element import Tag from cookbook.helper import recipe_url_import as helper from cookbook.helper.scrapers.scrapers import text_scraper +from json import JSONDecodeError from recipe_scrapers._utils import get_host_name, normalize_string -from bs4 import BeautifulSoup -from json import JSONDecodeError -from json.decoder import JSONDecodeError - def get_recipe_from_source(text, url, space): def build_node(k, v): if isinstance(v, dict): @@ -78,11 +75,9 @@ def get_recipe_from_source(text, url, space): text = normalize_string(text) try: parse_list.append(remove_graph(json.loads(text))) - scrape = text_scraper("") - + except JSONDecodeError: soup = BeautifulSoup(text, "html.parser") - scrape = text_scraper(text) html_data = get_from_html(soup) images += get_images_from_source(soup, url) for el in soup.find_all('script', type='application/ld+json'): @@ -94,7 +89,11 @@ def get_recipe_from_source(text, url, space): if not url and len(parse_list) > 0: if 'url' in parse_list[0]: url = parse_list[0]['url'] - + + if type(text) == dict: + scrape = text_scraper("", url=url) + elif type(text) == str: + scrape = text_scraper(text, url=url) recipe_json = helper.get_from_scraper(scrape, space) @@ -130,7 +129,6 @@ def get_recipe_from_source(text, url, space): recipe_tree += [{'name': 'ld+json', 'children': temp_tree}] else: recipe_tree += [{'name': 'json', 'children': temp_tree}] - return recipe_json, recipe_tree, html_data, images diff --git a/cookbook/helper/recipe_url_import.py b/cookbook/helper/recipe_url_import.py index f8b25e07..7e0eade9 100644 --- a/cookbook/helper/recipe_url_import.py +++ b/cookbook/helper/recipe_url_import.py @@ -1,86 +1,14 @@ -import json import random import re -from json import JSONDecodeError from isodate import parse_duration as iso_parse_duration from isodate.isoerror import ISO8601Error -import microdata -from bs4 import BeautifulSoup from cookbook.helper.ingredient_parser import parse as parse_single_ingredient from cookbook.models import Keyword -from django.http import JsonResponse from django.utils.dateparse import parse_duration -from django.utils.translation import gettext as _ from recipe_scrapers._utils import get_minutes, normalize_string -# def find_recipe_json(ld_json, url, space): -# ld_json['name'] = parse_name(ld_json['name']) - -# # some sites use ingredients instead of recipeIngredients -# if 'recipeIngredient' not in ld_json and 'ingredients' in ld_json: -# ld_json['recipeIngredient'] = ld_json['ingredients'] - -# if 'recipeIngredient' in ld_json: -# ld_json['recipeIngredient'] = parse_ingredients(ld_json['recipeIngredient']) -# else: -# ld_json['recipeIngredient'] = "" - -# keywords = [] -# if 'keywords' in ld_json: -# keywords += listify_keywords(ld_json['keywords']) -# if 'recipeCategory' in ld_json: -# keywords += listify_keywords(ld_json['recipeCategory']) -# if 'recipeCuisine' in ld_json: -# keywords += listify_keywords(ld_json['recipeCuisine']) -# try: -# ld_json['keywords'] = parse_keywords(list(set(map(str.casefold, keywords))), space) -# except TypeError: -# pass - -# if 'recipeInstructions' in ld_json: -# ld_json['recipeInstructions'] = parse_instructions(ld_json['recipeInstructions']) -# else: -# ld_json['recipeInstructions'] = "" - -# if 'image' in ld_json: -# ld_json['image'] = parse_image(ld_json['image']) -# else: -# ld_json['image'] = "" - -# if 'description' in ld_json: -# ld_json['description'] = normalize_string(ld_json['description']) -# else: -# ld_json['description'] = "" - -# if 'cookTime' in ld_json: -# ld_json['cookTime'] = parse_cooktime(ld_json['cookTime']) -# else: -# ld_json['cookTime'] = 0 - -# if 'prepTime' in ld_json: -# ld_json['prepTime'] = parse_cooktime(ld_json['prepTime']) -# else: -# ld_json['prepTime'] = 0 - -# if 'servings' in ld_json: -# ld_json['servings'] = parse_servings(ld_json['servings']) -# elif 'recipeYield' in ld_json: -# ld_json['servings'] = parse_servings(ld_json['recipeYield']) -# else: -# ld_json['servings'] = 1 - -# for key in list(ld_json): -# if key not in [ -# 'prepTime', 'cookTime', 'image', 'recipeInstructions', -# 'keywords', 'name', 'recipeIngredient', 'servings', 'description' -# ]: -# ld_json.pop(key, None) - -# return ld_json - - def get_from_scraper(scrape, space): # converting the scrape_me object to the existing json format based on ld+json @@ -89,12 +17,9 @@ def get_from_scraper(scrape, space): try: description = scrape.schema.data.get("description") or '' - recipe_json['prepTime'] = get_minutes(scrape.schema.data.get("prepTime")) or 0 - recipe_json['cookTime'] = get_minutes(scrape.schema.data.get("cookTime")) or 0 + except AttributeError: description = '' - recipe_json['prepTime'] = 0 - recipe_json['cookTime'] = 0 recipe_json['description'] = normalize_string(description) @@ -105,9 +30,11 @@ def get_from_scraper(scrape, space): servings = 1 recipe_json['servings'] = servings + recipe_json['prepTime'] = get_minutes(scrape.schema.data.get("prepTime")) or 0 + recipe_json['cookTime'] = get_minutes(scrape.schema.data.get("cookTime")) or 0 if recipe_json['cookTime'] + recipe_json['prepTime'] == 0: try: - recipe_json['prepTime'] = scrape.total_time() + recipe_json['prepTime'] = get_minutes(scrape.total_time()) or 0 except AttributeError: pass diff --git a/cookbook/helper/scrapers/scrapers.py b/cookbook/helper/scrapers/scrapers.py index baf4bf9d..93e35b23 100644 --- a/cookbook/helper/scrapers/scrapers.py +++ b/cookbook/helper/scrapers/scrapers.py @@ -1,4 +1,5 @@ from bs4 import BeautifulSoup +from json import JSONDecodeError from recipe_scrapers import SCRAPERS, get_domain, _exception_handling from recipe_scrapers._factory import SchemaScraperFactory from recipe_scrapers._schemaorg import SchemaOrg @@ -8,9 +9,9 @@ from .cooksillustrated import CooksIllustrated CUSTOM_SCRAPERS = { CooksIllustrated.host(): CooksIllustrated, } +SCRAPERS.update(CUSTOM_SCRAPERS) + -SCRAPERS = SCRAPERS.update(CUSTOM_SCRAPERS) -#%% def text_scraper(text, url=None): domain = None if url: @@ -19,7 +20,7 @@ def text_scraper(text, url=None): scraper_class = SCRAPERS[domain] else: scraper_class = SchemaScraperFactory.SchemaScraper - + class TextScraper(scraper_class): def __init__( self, @@ -31,11 +32,10 @@ def text_scraper(text, url=None): self.meta_http_equiv = False self.soup = BeautifulSoup(page_data, "html.parser") self.url = url + self.recipe = None try: self.schema = SchemaOrg(page_data) except JSONDecodeError: pass return TextScraper(text, url) - -# %%