ensure time is always a number
This commit is contained in:
parent
3cf949bf8d
commit
ec6a10ca0a
@ -5,13 +5,10 @@ from bs4 import BeautifulSoup
|
|||||||
from bs4.element import Tag
|
from bs4.element import Tag
|
||||||
from cookbook.helper import recipe_url_import as helper
|
from cookbook.helper import recipe_url_import as helper
|
||||||
from cookbook.helper.scrapers.scrapers import text_scraper
|
from cookbook.helper.scrapers.scrapers import text_scraper
|
||||||
|
from json import JSONDecodeError
|
||||||
from recipe_scrapers._utils import get_host_name, normalize_string
|
from recipe_scrapers._utils import get_host_name, normalize_string
|
||||||
|
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
from json import JSONDecodeError
|
|
||||||
from json.decoder import JSONDecodeError
|
|
||||||
|
|
||||||
def get_recipe_from_source(text, url, space):
|
def get_recipe_from_source(text, url, space):
|
||||||
def build_node(k, v):
|
def build_node(k, v):
|
||||||
if isinstance(v, dict):
|
if isinstance(v, dict):
|
||||||
@ -78,11 +75,9 @@ def get_recipe_from_source(text, url, space):
|
|||||||
text = normalize_string(text)
|
text = normalize_string(text)
|
||||||
try:
|
try:
|
||||||
parse_list.append(remove_graph(json.loads(text)))
|
parse_list.append(remove_graph(json.loads(text)))
|
||||||
scrape = text_scraper("<script type='application/ld+json'>"+text+"</script>")
|
|
||||||
|
|
||||||
except JSONDecodeError:
|
except JSONDecodeError:
|
||||||
soup = BeautifulSoup(text, "html.parser")
|
soup = BeautifulSoup(text, "html.parser")
|
||||||
scrape = text_scraper(text)
|
|
||||||
html_data = get_from_html(soup)
|
html_data = get_from_html(soup)
|
||||||
images += get_images_from_source(soup, url)
|
images += get_images_from_source(soup, url)
|
||||||
for el in soup.find_all('script', type='application/ld+json'):
|
for el in soup.find_all('script', type='application/ld+json'):
|
||||||
@ -95,6 +90,10 @@ def get_recipe_from_source(text, url, space):
|
|||||||
if 'url' in parse_list[0]:
|
if 'url' in parse_list[0]:
|
||||||
url = parse_list[0]['url']
|
url = parse_list[0]['url']
|
||||||
|
|
||||||
|
if type(text) == dict:
|
||||||
|
scrape = text_scraper("<script type='application/ld+json'>" + text + "</script>", url=url)
|
||||||
|
elif type(text) == str:
|
||||||
|
scrape = text_scraper(text, url=url)
|
||||||
|
|
||||||
recipe_json = helper.get_from_scraper(scrape, space)
|
recipe_json = helper.get_from_scraper(scrape, space)
|
||||||
|
|
||||||
@ -131,7 +130,6 @@ def get_recipe_from_source(text, url, space):
|
|||||||
else:
|
else:
|
||||||
recipe_tree += [{'name': 'json', 'children': temp_tree}]
|
recipe_tree += [{'name': 'json', 'children': temp_tree}]
|
||||||
|
|
||||||
|
|
||||||
return recipe_json, recipe_tree, html_data, images
|
return recipe_json, recipe_tree, html_data, images
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,86 +1,14 @@
|
|||||||
import json
|
|
||||||
import random
|
import random
|
||||||
import re
|
import re
|
||||||
from json import JSONDecodeError
|
|
||||||
from isodate import parse_duration as iso_parse_duration
|
from isodate import parse_duration as iso_parse_duration
|
||||||
from isodate.isoerror import ISO8601Error
|
from isodate.isoerror import ISO8601Error
|
||||||
|
|
||||||
import microdata
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
from cookbook.helper.ingredient_parser import parse as parse_single_ingredient
|
from cookbook.helper.ingredient_parser import parse as parse_single_ingredient
|
||||||
from cookbook.models import Keyword
|
from cookbook.models import Keyword
|
||||||
from django.http import JsonResponse
|
|
||||||
from django.utils.dateparse import parse_duration
|
from django.utils.dateparse import parse_duration
|
||||||
from django.utils.translation import gettext as _
|
|
||||||
from recipe_scrapers._utils import get_minutes, normalize_string
|
from recipe_scrapers._utils import get_minutes, normalize_string
|
||||||
|
|
||||||
|
|
||||||
# def find_recipe_json(ld_json, url, space):
|
|
||||||
# ld_json['name'] = parse_name(ld_json['name'])
|
|
||||||
|
|
||||||
# # some sites use ingredients instead of recipeIngredients
|
|
||||||
# if 'recipeIngredient' not in ld_json and 'ingredients' in ld_json:
|
|
||||||
# ld_json['recipeIngredient'] = ld_json['ingredients']
|
|
||||||
|
|
||||||
# if 'recipeIngredient' in ld_json:
|
|
||||||
# ld_json['recipeIngredient'] = parse_ingredients(ld_json['recipeIngredient'])
|
|
||||||
# else:
|
|
||||||
# ld_json['recipeIngredient'] = ""
|
|
||||||
|
|
||||||
# keywords = []
|
|
||||||
# if 'keywords' in ld_json:
|
|
||||||
# keywords += listify_keywords(ld_json['keywords'])
|
|
||||||
# if 'recipeCategory' in ld_json:
|
|
||||||
# keywords += listify_keywords(ld_json['recipeCategory'])
|
|
||||||
# if 'recipeCuisine' in ld_json:
|
|
||||||
# keywords += listify_keywords(ld_json['recipeCuisine'])
|
|
||||||
# try:
|
|
||||||
# ld_json['keywords'] = parse_keywords(list(set(map(str.casefold, keywords))), space)
|
|
||||||
# except TypeError:
|
|
||||||
# pass
|
|
||||||
|
|
||||||
# if 'recipeInstructions' in ld_json:
|
|
||||||
# ld_json['recipeInstructions'] = parse_instructions(ld_json['recipeInstructions'])
|
|
||||||
# else:
|
|
||||||
# ld_json['recipeInstructions'] = ""
|
|
||||||
|
|
||||||
# if 'image' in ld_json:
|
|
||||||
# ld_json['image'] = parse_image(ld_json['image'])
|
|
||||||
# else:
|
|
||||||
# ld_json['image'] = ""
|
|
||||||
|
|
||||||
# if 'description' in ld_json:
|
|
||||||
# ld_json['description'] = normalize_string(ld_json['description'])
|
|
||||||
# else:
|
|
||||||
# ld_json['description'] = ""
|
|
||||||
|
|
||||||
# if 'cookTime' in ld_json:
|
|
||||||
# ld_json['cookTime'] = parse_cooktime(ld_json['cookTime'])
|
|
||||||
# else:
|
|
||||||
# ld_json['cookTime'] = 0
|
|
||||||
|
|
||||||
# if 'prepTime' in ld_json:
|
|
||||||
# ld_json['prepTime'] = parse_cooktime(ld_json['prepTime'])
|
|
||||||
# else:
|
|
||||||
# ld_json['prepTime'] = 0
|
|
||||||
|
|
||||||
# if 'servings' in ld_json:
|
|
||||||
# ld_json['servings'] = parse_servings(ld_json['servings'])
|
|
||||||
# elif 'recipeYield' in ld_json:
|
|
||||||
# ld_json['servings'] = parse_servings(ld_json['recipeYield'])
|
|
||||||
# else:
|
|
||||||
# ld_json['servings'] = 1
|
|
||||||
|
|
||||||
# for key in list(ld_json):
|
|
||||||
# if key not in [
|
|
||||||
# 'prepTime', 'cookTime', 'image', 'recipeInstructions',
|
|
||||||
# 'keywords', 'name', 'recipeIngredient', 'servings', 'description'
|
|
||||||
# ]:
|
|
||||||
# ld_json.pop(key, None)
|
|
||||||
|
|
||||||
# return ld_json
|
|
||||||
|
|
||||||
|
|
||||||
def get_from_scraper(scrape, space):
|
def get_from_scraper(scrape, space):
|
||||||
# converting the scrape_me object to the existing json format based on ld+json
|
# converting the scrape_me object to the existing json format based on ld+json
|
||||||
|
|
||||||
@ -89,12 +17,9 @@ def get_from_scraper(scrape, space):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
description = scrape.schema.data.get("description") or ''
|
description = scrape.schema.data.get("description") or ''
|
||||||
recipe_json['prepTime'] = get_minutes(scrape.schema.data.get("prepTime")) or 0
|
|
||||||
recipe_json['cookTime'] = get_minutes(scrape.schema.data.get("cookTime")) or 0
|
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
description = ''
|
description = ''
|
||||||
recipe_json['prepTime'] = 0
|
|
||||||
recipe_json['cookTime'] = 0
|
|
||||||
|
|
||||||
recipe_json['description'] = normalize_string(description)
|
recipe_json['description'] = normalize_string(description)
|
||||||
|
|
||||||
@ -105,9 +30,11 @@ def get_from_scraper(scrape, space):
|
|||||||
servings = 1
|
servings = 1
|
||||||
recipe_json['servings'] = servings
|
recipe_json['servings'] = servings
|
||||||
|
|
||||||
|
recipe_json['prepTime'] = get_minutes(scrape.schema.data.get("prepTime")) or 0
|
||||||
|
recipe_json['cookTime'] = get_minutes(scrape.schema.data.get("cookTime")) or 0
|
||||||
if recipe_json['cookTime'] + recipe_json['prepTime'] == 0:
|
if recipe_json['cookTime'] + recipe_json['prepTime'] == 0:
|
||||||
try:
|
try:
|
||||||
recipe_json['prepTime'] = scrape.total_time()
|
recipe_json['prepTime'] = get_minutes(scrape.total_time()) or 0
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
from json import JSONDecodeError
|
||||||
from recipe_scrapers import SCRAPERS, get_domain, _exception_handling
|
from recipe_scrapers import SCRAPERS, get_domain, _exception_handling
|
||||||
from recipe_scrapers._factory import SchemaScraperFactory
|
from recipe_scrapers._factory import SchemaScraperFactory
|
||||||
from recipe_scrapers._schemaorg import SchemaOrg
|
from recipe_scrapers._schemaorg import SchemaOrg
|
||||||
@ -8,9 +9,9 @@ from .cooksillustrated import CooksIllustrated
|
|||||||
CUSTOM_SCRAPERS = {
|
CUSTOM_SCRAPERS = {
|
||||||
CooksIllustrated.host(): CooksIllustrated,
|
CooksIllustrated.host(): CooksIllustrated,
|
||||||
}
|
}
|
||||||
|
SCRAPERS.update(CUSTOM_SCRAPERS)
|
||||||
|
|
||||||
|
|
||||||
SCRAPERS = SCRAPERS.update(CUSTOM_SCRAPERS)
|
|
||||||
#%%
|
|
||||||
def text_scraper(text, url=None):
|
def text_scraper(text, url=None):
|
||||||
domain = None
|
domain = None
|
||||||
if url:
|
if url:
|
||||||
@ -31,11 +32,10 @@ def text_scraper(text, url=None):
|
|||||||
self.meta_http_equiv = False
|
self.meta_http_equiv = False
|
||||||
self.soup = BeautifulSoup(page_data, "html.parser")
|
self.soup = BeautifulSoup(page_data, "html.parser")
|
||||||
self.url = url
|
self.url = url
|
||||||
|
self.recipe = None
|
||||||
try:
|
try:
|
||||||
self.schema = SchemaOrg(page_data)
|
self.schema = SchemaOrg(page_data)
|
||||||
except JSONDecodeError:
|
except JSONDecodeError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
return TextScraper(text, url)
|
return TextScraper(text, url)
|
||||||
|
|
||||||
# %%
|
|
||||||
|
Loading…
Reference in New Issue
Block a user