refactored url_import to use recipe-scraper

This commit is contained in:
smilerz 2021-03-03 21:08:34 -06:00
parent 47090ce863
commit bfaed434cc
4 changed files with 157 additions and 26 deletions

View File

@ -10,6 +10,7 @@ from cookbook.models import Keyword
from django.http import JsonResponse from django.http import JsonResponse
from django.utils.dateparse import parse_duration from django.utils.dateparse import parse_duration
from django.utils.translation import gettext as _ from django.utils.translation import gettext as _
from recipe_scrapers import _utils
def get_from_html(html_text, url): def get_from_html(html_text, url):
@ -69,8 +70,10 @@ def find_recipe_json(ld_json, url):
if 'recipeIngredient' in ld_json: if 'recipeIngredient' in ld_json:
# some pages have comma separated ingredients in a single array entry # some pages have comma separated ingredients in a single array entry
if (len(ld_json['recipeIngredient']) == 1 if (len(ld_json['recipeIngredient']) == 1
and len(ld_json['recipeIngredient'][0]) > 30): and type(ld_json['recipeIngredient']) == list):
ld_json['recipeIngredient'] = ld_json['recipeIngredient'][0].split(',') # noqa: E501 ld_json['recipeIngredient'] = ld_json['recipeIngredient'][0].split(',') # noqa: E501
elif type(ld_json['recipeIngredient']) == str:
ld_json['recipeIngredient'] = ld_json['recipeIngredient'].split(',')
for x in ld_json['recipeIngredient']: for x in ld_json['recipeIngredient']:
if '\n' in x: if '\n' in x:
@ -122,28 +125,7 @@ def find_recipe_json(ld_json, url):
ld_json['recipeIngredient'] = [] ld_json['recipeIngredient'] = []
if 'keywords' in ld_json: if 'keywords' in ld_json:
keywords = [] ld_json['keywords'] = parse_keywords(listify_keywords(ld_json['keywords']))
# keywords as string
if type(ld_json['keywords']) == str:
ld_json['keywords'] = ld_json['keywords'].split(',')
# keywords as string in list
if (type(ld_json['keywords']) == list
and len(ld_json['keywords']) == 1
and ',' in ld_json['keywords'][0]):
ld_json['keywords'] = ld_json['keywords'][0].split(',')
# keywords as list
for kw in ld_json['keywords']:
if k := Keyword.objects.filter(name=kw).first():
keywords.append({'id': str(k.id), 'text': str(k).strip()})
else:
keywords.append({'id': random.randrange(1111111, 9999999, 1), 'text': kw.strip()})
ld_json['keywords'] = keywords
else:
ld_json['keywords'] = []
if 'recipeInstructions' in ld_json: if 'recipeInstructions' in ld_json:
instructions = '' instructions = ''
@ -218,6 +200,7 @@ def find_recipe_json(ld_json, url):
else: else:
ld_json['prepTime'] = 0 ld_json['prepTime'] = 0
ld_json['servings'] = 1
try: try:
if 'recipeYield' in ld_json: if 'recipeYield' in ld_json:
if type(ld_json['recipeYield']) == str: if type(ld_json['recipeYield']) == str:
@ -226,7 +209,6 @@ def find_recipe_json(ld_json, url):
ld_json['servings'] = int(re.findall(r'\b\d+\b', ld_json['recipeYield'][0])[0]) ld_json['servings'] = int(re.findall(r'\b\d+\b', ld_json['recipeYield'][0])[0])
except Exception as e: except Exception as e:
print(e) print(e)
ld_json['servings'] = 1
for key in list(ld_json): for key in list(ld_json):
if key not in [ if key not in [
@ -236,3 +218,117 @@ def find_recipe_json(ld_json, url):
ld_json.pop(key, None) ld_json.pop(key, None)
return ld_json return ld_json
def get_from_scraper(scrape):
# converting the scrape_me object to the existing json format based on ld+json
recipe_json = {}
recipe_json['name'] = scrape.title()
recipe_json['description'] = ''
description = scrape.schema.data.get("description")
description += "\n\nImported from " + scrape.url
recipe_json['description'] = description
try:
servings = scrape.yields()
servings = int(re.findall(r'\b\d+\b', servings)[0])
except (AttributeError, ValueError):
servings = 1
recipe_json['servings'] = servings
recipe_json['prepTime'] = _utils.get_minutes(scrape.schema.data.get("prepTime")) or 0
recipe_json['cookTime'] = _utils.get_minutes(scrape.schema.data.get("cookTime")) or 0
if recipe_json['cookTime'] + recipe_json['prepTime'] == 0:
try:
recipe_json['prepTime'] = scrape.total_time()
except AttributeError:
pass
try:
recipe_json['image'] = scrape.image()
except AttributeError:
pass
keywords = []
if scrape.schema.data.get("keywords"):
keywords += listify_keywords(scrape.schema.data.get("keywords"))
if scrape.schema.data.get('recipeCategory'):
keywords += listify_keywords(scrape.schema.data.get("recipeCategory"))
if scrape.schema.data.get('recipeCuisine'):
keywords += listify_keywords(scrape.schema.data.get("recipeCuisine"))
recipe_json['keywords'] = parse_keywords(list(set(map(str.casefold, keywords))))
try:
ingredients = []
for x in scrape.ingredients():
try:
amount, unit, ingredient, note = parse_ingredient(x)
if ingredient:
ingredients.append(
{
'amount': amount,
'unit': {
'text': unit,
'id': random.randrange(10000, 99999)
},
'ingredient': {
'text': ingredient,
'id': random.randrange(10000, 99999)
},
'note': note,
'original': x
}
)
except Exception:
ingredients.append(
{
'amount': 0,
'unit': {
'text': '',
'id': random.randrange(10000, 99999)
},
'ingredient': {
'text': x,
'id': random.randrange(10000, 99999)
},
'note': '',
'original': x
}
)
recipe_json['recipeIngredient'] = ingredients
except AttributeError:
recipe_json['recipeIngredient'] = ingredients
try:
recipe_json['recipeInstructions'] = scrape.instructions()
except AttributeError:
recipe_json['recipeInstructions'] = ""
return recipe_json
def parse_keywords(keyword_json):
keywords = []
# keywords as list
for kw in keyword_json:
if k := Keyword.objects.filter(name=kw).first():
keywords.append({'id': str(k.id), 'text': str(k)})
else:
keywords.append({'id': random.randrange(1111111, 9999999, 1), 'text': kw})
return keywords
def listify_keywords(keyword_list):
# keywords as string
if type(keyword_list) == str:
keyword_list = keyword_list.split(',')
# keywords as string in list
if (type(keyword_list) == list
and len(keyword_list) == 1
and ',' in keyword_list[0]):
keyword_list = keyword_list[0].split(',')
return [x.strip() for x in keyword_list]

View File

@ -45,6 +45,12 @@
<input id="id_name" class="form-control" v-model="recipe_data.name"> <input id="id_name" class="form-control" v-model="recipe_data.name">
</div> </div>
<div class="form-group">
<label for="id_description">{% trans 'Recipe Description' %}</label>
<textarea id="id_description" class="form-control" v-model="recipe_data.description"
rows="4"></textarea>
</div>
<div class="row"> <div class="row">
<div class="col col-md-6" v-if="recipe_data.image !== ''"> <div class="col col-md-6" v-if="recipe_data.image !== ''">
<img v-bind:src="recipe_data.image" alt="{% trans 'Recipe Image' %}" <img v-bind:src="recipe_data.image" alt="{% trans 'Recipe Image' %}"

View File

@ -9,7 +9,7 @@ from annoying.functions import get_object_or_None
from django.contrib import messages from django.contrib import messages
from django.contrib.auth.models import User from django.contrib.auth.models import User
from django.core import management from django.core import management
from django.core.exceptions import FieldError from django.core.exceptions import FieldError, ValidationError
from django.core.files import File from django.core.files import File
from django.db.models import Q from django.db.models import Q
from django.http import FileResponse, HttpResponse, JsonResponse from django.http import FileResponse, HttpResponse, JsonResponse
@ -32,7 +32,7 @@ from cookbook.helper.permission_helper import (CustomIsAdmin, CustomIsGuest,
CustomIsOwner, CustomIsShare, CustomIsOwner, CustomIsShare,
CustomIsShared, CustomIsUser, CustomIsShared, CustomIsUser,
group_required) group_required)
from cookbook.helper.recipe_url_import import get_from_html from cookbook.helper.recipe_url_import import get_from_html, get_from_scraper
from cookbook.models import (CookLog, Food, Ingredient, Keyword, MealPlan, from cookbook.models import (CookLog, Food, Ingredient, Keyword, MealPlan,
MealType, Recipe, RecipeBook, ShoppingList, MealType, Recipe, RecipeBook, ShoppingList,
ShoppingListEntry, ShoppingListRecipe, Step, ShoppingListEntry, ShoppingListRecipe, Step,
@ -54,6 +54,7 @@ from cookbook.serializer import (FoodSerializer, IngredientSerializer,
UserNameSerializer, UserPreferenceSerializer, UserNameSerializer, UserPreferenceSerializer,
ViewLogSerializer, CookLogSerializer, RecipeBookEntrySerializer, RecipeOverviewSerializer, SupermarketSerializer) ViewLogSerializer, CookLogSerializer, RecipeBookEntrySerializer, RecipeOverviewSerializer, SupermarketSerializer)
from recipes.settings import DEMO from recipes.settings import DEMO
from recipe_scrapers import scrape_me, WebsiteNotImplementedError, NoSchemaFoundInWildMode
class StandardFilterMixin(ViewSetMixin): class StandardFilterMixin(ViewSetMixin):
@ -498,6 +499,33 @@ def get_plan_ical(request, from_date, to_date):
def recipe_from_url(request): def recipe_from_url(request):
url = request.POST['url'] url = request.POST['url']
try:
scrape = scrape_me(url)
except WebsiteNotImplementedError:
try:
scrape = scrape_me(url, wild_mode=True)
except NoSchemaFoundInWildMode:
return JsonResponse(
{
'error': True,
'msg': _('The requested site provided malformed data and cannot be read.') # noqa: E501
},
status=400)
except ConnectionError:
return JsonResponse(
{
'error': True,
'msg': _('The requested page could not be found.')
},
status=400
)
return JsonResponse(get_from_scraper(scrape))
@group_required('user')
def recipe_from_url_old(request):
url = request.POST['url']
headers = { headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36' # noqa: E501 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36' # noqa: E501
} }

View File

@ -31,3 +31,4 @@ Jinja2==2.11.3
django-webpack-loader==0.7.0 django-webpack-loader==0.7.0
django-js-reverse==0.9.1 django-js-reverse==0.9.1
django-allauth==0.44.0 django-allauth==0.44.0
recipe-scrapers==12.2.0