improved recipe parser, added tests, cleaned up resources

This commit is contained in:
vabene1111
2020-06-24 21:22:23 +02:00
parent b6d98397b5
commit 2c5e44d73c
15 changed files with 80 additions and 22895 deletions

View File

@ -1,13 +1,10 @@
import io
import json
import re
from json import JSONDecodeError
import microdata
import requests
from annoying.decorators import ajax_request
from annoying.functions import get_object_or_None
from bs4 import BeautifulSoup
from django.contrib import messages
from django.contrib.auth.models import User
from django.db.models import Q
@ -18,11 +15,10 @@ from icalendar import Calendar, Event
from rest_framework import viewsets, permissions
from rest_framework.exceptions import APIException
from rest_framework.mixins import RetrieveModelMixin, UpdateModelMixin, ListModelMixin
from urllib3.exceptions import NewConnectionError
from cookbook.helper.permission_helper import group_required, CustomIsOwner, CustomIsAdmin, CustomIsUser
from cookbook.helper.recipe_url_import import find_recipe_json
from cookbook.models import Recipe, Sync, Storage, CookLog, MealPlan, MealType, ViewLog, UserPreference, RecipeBook, Keyword, RecipeIngredient, Ingredient
from cookbook.helper.recipe_url_import import get_from_html
from cookbook.models import Recipe, Sync, Storage, CookLog, MealPlan, MealType, ViewLog, UserPreference, RecipeBook, RecipeIngredient, Ingredient
from cookbook.provider.dropbox import Dropbox
from cookbook.provider.nextcloud import Nextcloud
from cookbook.serializer import MealPlanSerializer, MealTypeSerializer, RecipeSerializer, ViewLogSerializer, UserNameSerializer, UserPreferenceSerializer, RecipeBookSerializer, RecipeIngredientSerializer, IngredientSerializer
@ -265,7 +261,6 @@ def get_plan_ical(request, html_week):
@group_required('user')
def recipe_from_url(request, url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36'}
try:
response = requests.get(url, headers=headers)
except requests.exceptions.ConnectionError:
@ -273,33 +268,4 @@ def recipe_from_url(request, url):
if response.status_code == 403:
return JsonResponse({'error': True, 'msg': _('The requested page refused to provide any information (Status Code 403).')}, status=400)
soup = BeautifulSoup(response.text, "html.parser")
# first try finding ld+json as its most common
for ld in soup.find_all('script', type='application/ld+json'):
try:
ld_json = json.loads(ld.string)
if type(ld_json) != list:
ld_json = [ld_json]
for ld_json_item in ld_json:
# recipes type might be wrapped in @graph type
if '@graph' in ld_json_item:
for x in ld_json_item['@graph']:
if '@type' in x and x['@type'] == 'Recipe':
ld_json_item = x
if '@type' in ld_json_item and ld_json_item['@type'] == 'Recipe':
return find_recipe_json(ld_json_item, url)
except JSONDecodeError:
JsonResponse({'error': True, 'msg': _('The requested site does not provided malformed data and cannot be read.')}, status=400)
# now try to find microdata
items = microdata.get_items(response.text)
for i in items:
md_json = json.loads(i.json())
if 'schema.org/Recipe' in str(md_json['type']):
return find_recipe_json(md_json['properties'], url)
return JsonResponse({'error': True, 'msg': _('The requested site does not provide any recognized data format to import the recipe from.')}, status=400)
return get_from_html(response.text, url)