improved recipe parser, added tests, cleaned up resources

2020-06-24 21:22:23 +02:00
parent b6d98397b5
commit 2c5e44d73c
15 changed files with 80 additions and 22895 deletions
--- a/cookbook/views/api.py
+++ b/cookbook/views/api.py
@ -1,13 +1,10 @@
 import io
 import json
 import re
-from json import JSONDecodeError

-import microdata
 import requests
 from annoying.decorators import ajax_request
 from annoying.functions import get_object_or_None
-from bs4 import BeautifulSoup
 from django.contrib import messages
 from django.contrib.auth.models import User
 from django.db.models import Q
@ -18,11 +15,10 @@ from icalendar import Calendar, Event
 from rest_framework import viewsets, permissions
 from rest_framework.exceptions import APIException
 from rest_framework.mixins import RetrieveModelMixin, UpdateModelMixin, ListModelMixin
-from urllib3.exceptions import NewConnectionError

 from cookbook.helper.permission_helper import group_required, CustomIsOwner, CustomIsAdmin, CustomIsUser
-from cookbook.helper.recipe_url_import import find_recipe_json
-from cookbook.models import Recipe, Sync, Storage, CookLog, MealPlan, MealType, ViewLog, UserPreference, RecipeBook, Keyword, RecipeIngredient, Ingredient
+from cookbook.helper.recipe_url_import import get_from_html
+from cookbook.models import Recipe, Sync, Storage, CookLog, MealPlan, MealType, ViewLog, UserPreference, RecipeBook, RecipeIngredient, Ingredient
 from cookbook.provider.dropbox import Dropbox
 from cookbook.provider.nextcloud import Nextcloud
 from cookbook.serializer import MealPlanSerializer, MealTypeSerializer, RecipeSerializer, ViewLogSerializer, UserNameSerializer, UserPreferenceSerializer, RecipeBookSerializer, RecipeIngredientSerializer, IngredientSerializer
@ -265,7 +261,6 @@ def get_plan_ical(request, html_week):
@group_required('user')
 def recipe_from_url(request, url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36'}
-
    try:
        response = requests.get(url, headers=headers)
    except requests.exceptions.ConnectionError:
@ -273,33 +268,4 @@ def recipe_from_url(request, url):

    if response.status_code == 403:
        return JsonResponse({'error': True, 'msg': _('The requested page refused to provide any information (Status Code 403).')}, status=400)
-
-    soup = BeautifulSoup(response.text, "html.parser")
-
-    # first try finding ld+json as its most common
-    for ld in soup.find_all('script', type='application/ld+json'):
-        try:
-            ld_json = json.loads(ld.string)
-            if type(ld_json) != list:
-                ld_json = [ld_json]
-
-            for ld_json_item in ld_json:
-                # recipes type might be wrapped in @graph type
-                if '@graph' in ld_json_item:
-                    for x in ld_json_item['@graph']:
-                        if '@type' in x and x['@type'] == 'Recipe':
-                            ld_json_item = x
-
-                if '@type' in ld_json_item and ld_json_item['@type'] == 'Recipe':
-                    return find_recipe_json(ld_json_item, url)
-        except JSONDecodeError:
-            JsonResponse({'error': True, 'msg': _('The requested site does not provided malformed data and cannot be read.')}, status=400)
-
-    # now try to find microdata
-    items = microdata.get_items(response.text)
-    for i in items:
-        md_json = json.loads(i.json())
-        if 'schema.org/Recipe' in str(md_json['type']):
-            return find_recipe_json(md_json['properties'], url)
-
-    return JsonResponse({'error': True, 'msg': _('The requested site does not provide any recognized data format to import the recipe from.')}, status=400)
+    return get_from_html(response.text, url)