microdata import

This commit is contained in:
vabene1111 2020-06-22 22:47:33 +02:00
parent 71b41a9ca2
commit f066b7097c
3 changed files with 96 additions and 60 deletions

View File

@ -1,59 +1,82 @@
import re
from django.http import JsonResponse from django.http import JsonResponse
from cookbook.models import Keyword from cookbook.models import Keyword
def find_ld_json(ld_json): def find_recipe_json(ld_json):
# recipes type might be wrapped in @graph type ld_json['org'] = str(ld_json)
if '@graph' in ld_json:
for x in ld_json['@graph']:
if '@type' in x and x['@type'] == 'Recipe':
ld_json = x
if '@type' in ld_json and ld_json['@type'] == 'Recipe': # some sites use ingredients instead of recipeIngredients
if 'recipeIngredient' not in ld_json and 'ingredients' in ld_json:
ld_json['recipeIngredient'] = ld_json['ingredients']
if 'recipeIngredient' in ld_json: if 'recipeIngredient' in ld_json:
ingredients = [] # some pages have comma separated ingredients in a single array entry
if len(ld_json['recipeIngredient']) == 1 and len(ld_json['recipeIngredient'][0]) > 30:
ld_json['recipeIngredient'] = ld_json['recipeIngredient'][0].split(',')
for x in ld_json['recipeIngredient']: ingredients = []
ingredient_split = x.split()
if len(ingredient_split) > 2:
ingredients.append({'amount': ingredient_split[0], 'unit': ingredient_split[1], 'ingredient': " ".join(ingredient_split[2:])})
if len(ingredient_split) == 2:
ingredients.append({'amount': ingredient_split[0], 'unit': '', 'ingredient': " ".join(ingredient_split[1:])})
if len(ingredient_split) == 1:
ingredients.append({'amount': 0, 'unit': '', 'ingredient': " ".join(ingredient_split)})
ld_json['recipeIngredient'] = ingredients for x in ld_json['recipeIngredient']:
ingredient_split = x.split()
if len(ingredient_split) > 2:
ingredients.append({'amount': ingredient_split[0], 'unit': ingredient_split[1], 'ingredient': " ".join(ingredient_split[2:])})
if len(ingredient_split) == 2:
ingredients.append({'amount': ingredient_split[0], 'unit': '', 'ingredient': " ".join(ingredient_split[1:])})
if len(ingredient_split) == 1:
ingredients.append({'amount': 0, 'unit': '', 'ingredient': " ".join(ingredient_split)})
if 'keywords' in ld_json: ld_json['recipeIngredient'] = ingredients
keywords = []
if type(ld_json['keywords']) == str:
ld_json['keywords'] = ld_json['keywords'].split(',')
for kw in ld_json['keywords']: if 'keywords' in ld_json:
if k := Keyword.objects.filter(name=kw).first(): keywords = []
keywords.append({'id': str(k.id), 'text': str(k).strip()})
# keywords as string
if type(ld_json['keywords']) == str:
ld_json['keywords'] = ld_json['keywords'].split(',')
# keywords as string in list
if type(ld_json['keywords']) == list and len(ld_json['keywords']) == 1 and ',' in ld_json['keywords'][0]:
ld_json['keywords'] = ld_json['keywords'][0].split(',')
# keywords as list
for kw in ld_json['keywords']:
if k := Keyword.objects.filter(name=kw).first():
keywords.append({'id': str(k.id), 'text': str(k).strip()})
else:
keywords.append({'id': "null", 'text': kw.strip()})
ld_json['keywords'] = keywords
if 'recipeInstructions' in ld_json:
instructions = ''
# flatten instructions if they are in a list
if type(ld_json['recipeInstructions']) == list:
for i in ld_json['recipeInstructions']:
if type(i) == str:
instructions += i
else: else:
keywords.append({'id': "null", 'text': kw.strip()}) instructions += i['text'] + '\n\n'
ld_json['recipeInstructions'] = instructions
ld_json['keywords'] = keywords ld_json['recipeInstructions'] = re.sub(r'\n\s*\n', '\n\n', ld_json['recipeInstructions'])
ld_json['recipeInstructions'] = re.sub(' +', ' ', ld_json['recipeInstructions'])
ld_json['recipeInstructions'] = ld_json['recipeInstructions'].replace('<p>', '')
ld_json['recipeInstructions'] = ld_json['recipeInstructions'].replace('</p>', '')
if 'recipeInstructions' in ld_json: if 'image' in ld_json:
instructions = '' # check if list of images is returned, take first if so
if type(ld_json['recipeInstructions']) == list: if (type(ld_json['image'])) == list:
for i in ld_json['recipeInstructions']: if type(ld_json['image'][0]) == str:
if type(i) == str: ld_json['image'] = ld_json['image'][0]
instructions += i elif 'url' in ld_json['image'][0]:
else: ld_json['image'] = ld_json['image'][0]['url']
instructions += i['text'] + '\n\n'
ld_json['recipeInstructions'] = instructions
if 'image' in ld_json: # ignore relative image paths
if (type(ld_json['image'])) == list: if 'http' not in ld_json['image']:
if type(ld_json['image'][0]) == str: ld_json['image'] = ''
ld_json['image'] = ld_json['image'][0]
elif 'url' in ld_json['image'][0]:
ld_json['image'] = ld_json['image'][0]['url']
return JsonResponse(ld_json) return JsonResponse(ld_json)

View File

@ -17,6 +17,13 @@
<div id="app"> <div id="app">
https://www.inspirationforall.de/pudding-selber-machen-vanillepudding-schokopudding-rezept/<br/>
https://www.ichkoche.at/schokopudding-rezept-218012<br/>
https://www.gutekueche.de/mamis-feiner-schokopudding-rezept-4274<br/>
https://www.maizena.at/rezepte/schokopudding/13534<br/>
https://kochkino.de/schokoladen-pudding/2159<br/>
https://www.oetker.de/rezepte/r/schokopudding-mit-vanille-herzen<br/>
<div class="row"> <div class="row">
<div class="col-md-12"> <div class="col-md-12">
<div class="input-group mb-3"> <div class="input-group mb-3">
@ -42,11 +49,11 @@
<div class="row"> <div class="row">
<div class="col col-md-6"> <div class="col col-md-6" v-if="recipe_data.image !== ''">
<img v-bind:src="recipe_data.image" alt="{% trans 'Recipe Image' %}" <img v-bind:src="recipe_data.image" alt="{% trans 'Recipe Image' %}"
class="img-fluid img-responsive img-rounded"> class="img-fluid img-responsive img-rounded">
</div> </div>
<div> <div class="col col-md-6">
<div class="form-group"> <div class="form-group">
<label for="id_prep_time">{% trans 'Preparation time ca.' %}</label> <label for="id_prep_time">{% trans 'Preparation time ca.' %}</label>
<input id="id_prep_time" class="form-control" v-model="recipe_data.prepTime"> <input id="id_prep_time" class="form-control" v-model="recipe_data.prepTime">
@ -109,8 +116,10 @@
</form> </form>
[[recipe_data]]
</template> </template>
[[recipe_data]]
</div> </div>
@ -131,14 +140,6 @@
Vue.component('vue-multiselect', window.VueMultiselect.default) Vue.component('vue-multiselect', window.VueMultiselect.default)
// micro data examples
// https://www.inspirationforall.de/pudding-selber-machen-vanillepudding-schokopudding-rezept/
// https://www.ichkoche.at/schokopudding-rezept-218012
// https://www.gutekueche.de/mamis-feiner-schokopudding-rezept-4274
// https://www.maizena.at/rezepte/schokopudding/13534
// https://kochkino.de/schokoladen-pudding/2159
// https://www.oetker.de/rezepte/r/schokopudding-mit-vanille-herzen
let app = new Vue({ let app = new Vue({
components: { components: {
Multiselect: window.VueMultiselect.default Multiselect: window.VueMultiselect.default

View File

@ -19,7 +19,7 @@ from rest_framework.exceptions import APIException
from rest_framework.mixins import RetrieveModelMixin, UpdateModelMixin, ListModelMixin from rest_framework.mixins import RetrieveModelMixin, UpdateModelMixin, ListModelMixin
from cookbook.helper.permission_helper import group_required, CustomIsOwner, CustomIsAdmin from cookbook.helper.permission_helper import group_required, CustomIsOwner, CustomIsAdmin
from cookbook.helper.recipe_url_import import find_ld_json from cookbook.helper.recipe_url_import import find_recipe_json
from cookbook.models import Recipe, Sync, Storage, CookLog, MealPlan, MealType, ViewLog, UserPreference, RecipeBook, Keyword from cookbook.models import Recipe, Sync, Storage, CookLog, MealPlan, MealType, ViewLog, UserPreference, RecipeBook, Keyword
from cookbook.provider.dropbox import Dropbox from cookbook.provider.dropbox import Dropbox
from cookbook.provider.nextcloud import Nextcloud from cookbook.provider.nextcloud import Nextcloud
@ -260,13 +260,25 @@ def recipe_from_url(request, url):
# first try finding ld+json as its most common # first try finding ld+json as its most common
for ld in soup.find_all('script', type='application/ld+json'): for ld in soup.find_all('script', type='application/ld+json'):
if (r := find_ld_json(json.loads(ld.string))) is not None: ld_json = json.loads(ld.string)
return r if type(ld_json) != list:
ld_json = [ld_json]
for ld_json_item in ld_json:
# recipes type might be wrapped in @graph type
if '@graph' in ld_json_item:
for x in ld_json_item['@graph']:
if '@type' in x and x['@type'] == 'Recipe':
ld_json_item = x
if '@type' in ld_json_item and ld_json_item['@type'] == 'Recipe':
return find_recipe_json(ld_json_item)
# now try to find microdata # now try to find microdata
items = microdata.get_items(response) items = microdata.get_items(response.text)
for i in items: for i in items:
js = i.json() md_json = json.loads(i.json())
print('hi') if 'schema.org/Recipe' in str(md_json['type']):
return find_recipe_json(md_json['properties'])
return JsonResponse({'error': _('The requested site does not provide any recognized data format to import the recipe from.')}) return JsonResponse({'error': _('The requested site does not provide any recognized data format to import the recipe from.')})