updated import from source to use text scraper

This commit is contained in:
smilerz 2021-04-01 16:19:18 -05:00
parent bf3c30a8fb
commit a54f4e1367

View File

@ -1,3 +1,4 @@
#%%
import json import json
import re import re
@ -72,7 +73,6 @@ def get_recipe_from_source(text, url, space):
'cookTime': '' 'cookTime': ''
} }
recipe_tree = [] recipe_tree = []
temp_tree = []
parse_list = [] parse_list = []
html_data = [] html_data = []
images = [] images = []
@ -80,8 +80,11 @@ def get_recipe_from_source(text, url, space):
text = normalize_string(text) text = normalize_string(text)
try: try:
parse_list.append(remove_graph(json.loads(text))) parse_list.append(remove_graph(json.loads(text)))
scrape = text_scraper("<script type='application/ld+json'>"+text+"</script>")
except JSONDecodeError: except JSONDecodeError:
soup = BeautifulSoup(text, "html.parser") soup = BeautifulSoup(text, "html.parser")
scrape = text_scraper(text)
html_data = get_from_html(soup) html_data = get_from_html(soup)
images += get_images_from_source(soup, url) images += get_images_from_source(soup, url)
for el in soup.find_all('script', type='application/ld+json'): for el in soup.find_all('script', type='application/ld+json'):
@ -89,13 +92,14 @@ def get_recipe_from_source(text, url, space):
for el in soup.find_all(type='application/json'): for el in soup.find_all(type='application/json'):
parse_list.append(remove_graph(el)) parse_list.append(remove_graph(el))
# first try finding ld+json as its most common
for el in parse_list:
# if a url was not provided, try to find one in the first document # if a url was not provided, try to find one in the first document
if not url: if not url:
if 'url' in el: if 'url' in parse_list[0]:
url = el['url'] url = parse_list[0]['url']
recipe_json = helper.get_from_scraper(scrape, url, space)
for el in parse_list:
temp_tree = []
if isinstance(el, Tag): if isinstance(el, Tag):
try: try:
el = json.loads(el.string) el = json.loads(el.string)
@ -123,11 +127,10 @@ def get_recipe_from_source(text, url, space):
temp_tree.append(node) temp_tree.append(node)
if '@type' in el and el['@type'] == 'Recipe': if '@type' in el and el['@type'] == 'Recipe':
recipe_json = helper.find_recipe_json(el, url, space)
recipe_tree += [{'name': 'ld+json', 'children': temp_tree}] recipe_tree += [{'name': 'ld+json', 'children': temp_tree}]
else: else:
recipe_tree += [{'name': 'json', 'children': temp_tree}] recipe_tree += [{'name': 'json', 'children': temp_tree}]
temp_tree = []
return recipe_json, recipe_tree, html_data, images return recipe_json, recipe_tree, html_data, images
@ -195,7 +198,7 @@ def text_scraper(text, url=None):
@classmethod @classmethod
def generate(cls, page_data, url, **options): def generate(cls, page_data, url, **options):
return cls.TextScraper(page_data, url, **options) return cls(page_data, url, **options)
return TextScraper.generate(text, url) return TextScraper.generate(text, url)