updated import from source to use text scraper
This commit is contained in:
parent
bf3c30a8fb
commit
a54f4e1367
@ -1,3 +1,4 @@
|
|||||||
|
#%%
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
|
|
||||||
@ -72,7 +73,6 @@ def get_recipe_from_source(text, url, space):
|
|||||||
'cookTime': ''
|
'cookTime': ''
|
||||||
}
|
}
|
||||||
recipe_tree = []
|
recipe_tree = []
|
||||||
temp_tree = []
|
|
||||||
parse_list = []
|
parse_list = []
|
||||||
html_data = []
|
html_data = []
|
||||||
images = []
|
images = []
|
||||||
@ -80,8 +80,11 @@ def get_recipe_from_source(text, url, space):
|
|||||||
text = normalize_string(text)
|
text = normalize_string(text)
|
||||||
try:
|
try:
|
||||||
parse_list.append(remove_graph(json.loads(text)))
|
parse_list.append(remove_graph(json.loads(text)))
|
||||||
|
scrape = text_scraper("<script type='application/ld+json'>"+text+"</script>")
|
||||||
|
|
||||||
except JSONDecodeError:
|
except JSONDecodeError:
|
||||||
soup = BeautifulSoup(text, "html.parser")
|
soup = BeautifulSoup(text, "html.parser")
|
||||||
|
scrape = text_scraper(text)
|
||||||
html_data = get_from_html(soup)
|
html_data = get_from_html(soup)
|
||||||
images += get_images_from_source(soup, url)
|
images += get_images_from_source(soup, url)
|
||||||
for el in soup.find_all('script', type='application/ld+json'):
|
for el in soup.find_all('script', type='application/ld+json'):
|
||||||
@ -89,13 +92,14 @@ def get_recipe_from_source(text, url, space):
|
|||||||
for el in soup.find_all(type='application/json'):
|
for el in soup.find_all(type='application/json'):
|
||||||
parse_list.append(remove_graph(el))
|
parse_list.append(remove_graph(el))
|
||||||
|
|
||||||
# first try finding ld+json as its most common
|
|
||||||
for el in parse_list:
|
|
||||||
# if a url was not provided, try to find one in the first document
|
# if a url was not provided, try to find one in the first document
|
||||||
if not url:
|
if not url:
|
||||||
if 'url' in el:
|
if 'url' in parse_list[0]:
|
||||||
url = el['url']
|
url = parse_list[0]['url']
|
||||||
|
recipe_json = helper.get_from_scraper(scrape, url, space)
|
||||||
|
|
||||||
|
for el in parse_list:
|
||||||
|
temp_tree = []
|
||||||
if isinstance(el, Tag):
|
if isinstance(el, Tag):
|
||||||
try:
|
try:
|
||||||
el = json.loads(el.string)
|
el = json.loads(el.string)
|
||||||
@ -123,11 +127,10 @@ def get_recipe_from_source(text, url, space):
|
|||||||
temp_tree.append(node)
|
temp_tree.append(node)
|
||||||
|
|
||||||
if '@type' in el and el['@type'] == 'Recipe':
|
if '@type' in el and el['@type'] == 'Recipe':
|
||||||
recipe_json = helper.find_recipe_json(el, url, space)
|
|
||||||
recipe_tree += [{'name': 'ld+json', 'children': temp_tree}]
|
recipe_tree += [{'name': 'ld+json', 'children': temp_tree}]
|
||||||
else:
|
else:
|
||||||
recipe_tree += [{'name': 'json', 'children': temp_tree}]
|
recipe_tree += [{'name': 'json', 'children': temp_tree}]
|
||||||
temp_tree = []
|
|
||||||
|
|
||||||
return recipe_json, recipe_tree, html_data, images
|
return recipe_json, recipe_tree, html_data, images
|
||||||
|
|
||||||
@ -195,7 +198,7 @@ def text_scraper(text, url=None):
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def generate(cls, page_data, url, **options):
|
def generate(cls, page_data, url, **options):
|
||||||
return cls.TextScraper(page_data, url, **options)
|
return cls(page_data, url, **options)
|
||||||
|
|
||||||
return TextScraper.generate(text, url)
|
return TextScraper.generate(text, url)
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user