Call scrape_me first when scraping from url
This commit is contained in:
parent
33a7fee1cc
commit
2a7475c435
@ -9,6 +9,8 @@ from recipe_scrapers._utils import get_host_name, normalize_string
|
|||||||
|
|
||||||
from cookbook.helper import recipe_url_import as helper
|
from cookbook.helper import recipe_url_import as helper
|
||||||
from cookbook.helper.scrapers.scrapers import text_scraper
|
from cookbook.helper.scrapers.scrapers import text_scraper
|
||||||
|
from recipe_scrapers import scrape_me
|
||||||
|
from recipe_scrapers._exceptions import NoSchemaFoundInWildMode
|
||||||
|
|
||||||
|
|
||||||
def get_recipe_from_source(text, url, request):
|
def get_recipe_from_source(text, url, request):
|
||||||
@ -63,34 +65,41 @@ def get_recipe_from_source(text, url, request):
|
|||||||
html_data = []
|
html_data = []
|
||||||
images = []
|
images = []
|
||||||
text = unquote(text)
|
text = unquote(text)
|
||||||
|
scrape = None
|
||||||
|
|
||||||
try:
|
if url:
|
||||||
parse_list.append(remove_graph(json.loads(text)))
|
try:
|
||||||
if not url and 'url' in parse_list[0]:
|
scrape = scrape_me(url_path=url, wild_mode=True)
|
||||||
url = parse_list[0]['url']
|
except(NoSchemaFoundInWildMode):
|
||||||
scrape = text_scraper("<script type='application/ld+json'>" + text + "</script>", url=url)
|
pass
|
||||||
|
if not scrape:
|
||||||
|
try:
|
||||||
|
parse_list.append(remove_graph(json.loads(text)))
|
||||||
|
if not url and 'url' in parse_list[0]:
|
||||||
|
url = parse_list[0]['url']
|
||||||
|
scrape = text_scraper("<script type='application/ld+json'>" + text + "</script>", url=url)
|
||||||
|
|
||||||
except JSONDecodeError:
|
except JSONDecodeError:
|
||||||
soup = BeautifulSoup(text, "html.parser")
|
soup = BeautifulSoup(text, "html.parser")
|
||||||
html_data = get_from_html(soup)
|
html_data = get_from_html(soup)
|
||||||
images += get_images_from_source(soup, url)
|
images += get_images_from_source(soup, url)
|
||||||
for el in soup.find_all('script', type='application/ld+json'):
|
for el in soup.find_all('script', type='application/ld+json'):
|
||||||
el = remove_graph(el)
|
el = remove_graph(el)
|
||||||
if not url and 'url' in el:
|
if not url and 'url' in el:
|
||||||
url = el['url']
|
url = el['url']
|
||||||
if type(el) == list:
|
if type(el) == list:
|
||||||
for le in el:
|
for le in el:
|
||||||
parse_list.append(le)
|
parse_list.append(le)
|
||||||
elif type(el) == dict:
|
elif type(el) == dict:
|
||||||
parse_list.append(el)
|
parse_list.append(el)
|
||||||
for el in soup.find_all(type='application/json'):
|
for el in soup.find_all(type='application/json'):
|
||||||
el = remove_graph(el)
|
el = remove_graph(el)
|
||||||
if type(el) == list:
|
if type(el) == list:
|
||||||
for le in el:
|
for le in el:
|
||||||
parse_list.append(le)
|
parse_list.append(le)
|
||||||
elif type(el) == dict:
|
elif type(el) == dict:
|
||||||
parse_list.append(el)
|
parse_list.append(el)
|
||||||
scrape = text_scraper(text, url=url)
|
scrape = text_scraper(text, url=url)
|
||||||
|
|
||||||
recipe_json = helper.get_from_scraper(scrape, request)
|
recipe_json = helper.get_from_scraper(scrape, request)
|
||||||
|
|
||||||
|
@ -114,7 +114,14 @@ def get_from_scraper(scrape, request):
|
|||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
if source_url := scrape.url:
|
try:
|
||||||
|
source_url = scrape.canonical_url()
|
||||||
|
except Exception:
|
||||||
|
try:
|
||||||
|
source_url = scrape.url
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
if source_url:
|
||||||
recipe_json['source_url'] = source_url
|
recipe_json['source_url'] = source_url
|
||||||
try:
|
try:
|
||||||
keywords.append(source_url.replace('http://', '').replace('https://', '').split('/')[0])
|
keywords.append(source_url.replace('http://', '').replace('https://', '').split('/')[0])
|
||||||
@ -129,9 +136,11 @@ def get_from_scraper(scrape, request):
|
|||||||
ingredient_parser = IngredientParser(request, True)
|
ingredient_parser = IngredientParser(request, True)
|
||||||
|
|
||||||
recipe_json['steps'] = []
|
recipe_json['steps'] = []
|
||||||
|
try:
|
||||||
for i in parse_instructions(scrape.instructions()):
|
for i in parse_instructions(scrape.instructions()):
|
||||||
recipe_json['steps'].append({'instruction': i, 'ingredients': [], })
|
recipe_json['steps'].append({'instruction': i, 'ingredients': [], })
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
if len(recipe_json['steps']) == 0:
|
if len(recipe_json['steps']) == 0:
|
||||||
recipe_json['steps'].append({'instruction': '', 'ingredients': [], })
|
recipe_json['steps'].append({'instruction': '', 'ingredients': [], })
|
||||||
|
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from json import JSONDecodeError
|
from json import JSONDecodeError
|
||||||
from recipe_scrapers import SCRAPERS, get_host_name
|
from recipe_scrapers import SCRAPERS
|
||||||
from recipe_scrapers._factory import SchemaScraperFactory
|
from recipe_scrapers._factory import SchemaScraperFactory
|
||||||
from recipe_scrapers._schemaorg import SchemaOrg
|
from recipe_scrapers._schemaorg import SchemaOrg
|
||||||
|
|
||||||
@ -15,13 +15,7 @@ SCRAPERS.update(CUSTOM_SCRAPERS)
|
|||||||
|
|
||||||
|
|
||||||
def text_scraper(text, url=None):
|
def text_scraper(text, url=None):
|
||||||
domain = None
|
scraper_class = SchemaScraperFactory.SchemaScraper
|
||||||
if url:
|
|
||||||
domain = get_host_name(url)
|
|
||||||
if domain in SCRAPERS:
|
|
||||||
scraper_class = SCRAPERS[domain]
|
|
||||||
else:
|
|
||||||
scraper_class = SchemaScraperFactory.SchemaScraper
|
|
||||||
|
|
||||||
class TextScraper(scraper_class):
|
class TextScraper(scraper_class):
|
||||||
def __init__(
|
def __init__(
|
||||||
|
Loading…
Reference in New Issue
Block a user