diff --git a/cookbook/helper/recipe_html_import.py b/cookbook/helper/recipe_html_import.py index 7fa7beaf..f78ccb79 100644 --- a/cookbook/helper/recipe_html_import.py +++ b/cookbook/helper/recipe_html_import.py @@ -9,6 +9,8 @@ from recipe_scrapers._utils import get_host_name, normalize_string from cookbook.helper import recipe_url_import as helper from cookbook.helper.scrapers.scrapers import text_scraper +from recipe_scrapers import scrape_me +from recipe_scrapers._exceptions import NoSchemaFoundInWildMode def get_recipe_from_source(text, url, request): @@ -63,34 +65,41 @@ def get_recipe_from_source(text, url, request): html_data = [] images = [] text = unquote(text) + scrape = None - try: - parse_list.append(remove_graph(json.loads(text))) - if not url and 'url' in parse_list[0]: - url = parse_list[0]['url'] - scrape = text_scraper("", url=url) + if url: + try: + scrape = scrape_me(url_path=url, wild_mode=True) + except(NoSchemaFoundInWildMode): + pass + if not scrape: + try: + parse_list.append(remove_graph(json.loads(text))) + if not url and 'url' in parse_list[0]: + url = parse_list[0]['url'] + scrape = text_scraper("", url=url) - except JSONDecodeError: - soup = BeautifulSoup(text, "html.parser") - html_data = get_from_html(soup) - images += get_images_from_source(soup, url) - for el in soup.find_all('script', type='application/ld+json'): - el = remove_graph(el) - if not url and 'url' in el: - url = el['url'] - if type(el) == list: - for le in el: - parse_list.append(le) - elif type(el) == dict: - parse_list.append(el) - for el in soup.find_all(type='application/json'): - el = remove_graph(el) - if type(el) == list: - for le in el: - parse_list.append(le) - elif type(el) == dict: - parse_list.append(el) - scrape = text_scraper(text, url=url) + except JSONDecodeError: + soup = BeautifulSoup(text, "html.parser") + html_data = get_from_html(soup) + images += get_images_from_source(soup, url) + for el in soup.find_all('script', type='application/ld+json'): + el = remove_graph(el) + if not url and 'url' in el: + url = el['url'] + if type(el) == list: + for le in el: + parse_list.append(le) + elif type(el) == dict: + parse_list.append(el) + for el in soup.find_all(type='application/json'): + el = remove_graph(el) + if type(el) == list: + for le in el: + parse_list.append(le) + elif type(el) == dict: + parse_list.append(el) + scrape = text_scraper(text, url=url) recipe_json = helper.get_from_scraper(scrape, request) diff --git a/cookbook/helper/recipe_url_import.py b/cookbook/helper/recipe_url_import.py index 9ad34528..0d8688b2 100644 --- a/cookbook/helper/recipe_url_import.py +++ b/cookbook/helper/recipe_url_import.py @@ -114,7 +114,14 @@ def get_from_scraper(scrape, request): except Exception: pass - if source_url := scrape.url: + try: + source_url = scrape.canonical_url() + except Exception: + try: + source_url = scrape.url + except Exception: + pass + if source_url: recipe_json['source_url'] = source_url try: keywords.append(source_url.replace('http://', '').replace('https://', '').split('/')[0]) @@ -129,9 +136,11 @@ def get_from_scraper(scrape, request): ingredient_parser = IngredientParser(request, True) recipe_json['steps'] = [] - - for i in parse_instructions(scrape.instructions()): - recipe_json['steps'].append({'instruction': i, 'ingredients': [], }) + try: + for i in parse_instructions(scrape.instructions()): + recipe_json['steps'].append({'instruction': i, 'ingredients': [], }) + except Exception: + pass if len(recipe_json['steps']) == 0: recipe_json['steps'].append({'instruction': '', 'ingredients': [], }) diff --git a/cookbook/helper/scrapers/scrapers.py b/cookbook/helper/scrapers/scrapers.py index 6d785a5e..eb93cc2c 100644 --- a/cookbook/helper/scrapers/scrapers.py +++ b/cookbook/helper/scrapers/scrapers.py @@ -1,6 +1,6 @@ from bs4 import BeautifulSoup from json import JSONDecodeError -from recipe_scrapers import SCRAPERS, get_host_name +from recipe_scrapers import SCRAPERS from recipe_scrapers._factory import SchemaScraperFactory from recipe_scrapers._schemaorg import SchemaOrg @@ -15,13 +15,7 @@ SCRAPERS.update(CUSTOM_SCRAPERS) def text_scraper(text, url=None): - domain = None - if url: - domain = get_host_name(url) - if domain in SCRAPERS: - scraper_class = SCRAPERS[domain] - else: - scraper_class = SchemaScraperFactory.SchemaScraper + scraper_class = SchemaScraperFactory.SchemaScraper class TextScraper(scraper_class): def __init__(