diff --git a/cookbook/helper/recipe_html_import.py b/cookbook/helper/recipe_html_import.py index e4d818d4..d2264721 100644 --- a/cookbook/helper/recipe_html_import.py +++ b/cookbook/helper/recipe_html_import.py @@ -1,14 +1,12 @@ -#%% import json import re from bs4 import BeautifulSoup from bs4.element import Tag from cookbook.helper import recipe_url_import as helper -from recipe_scrapers import SCRAPERS, get_domain, _exception_handling +from cookbook.helper.scrapers.scrapers import text_scraper from recipe_scrapers._utils import get_host_name, normalize_string -from recipe_scrapers._factory import SchemaScraperFactory -from recipe_scrapers._schemaorg import SchemaOrg + from bs4 import BeautifulSoup from json import JSONDecodeError @@ -93,10 +91,12 @@ def get_recipe_from_source(text, url, space): parse_list.append(remove_graph(el)) # if a url was not provided, try to find one in the first document - if not url: + if not url and len(parse_list) > 0: if 'url' in parse_list[0]: url = parse_list[0]['url'] - recipe_json = helper.get_from_scraper(scrape, url, space) + + + recipe_json = helper.get_from_scraper(scrape, space) for el in parse_list: temp_tree = [] @@ -173,36 +173,6 @@ def get_images_from_source(soup, url): return images -def text_scraper(text, url=None): - domain = get_domain(url) - if domain in SCRAPERS: - scraper_class = SCRAPERS[domain] - else: - scraper_class = SchemaScraperFactory - - class TextScraper(scraper_class): - def __init__( - self, - page_data, - url=None - ): - self.wild_mode = False - self.exception_handling = _exception_handling - self.meta_http_equiv = False - self.soup = BeautifulSoup(page_data, "html.parser") - self.url = url - try: - self.schema = SchemaOrg(page_data) - except JSONDecodeError: - pass - - @classmethod - def generate(cls, page_data, url, **options): - return cls(page_data, url, **options) - - return TextScraper.generate(text, url) - - def remove_graph(el): # recipes type might be wrapped in @graph type if isinstance(el, Tag): diff --git a/cookbook/helper/recipe_url_import.py b/cookbook/helper/recipe_url_import.py index 9333ff67..f8b25e07 100644 --- a/cookbook/helper/recipe_url_import.py +++ b/cookbook/helper/recipe_url_import.py @@ -174,7 +174,8 @@ def get_from_scraper(scrape, space): except AttributeError: recipe_json['recipeInstructions'] = "" - recipe_json['recipeInstructions'] += "\n\nImported from " + scrape.url + if scrape.url: + recipe_json['recipeInstructions'] += "\n\nImported from " + scrape.url return recipe_json diff --git a/cookbook/helper/scrapers/scrapers.py b/cookbook/helper/scrapers/scrapers.py new file mode 100644 index 00000000..baf4bf9d --- /dev/null +++ b/cookbook/helper/scrapers/scrapers.py @@ -0,0 +1,41 @@ +from bs4 import BeautifulSoup +from recipe_scrapers import SCRAPERS, get_domain, _exception_handling +from recipe_scrapers._factory import SchemaScraperFactory +from recipe_scrapers._schemaorg import SchemaOrg + +from .cooksillustrated import CooksIllustrated + +CUSTOM_SCRAPERS = { + CooksIllustrated.host(): CooksIllustrated, +} + +SCRAPERS = SCRAPERS.update(CUSTOM_SCRAPERS) +#%% +def text_scraper(text, url=None): + domain = None + if url: + domain = get_domain(url) + if domain in SCRAPERS: + scraper_class = SCRAPERS[domain] + else: + scraper_class = SchemaScraperFactory.SchemaScraper + + class TextScraper(scraper_class): + def __init__( + self, + page_data, + url=None + ): + self.wild_mode = False + self.exception_handling = _exception_handling + self.meta_http_equiv = False + self.soup = BeautifulSoup(page_data, "html.parser") + self.url = url + try: + self.schema = SchemaOrg(page_data) + except JSONDecodeError: + pass + + return TextScraper(text, url) + +# %%