diff --git a/cookbook/helper/recipe_html_import.py b/cookbook/helper/recipe_html_import.py
index e4d818d4..d2264721 100644
--- a/cookbook/helper/recipe_html_import.py
+++ b/cookbook/helper/recipe_html_import.py
@@ -1,14 +1,12 @@
-#%%
import json
import re
from bs4 import BeautifulSoup
from bs4.element import Tag
from cookbook.helper import recipe_url_import as helper
-from recipe_scrapers import SCRAPERS, get_domain, _exception_handling
+from cookbook.helper.scrapers.scrapers import text_scraper
from recipe_scrapers._utils import get_host_name, normalize_string
-from recipe_scrapers._factory import SchemaScraperFactory
-from recipe_scrapers._schemaorg import SchemaOrg
+
from bs4 import BeautifulSoup
from json import JSONDecodeError
@@ -93,10 +91,12 @@ def get_recipe_from_source(text, url, space):
parse_list.append(remove_graph(el))
# if a url was not provided, try to find one in the first document
- if not url:
+ if not url and len(parse_list) > 0:
if 'url' in parse_list[0]:
url = parse_list[0]['url']
- recipe_json = helper.get_from_scraper(scrape, url, space)
+
+
+ recipe_json = helper.get_from_scraper(scrape, space)
for el in parse_list:
temp_tree = []
@@ -173,36 +173,6 @@ def get_images_from_source(soup, url):
return images
-def text_scraper(text, url=None):
- domain = get_domain(url)
- if domain in SCRAPERS:
- scraper_class = SCRAPERS[domain]
- else:
- scraper_class = SchemaScraperFactory
-
- class TextScraper(scraper_class):
- def __init__(
- self,
- page_data,
- url=None
- ):
- self.wild_mode = False
- self.exception_handling = _exception_handling
- self.meta_http_equiv = False
- self.soup = BeautifulSoup(page_data, "html.parser")
- self.url = url
- try:
- self.schema = SchemaOrg(page_data)
- except JSONDecodeError:
- pass
-
- @classmethod
- def generate(cls, page_data, url, **options):
- return cls(page_data, url, **options)
-
- return TextScraper.generate(text, url)
-
-
def remove_graph(el):
# recipes type might be wrapped in @graph type
if isinstance(el, Tag):
diff --git a/cookbook/helper/recipe_url_import.py b/cookbook/helper/recipe_url_import.py
index 9333ff67..f8b25e07 100644
--- a/cookbook/helper/recipe_url_import.py
+++ b/cookbook/helper/recipe_url_import.py
@@ -174,7 +174,8 @@ def get_from_scraper(scrape, space):
except AttributeError:
recipe_json['recipeInstructions'] = ""
- recipe_json['recipeInstructions'] += "\n\nImported from " + scrape.url
+ if scrape.url:
+ recipe_json['recipeInstructions'] += "\n\nImported from " + scrape.url
return recipe_json
diff --git a/cookbook/helper/scrapers/scrapers.py b/cookbook/helper/scrapers/scrapers.py
new file mode 100644
index 00000000..baf4bf9d
--- /dev/null
+++ b/cookbook/helper/scrapers/scrapers.py
@@ -0,0 +1,41 @@
+from bs4 import BeautifulSoup
+from recipe_scrapers import SCRAPERS, get_domain, _exception_handling
+from recipe_scrapers._factory import SchemaScraperFactory
+from recipe_scrapers._schemaorg import SchemaOrg
+
+from .cooksillustrated import CooksIllustrated
+
+CUSTOM_SCRAPERS = {
+ CooksIllustrated.host(): CooksIllustrated,
+}
+
+SCRAPERS = SCRAPERS.update(CUSTOM_SCRAPERS)
+#%%
+def text_scraper(text, url=None):
+ domain = None
+ if url:
+ domain = get_domain(url)
+ if domain in SCRAPERS:
+ scraper_class = SCRAPERS[domain]
+ else:
+ scraper_class = SchemaScraperFactory.SchemaScraper
+
+ class TextScraper(scraper_class):
+ def __init__(
+ self,
+ page_data,
+ url=None
+ ):
+ self.wild_mode = False
+ self.exception_handling = _exception_handling
+ self.meta_http_equiv = False
+ self.soup = BeautifulSoup(page_data, "html.parser")
+ self.url = url
+ try:
+ self.schema = SchemaOrg(page_data)
+ except JSONDecodeError:
+ pass
+
+ return TextScraper(text, url)
+
+# %%