Call scrape_me first when scraping from url

2022-05-10 00:08:37 +02:00 · 2022-05-10 00:08:37 +02:00 · 2a7475c435
commit 2a7475c435
parent 33a7fee1cc
3 changed files with 50 additions and 38 deletions
--- a/cookbook/helper/recipe_html_import.py
+++ b/cookbook/helper/recipe_html_import.py
@ -9,6 +9,8 @@ from recipe_scrapers._utils import get_host_name, normalize_string

 from cookbook.helper import recipe_url_import as helper
 from cookbook.helper.scrapers.scrapers import text_scraper
+from recipe_scrapers import scrape_me
+from recipe_scrapers._exceptions import NoSchemaFoundInWildMode


 def get_recipe_from_source(text, url, request):
@ -63,7 +65,14 @@ def get_recipe_from_source(text, url, request):
    html_data = []
    images = []
    text = unquote(text)
+    scrape = None

+    if url:
+        try:
+            scrape = scrape_me(url_path=url, wild_mode=True)
+        except(NoSchemaFoundInWildMode):
+            pass
+    if not scrape:
        try:
            parse_list.append(remove_graph(json.loads(text)))
            if not url and 'url' in parse_list[0]:
--- a/cookbook/helper/recipe_url_import.py
+++ b/cookbook/helper/recipe_url_import.py
@ -114,7 +114,14 @@ def get_from_scraper(scrape, request):
        except Exception:
            pass

-    if source_url := scrape.url:
+    try:
+        source_url = scrape.canonical_url()
+    except Exception:
+        try: 
+            source_url = scrape.url
+        except Exception:
+            pass
+    if source_url:
        recipe_json['source_url'] = source_url
        try:
            keywords.append(source_url.replace('http://', '').replace('https://', '').split('/')[0])
@ -129,9 +136,11 @@ def get_from_scraper(scrape, request):
    ingredient_parser = IngredientParser(request, True)

    recipe_json['steps'] = []
-
+    try:
        for i in parse_instructions(scrape.instructions()):
            recipe_json['steps'].append({'instruction': i, 'ingredients': [], })
+    except Exception:
+        pass
    if len(recipe_json['steps']) == 0:
        recipe_json['steps'].append({'instruction': '', 'ingredients': [], })

--- a/cookbook/helper/scrapers/scrapers.py
+++ b/cookbook/helper/scrapers/scrapers.py
@ -1,6 +1,6 @@
 from bs4 import BeautifulSoup
 from json import JSONDecodeError
-from recipe_scrapers import SCRAPERS, get_host_name
+from recipe_scrapers import SCRAPERS 
 from recipe_scrapers._factory import SchemaScraperFactory
 from recipe_scrapers._schemaorg import SchemaOrg

@ -15,12 +15,6 @@ SCRAPERS.update(CUSTOM_SCRAPERS)


 def text_scraper(text, url=None):
-    domain = None
-    if url:
-        domain = get_host_name(url)
-    if domain in SCRAPERS:
-        scraper_class = SCRAPERS[domain]
-    else:
    scraper_class = SchemaScraperFactory.SchemaScraper

    class TextScraper(scraper_class):