From a54f4e1367524ac35d2221e1c34e506443ffcf75 Mon Sep 17 00:00:00 2001
From: smilerz <smilerz@gmail.com>
Date: Thu, 1 Apr 2021 16:19:18 -0500
Subject: [PATCH] updated import from source to use text scraper

---
 cookbook/helper/recipe_html_import.py | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)
diff --git a/cookbook/helper/recipe_html_import.py b/cookbook/helper/recipe_html_import.py
index ab34fdad..e4d818d4 100644
--- a/cookbook/helper/recipe_html_import.py
+++ b/cookbook/helper/recipe_html_import.py
@@ -1,3 +1,4 @@
+#%%
 import json
 import re
 
@@ -72,7 +73,6 @@ def get_recipe_from_source(text, url, space):
                 'cookTime': ''
                 }
     recipe_tree = []
-    temp_tree = []
     parse_list = []
     html_data = []
     images = []
@@ -80,8 +80,11 @@ def get_recipe_from_source(text, url, space):
     text = normalize_string(text)
     try:
         parse_list.append(remove_graph(json.loads(text)))
+        scrape = text_scraper("<script type='application/ld+json'>"+text+"</script>")
+        
     except JSONDecodeError:
         soup = BeautifulSoup(text, "html.parser")
+        scrape = text_scraper(text)
         html_data = get_from_html(soup)
         images += get_images_from_source(soup, url)
         for el in soup.find_all('script', type='application/ld+json'):
@@ -89,13 +92,14 @@ def get_recipe_from_source(text, url, space):
         for el in soup.find_all(type='application/json'):
             parse_list.append(remove_graph(el))
 
-    # first try finding ld+json as its most common
-    for el in parse_list:
-        # if a url was not provided, try to find one in the first document
-        if not url:
-            if 'url' in el:
-                url = el['url']
+    # if a url was not provided, try to find one in the first document
+    if not url:
+        if 'url' in parse_list[0]:
+            url = parse_list[0]['url']
+    recipe_json = helper.get_from_scraper(scrape, url, space)
 
+    for el in parse_list:
+        temp_tree = []
         if isinstance(el, Tag):
             try:
                 el = json.loads(el.string)
@@ -123,11 +127,10 @@ def get_recipe_from_source(text, url, space):
             temp_tree.append(node)
 
         if '@type' in el and el['@type'] == 'Recipe':
-            recipe_json = helper.find_recipe_json(el, url, space)
             recipe_tree += [{'name': 'ld+json', 'children': temp_tree}]
         else:
             recipe_tree += [{'name': 'json', 'children': temp_tree}]
-        temp_tree = []
+        
 
     return recipe_json, recipe_tree, html_data, images
 
@@ -195,7 +198,7 @@ def text_scraper(text, url=None):
         
         @classmethod
         def generate(cls, page_data, url, **options):
-            return cls.TextScraper(page_data, url, **options)
+            return cls(page_data, url, **options)
 
     return TextScraper.generate(text, url)