From f811f5996e0dd038efb3274a89b82d5a61a305d7 Mon Sep 17 00:00:00 2001 From: smilerz Date: Thu, 1 Apr 2021 15:07:51 -0500 Subject: [PATCH] wrapper for recipe_scrapers to parse text input --- cookbook/helper/recipe_html_import.py | 37 ++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/cookbook/helper/recipe_html_import.py b/cookbook/helper/recipe_html_import.py index 9140d076..ab34fdad 100644 --- a/cookbook/helper/recipe_html_import.py +++ b/cookbook/helper/recipe_html_import.py @@ -1,12 +1,17 @@ import json import re -from json.decoder import JSONDecodeError from bs4 import BeautifulSoup from bs4.element import Tag from cookbook.helper import recipe_url_import as helper +from recipe_scrapers import SCRAPERS, get_domain, _exception_handling from recipe_scrapers._utils import get_host_name, normalize_string +from recipe_scrapers._factory import SchemaScraperFactory +from recipe_scrapers._schemaorg import SchemaOrg +from bs4 import BeautifulSoup +from json import JSONDecodeError +from json.decoder import JSONDecodeError def get_recipe_from_source(text, url, space): def build_node(k, v): @@ -165,6 +170,36 @@ def get_images_from_source(soup, url): return images +def text_scraper(text, url=None): + domain = get_domain(url) + if domain in SCRAPERS: + scraper_class = SCRAPERS[domain] + else: + scraper_class = SchemaScraperFactory + + class TextScraper(scraper_class): + def __init__( + self, + page_data, + url=None + ): + self.wild_mode = False + self.exception_handling = _exception_handling + self.meta_http_equiv = False + self.soup = BeautifulSoup(page_data, "html.parser") + self.url = url + try: + self.schema = SchemaOrg(page_data) + except JSONDecodeError: + pass + + @classmethod + def generate(cls, page_data, url, **options): + return cls.TextScraper(page_data, url, **options) + + return TextScraper.generate(text, url) + + def remove_graph(el): # recipes type might be wrapped in @graph type if isinstance(el, Tag):