wrapper for recipe_scrapers to parse text input

This commit is contained in:
smilerz 2021-04-01 15:07:51 -05:00
parent a3490240f4
commit f811f5996e

View File

@ -1,12 +1,17 @@
import json import json
import re import re
from json.decoder import JSONDecodeError
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from bs4.element import Tag from bs4.element import Tag
from cookbook.helper import recipe_url_import as helper from cookbook.helper import recipe_url_import as helper
from recipe_scrapers import SCRAPERS, get_domain, _exception_handling
from recipe_scrapers._utils import get_host_name, normalize_string from recipe_scrapers._utils import get_host_name, normalize_string
from recipe_scrapers._factory import SchemaScraperFactory
from recipe_scrapers._schemaorg import SchemaOrg
from bs4 import BeautifulSoup
from json import JSONDecodeError
from json.decoder import JSONDecodeError
def get_recipe_from_source(text, url, space): def get_recipe_from_source(text, url, space):
def build_node(k, v): def build_node(k, v):
@ -165,6 +170,36 @@ def get_images_from_source(soup, url):
return images return images
def text_scraper(text, url=None):
domain = get_domain(url)
if domain in SCRAPERS:
scraper_class = SCRAPERS[domain]
else:
scraper_class = SchemaScraperFactory
class TextScraper(scraper_class):
def __init__(
self,
page_data,
url=None
):
self.wild_mode = False
self.exception_handling = _exception_handling
self.meta_http_equiv = False
self.soup = BeautifulSoup(page_data, "html.parser")
self.url = url
try:
self.schema = SchemaOrg(page_data)
except JSONDecodeError:
pass
@classmethod
def generate(cls, page_data, url, **options):
return cls.TextScraper(page_data, url, **options)
return TextScraper.generate(text, url)
def remove_graph(el): def remove_graph(el):
# recipes type might be wrapped in @graph type # recipes type might be wrapped in @graph type
if isinstance(el, Tag): if isinstance(el, Tag):