44 lines
1.3 KiB
Python
44 lines
1.3 KiB
Python
from bs4 import BeautifulSoup
|
|
from json import JSONDecodeError
|
|
from recipe_scrapers import SCRAPERS, get_domain, _exception_handling
|
|
from recipe_scrapers._factory import SchemaScraperFactory
|
|
from recipe_scrapers._schemaorg import SchemaOrg
|
|
|
|
from .cooksillustrated import CooksIllustrated
|
|
|
|
CUSTOM_SCRAPERS = {
|
|
CooksIllustrated.host(site="cooksillustrated"): CooksIllustrated,
|
|
CooksIllustrated.host(site="americastestkitchen"): CooksIllustrated,
|
|
CooksIllustrated.host(site="cookscountry"): CooksIllustrated,
|
|
}
|
|
SCRAPERS.update(CUSTOM_SCRAPERS)
|
|
|
|
|
|
def text_scraper(text, url=None):
|
|
domain = None
|
|
if url:
|
|
domain = get_domain(url)
|
|
if domain in SCRAPERS:
|
|
scraper_class = SCRAPERS[domain]
|
|
else:
|
|
scraper_class = SchemaScraperFactory.SchemaScraper
|
|
|
|
class TextScraper(scraper_class):
|
|
def __init__(
|
|
self,
|
|
page_data,
|
|
url=None
|
|
):
|
|
self.wild_mode = False
|
|
self.exception_handling = None
|
|
self.meta_http_equiv = False
|
|
self.soup = BeautifulSoup(page_data, "html.parser")
|
|
self.url = url
|
|
self.recipe = None
|
|
try:
|
|
self.schema = SchemaOrg(page_data)
|
|
except JSONDecodeError:
|
|
pass
|
|
|
|
return TextScraper(text, url)
|