TandoorRecipes/cookbook/helper/scrapers/scrapers.py
2021-04-16 09:02:03 -05:00

44 lines
1.3 KiB
Python

from bs4 import BeautifulSoup
from json import JSONDecodeError
from recipe_scrapers import SCRAPERS, get_domain, _exception_handling
from recipe_scrapers._factory import SchemaScraperFactory
from recipe_scrapers._schemaorg import SchemaOrg
from .cooksillustrated import CooksIllustrated
CUSTOM_SCRAPERS = {
CooksIllustrated.host(site="cooksillustrated"): CooksIllustrated,
CooksIllustrated.host(site="americastestkitchen"): CooksIllustrated,
CooksIllustrated.host(site="cookscountry"): CooksIllustrated,
}
SCRAPERS.update(CUSTOM_SCRAPERS)
def text_scraper(text, url=None):
domain = None
if url:
domain = get_domain(url)
if domain in SCRAPERS:
scraper_class = SCRAPERS[domain]
else:
scraper_class = SchemaScraperFactory.SchemaScraper
class TextScraper(scraper_class):
def __init__(
self,
page_data,
url=None
):
self.wild_mode = False
self.exception_handling = None
self.meta_http_equiv = False
self.soup = BeautifulSoup(page_data, "html.parser")
self.url = url
self.recipe = None
try:
self.schema = SchemaOrg(page_data)
except JSONDecodeError:
pass
return TextScraper(text, url)