TandoorRecipes/cookbook/helper/scrapers/scrapers.py
smilerz faf458e8ef Squashed commit of the following:
commit 707d862e01a7497a1f22879d314b865a35e0e85b
Author: smilerz <smilerz@gmail.com>
Date:   Wed Apr 14 10:35:00 2021 -0500

    works now

commit 3942a445ed4f2ccec57de25eacd86ea4e4dd6bdb
Author: smilerz <smilerz@gmail.com>
Date:   Wed Apr 14 10:25:24 2021 -0500

    updated serializer and api

commit 10dc746eb175c7f805a8a8ffa7ce49977a7ce97e
Author: smilerz <smilerz@gmail.com>
Date:   Wed Apr 14 10:20:19 2021 -0500

    fixed bookmarklet

commit 9779104902d3be0258c95cd2eeebcba0d5d48892
Merge: bb8262c 0cb3928
Author: smilerz <smilerz@gmail.com>
Date:   Wed Apr 14 09:56:27 2021 -0500

    Merge branch 'bookmarklet' into json_import

commit 0cb39284bb835ffc6cfee3e4306aadc4a64a25be
Author: smilerz <smilerz@gmail.com>
Date:   Wed Apr 14 09:42:53 2021 -0500

    retrieve bookmarklet ID from get

commit e89e0218de684d40b2e2bfb6ba833891206c828e
Author: smilerz <smilerz@gmail.com>
Date:   Wed Apr 14 09:29:33 2021 -0500

    Revert "fixed broken tab"

    This reverts commit ca0a1aede3cc6cb3912bc1fe30c0aa22e3f481a6.

commit bb8262ccabb93c56fbc18c407d5a0653b8b3ca79
Merge: b1e73aa 35a7f62
Author: smilerz <smilerz@gmail.com>
Date:   Sun Apr 11 20:35:57 2021 -0500

    Merge branch 'main_fork' into json_import
2021-05-01 16:30:22 -05:00

44 lines
1.3 KiB
Python

from bs4 import BeautifulSoup
from json import JSONDecodeError
from recipe_scrapers import SCRAPERS, get_domain, _exception_handling
from recipe_scrapers._factory import SchemaScraperFactory
from recipe_scrapers._schemaorg import SchemaOrg
from .cooksillustrated import CooksIllustrated
CUSTOM_SCRAPERS = {
CooksIllustrated.host(site="cooksillustrated"): CooksIllustrated,
CooksIllustrated.host(site="americastestkitchen"): CooksIllustrated,
CooksIllustrated.host(site="cookscountry"): CooksIllustrated,
}
SCRAPERS.update(CUSTOM_SCRAPERS)
def text_scraper(text, url=None):
domain = None
if url:
domain = get_host_name(url)
if domain in SCRAPERS:
scraper_class = SCRAPERS[domain]
else:
scraper_class = SchemaScraperFactory.SchemaScraper
class TextScraper(scraper_class):
def __init__(
self,
page_data,
url=None
):
self.wild_mode = False
self.exception_handling = None
self.meta_http_equiv = False
self.soup = BeautifulSoup(page_data, "html.parser")
self.url = url
self.recipe = None
try:
self.schema = SchemaOrg(page_data)
except JSONDecodeError:
pass
return TextScraper(text, url)