added ability to create custom scrapers
This commit is contained in:
parent
0a62225797
commit
3cf949bf8d
@ -1,14 +1,12 @@
|
|||||||
#%%
|
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from bs4.element import Tag
|
from bs4.element import Tag
|
||||||
from cookbook.helper import recipe_url_import as helper
|
from cookbook.helper import recipe_url_import as helper
|
||||||
from recipe_scrapers import SCRAPERS, get_domain, _exception_handling
|
from cookbook.helper.scrapers.scrapers import text_scraper
|
||||||
from recipe_scrapers._utils import get_host_name, normalize_string
|
from recipe_scrapers._utils import get_host_name, normalize_string
|
||||||
from recipe_scrapers._factory import SchemaScraperFactory
|
|
||||||
from recipe_scrapers._schemaorg import SchemaOrg
|
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from json import JSONDecodeError
|
from json import JSONDecodeError
|
||||||
@ -93,10 +91,12 @@ def get_recipe_from_source(text, url, space):
|
|||||||
parse_list.append(remove_graph(el))
|
parse_list.append(remove_graph(el))
|
||||||
|
|
||||||
# if a url was not provided, try to find one in the first document
|
# if a url was not provided, try to find one in the first document
|
||||||
if not url:
|
if not url and len(parse_list) > 0:
|
||||||
if 'url' in parse_list[0]:
|
if 'url' in parse_list[0]:
|
||||||
url = parse_list[0]['url']
|
url = parse_list[0]['url']
|
||||||
recipe_json = helper.get_from_scraper(scrape, url, space)
|
|
||||||
|
|
||||||
|
recipe_json = helper.get_from_scraper(scrape, space)
|
||||||
|
|
||||||
for el in parse_list:
|
for el in parse_list:
|
||||||
temp_tree = []
|
temp_tree = []
|
||||||
@ -173,36 +173,6 @@ def get_images_from_source(soup, url):
|
|||||||
return images
|
return images
|
||||||
|
|
||||||
|
|
||||||
def text_scraper(text, url=None):
|
|
||||||
domain = get_domain(url)
|
|
||||||
if domain in SCRAPERS:
|
|
||||||
scraper_class = SCRAPERS[domain]
|
|
||||||
else:
|
|
||||||
scraper_class = SchemaScraperFactory
|
|
||||||
|
|
||||||
class TextScraper(scraper_class):
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
page_data,
|
|
||||||
url=None
|
|
||||||
):
|
|
||||||
self.wild_mode = False
|
|
||||||
self.exception_handling = _exception_handling
|
|
||||||
self.meta_http_equiv = False
|
|
||||||
self.soup = BeautifulSoup(page_data, "html.parser")
|
|
||||||
self.url = url
|
|
||||||
try:
|
|
||||||
self.schema = SchemaOrg(page_data)
|
|
||||||
except JSONDecodeError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def generate(cls, page_data, url, **options):
|
|
||||||
return cls(page_data, url, **options)
|
|
||||||
|
|
||||||
return TextScraper.generate(text, url)
|
|
||||||
|
|
||||||
|
|
||||||
def remove_graph(el):
|
def remove_graph(el):
|
||||||
# recipes type might be wrapped in @graph type
|
# recipes type might be wrapped in @graph type
|
||||||
if isinstance(el, Tag):
|
if isinstance(el, Tag):
|
||||||
|
@ -174,7 +174,8 @@ def get_from_scraper(scrape, space):
|
|||||||
except AttributeError:
|
except AttributeError:
|
||||||
recipe_json['recipeInstructions'] = ""
|
recipe_json['recipeInstructions'] = ""
|
||||||
|
|
||||||
recipe_json['recipeInstructions'] += "\n\nImported from " + scrape.url
|
if scrape.url:
|
||||||
|
recipe_json['recipeInstructions'] += "\n\nImported from " + scrape.url
|
||||||
return recipe_json
|
return recipe_json
|
||||||
|
|
||||||
|
|
||||||
|
41
cookbook/helper/scrapers/scrapers.py
Normal file
41
cookbook/helper/scrapers/scrapers.py
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from recipe_scrapers import SCRAPERS, get_domain, _exception_handling
|
||||||
|
from recipe_scrapers._factory import SchemaScraperFactory
|
||||||
|
from recipe_scrapers._schemaorg import SchemaOrg
|
||||||
|
|
||||||
|
from .cooksillustrated import CooksIllustrated
|
||||||
|
|
||||||
|
CUSTOM_SCRAPERS = {
|
||||||
|
CooksIllustrated.host(): CooksIllustrated,
|
||||||
|
}
|
||||||
|
|
||||||
|
SCRAPERS = SCRAPERS.update(CUSTOM_SCRAPERS)
|
||||||
|
#%%
|
||||||
|
def text_scraper(text, url=None):
|
||||||
|
domain = None
|
||||||
|
if url:
|
||||||
|
domain = get_domain(url)
|
||||||
|
if domain in SCRAPERS:
|
||||||
|
scraper_class = SCRAPERS[domain]
|
||||||
|
else:
|
||||||
|
scraper_class = SchemaScraperFactory.SchemaScraper
|
||||||
|
|
||||||
|
class TextScraper(scraper_class):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
page_data,
|
||||||
|
url=None
|
||||||
|
):
|
||||||
|
self.wild_mode = False
|
||||||
|
self.exception_handling = _exception_handling
|
||||||
|
self.meta_http_equiv = False
|
||||||
|
self.soup = BeautifulSoup(page_data, "html.parser")
|
||||||
|
self.url = url
|
||||||
|
try:
|
||||||
|
self.schema = SchemaOrg(page_data)
|
||||||
|
except JSONDecodeError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return TextScraper(text, url)
|
||||||
|
|
||||||
|
# %%
|
Loading…
Reference in New Issue
Block a user