TandoorRecipes/cookbook/helper/recipe_html_import.py

# import json
# import re
# from json import JSONDecodeError
# from urllib.parse import unquote

# from bs4 import BeautifulSoup
# from bs4.element import Tag
# from recipe_scrapers import scrape_html, scrape_me
# from recipe_scrapers._exceptions import NoSchemaFoundInWildMode
# from recipe_scrapers._utils import get_host_name, normalize_string

# from cookbook.helper import recipe_url_import as helper
# from cookbook.helper.scrapers.scrapers import text_scraper


# def get_recipe_from_source(text, url, request):
#     def build_node(k, v):
#         if isinstance(v, dict):
#             node = {
#                 'name': k,
#                 'value': k,
#                 'children': get_children_dict(v)
#             }
#         elif isinstance(v, list):
#             node = {
#                 'name': k,
#                 'value': k,
#                 'children': get_children_list(v)
#             }
#         else:
#             node = {
#                 'name': k + ": " + normalize_string(str(v)),
#                 'value': normalize_string(str(v))
#             }
#         return node

#     def get_children_dict(children):
#         kid_list = []
#         for k, v in children.items():
#             kid_list.append(build_node(k, v))
#         return kid_list

#     def get_children_list(children):
#         kid_list = []
#         for kid in children:
#             if type(kid) == list:
#                 node = {
#                     'name': "unknown list",
#                     'value': "unknown list",
#                     'children': get_children_list(kid)
#                 }
#                 kid_list.append(node)
#             elif type(kid) == dict:
#                 for k, v in kid.items():
#                     kid_list.append(build_node(k, v))
#             else:
#                 kid_list.append({
#                     'name': normalize_string(str(kid)),
#                     'value': normalize_string(str(kid))
#                 })
#         return kid_list

#     recipe_tree = []
#     parse_list = []
#     soup = BeautifulSoup(text, "html.parser")
#     html_data = get_from_html(soup)
#     images = get_images_from_source(soup, url)
#     text = unquote(text)
#     scrape = None

#     if url and not text:
#         try:
#             scrape = scrape_me(url_path=url, wild_mode=True)
#         except(NoSchemaFoundInWildMode):
#             pass

#     if not scrape:
#         try:
#             parse_list.append(remove_graph(json.loads(text)))
#             if not url and 'url' in parse_list[0]:
#                 url = parse_list[0]['url']
#             scrape = text_scraper("<script type='application/ld+json'>" + text + "</script>", url=url)

#         except JSONDecodeError:
#             for el in soup.find_all('script', type='application/ld+json'):
#                 el = remove_graph(el)
#                 if not url and 'url' in el:
#                     url = el['url']
#                 if type(el) == list:
#                     for le in el:
#                         parse_list.append(le)
#                 elif type(el) == dict:
#                     parse_list.append(el)
#             for el in soup.find_all(type='application/json'):
#                 el = remove_graph(el)
#                 if type(el) == list:
#                     for le in el:
#                         parse_list.append(le)
#                 elif type(el) == dict:
#                     parse_list.append(el)
#             scrape = text_scraper(text, url=url)

#     recipe_json = helper.get_from_scraper(scrape, request)

#     # TODO: DEPRECATE recipe_tree & html_data.  first validate it isn't used anywhere
#     for el in parse_list:
#         temp_tree = []
#         if isinstance(el, Tag):
#             try:
#                 el = json.loads(el.string)
#             except TypeError:
#                 continue

#         for k, v in el.items():
#             if isinstance(v, dict):
#                 node = {
#                     'name': k,
#                     'value': k,
#                     'children': get_children_dict(v)
#                 }
#             elif isinstance(v, list):
#                 node = {
#                     'name': k,
#                     'value': k,
#                     'children': get_children_list(v)
#                 }
#             else:
#                 node = {
#                     'name': k + ": " + normalize_string(str(v)),
#                     'value': normalize_string(str(v))
#                 }
#             temp_tree.append(node)

#         if '@type' in el and el['@type'] == 'Recipe':
#             recipe_tree += [{'name': 'ld+json', 'children': temp_tree}]
#         else:
#             recipe_tree += [{'name': 'json', 'children': temp_tree}]

#     return recipe_json, recipe_tree, html_data, images


# def get_from_html(soup):
#     INVISIBLE_ELEMS = ('style', 'script', 'head', 'title')
#     html = []
#     for s in soup.strings:
#         if ((s.parent.name not in INVISIBLE_ELEMS) and (len(s.strip()) > 0)):
#             html.append(s)
#     return html


# def get_images_from_source(soup, url):
#     sources = ['src', 'srcset', 'data-src']
#     images = []
#     img_tags = soup.find_all('img')
#     if url:
#         site = get_host_name(url)
#         prot = url.split(':')[0]

#     urls = []
#     for img in img_tags:
#         for src in sources:
#             try:
#                 urls.append(img[src])
#             except KeyError:
#                 pass

#     for u in urls:
#         u = u.split('?')[0]
#         filename = re.search(r'/([\w_-]+[.](jpg|jpeg|gif|png))$', u)
#         if filename:
#             if (('http' not in u) and (url)):
#                 # sometimes an image source can be relative
#                 # if it is provide the base url
#                 u = '{}://{}{}'.format(prot, site, u)
#             if 'http' in u:
#                 images.append(u)
#     return images


# def remove_graph(el):
#     # recipes type might be wrapped in @graph type
#     if isinstance(el, Tag):
#         try:
#             el = json.loads(el.string)
#             if '@graph' in el:
#                 for x in el['@graph']:
#                     if '@type' in x and x['@type'] == 'Recipe':
#                         el = x
#         except (TypeError, JSONDecodeError):
#             pass
#     return el