quick hack to allow scraper to work correctly

2022-07-07 07:50:57 -05:00
parent 25a41bd293
commit b1c0334947
2 changed files with 7 additions and 2 deletions
--- a/cookbook/helper/recipe_html_import.py
+++ b/cookbook/helper/recipe_html_import.py
@ -73,6 +73,7 @@ def get_recipe_from_source(text, url, request):
            scrape = scrape_me(url_path=url, wild_mode=True)
        except(NoSchemaFoundInWildMode):
            pass
    if not scrape:
        try:
            parse_list.append(remove_graph(json.loads(text)))
@ -101,6 +102,7 @@ def get_recipe_from_source(text, url, request):
    recipe_json = helper.get_from_scraper(scrape, request)
    # TODO: DEPRECATE recipe_tree & html_data.  first validate it isn't used anywhere
    for el in parse_list:
        temp_tree = []
        if isinstance(el, Tag):
--- a/cookbook/views/api.py
+++ b/cookbook/views/api.py
@ -1120,7 +1120,7 @@ def recipe_from_source(request):
    """
    serializer = RecipeFromSourceSerializer(data=request.data)
    if serializer.is_valid():
-        # headers to use for request to external sites
+        # headers to use for request to external sites - DEPRECATE
        external_request_headers = {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7"}
        if (b_pk := serializer.validated_data.get('bookmarklet', None)) and (bookmarklet := BookmarkletImport.objects.filter(pk=b_pk).first()):
@ -1144,9 +1144,11 @@ def recipe_from_source(request):
                        'recipe_html': '',
                        'recipe_images': [],
                    }, status=status.HTTP_200_OK)
            #######
            # this section is redundant to scrape_me.  REFACTOR to catch errors from scrape_me
            try:
                if validators.url(serializer.validated_data['url'], public=True):
-                    serializer.validated_data['data'] = requests.get(serializer.validated_data['url'], headers=external_request_headers).content
+                    requests.get(serializer.validated_data['url'], headers=external_request_headers).content
                else:
                    return Response({
                        'error': True,
@ -1162,6 +1164,7 @@ def recipe_from_source(request):
                    'error': True,
                    'msg': _('Bad URL Schema.')
                }, status=status.HTTP_400_BAD_REQUEST)
            #######
        recipe_json, recipe_tree, recipe_html, recipe_images = get_recipe_from_source(serializer.validated_data['data'], serializer.validated_data['url'], request)
        if len(recipe_tree) == 0 and len(recipe_json) == 0: