custom scraper fails safe

2021-04-15 13:17:15 -05:00 · 2021-04-15 13:17:15 -05:00 · d814d13d54
commit d814d13d54
parent d0cedaf7a1
3 changed files with 22 additions and 37 deletions
--- a/cookbook/helper/mdx_attributes.py
+++ b/cookbook/helper/mdx_attributes.py
@ -19,6 +19,7 @@ class StyleTreeprocessor(Treeprocessor):


 class MarkdownFormatExtension(markdown.Extension):
+    # md_ globals deprecated - see here:
    def extendMarkdown(self, md):
        md.treeprocessors.register(
            StyleTreeprocessor(),
--- a/cookbook/helper/scrapers/cooksillustrated.py
+++ b/cookbook/helper/scrapers/cooksillustrated.py
@ -44,11 +44,12 @@ class CooksIllustrated(AbstractScraper):
    def instructions(self):
        if not self.recipe:
            self.get_recipe()
-        return "\n".join([self.recipe['whyThisWorks']] +
-                    [
-                        instruction['fields']['content']
-                        for instruction in self.recipe['instructions']
-                    ]
+        return "\n".join(
+            [self.recipe['whyThisWorks']]
+            + [
+                instruction['fields']['content']
+                for instruction in self.recipe['instructions']
+            ]
        )

    def nutrients(self):
--- a/cookbook/views/api.py
+++ b/cookbook/views/api.py
@ -228,8 +228,8 @@ class MealPlanViewSet(viewsets.ModelViewSet):

    def get_queryset(self):
        queryset = self.queryset.filter(
-            Q(created_by=self.request.user) |
-            Q(shared=self.request.user)
+            Q(created_by=self.request.user)
+            | Q(shared=self.request.user)
        ).filter(space=self.request.space).distinct().all()

        from_date = self.request.query_params.get('from_date', None)
@ -554,6 +554,10 @@ def recipe_from_source(request):
    mode = request.POST.get('mode', None)
    auto = request.POST.get('auto', 'true')

+    HEADERS = {
+        "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7"
+    }
+
    if (not url and not data) or (mode == 'url' and not url) or (mode == 'source' and not data):
        return JsonResponse(
            {
@ -563,7 +567,7 @@ def recipe_from_source(request):
            status=400
        )

-    if mode == 'url':
+    if mode == 'url' and auto == 'true':
        if auto == 'true':
            try:
                scrape = scrape_me(url)
@ -594,31 +598,10 @@ def recipe_from_source(request):
                    status=400)
            else:
                return JsonResponse({"recipe_json": get_from_scraper(scrape, request.space)})
-        else:
-            headers = {
-                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36'  # noqa: E501
-            }
-            try:
-                response = requests.get(url, headers=headers)
-            except requests.exceptions.ConnectionError:
-                return JsonResponse(
-                    {
-                        'error': True,
-                        'msg': _('The requested page could not be found.')
-                    },
-                    status=400
-                )

-            if response.status_code == 403:
-                return JsonResponse(
-                    {
-                        'error': True,
-                        'msg': _('The requested page refused to provide any information (Status Code 403).')
-                    },
-                    status=400
-                )
-            data = response.text
    if (mode == 'source') or (mode == 'url' and auto == 'false'):
+        if not data or data == 'undefined':
+            data = requests.get(url, headers=HEADERS).content
        recipe_json, recipe_tree, recipe_html, images = get_recipe_from_source(data, url, request.space)
        if len(recipe_tree) == 0 and len(recipe_json) == 0:
            return JsonResponse(
@ -637,12 +620,12 @@ def recipe_from_source(request):
            })

    return JsonResponse(
-            {
-                'error': True,
-                'msg': _('I couldn\'t find anything to do.')
-            },
-            status=400
-        )
+        {
+            'error': True,
+            'msg': _('I couldn\'t find anything to do.')
+        },
+        status=400
+    )


@group_required('admin')