custom scraper fails safe

This commit is contained in:
smilerz 2021-04-15 13:17:15 -05:00
parent d0cedaf7a1
commit d814d13d54
3 changed files with 22 additions and 37 deletions

View File

@ -19,6 +19,7 @@ class StyleTreeprocessor(Treeprocessor):
class MarkdownFormatExtension(markdown.Extension): class MarkdownFormatExtension(markdown.Extension):
# md_ globals deprecated - see here:
def extendMarkdown(self, md): def extendMarkdown(self, md):
md.treeprocessors.register( md.treeprocessors.register(
StyleTreeprocessor(), StyleTreeprocessor(),

View File

@ -44,11 +44,12 @@ class CooksIllustrated(AbstractScraper):
def instructions(self): def instructions(self):
if not self.recipe: if not self.recipe:
self.get_recipe() self.get_recipe()
return "\n".join([self.recipe['whyThisWorks']] + return "\n".join(
[ [self.recipe['whyThisWorks']]
instruction['fields']['content'] + [
for instruction in self.recipe['instructions'] instruction['fields']['content']
] for instruction in self.recipe['instructions']
]
) )
def nutrients(self): def nutrients(self):

View File

@ -228,8 +228,8 @@ class MealPlanViewSet(viewsets.ModelViewSet):
def get_queryset(self): def get_queryset(self):
queryset = self.queryset.filter( queryset = self.queryset.filter(
Q(created_by=self.request.user) | Q(created_by=self.request.user)
Q(shared=self.request.user) | Q(shared=self.request.user)
).filter(space=self.request.space).distinct().all() ).filter(space=self.request.space).distinct().all()
from_date = self.request.query_params.get('from_date', None) from_date = self.request.query_params.get('from_date', None)
@ -554,6 +554,10 @@ def recipe_from_source(request):
mode = request.POST.get('mode', None) mode = request.POST.get('mode', None)
auto = request.POST.get('auto', 'true') auto = request.POST.get('auto', 'true')
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7"
}
if (not url and not data) or (mode == 'url' and not url) or (mode == 'source' and not data): if (not url and not data) or (mode == 'url' and not url) or (mode == 'source' and not data):
return JsonResponse( return JsonResponse(
{ {
@ -563,7 +567,7 @@ def recipe_from_source(request):
status=400 status=400
) )
if mode == 'url': if mode == 'url' and auto == 'true':
if auto == 'true': if auto == 'true':
try: try:
scrape = scrape_me(url) scrape = scrape_me(url)
@ -594,31 +598,10 @@ def recipe_from_source(request):
status=400) status=400)
else: else:
return JsonResponse({"recipe_json": get_from_scraper(scrape, request.space)}) return JsonResponse({"recipe_json": get_from_scraper(scrape, request.space)})
else:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36' # noqa: E501
}
try:
response = requests.get(url, headers=headers)
except requests.exceptions.ConnectionError:
return JsonResponse(
{
'error': True,
'msg': _('The requested page could not be found.')
},
status=400
)
if response.status_code == 403:
return JsonResponse(
{
'error': True,
'msg': _('The requested page refused to provide any information (Status Code 403).')
},
status=400
)
data = response.text
if (mode == 'source') or (mode == 'url' and auto == 'false'): if (mode == 'source') or (mode == 'url' and auto == 'false'):
if not data or data == 'undefined':
data = requests.get(url, headers=HEADERS).content
recipe_json, recipe_tree, recipe_html, images = get_recipe_from_source(data, url, request.space) recipe_json, recipe_tree, recipe_html, images = get_recipe_from_source(data, url, request.space)
if len(recipe_tree) == 0 and len(recipe_json) == 0: if len(recipe_tree) == 0 and len(recipe_json) == 0:
return JsonResponse( return JsonResponse(
@ -637,12 +620,12 @@ def recipe_from_source(request):
}) })
return JsonResponse( return JsonResponse(
{ {
'error': True, 'error': True,
'msg': _('I couldn\'t find anything to do.') 'msg': _('I couldn\'t find anything to do.')
}, },
status=400 status=400
) )
@group_required('admin') @group_required('admin')