fix: ingredient parsing for non-latin languages
Before this change the ingredient string for non-latin languages was not being parsed into the correct amount or units when the food is found at the start of the ingredient string. This was because the regex being used was restricted to latin characters. With this change the amount and units are correctly parsed from such a string. Fixes https://github.com/TandoorRecipes/recipes/issues/1983
This commit is contained in:
@ -221,8 +221,8 @@ class IngredientParser:
|
||||
|
||||
# some people/languages put amount and unit at the end of the ingredient string
|
||||
# if something like this is detected move it to the beginning so the parser can handle it
|
||||
if len(ingredient) < 1000 and re.search(r'^([A-z])+(.)*[1-9](\d)*\s([A-z])+', ingredient):
|
||||
match = re.search(r'[1-9](\d)*\s([A-z])+', ingredient)
|
||||
if len(ingredient) < 1000 and re.search(r'^([^\W\d_])+(.)*[1-9](\d)*\s*([^\W\d_])+', ingredient):
|
||||
match = re.search(r'[1-9](\d)*\s*([^\W\d_])+', ingredient)
|
||||
print(f'reording from {ingredient} to {ingredient[match.start():match.end()] + " " + ingredient.replace(ingredient[match.start():match.end()], "")}')
|
||||
ingredient = ingredient[match.start():match.end()] + ' ' + ingredient.replace(ingredient[match.start():match.end()], '')
|
||||
|
||||
|
@ -66,7 +66,9 @@ def test_ingredient_parser():
|
||||
1.0, 'Lorem', 'ipsum', 'dolor sit amet consetetur sadipscing elitr sed diam nonumy eirmod tempor invidunt ut l Lorem ipsum dolor sit amet consetetur sadipscing elitr sed diam nonumy eirmod tempor invidunt ut l'),
|
||||
"1 LoremipsumdolorsitametconsetetursadipscingelitrseddiamnonumyeirmodtemporinviduntutlLoremipsumdolorsitametconsetetursadipscingelitrseddiamnonumyeirmodtemporinviduntutl": (
|
||||
1.0, None, 'LoremipsumdolorsitametconsetetursadipscingelitrseddiamnonumyeirmodtemporinviduntutlLoremipsumdolorsitametconsetetursadipscingeli',
|
||||
'LoremipsumdolorsitametconsetetursadipscingelitrseddiamnonumyeirmodtemporinviduntutlLoremipsumdolorsitametconsetetursadipscingelitrseddiamnonumyeirmodtemporinviduntutl')
|
||||
'LoremipsumdolorsitametconsetetursadipscingelitrseddiamnonumyeirmodtemporinviduntutlLoremipsumdolorsitametconsetetursadipscingelitrseddiamnonumyeirmodtemporinviduntutl'),
|
||||
"砂糖 50g": (50, "g", "砂糖", ""),
|
||||
"卵 4個": (4, "個", "卵", "")
|
||||
|
||||
}
|
||||
# for German you could say that if an ingredient does not have
|
||||
|
Reference in New Issue
Block a user