improved ingredient parser

This commit is contained in:
vabene1111
2021-06-18 17:36:31 +02:00
parent 7aa71dc744
commit 256c1a7d41
2 changed files with 27 additions and 15 deletions

View File

@ -1,3 +1,4 @@
import re
import string import string
import unicodedata import unicodedata
@ -25,17 +26,12 @@ def parse_amount(x):
did_check_frac = False did_check_frac = False
end = 0 end = 0
while ( while (end < len(x) and (x[end] in string.digits
end < len(x)
and (
x[end] in string.digits
or ( or (
(x[end] == '.' or x[end] == ',' or x[end] == '/') (x[end] == '.' or x[end] == ',' or x[end] == '/')
and end + 1 < len(x) and end + 1 < len(x)
and x[end + 1] in string.digits and x[end + 1] in string.digits
) ))):
)
):
end += 1 end += 1
if end > 0: if end > 0:
if "/" in x[:end]: if "/" in x[:end]:
@ -55,6 +51,9 @@ def parse_amount(x):
unit = x[end + 1:] unit = x[end + 1:]
except ValueError: except ValueError:
unit = x[end:] unit = x[end:]
if unit.startswith('('): # i dont know any unit that starts with ( so its likely an alternative like 1L (500ml) Water
unit = ''
return amount, unit return amount, unit
@ -107,6 +106,12 @@ def parse(x):
ingredient = '' ingredient = ''
note = '' note = ''
# if the string contains parenthesis early on remove it and place it at the end
# because its likely some kind of note
if re.match('(.){1,6}\s\((.[^\(\)])+\)\s', x):
match = re.search('\((.[^\(])+\)', x)
x = x[:match.start()] + x[match.end():] + ' ' + x[match.start():match.end()]
tokens = x.split() tokens = x.split()
if len(tokens) == 1: if len(tokens) == 1:
# there only is one argument, that must be the ingredient # there only is one argument, that must be the ingredient
@ -115,16 +120,17 @@ def parse(x):
try: try:
# try to parse first argument as amount # try to parse first argument as amount
amount, unit = parse_amount(tokens[0]) amount, unit = parse_amount(tokens[0])
print('test', unit)
# only try to parse second argument as amount if there are at least # only try to parse second argument as amount if there are at least
# three arguments if it already has a unit there can't be # three arguments if it already has a unit there can't be
# a fraction for the amount # a fraction for the amount
if len(tokens) > 2: if len(tokens) > 2:
try: try:
if not unit == '': if not unit == '':
# a unit is already found, no need to try the second argument for a fraction # noqa: E501 # a unit is already found, no need to try the second argument for a fraction
# probably not the best method to do it, but I didn't want to make an if check and paste the exact same thing in the else as already is in the except # noqa: E501 # probably not the best method to do it, but I didn't want to make an if check and paste the exact same thing in the else as already is in the except # noqa: E501
raise ValueError raise ValueError
# try to parse second argument as amount and add that, in case of '2 1/2' or '2 ½' # noqa: E501 # try to parse second argument as amount and add that, in case of '2 1/2' or '2 ½'
amount += parse_fraction(tokens[1]) amount += parse_fraction(tokens[1])
# assume that units can't end with a comma # assume that units can't end with a comma
if len(tokens) > 3 and not tokens[2].endswith(','): if len(tokens) > 3 and not tokens[2].endswith(','):
@ -142,7 +148,10 @@ def parse(x):
# try to use second argument as unit and everything else as ingredient, use everything as ingredient if it fails # noqa: E501 # try to use second argument as unit and everything else as ingredient, use everything as ingredient if it fails # noqa: E501
try: try:
ingredient, note = parse_ingredient(tokens[2:]) ingredient, note = parse_ingredient(tokens[2:])
if unit == '':
unit = tokens[1] unit = tokens[1]
else:
note = tokens[1]
except ValueError: except ValueError:
ingredient, note = parse_ingredient(tokens[1:]) ingredient, note = parse_ingredient(tokens[1:])
else: else:

View File

@ -53,7 +53,10 @@ def test_ingredient_parser():
"50 g smör eller margarin": (50, "g", "smör eller margarin", ""), "50 g smör eller margarin": (50, "g", "smör eller margarin", ""),
"3,5 l Wasser": (3.5, "l", "Wasser", ""), "3,5 l Wasser": (3.5, "l", "Wasser", ""),
"3.5 l Wasser": (3.5, "l", "Wasser", ""), "3.5 l Wasser": (3.5, "l", "Wasser", ""),
"400 g Karotte(n)": (400, "g", "Karotte(n)", "") "400 g Karotte(n)": (400, "g", "Karotte(n)", ""),
"400g unsalted butter": (400, "g", "butter", "unsalted"),
"2L Wasser": (2, "L", "Wasser", ""),
"1 (16 ounce) package dry lentils, rinsed": (1, "package", "dry lentils, rinsed", "16 ounce"),
} }
# for German you could say that if an ingredient does not have # for German you could say that if an ingredient does not have
# an amount # and it starts with a lowercase letter, then that # an amount # and it starts with a lowercase letter, then that