improved ingredient parser

This commit is contained in:
vabene1111
2021-06-18 17:36:31 +02:00
parent 7aa71dc744
commit 256c1a7d41
2 changed files with 27 additions and 15 deletions

View File

@ -1,3 +1,4 @@
import re
import string
import unicodedata
@ -25,17 +26,12 @@ def parse_amount(x):
did_check_frac = False
end = 0
while (
end < len(x)
and (
x[end] in string.digits
while (end < len(x) and (x[end] in string.digits
or (
(x[end] == '.' or x[end] == ',' or x[end] == '/')
and end + 1 < len(x)
and x[end + 1] in string.digits
)
)
):
))):
end += 1
if end > 0:
if "/" in x[:end]:
@ -55,6 +51,9 @@ def parse_amount(x):
unit = x[end + 1:]
except ValueError:
unit = x[end:]
if unit.startswith('('): # i dont know any unit that starts with ( so its likely an alternative like 1L (500ml) Water
unit = ''
return amount, unit
@ -107,6 +106,12 @@ def parse(x):
ingredient = ''
note = ''
# if the string contains parenthesis early on remove it and place it at the end
# because its likely some kind of note
if re.match('(.){1,6}\s\((.[^\(\)])+\)\s', x):
match = re.search('\((.[^\(])+\)', x)
x = x[:match.start()] + x[match.end():] + ' ' + x[match.start():match.end()]
tokens = x.split()
if len(tokens) == 1:
# there only is one argument, that must be the ingredient
@ -115,16 +120,17 @@ def parse(x):
try:
# try to parse first argument as amount
amount, unit = parse_amount(tokens[0])
print('test', unit)
# only try to parse second argument as amount if there are at least
# three arguments if it already has a unit there can't be
# a fraction for the amount
if len(tokens) > 2:
try:
if not unit == '':
# a unit is already found, no need to try the second argument for a fraction # noqa: E501
# a unit is already found, no need to try the second argument for a fraction
# probably not the best method to do it, but I didn't want to make an if check and paste the exact same thing in the else as already is in the except # noqa: E501
raise ValueError
# try to parse second argument as amount and add that, in case of '2 1/2' or '2 ½' # noqa: E501
# try to parse second argument as amount and add that, in case of '2 1/2' or '2 ½'
amount += parse_fraction(tokens[1])
# assume that units can't end with a comma
if len(tokens) > 3 and not tokens[2].endswith(','):
@ -142,7 +148,10 @@ def parse(x):
# try to use second argument as unit and everything else as ingredient, use everything as ingredient if it fails # noqa: E501
try:
ingredient, note = parse_ingredient(tokens[2:])
if unit == '':
unit = tokens[1]
else:
note = tokens[1]
except ValueError:
ingredient, note = parse_ingredient(tokens[1:])
else:

View File

@ -53,7 +53,10 @@ def test_ingredient_parser():
"50 g smör eller margarin": (50, "g", "smör eller margarin", ""),
"3,5 l Wasser": (3.5, "l", "Wasser", ""),
"3.5 l Wasser": (3.5, "l", "Wasser", ""),
"400 g Karotte(n)": (400, "g", "Karotte(n)", "")
"400 g Karotte(n)": (400, "g", "Karotte(n)", ""),
"400g unsalted butter": (400, "g", "butter", "unsalted"),
"2L Wasser": (2, "L", "Wasser", ""),
"1 (16 ounce) package dry lentils, rinsed": (1, "package", "dry lentils, rinsed", "16 ounce"),
}
# for German you could say that if an ingredient does not have
# an amount # and it starts with a lowercase letter, then that