improved ingredient parser
This commit is contained in:
@ -1,3 +1,4 @@
|
||||
import re
|
||||
import string
|
||||
import unicodedata
|
||||
|
||||
@ -25,17 +26,12 @@ def parse_amount(x):
|
||||
|
||||
did_check_frac = False
|
||||
end = 0
|
||||
while (
|
||||
end < len(x)
|
||||
and (
|
||||
x[end] in string.digits
|
||||
while (end < len(x) and (x[end] in string.digits
|
||||
or (
|
||||
(x[end] == '.' or x[end] == ',' or x[end] == '/')
|
||||
and end + 1 < len(x)
|
||||
and x[end + 1] in string.digits
|
||||
)
|
||||
)
|
||||
):
|
||||
))):
|
||||
end += 1
|
||||
if end > 0:
|
||||
if "/" in x[:end]:
|
||||
@ -55,6 +51,9 @@ def parse_amount(x):
|
||||
unit = x[end + 1:]
|
||||
except ValueError:
|
||||
unit = x[end:]
|
||||
|
||||
if unit.startswith('('): # i dont know any unit that starts with ( so its likely an alternative like 1L (500ml) Water
|
||||
unit = ''
|
||||
return amount, unit
|
||||
|
||||
|
||||
@ -107,6 +106,12 @@ def parse(x):
|
||||
ingredient = ''
|
||||
note = ''
|
||||
|
||||
# if the string contains parenthesis early on remove it and place it at the end
|
||||
# because its likely some kind of note
|
||||
if re.match('(.){1,6}\s\((.[^\(\)])+\)\s', x):
|
||||
match = re.search('\((.[^\(])+\)', x)
|
||||
x = x[:match.start()] + x[match.end():] + ' ' + x[match.start():match.end()]
|
||||
|
||||
tokens = x.split()
|
||||
if len(tokens) == 1:
|
||||
# there only is one argument, that must be the ingredient
|
||||
@ -115,16 +120,17 @@ def parse(x):
|
||||
try:
|
||||
# try to parse first argument as amount
|
||||
amount, unit = parse_amount(tokens[0])
|
||||
print('test', unit)
|
||||
# only try to parse second argument as amount if there are at least
|
||||
# three arguments if it already has a unit there can't be
|
||||
# a fraction for the amount
|
||||
if len(tokens) > 2:
|
||||
try:
|
||||
if not unit == '':
|
||||
# a unit is already found, no need to try the second argument for a fraction # noqa: E501
|
||||
# a unit is already found, no need to try the second argument for a fraction
|
||||
# probably not the best method to do it, but I didn't want to make an if check and paste the exact same thing in the else as already is in the except # noqa: E501
|
||||
raise ValueError
|
||||
# try to parse second argument as amount and add that, in case of '2 1/2' or '2 ½' # noqa: E501
|
||||
# try to parse second argument as amount and add that, in case of '2 1/2' or '2 ½'
|
||||
amount += parse_fraction(tokens[1])
|
||||
# assume that units can't end with a comma
|
||||
if len(tokens) > 3 and not tokens[2].endswith(','):
|
||||
@ -142,7 +148,10 @@ def parse(x):
|
||||
# try to use second argument as unit and everything else as ingredient, use everything as ingredient if it fails # noqa: E501
|
||||
try:
|
||||
ingredient, note = parse_ingredient(tokens[2:])
|
||||
if unit == '':
|
||||
unit = tokens[1]
|
||||
else:
|
||||
note = tokens[1]
|
||||
except ValueError:
|
||||
ingredient, note = parse_ingredient(tokens[1:])
|
||||
else:
|
||||
|
@ -53,7 +53,10 @@ def test_ingredient_parser():
|
||||
"50 g smör eller margarin": (50, "g", "smör eller margarin", ""),
|
||||
"3,5 l Wasser": (3.5, "l", "Wasser", ""),
|
||||
"3.5 l Wasser": (3.5, "l", "Wasser", ""),
|
||||
"400 g Karotte(n)": (400, "g", "Karotte(n)", "")
|
||||
"400 g Karotte(n)": (400, "g", "Karotte(n)", ""),
|
||||
"400g unsalted butter": (400, "g", "butter", "unsalted"),
|
||||
"2L Wasser": (2, "L", "Wasser", ""),
|
||||
"1 (16 ounce) package dry lentils, rinsed": (1, "package", "dry lentils, rinsed", "16 ounce"),
|
||||
}
|
||||
# for German you could say that if an ingredient does not have
|
||||
# an amount # and it starts with a lowercase letter, then that
|
||||
|
Reference in New Issue
Block a user