improved ingredient parser

2021-06-18 17:36:31 +02:00
parent 7aa71dc744
commit 256c1a7d41
2 changed files with 27 additions and 15 deletions
--- a/cookbook/helper/ingredient_parser.py
+++ b/cookbook/helper/ingredient_parser.py
@ -1,3 +1,4 @@
 import re
 import string
 import unicodedata
@ -25,17 +26,12 @@ def parse_amount(x):
    did_check_frac = False
    end = 0
-    while (
+    while (end < len(x) and (x[end] in string.digits
            end < len(x)
            and (
                    x[end] in string.digits
                             or (
                                     (x[end] == '.' or x[end] == ',' or x[end] == '/')
                                     and end + 1 < len(x)
                                     and x[end + 1] in string.digits
-                    )
+                             ))):
            )
    ):
        end += 1
    if end > 0:
        if "/" in x[:end]:
@ -55,6 +51,9 @@ def parse_amount(x):
                unit = x[end + 1:]
            except ValueError:
                unit = x[end:]
    if unit.startswith('('):  # i dont know any unit that starts with ( so its likely an alternative like 1L (500ml) Water
        unit = ''
    return amount, unit
@ -107,6 +106,12 @@ def parse(x):
    ingredient = ''
    note = ''
    # if the string contains parenthesis early on remove it and place it at the end
    # because its likely some kind of note
    if re.match('(.){1,6}\s\((.[^\(\)])+\)\s', x):
        match = re.search('\((.[^\(])+\)', x)
        x = x[:match.start()] + x[match.end():] + ' ' + x[match.start():match.end()]
    tokens = x.split()
    if len(tokens) == 1:
        # there only is one argument, that must be the ingredient
@ -115,16 +120,17 @@ def parse(x):
        try:
            # try to parse first argument as amount
            amount, unit = parse_amount(tokens[0])
            print('test', unit)
            # only try to parse second argument as amount if there are at least
            # three arguments if it already has a unit there can't be
            # a fraction for the amount
            if len(tokens) > 2:
                try:
                    if not unit == '':
-                        # a unit is already found, no need to try the second argument for a fraction  # noqa: E501
+                        # a unit is already found, no need to try the second argument for a fraction
                        # probably not the best method to do it, but I didn't want to make an if check and paste the exact same thing in the else as already is in the except  # noqa: E501
                        raise ValueError
-                    # try to parse second argument as amount and add that, in case of '2 1/2' or '2 ½'  # noqa: E501
+                    # try to parse second argument as amount and add that, in case of '2 1/2' or '2 ½'
                    amount += parse_fraction(tokens[1])
                    # assume that units can't end with a comma
                    if len(tokens) > 3 and not tokens[2].endswith(','):
@ -142,7 +148,10 @@ def parse(x):
                        # try to use second argument as unit and everything else as ingredient, use everything as ingredient if it fails  # noqa: E501
                        try:
                            ingredient, note = parse_ingredient(tokens[2:])
                            if unit == '':
                                unit = tokens[1]
                            else:
                                note = tokens[1]
                        except ValueError:
                            ingredient, note = parse_ingredient(tokens[1:])
                    else:
--- a/cookbook/tests/other/test_ingredient_parser.py
+++ b/cookbook/tests/other/test_ingredient_parser.py
@ -53,7 +53,10 @@ def test_ingredient_parser():
        "50 g smör eller margarin": (50, "g", "smör eller margarin", ""),
        "3,5 l Wasser": (3.5, "l", "Wasser", ""),
        "3.5 l Wasser": (3.5, "l", "Wasser", ""),
-        "400 g Karotte(n)": (400, "g", "Karotte(n)", "")
+        "400 g Karotte(n)": (400, "g", "Karotte(n)", ""),
        "400g unsalted butter": (400, "g", "butter", "unsalted"),
        "2L Wasser": (2, "L", "Wasser", ""),
        "1 (16 ounce) package dry lentils, rinsed": (1, "package", "dry lentils, rinsed", "16 ounce"),
    }
    # for German you could say that if an ingredient does not have
    # an amount # and it starts with a lowercase letter, then that