diff --git a/misaki/de.py b/misaki/de.py index 80ce437..2d50427 100644 --- a/misaki/de.py +++ b/misaki/de.py @@ -8,6 +8,7 @@ from typing import Tuple import re +from decimal import Decimal, InvalidOperation, ROUND_HALF_UP # ── cardinal numbers ───────────────────────────────────────────────────────── @@ -140,11 +141,11 @@ def _currency_repl(sym, num): word = _CURRENCY.get(sym, sym) cleaned = num.replace(".", "").replace(",", ".") try: - val = float(cleaned) - except ValueError: + val = Decimal(cleaned) + except InvalidOperation: return sym + num - euros = int(val) - cents = round((val - euros) * 100) + cents_total = int((val * 100).quantize(Decimal("1"), rounding=ROUND_HALF_UP)) + euros, cents = divmod(cents_total, 100) if cents == 0: return _int_to_de(euros) + " " + word return _int_to_de(euros) + " " + word + " und " + _int_to_de(cents) + " Cent" @@ -225,9 +226,11 @@ def normalize_text_de(text): # 5. Times (HH:MM) def _time_repl(m): h, mi = int(m.group(1)), int(m.group(2)) + if h > 23 or mi > 59: + return m.group(0) return _int_to_de(h) + " Uhr" + (" " + _int_to_de(mi) if mi else "") - text = re.sub(r"\b(\d{1,2}):(\d{2})(?:\s*Uhr)?", _time_repl, text) + text = re.sub(r"\b(\d{1,2}):(\d{2})(?:\s*Uhr\b)?", _time_repl, text) # 6. Full dates (DD.MM.YYYY) def _date_repl(m): @@ -277,8 +280,18 @@ def _decimal_repl(m): text = re.sub(r"\b(\d+),(\d+)\b", _decimal_repl, text) - # Plain integers - text = re.sub(r"\b(\d+)\b", lambda m: _int_to_de(int(m.group(1))), text) + # Plain integers. Keep any invalid HH:MM text that survived the time pass unchanged. + remaining_time_re = re.compile(r"\b\d{1,2}:\d{2}(?:\s*Uhr\b)?") + + def _plain_int_repl(m): + start = max(0, m.start() - 3) + end = min(len(text), m.end() + len(":00 Uhr")) + for time_match in remaining_time_re.finditer(text, start, end): + if time_match.start() <= m.start() and m.end() <= time_match.end(): + return m.group(0) + return _int_to_de(int(m.group(1))) + + text = re.sub(r"\b(\d+)\b", _plain_int_repl, text) # 10. Whitespace cleanup text = re.sub(r"[ \t]{2,}", " ", text) diff --git a/tests/test_de.py b/tests/test_de.py index 2cf5da2..1f7b47e 100644 --- a/tests/test_de.py +++ b/tests/test_de.py @@ -190,6 +190,9 @@ def test_decimal_comma(self): assert "Komma" in r assert "36,9" not in r + def test_colon_separated_non_time_numbers(self): + assert "eins:zwei" in normalize_text_de("Stand 1:2.") + class TestCurrency: def test_euro_before(self): @@ -207,6 +210,11 @@ def test_euro_with_cents(self): assert "Euro" in r assert "Cent" in r + def test_euro_fraction_rounds_to_next_unit(self): + r = normalize_text_de("€9,999 bitte") + assert "zehn Euro" in r + assert "Cent" not in r + def test_dollar(self): r = normalize_text_de("$100 Rabatt") assert "Dollar" in r @@ -234,6 +242,22 @@ def test_no_double_uhr(self): r = normalize_text_de("Um 14:30 Uhr") assert r.count("Uhr") == 1 + def test_uhr_word_boundary(self): + r = normalize_text_de("Um 14:30 Uhrzeit beginnt es.") + assert "vierzehn Uhr dreißigzeit" not in r + assert "Uhrzeit" in r + + def test_invalid_hour_is_unchanged(self): + assert "25:00 Uhr" in normalize_text_de("Um 25:00 Uhr.") + + def test_invalid_minute_is_unchanged(self): + assert "23:99 Uhr" in normalize_text_de("Um 23:99 Uhr.") + + def test_invalid_time_does_not_replace_literal_placeholder_text(self): + r = normalize_text_de("Token __MISAKI_DE_INVALID_TIME_0__ um 25:00 Uhr.") + assert "__MISAKI_DE_INVALID_TIME_0__" in r + assert "25:00 Uhr" in r + class TestDates: def test_christmas(self):