Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 20 additions & 7 deletions misaki/de.py
Comment thread
apples-kksk marked this conversation as resolved.
Comment thread
apples-kksk marked this conversation as resolved.
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from typing import Tuple
import re
from decimal import Decimal, InvalidOperation, ROUND_HALF_UP

# ── cardinal numbers ─────────────────────────────────────────────────────────

Expand Down Expand Up @@ -140,11 +141,11 @@ def _currency_repl(sym, num):
word = _CURRENCY.get(sym, sym)
cleaned = num.replace(".", "").replace(",", ".")
try:
val = float(cleaned)
except ValueError:
val = Decimal(cleaned)
except InvalidOperation:
return sym + num
euros = int(val)
cents = round((val - euros) * 100)
cents_total = int((val * 100).quantize(Decimal("1"), rounding=ROUND_HALF_UP))
euros, cents = divmod(cents_total, 100)
if cents == 0:
return _int_to_de(euros) + " " + word
return _int_to_de(euros) + " " + word + " und " + _int_to_de(cents) + " Cent"
Expand Down Expand Up @@ -225,9 +226,11 @@ def normalize_text_de(text):
# 5. Times (HH:MM)
def _time_repl(m):
h, mi = int(m.group(1)), int(m.group(2))
if h > 23 or mi > 59:
return m.group(0)
return _int_to_de(h) + " Uhr" + (" " + _int_to_de(mi) if mi else "")

text = re.sub(r"\b(\d{1,2}):(\d{2})(?:\s*Uhr)?", _time_repl, text)
text = re.sub(r"\b(\d{1,2}):(\d{2})(?:\s*Uhr\b)?", _time_repl, text)

# 6. Full dates (DD.MM.YYYY)
def _date_repl(m):
Expand Down Expand Up @@ -277,8 +280,18 @@ def _decimal_repl(m):

text = re.sub(r"\b(\d+),(\d+)\b", _decimal_repl, text)

# Plain integers
text = re.sub(r"\b(\d+)\b", lambda m: _int_to_de(int(m.group(1))), text)
# Plain integers. Keep any invalid HH:MM text that survived the time pass unchanged.
remaining_time_re = re.compile(r"\b\d{1,2}:\d{2}(?:\s*Uhr\b)?")

def _plain_int_repl(m):
start = max(0, m.start() - 3)
end = min(len(text), m.end() + len(":00 Uhr"))
for time_match in remaining_time_re.finditer(text, start, end):
if time_match.start() <= m.start() and m.end() <= time_match.end():
return m.group(0)
return _int_to_de(int(m.group(1)))

text = re.sub(r"\b(\d+)\b", _plain_int_repl, text)

# 10. Whitespace cleanup
text = re.sub(r"[ \t]{2,}", " ", text)
Expand Down
24 changes: 24 additions & 0 deletions tests/test_de.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,9 @@ def test_decimal_comma(self):
assert "Komma" in r
assert "36,9" not in r

def test_colon_separated_non_time_numbers(self):
assert "eins:zwei" in normalize_text_de("Stand 1:2.")


class TestCurrency:
def test_euro_before(self):
Expand All @@ -207,6 +210,11 @@ def test_euro_with_cents(self):
assert "Euro" in r
assert "Cent" in r

def test_euro_fraction_rounds_to_next_unit(self):
r = normalize_text_de("€9,999 bitte")
assert "zehn Euro" in r
assert "Cent" not in r

def test_dollar(self):
r = normalize_text_de("$100 Rabatt")
assert "Dollar" in r
Expand Down Expand Up @@ -234,6 +242,22 @@ def test_no_double_uhr(self):
r = normalize_text_de("Um 14:30 Uhr")
assert r.count("Uhr") == 1

def test_uhr_word_boundary(self):
r = normalize_text_de("Um 14:30 Uhrzeit beginnt es.")
assert "vierzehn Uhr dreißigzeit" not in r
assert "Uhrzeit" in r

def test_invalid_hour_is_unchanged(self):
assert "25:00 Uhr" in normalize_text_de("Um 25:00 Uhr.")

def test_invalid_minute_is_unchanged(self):
assert "23:99 Uhr" in normalize_text_de("Um 23:99 Uhr.")

def test_invalid_time_does_not_replace_literal_placeholder_text(self):
r = normalize_text_de("Token __MISAKI_DE_INVALID_TIME_0__ um 25:00 Uhr.")
assert "__MISAKI_DE_INVALID_TIME_0__" in r
assert "25:00 Uhr" in r


class TestDates:
def test_christmas(self):
Expand Down