chore: improved tokenizer vocabulary warning

le1nux · le1nux · commit eb747fd42c00 · 2026-01-23T10:34:43.000+01:00
diff --git a/.gitignore b/.gitignore
@@ -171,4 +171,8 @@ tutorials/instruction_tuning/prepared_data
 config_files/instruction_tuning
 data/lorem_ipsum_instruct.jsonl
 tutorials/scaling_up/logs*
-tutorials/scaling_up/experiments_old/*
+tutorials/scaling_up/experiments_old/*
+
+results/*
+tutorials/einsum_transformer/experiments/*
+tutorials/warmstart/experiments/*
diff --git a/src/modalities/tokenization/tokenizer_wrapper.py b/src/modalities/tokenization/tokenizer_wrapper.py
@@ -118,7 +118,8 @@ def __init__(
             if len(self.tokenizer.get_vocab()) > old_vocab_size:
                 raise NotImplementedError(
                     "Currently only tokens already known to the tokenizers vocabulary can be added,"
-                    + " as resizing the embedding matrix is not yet supported!"
+                    + " as resizing the embedding matrix is not yet supported! "
+                    f"Before: {old_vocab_size}, after: {len(self.tokenizer.get_vocab())}"
                 )
         self.max_length = max_length
         self.truncation = truncation