docling-project · ceberam · Jun 15, 2026 · Jun 12, 2026 · Jun 12, 2026
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,16 +1,18 @@
 fail_fast: true
 repos:
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.11.5
+    rev: v0.15.17
     hooks:
+      - id: ruff-check
+        name: "Ruff linter"
+        args: [--exit-non-zero-on-fix, --fix, --config=pyproject.toml]
+        files: '^(docling_core|test|docs/examples|examples).*\.(py|ipynb)$'
+        pass_filenames: false
       - id: ruff-format
         name: "Ruff formatter"
         args: [--config=pyproject.toml]
-        files: '^(docling_core|tests|docs/examples).*\.(py|ipynb)$'
-      - id: ruff
-        name: "Ruff linter"
-        args: [--exit-non-zero-on-fix, --fix, --config=pyproject.toml]
-        files: '^(docling_core|tests|docs/examples).*\.(py|ipynb)$'
+        files: '^(docling_core|test|docs/examples|examples).*\.(py|ipynb)$'
+        pass_filenames: false
   - repo: local
     hooks:
       - id: mypy
@@ -36,6 +38,10 @@ repos:
         language: system
         files: '\.py$'
   - repo: https://github.com/astral-sh/uv-pre-commit
-    rev: 0.7.8
+    rev: 0.11.21
     hooks:
       - id: uv-lock
+        name: "uv lockfile"
+        entry: uv lock
+        pass_filenames: false
+        language: system
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -47,9 +47,7 @@ uv add [OPTIONS] <PACKAGES|--requirements <REQUIREMENTS>>
 
 We use the following tools to enforce code style:
 
-- Ruff, to format and lint code
-- Flake8, to lint code
-- autoflake, to remove unused variables and imports
+- [Ruff](https://docs.astral.sh/ruff/), to format and lint code
 - [MyPy](https://mypy.readthedocs.io), as static type checker
 
 A set of styling checks, as well as regression tests, are defined and managed through the [pre-commit](https://pre-commit.com/) framework. To ensure that those scripts run automatically before a commit is finalized, install `pre-commit` on your local repository:

diff --git a/docling_core/experimental/serializer/outline.py b/docling_core/experimental/serializer/outline.py
@@ -136,7 +136,7 @@ def _format_indented_text_line(item: OutlineItemData, indent_size: int = 2, max_
 
 
 def _extract_ref_from_markdown(text: str) -> tuple[str | None, int | None]:
-    """Extract reference and level from Markdown outline text.
+    r"""Extract reference and level from Markdown outline text.
 
     Args:
         text: Markdown text containing a reference like \\[ref=#/texts/1\\]

diff --git a/docling_core/transforms/chunker/chunk_expander.py b/docling_core/transforms/chunker/chunk_expander.py
@@ -35,7 +35,6 @@ def _get_top_containing_items(meta: DocMeta, doc: DoclingDocument) -> list[DocIt
         Returns:
             List of top-level DocItems in document order, or None if no items found.
         """
-
         items: dict[str, DocItem] = {}
         ref_items: list[str] = [item.self_ref for item in meta.doc_items]
         for item in ref_items:
@@ -138,7 +137,6 @@ def expand(self, chunk: BaseChunk, dl_doc: DoclingDocument, serializer: BaseDocS
         Note:
             It is recommended to use the same serializer as used for the original document.
         """
-
         if not isinstance(chunk, DocChunk):
             return chunk
 

diff --git a/docling_core/transforms/chunker/line_chunker.py b/docling_core/transforms/chunker/line_chunker.py
@@ -287,27 +287,19 @@ def split_by_token_limit(
         token_limit: int,
         prefer_word_boundary: bool = True,
     ) -> tuple[str, str]:
-        """
-        Split `text` into (head, tail) where `head` has at most `token_limit` tokens,
+        """Split `text` into (head, tail) where `head` has at most `token_limit` tokens,
         and `tail` is the remainder. Uses binary search on character indices to minimize
         calls to `count_tokens`.
 
-        Parameters
-        ----------
-        text : str
-            Input string to split.
-        token_limit: int
-            Maximum number of tokens allowed in the head.
-        prefer_word_boundary : bool
-            If True, try to end the head on a whitespace boundary (without violating
-            the token limit). If no boundary exists in range, fall back to the
-            exact max index found by search.
-
-        Returns
-        -------
-        (head, tail) : tuple[str, str]
-            `head` contains at most `token_limit` tokens, `tail` is the remaining suffix.
-            If `token_limit <= 0`, returns ("", text).
+        Args:
+            text: Input string to split.
+            token_limit: Maximum number of tokens allowed in the head.
+            prefer_word_boundary: If True, try to end the head on a whitespace boundary (without violating
+                the token limit). If no boundary exists in range, fall back to the
+                exact max index found by search.
+
+        Returns:
+           (head, tail) where `head` contains at most `token_limit` tokens, `tail` is the remaining suffix. If `token_limit <= 0`, returns ("", text).
         """
         if token_limit <= 0 or not text:
             return "", text

diff --git a/docling_core/transforms/chunker/tokenizer/huggingface.py b/docling_core/transforms/chunker/tokenizer/huggingface.py
@@ -72,5 +72,4 @@ def get_tokenizer(self):
 
 def get_default_tokenizer():
     """Get default tokenizer instance."""
-
     return HuggingFaceTokenizer.from_pretrained(model_name="sentence-transformers/all-MiniLM-L6-v2")
diff --git a/docling_core/transforms/deserializer/doclang.py b/docling_core/transforms/deserializer/doclang.py
@@ -1600,7 +1600,7 @@ def _inner_xml(self, el: Element, exclude_tags: Optional[set[str]] = None) -> st
         return "".join(parts)
 
     def _layer_from_nodes(self, nodes: Sequence[Node]) -> Optional[ContentLayer]:
-        """Extract content layer from ``<layer value=\"...\"/>`` in element head nodes."""
+        r"""Extract content layer from ``<layer value=\"...\"/>`` in element head nodes."""
         for node in nodes:
             if isinstance(node, Element) and node.tagName == DocLangToken.LAYER.value:
                 if layer_value := node.getAttribute(DocLangAttributeKey.VALUE.value):
@@ -1611,7 +1611,7 @@ def _layer_from_nodes(self, nodes: Sequence[Node]) -> Optional[ContentLayer]:
         return None
 
     def _label_value_from_nodes(self, nodes: Sequence[Node]) -> Optional[str]:
-        """Extract ``<label value=\"...\"/>`` from element head nodes."""
+        r"""Extract ``<label value=\"...\"/>`` from element head nodes."""
         for node in nodes:
             if isinstance(node, Element) and node.tagName == DocLangToken.LABEL.value:
                 if label_val := node.getAttribute(DocLangAttributeKey.VALUE.value):
@@ -1637,7 +1637,7 @@ def _label_value_from_nodes(self, nodes: Sequence[Node]) -> Optional[str]:
     def _bbox_from_location_text_fragments(
         self, *, doc: DoclingDocument, fragments: list[str]
     ) -> Optional[BoundingBox]:
-        """Build a TOPLEFT bbox from four ``<location value=\"...\"/>`` XML fragments."""
+        r"""Build a TOPLEFT bbox from four ``<location value=\"...\"/>`` XML fragments."""
         if len(fragments) != 4:
             return None
         values: list[int] = []
@@ -1714,7 +1714,6 @@ def _otsl_extract_tokens_and_text(self, s: str) -> tuple[list[str], list[str]]:
         ``<location>`` tokens. Handles nested XML elements (like
         ``<text><italic>...</italic></text>``) by keeping them as single units.
         """
-
         tokens: list[str] = []
         parts: list[str] = []
 
@@ -2016,11 +2015,11 @@ def _extract_provenance(self, *, doc: DoclingDocument, el: Element) -> list[Prov
         return self._provenance_from_location_nodes(doc=doc, nodes=head_nodes)
 
     def _extract_layer(self, *, el: Element) -> Optional[ContentLayer]:
-        """Extract content layer from element-head ``<layer value=\"...\"/>``."""
+        r"""Extract content layer from element-head ``<layer value=\"...\"/>``."""
         head_nodes, _ = self._split_element_children_head_body(el)
         return self._layer_from_nodes(head_nodes)
 
     def _extract_label_value(self, *, el: Element) -> Optional[str]:
-        """Extract ``<label value=\"...\"/>`` from element head."""
+        r"""Extract ``<label value=\"...\"/>`` from element head."""
         head_nodes, _ = self._split_element_children_head_body(el)
         return self._label_value_from_nodes(head_nodes)
diff --git a/docling_core/transforms/serializer/doclang.py b/docling_core/transforms/serializer/doclang.py
@@ -1742,7 +1742,7 @@ def serialize_hyperlink(
         hyperlink: Union[AnyUrl, Path],
         **kwargs: Any,
     ) -> str:
-        """Hyperlinks are emitted as ``<href uri=\"...\"/>`` in element head, not inline."""
+        r"""Hyperlinks are emitted as ``<href uri=\"...\"/>`` in element head, not inline."""
         return text
 
     text_serializer: BaseTextSerializer = DocLangTextSerializer()
@@ -1853,7 +1853,6 @@ def _filter_out_all_content(self, text: str) -> str:
 
     def _serialize_body(self, **kwargs) -> SerializationResult:
         """Serialize the document body."""
-
         self._suppressed_page_breaks = set()
         self._next_thread_id = 1
         self._thread_id_by_ref = {}

diff --git a/docling_core/transforms/serializer/markdown.py b/docling_core/transforms/serializer/markdown.py
@@ -443,7 +443,6 @@ def get_header_and_body_lines(
             A tuple of (header_lines, body_lines) where header_lines contains
             the header row and separator row, and body_lines contains the data rows.
         """
-
         lines = [line for line in table_text.splitlines(True) if line.strip()]
 
         if len(lines) < 2:

diff --git a/docling_core/transforms/serializer/markdown_excel.py b/docling_core/transforms/serializer/markdown_excel.py
@@ -46,7 +46,7 @@ def serialize(
 
 
 class MsExcelMarkdownDocSerializer(MarkdownDocSerializer):
-    """``MarkdownDocSerializer`` variant for Excel-sourced ``DoclingDocument``\\s.
+    r"""``MarkdownDocSerializer`` variant for Excel-sourced ``DoclingDocument``\\s.
 
     Swap in :class:`MsExcelMarkdownFallbackSerializer` so that worksheet
     groups (``GroupLabel.SHEET``) are rendered with their name as a Markdown

diff --git a/docling_core/transforms/serializer/webvtt.py b/docling_core/transforms/serializer/webvtt.py
@@ -368,19 +368,16 @@ def requires_page_break(self) -> bool:
     @override
     def serialize_bold(self, text: str, **kwargs) -> str:
         """Apply WebVTT-specific bold serialization."""
-
         return self.serialize_cue_span(text=text, tag="b")
 
     @override
     def serialize_italic(self, text: str, **kwargs) -> str:
         """Apply WebVTT-specific italic serialization."""
-
         return self.serialize_cue_span(text=text, tag="i")
 
     @override
     def serialize_underline(self, text: str, **kwargs) -> str:
         """Apply WebVTT-specific underline serialization."""
-
         return self.serialize_cue_span(text=text, tag="u")
 
     def serialize_cue_span(

diff --git a/docling_core/types/doc/base.py b/docling_core/types/doc/base.py
@@ -63,12 +63,12 @@ class BoundingBox(BaseModel):
 
     @property
     def width(self):
-        """width."""
+        """Width."""
         return self.r - self.l
 
     @property
     def height(self):
-        """height."""
+        """Height."""
         return abs(self.t - self.b)
 
     @field_serializer("l", "t", "r", "b")
@@ -94,12 +94,12 @@ def scale_to_size(self, old_size: Size, new_size: Size):
 
     # same as before, but using the implementation above
     def scaled(self, scale: float):
-        """scaled."""
+        """Scaled."""
         return self.resize_by_scale(x_scale=scale, y_scale=scale)
 
     # same as before, but using the implementation above
     def normalized(self, page_size: Size):
-        """normalized."""
+        """Normalized."""
         return self.scale_to_size(old_size=page_size, new_size=Size(height=1.0, width=1.0))
 
     def expand_by_scale(self, x_scale: float, y_scale: float) -> "BoundingBox":
@@ -155,7 +155,7 @@ def from_tuple(cls, coord: tuple[float, ...], origin: CoordOrigin):
             return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
 
     def area(self) -> float:
-        """area."""
+        """Area."""
         return abs(self.r - self.l) * abs(self.b - self.t)
 
     def intersection_area_with(self, other: "BoundingBox") -> float:
@@ -260,7 +260,7 @@ def to_top_left_origin(self, page_height: float) -> "BoundingBox":
             )
 
     def overlaps(self, other: "BoundingBox") -> bool:
-        """overlaps."""
+        """Overlaps."""
         return self.overlaps_horizontally(other=other) and self.overlaps_vertically(other=other)
 
     def overlaps_horizontally(self, other: "BoundingBox") -> bool:

diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
@@ -414,7 +414,7 @@ class TableData(BaseModel):  # TBD
     def grid(
         self,
     ) -> list[list[TableCell]]:
-        """grid."""
+        """Grid."""
         # Initialise empty table data grid (only empty cells)
         table_data = [
             [
@@ -2419,7 +2419,6 @@ def _migrate_annotations_to_meta(self) -> Self:
 
     def export_to_dataframe(self, doc: Optional["DoclingDocument"] = None) -> pd.DataFrame:
         """Export the table as a Pandas DataFrame."""
-
         return self._export_to_dataframe_with_options(doc=doc)
 
     def _export_to_dataframe_with_options(
@@ -2428,7 +2427,6 @@ def _export_to_dataframe_with_options(
         **kwargs: Any,
     ) -> pd.DataFrame:
         """Export the table as a Pandas DataFrame with contextual named arguments."""
-
         if doc is None:
             _logger.warning("Usage of TableItem.export_to_dataframe() without `doc` argument is deprecated.")
 
@@ -6317,7 +6315,7 @@ def export_to_text(
         page_break_placeholder: Optional[str] = None,
         traverse_pictures: bool = False,
     ) -> str:
-        """Export to plain text.
+        r"""Export to plain text.
 
         Produces clean plain text without any Markdown decoration. Heading
         markers (``#``), bold/italic markers, and hyperlink syntax are all
@@ -6534,7 +6532,6 @@ def export_to_vtt(
         Returns:
             A string representation of the Docling document in WebVTT format.
         """
-
         from docling_core.transforms.serializer.webvtt import WebVTTDocSerializer, WebVTTParams
 
         my_layers = included_content_layers if included_content_layers is not None else DEFAULT_CONTENT_LAYERS
@@ -7497,7 +7494,6 @@ def _normalize_table_children_from_rich_cells(self):
     @model_validator(mode="after")
     def validate_document(self) -> Self:
         """validate_document."""
-
         with warnings.catch_warnings():
             # ignore warning from deprecated furniture
             warnings.filterwarnings("ignore", category=DeprecationWarning)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -72,5 +72,4 @@ def get_tokenizer(self):

		def get_default_tokenizer():
		"""Get default tokenizer instance."""

		return HuggingFaceTokenizer.from_pretrained(model_name="sentence-transformers/all-MiniLM-L6-v2")