diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e8bda5708..9f51b6918 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,16 +1,18 @@ fail_fast: true repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.11.5 + rev: v0.15.17 hooks: + - id: ruff-check + name: "Ruff linter" + args: [--exit-non-zero-on-fix, --fix, --config=pyproject.toml] + files: '^(docling_core|test|docs/examples|examples).*\.(py|ipynb)$' + pass_filenames: false - id: ruff-format name: "Ruff formatter" args: [--config=pyproject.toml] - files: '^(docling_core|tests|docs/examples).*\.(py|ipynb)$' - - id: ruff - name: "Ruff linter" - args: [--exit-non-zero-on-fix, --fix, --config=pyproject.toml] - files: '^(docling_core|tests|docs/examples).*\.(py|ipynb)$' + files: '^(docling_core|test|docs/examples|examples).*\.(py|ipynb)$' + pass_filenames: false - repo: local hooks: - id: mypy @@ -36,6 +38,10 @@ repos: language: system files: '\.py$' - repo: https://github.com/astral-sh/uv-pre-commit - rev: 0.7.8 + rev: 0.11.21 hooks: - id: uv-lock + name: "uv lockfile" + entry: uv lock + pass_filenames: false + language: system diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 601510df8..ba8f077b9 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -47,9 +47,7 @@ uv add [OPTIONS] > We use the following tools to enforce code style: -- Ruff, to format and lint code -- Flake8, to lint code -- autoflake, to remove unused variables and imports +- [Ruff](https://docs.astral.sh/ruff/), to format and lint code - [MyPy](https://mypy.readthedocs.io), as static type checker A set of styling checks, as well as regression tests, are defined and managed through the [pre-commit](https://pre-commit.com/) framework. To ensure that those scripts run automatically before a commit is finalized, install `pre-commit` on your local repository: diff --git a/docling_core/experimental/serializer/outline.py b/docling_core/experimental/serializer/outline.py index b6124db00..9376ef8d7 100644 --- a/docling_core/experimental/serializer/outline.py +++ b/docling_core/experimental/serializer/outline.py @@ -136,7 +136,7 @@ def _format_indented_text_line(item: OutlineItemData, indent_size: int = 2, max_ def _extract_ref_from_markdown(text: str) -> tuple[str | None, int | None]: - """Extract reference and level from Markdown outline text. + r"""Extract reference and level from Markdown outline text. Args: text: Markdown text containing a reference like \\[ref=#/texts/1\\] diff --git a/docling_core/transforms/chunker/chunk_expander.py b/docling_core/transforms/chunker/chunk_expander.py index 0fe017bec..8e3cf1319 100644 --- a/docling_core/transforms/chunker/chunk_expander.py +++ b/docling_core/transforms/chunker/chunk_expander.py @@ -35,7 +35,6 @@ def _get_top_containing_items(meta: DocMeta, doc: DoclingDocument) -> list[DocIt Returns: List of top-level DocItems in document order, or None if no items found. """ - items: dict[str, DocItem] = {} ref_items: list[str] = [item.self_ref for item in meta.doc_items] for item in ref_items: @@ -138,7 +137,6 @@ def expand(self, chunk: BaseChunk, dl_doc: DoclingDocument, serializer: BaseDocS Note: It is recommended to use the same serializer as used for the original document. """ - if not isinstance(chunk, DocChunk): return chunk diff --git a/docling_core/transforms/chunker/line_chunker.py b/docling_core/transforms/chunker/line_chunker.py index 1fb3b8bf2..db955e240 100644 --- a/docling_core/transforms/chunker/line_chunker.py +++ b/docling_core/transforms/chunker/line_chunker.py @@ -287,27 +287,19 @@ def split_by_token_limit( token_limit: int, prefer_word_boundary: bool = True, ) -> tuple[str, str]: - """ - Split `text` into (head, tail) where `head` has at most `token_limit` tokens, + """Split `text` into (head, tail) where `head` has at most `token_limit` tokens, and `tail` is the remainder. Uses binary search on character indices to minimize calls to `count_tokens`. - Parameters - ---------- - text : str - Input string to split. - token_limit: int - Maximum number of tokens allowed in the head. - prefer_word_boundary : bool - If True, try to end the head on a whitespace boundary (without violating - the token limit). If no boundary exists in range, fall back to the - exact max index found by search. - - Returns - ------- - (head, tail) : tuple[str, str] - `head` contains at most `token_limit` tokens, `tail` is the remaining suffix. - If `token_limit <= 0`, returns ("", text). + Args: + text: Input string to split. + token_limit: Maximum number of tokens allowed in the head. + prefer_word_boundary: If True, try to end the head on a whitespace boundary (without violating + the token limit). If no boundary exists in range, fall back to the + exact max index found by search. + + Returns: + (head, tail) where `head` contains at most `token_limit` tokens, `tail` is the remaining suffix. If `token_limit <= 0`, returns ("", text). """ if token_limit <= 0 or not text: return "", text diff --git a/docling_core/transforms/chunker/tokenizer/huggingface.py b/docling_core/transforms/chunker/tokenizer/huggingface.py index f5f111665..3b079de3a 100644 --- a/docling_core/transforms/chunker/tokenizer/huggingface.py +++ b/docling_core/transforms/chunker/tokenizer/huggingface.py @@ -72,5 +72,4 @@ def get_tokenizer(self): def get_default_tokenizer(): """Get default tokenizer instance.""" - return HuggingFaceTokenizer.from_pretrained(model_name="sentence-transformers/all-MiniLM-L6-v2") diff --git a/docling_core/transforms/deserializer/doclang.py b/docling_core/transforms/deserializer/doclang.py index 59f8eb172..27173779d 100644 --- a/docling_core/transforms/deserializer/doclang.py +++ b/docling_core/transforms/deserializer/doclang.py @@ -1600,7 +1600,7 @@ def _inner_xml(self, el: Element, exclude_tags: Optional[set[str]] = None) -> st return "".join(parts) def _layer_from_nodes(self, nodes: Sequence[Node]) -> Optional[ContentLayer]: - """Extract content layer from ```` in element head nodes.""" + r"""Extract content layer from ```` in element head nodes.""" for node in nodes: if isinstance(node, Element) and node.tagName == DocLangToken.LAYER.value: if layer_value := node.getAttribute(DocLangAttributeKey.VALUE.value): @@ -1611,7 +1611,7 @@ def _layer_from_nodes(self, nodes: Sequence[Node]) -> Optional[ContentLayer]: return None def _label_value_from_nodes(self, nodes: Sequence[Node]) -> Optional[str]: - """Extract ``