Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 13 additions & 7 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,16 +1,18 @@
fail_fast: true
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.11.5
rev: v0.15.17
hooks:
- id: ruff-check
name: "Ruff linter"
args: [--exit-non-zero-on-fix, --fix, --config=pyproject.toml]
files: '^(docling_core|test|docs/examples|examples).*\.(py|ipynb)$'
pass_filenames: false
- id: ruff-format
name: "Ruff formatter"
args: [--config=pyproject.toml]
files: '^(docling_core|tests|docs/examples).*\.(py|ipynb)$'
- id: ruff
name: "Ruff linter"
args: [--exit-non-zero-on-fix, --fix, --config=pyproject.toml]
files: '^(docling_core|tests|docs/examples).*\.(py|ipynb)$'
files: '^(docling_core|test|docs/examples|examples).*\.(py|ipynb)$'
pass_filenames: false
- repo: local
hooks:
- id: mypy
Expand All @@ -36,6 +38,10 @@ repos:
language: system
files: '\.py$'
- repo: https://github.com/astral-sh/uv-pre-commit
rev: 0.7.8
rev: 0.11.21
hooks:
- id: uv-lock
name: "uv lockfile"
entry: uv lock
pass_filenames: false
language: system
4 changes: 1 addition & 3 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,7 @@ uv add [OPTIONS] <PACKAGES|--requirements <REQUIREMENTS>>

We use the following tools to enforce code style:

- Ruff, to format and lint code
- Flake8, to lint code
- autoflake, to remove unused variables and imports
- [Ruff](https://docs.astral.sh/ruff/), to format and lint code
- [MyPy](https://mypy.readthedocs.io), as static type checker

A set of styling checks, as well as regression tests, are defined and managed through the [pre-commit](https://pre-commit.com/) framework. To ensure that those scripts run automatically before a commit is finalized, install `pre-commit` on your local repository:
Expand Down
2 changes: 1 addition & 1 deletion docling_core/experimental/serializer/outline.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ def _format_indented_text_line(item: OutlineItemData, indent_size: int = 2, max_


def _extract_ref_from_markdown(text: str) -> tuple[str | None, int | None]:
"""Extract reference and level from Markdown outline text.
r"""Extract reference and level from Markdown outline text.

Args:
text: Markdown text containing a reference like \\[ref=#/texts/1\\]
Expand Down
2 changes: 0 additions & 2 deletions docling_core/transforms/chunker/chunk_expander.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ def _get_top_containing_items(meta: DocMeta, doc: DoclingDocument) -> list[DocIt
Returns:
List of top-level DocItems in document order, or None if no items found.
"""

items: dict[str, DocItem] = {}
ref_items: list[str] = [item.self_ref for item in meta.doc_items]
for item in ref_items:
Expand Down Expand Up @@ -138,7 +137,6 @@ def expand(self, chunk: BaseChunk, dl_doc: DoclingDocument, serializer: BaseDocS
Note:
It is recommended to use the same serializer as used for the original document.
"""

if not isinstance(chunk, DocChunk):
return chunk

Expand Down
28 changes: 10 additions & 18 deletions docling_core/transforms/chunker/line_chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,27 +287,19 @@ def split_by_token_limit(
token_limit: int,
prefer_word_boundary: bool = True,
) -> tuple[str, str]:
"""
Split `text` into (head, tail) where `head` has at most `token_limit` tokens,
"""Split `text` into (head, tail) where `head` has at most `token_limit` tokens,
and `tail` is the remainder. Uses binary search on character indices to minimize
calls to `count_tokens`.

Parameters
----------
text : str
Input string to split.
token_limit: int
Maximum number of tokens allowed in the head.
prefer_word_boundary : bool
If True, try to end the head on a whitespace boundary (without violating
the token limit). If no boundary exists in range, fall back to the
exact max index found by search.

Returns
-------
(head, tail) : tuple[str, str]
`head` contains at most `token_limit` tokens, `tail` is the remaining suffix.
If `token_limit <= 0`, returns ("", text).
Args:
text: Input string to split.
token_limit: Maximum number of tokens allowed in the head.
prefer_word_boundary: If True, try to end the head on a whitespace boundary (without violating
the token limit). If no boundary exists in range, fall back to the
exact max index found by search.

Returns:
(head, tail) where `head` contains at most `token_limit` tokens, `tail` is the remaining suffix. If `token_limit <= 0`, returns ("", text).
"""
if token_limit <= 0 or not text:
return "", text
Expand Down
1 change: 0 additions & 1 deletion docling_core/transforms/chunker/tokenizer/huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,5 +72,4 @@ def get_tokenizer(self):

def get_default_tokenizer():
"""Get default tokenizer instance."""

return HuggingFaceTokenizer.from_pretrained(model_name="sentence-transformers/all-MiniLM-L6-v2")
11 changes: 5 additions & 6 deletions docling_core/transforms/deserializer/doclang.py
Original file line number Diff line number Diff line change
Expand Up @@ -1600,7 +1600,7 @@ def _inner_xml(self, el: Element, exclude_tags: Optional[set[str]] = None) -> st
return "".join(parts)

def _layer_from_nodes(self, nodes: Sequence[Node]) -> Optional[ContentLayer]:
"""Extract content layer from ``<layer value=\"...\"/>`` in element head nodes."""
r"""Extract content layer from ``<layer value=\"...\"/>`` in element head nodes."""
for node in nodes:
if isinstance(node, Element) and node.tagName == DocLangToken.LAYER.value:
if layer_value := node.getAttribute(DocLangAttributeKey.VALUE.value):
Expand All @@ -1611,7 +1611,7 @@ def _layer_from_nodes(self, nodes: Sequence[Node]) -> Optional[ContentLayer]:
return None

def _label_value_from_nodes(self, nodes: Sequence[Node]) -> Optional[str]:
"""Extract ``<label value=\"...\"/>`` from element head nodes."""
r"""Extract ``<label value=\"...\"/>`` from element head nodes."""
for node in nodes:
if isinstance(node, Element) and node.tagName == DocLangToken.LABEL.value:
if label_val := node.getAttribute(DocLangAttributeKey.VALUE.value):
Expand All @@ -1637,7 +1637,7 @@ def _label_value_from_nodes(self, nodes: Sequence[Node]) -> Optional[str]:
def _bbox_from_location_text_fragments(
self, *, doc: DoclingDocument, fragments: list[str]
) -> Optional[BoundingBox]:
"""Build a TOPLEFT bbox from four ``<location value=\"...\"/>`` XML fragments."""
r"""Build a TOPLEFT bbox from four ``<location value=\"...\"/>`` XML fragments."""
if len(fragments) != 4:
return None
values: list[int] = []
Expand Down Expand Up @@ -1714,7 +1714,6 @@ def _otsl_extract_tokens_and_text(self, s: str) -> tuple[list[str], list[str]]:
``<location>`` tokens. Handles nested XML elements (like
``<text><italic>...</italic></text>``) by keeping them as single units.
"""

tokens: list[str] = []
parts: list[str] = []

Expand Down Expand Up @@ -2016,11 +2015,11 @@ def _extract_provenance(self, *, doc: DoclingDocument, el: Element) -> list[Prov
return self._provenance_from_location_nodes(doc=doc, nodes=head_nodes)

def _extract_layer(self, *, el: Element) -> Optional[ContentLayer]:
"""Extract content layer from element-head ``<layer value=\"...\"/>``."""
r"""Extract content layer from element-head ``<layer value=\"...\"/>``."""
head_nodes, _ = self._split_element_children_head_body(el)
return self._layer_from_nodes(head_nodes)

def _extract_label_value(self, *, el: Element) -> Optional[str]:
"""Extract ``<label value=\"...\"/>`` from element head."""
r"""Extract ``<label value=\"...\"/>`` from element head."""
head_nodes, _ = self._split_element_children_head_body(el)
return self._label_value_from_nodes(head_nodes)
3 changes: 1 addition & 2 deletions docling_core/transforms/serializer/doclang.py
Original file line number Diff line number Diff line change
Expand Up @@ -1742,7 +1742,7 @@ def serialize_hyperlink(
hyperlink: Union[AnyUrl, Path],
**kwargs: Any,
) -> str:
"""Hyperlinks are emitted as ``<href uri=\"...\"/>`` in element head, not inline."""
r"""Hyperlinks are emitted as ``<href uri=\"...\"/>`` in element head, not inline."""
return text

text_serializer: BaseTextSerializer = DocLangTextSerializer()
Expand Down Expand Up @@ -1853,7 +1853,6 @@ def _filter_out_all_content(self, text: str) -> str:

def _serialize_body(self, **kwargs) -> SerializationResult:
"""Serialize the document body."""

self._suppressed_page_breaks = set()
self._next_thread_id = 1
self._thread_id_by_ref = {}
Expand Down
1 change: 0 additions & 1 deletion docling_core/transforms/serializer/markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -443,7 +443,6 @@ def get_header_and_body_lines(
A tuple of (header_lines, body_lines) where header_lines contains
the header row and separator row, and body_lines contains the data rows.
"""

lines = [line for line in table_text.splitlines(True) if line.strip()]

if len(lines) < 2:
Expand Down
2 changes: 1 addition & 1 deletion docling_core/transforms/serializer/markdown_excel.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def serialize(


class MsExcelMarkdownDocSerializer(MarkdownDocSerializer):
"""``MarkdownDocSerializer`` variant for Excel-sourced ``DoclingDocument``\\s.
r"""``MarkdownDocSerializer`` variant for Excel-sourced ``DoclingDocument``\\s.

Swap in :class:`MsExcelMarkdownFallbackSerializer` so that worksheet
groups (``GroupLabel.SHEET``) are rendered with their name as a Markdown
Expand Down
3 changes: 0 additions & 3 deletions docling_core/transforms/serializer/webvtt.py
Original file line number Diff line number Diff line change
Expand Up @@ -368,19 +368,16 @@ def requires_page_break(self) -> bool:
@override
def serialize_bold(self, text: str, **kwargs) -> str:
"""Apply WebVTT-specific bold serialization."""

return self.serialize_cue_span(text=text, tag="b")

@override
def serialize_italic(self, text: str, **kwargs) -> str:
"""Apply WebVTT-specific italic serialization."""

return self.serialize_cue_span(text=text, tag="i")

@override
def serialize_underline(self, text: str, **kwargs) -> str:
"""Apply WebVTT-specific underline serialization."""

return self.serialize_cue_span(text=text, tag="u")

def serialize_cue_span(
Expand Down
12 changes: 6 additions & 6 deletions docling_core/types/doc/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,12 +63,12 @@ class BoundingBox(BaseModel):

@property
def width(self):
"""width."""
"""Width."""
return self.r - self.l

@property
def height(self):
"""height."""
"""Height."""
return abs(self.t - self.b)

@field_serializer("l", "t", "r", "b")
Expand All @@ -94,12 +94,12 @@ def scale_to_size(self, old_size: Size, new_size: Size):

# same as before, but using the implementation above
def scaled(self, scale: float):
"""scaled."""
"""Scaled."""
return self.resize_by_scale(x_scale=scale, y_scale=scale)

# same as before, but using the implementation above
def normalized(self, page_size: Size):
"""normalized."""
"""Normalized."""
return self.scale_to_size(old_size=page_size, new_size=Size(height=1.0, width=1.0))

def expand_by_scale(self, x_scale: float, y_scale: float) -> "BoundingBox":
Expand Down Expand Up @@ -155,7 +155,7 @@ def from_tuple(cls, coord: tuple[float, ...], origin: CoordOrigin):
return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)

def area(self) -> float:
"""area."""
"""Area."""
return abs(self.r - self.l) * abs(self.b - self.t)

def intersection_area_with(self, other: "BoundingBox") -> float:
Expand Down Expand Up @@ -260,7 +260,7 @@ def to_top_left_origin(self, page_height: float) -> "BoundingBox":
)

def overlaps(self, other: "BoundingBox") -> bool:
"""overlaps."""
"""Overlaps."""
return self.overlaps_horizontally(other=other) and self.overlaps_vertically(other=other)

def overlaps_horizontally(self, other: "BoundingBox") -> bool:
Expand Down
8 changes: 2 additions & 6 deletions docling_core/types/doc/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -414,7 +414,7 @@ class TableData(BaseModel): # TBD
def grid(
self,
) -> list[list[TableCell]]:
"""grid."""
"""Grid."""
# Initialise empty table data grid (only empty cells)
table_data = [
[
Expand Down Expand Up @@ -2419,7 +2419,6 @@ def _migrate_annotations_to_meta(self) -> Self:

def export_to_dataframe(self, doc: Optional["DoclingDocument"] = None) -> pd.DataFrame:
"""Export the table as a Pandas DataFrame."""

return self._export_to_dataframe_with_options(doc=doc)

def _export_to_dataframe_with_options(
Expand All @@ -2428,7 +2427,6 @@ def _export_to_dataframe_with_options(
**kwargs: Any,
) -> pd.DataFrame:
"""Export the table as a Pandas DataFrame with contextual named arguments."""

if doc is None:
_logger.warning("Usage of TableItem.export_to_dataframe() without `doc` argument is deprecated.")

Expand Down Expand Up @@ -6317,7 +6315,7 @@ def export_to_text(
page_break_placeholder: Optional[str] = None,
traverse_pictures: bool = False,
) -> str:
"""Export to plain text.
r"""Export to plain text.

Produces clean plain text without any Markdown decoration. Heading
markers (``#``), bold/italic markers, and hyperlink syntax are all
Expand Down Expand Up @@ -6534,7 +6532,6 @@ def export_to_vtt(
Returns:
A string representation of the Docling document in WebVTT format.
"""

from docling_core.transforms.serializer.webvtt import WebVTTDocSerializer, WebVTTParams

my_layers = included_content_layers if included_content_layers is not None else DEFAULT_CONTENT_LAYERS
Expand Down Expand Up @@ -7497,7 +7494,6 @@ def _normalize_table_children_from_rich_cells(self):
@model_validator(mode="after")
def validate_document(self) -> Self:
"""validate_document."""

with warnings.catch_warnings():
# ignore warning from deprecated furniture
warnings.filterwarnings("ignore", category=DeprecationWarning)
Expand Down
Loading
Loading