Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 24 additions & 3 deletions docling_core/transforms/serializer/markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,8 @@ class MarkdownParams(CommonParams):
enable_chart_tables: bool = True
indent: int = 4
wrap_width: Optional[PositiveInt] = None
page_break_placeholder: Optional[str] = None # e.g. "<!-- page break -->"
# e.g. "<!-- page break -->"; supports {page_no}, {prev_page}, and {next_page}.
page_break_placeholder: Optional[str] = None
escape_underscores: bool = True
escape_html: bool = True
mark_meta: bool = Field(default=False, description="Mark meta sections.")
Expand Down Expand Up @@ -938,11 +939,31 @@ def serialize_doc(
text_res = "\n\n".join([p.text for p in parts if p.text])
if self.requires_page_break():
page_sep = self.params.page_break_placeholder or ""
for full_match, _, _ in self._get_page_breaks(text=text_res):
text_res = text_res.replace(full_match, page_sep)
for full_match, prev_page, next_page in self._get_page_breaks(text=text_res):
text_res = text_res.replace(
full_match,
self._format_page_break_placeholder(
page_sep,
prev_page=prev_page,
next_page=next_page,
),
)

return create_ser_result(text=text_res, span_source=parts)

@staticmethod
def _format_page_break_placeholder(
placeholder: str,
*,
prev_page: int,
next_page: int,
) -> str:
return (
placeholder.replace("{page_no}", str(next_page))
.replace("{prev_page}", str(prev_page))
.replace("{next_page}", str(next_page))
)

@override
def requires_page_break(self) -> bool:
"""Whether to add page breaks."""
Expand Down
22 changes: 22 additions & 0 deletions test/test_serialization.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,28 @@ def test_md_cross_page_list_page_break_non_empty():
verify(exp_file=src.parent / f"{src.stem}_pb_non_empty.gt.md", actual=actual)


def test_md_cross_page_list_page_break_page_number_placeholder():
src = Path("./test/data/doc/activities.json")
doc = DoclingDocument.load_from_json(src)

ser = MarkdownDocSerializer(
doc=doc,
params=MarkdownParams(
image_mode=ImageRefMode.PLACEHOLDER,
image_placeholder="<!-- image -->",
page_break_placeholder="<!-- page {page_no} from {prev_page} to {next_page} -->",
labels=_DEFAULT_LABELS - {DocItemLabel.PICTURE},
),
)
actual = ser.serialize().text

assert "<!-- page 2 from 1 to 2 -->" in actual
assert "<!-- page 3 from 2 to 3 -->" in actual
assert "{page_no}" not in actual
assert "{prev_page}" not in actual
assert "{next_page}" not in actual


def test_md_cross_page_list_page_break_p2():
src = Path("./test/data/doc/activities.json")
doc = DoclingDocument.load_from_json(src)
Expand Down
Loading