From 8d675d41474444d9394e342fb0fe5a5ad7f3d8e0 Mon Sep 17 00:00:00 2001 From: gyx09212214-prog <243787584+gyx09212214-prog@users.noreply.github.com> Date: Sat, 13 Jun 2026 00:48:36 +0800 Subject: [PATCH 1/2] Support page numbers in markdown page breaks --- .../transforms/serializer/markdown.py | 27 ++++++++++++++++--- test/test_serialization.py | 22 +++++++++++++++ 2 files changed, 46 insertions(+), 3 deletions(-) diff --git a/docling_core/transforms/serializer/markdown.py b/docling_core/transforms/serializer/markdown.py index bd3c8e22..092cec14 100644 --- a/docling_core/transforms/serializer/markdown.py +++ b/docling_core/transforms/serializer/markdown.py @@ -153,7 +153,8 @@ class MarkdownParams(CommonParams): enable_chart_tables: bool = True indent: int = 4 wrap_width: Optional[PositiveInt] = None - page_break_placeholder: Optional[str] = None # e.g. "" + # e.g. ""; supports {page_no}, {prev_page}, and {next_page}. + page_break_placeholder: Optional[str] = None escape_underscores: bool = True escape_html: bool = True mark_meta: bool = Field(default=False, description="Mark meta sections.") @@ -938,11 +939,31 @@ def serialize_doc( text_res = "\n\n".join([p.text for p in parts if p.text]) if self.requires_page_break(): page_sep = self.params.page_break_placeholder or "" - for full_match, _, _ in self._get_page_breaks(text=text_res): - text_res = text_res.replace(full_match, page_sep) + for full_match, prev_page, next_page in self._get_page_breaks(text=text_res): + text_res = text_res.replace( + full_match, + self._format_page_break_placeholder( + page_sep, + prev_page=prev_page, + next_page=next_page, + ), + ) return create_ser_result(text=text_res, span_source=parts) + @staticmethod + def _format_page_break_placeholder( + placeholder: str, + *, + prev_page: int, + next_page: int, + ) -> str: + return ( + placeholder.replace("{page_no}", str(next_page)) + .replace("{prev_page}", str(prev_page)) + .replace("{next_page}", str(next_page)) + ) + @override def requires_page_break(self) -> bool: """Whether to add page breaks.""" diff --git a/test/test_serialization.py b/test/test_serialization.py index 8c3fae1c..e03e5be7 100644 --- a/test/test_serialization.py +++ b/test/test_serialization.py @@ -159,6 +159,28 @@ def test_md_cross_page_list_page_break_non_empty(): verify(exp_file=src.parent / f"{src.stem}_pb_non_empty.gt.md", actual=actual) +def test_md_cross_page_list_page_break_page_number_placeholder(): + src = Path("./test/data/doc/activities.json") + doc = DoclingDocument.load_from_json(src) + + ser = MarkdownDocSerializer( + doc=doc, + params=MarkdownParams( + image_mode=ImageRefMode.PLACEHOLDER, + image_placeholder="", + page_break_placeholder="", + labels=_DEFAULT_LABELS - {DocItemLabel.PICTURE}, + ), + ) + actual = ser.serialize().text + + assert "" in actual + assert "" in actual + assert "{page_no}" not in actual + assert "{prev_page}" not in actual + assert "{next_page}" not in actual + + def test_md_cross_page_list_page_break_p2(): src = Path("./test/data/doc/activities.json") doc = DoclingDocument.load_from_json(src) From 897217d0c035dba54c0acf17e51d9b09f72895f7 Mon Sep 17 00:00:00 2001 From: gyx09212214-prog <243787584+gyx09212214-prog@users.noreply.github.com> Date: Sat, 13 Jun 2026 01:29:21 +0800 Subject: [PATCH 2/2] DCO Remediation Commit for gyx09212214-prog <243787584+gyx09212214-prog@users.noreply.github.com> I, gyx09212214-prog <243787584+gyx09212214-prog@users.noreply.github.com>, hereby add my Signed-off-by to this commit: 8d675d41474444d9394e342fb0fe5a5ad7f3d8e0 Signed-off-by: gyx09212214-prog <243787584+gyx09212214-prog@users.noreply.github.com>