From 05b154a7e87ba6c619ef08578a78d9f78079cefb Mon Sep 17 00:00:00 2001 From: Mirko Westermeier Date: Mon, 20 Apr 2026 17:35:49 +0200 Subject: [PATCH 1/9] Add image type instead of file name only (optional dimensions) --- pygexml/image.py | 9 +++++++ pygexml/page.py | 31 ++++++++++++++++++++---- pygexml/strategies.py | 12 ++++++++-- test/test_image.py | 29 +++++++++++++++++++++++ test/test_page.py | 55 ++++++++++++++++++++++++++++++++++++------- 5 files changed, 121 insertions(+), 15 deletions(-) create mode 100644 pygexml/image.py create mode 100644 test/test_image.py diff --git a/pygexml/image.py b/pygexml/image.py new file mode 100644 index 0000000..8fcd7e4 --- /dev/null +++ b/pygexml/image.py @@ -0,0 +1,9 @@ +from dataclasses import dataclass +from dataclasses_json import DataClassJsonMixin + + +@dataclass +class Image(DataClassJsonMixin): + filename: str + width: int | None + height: int | None diff --git a/pygexml/page.py b/pygexml/page.py index 116b1ce..d14aaf4 100644 --- a/pygexml/page.py +++ b/pygexml/page.py @@ -9,6 +9,7 @@ from lxml.etree import _Element as Element, QName from .geometry import Point, Box, Polygon, GeometryError +from .image import Image def find_child(element: Element, name: str) -> Element | None: @@ -225,7 +226,7 @@ def all_words(self) -> Iterable[str]: @dataclass class Page(DataClassJsonMixin): - image_filename: str + image: Image regions: dict[ID, TextRegion] @classmethod @@ -234,12 +235,20 @@ def from_xml(cls, element: Element) -> "Page": raise PageXMLError("Wrong element given") if "imageFilename" not in element.attrib: - raise PageXMLError("No filename found") + raise PageXMLError("No image filename found") + if "imageWidth" not in element.attrib: + raise PageXMLError("No image width found") + if "imageHeight" not in element.attrib: + raise PageXMLError("No image height found") regions = find_children(element, "TextRegion") return Page( - image_filename=str(element.attrib["imageFilename"]), + image=Image( + filename=str(element.attrib["imageFilename"]), + width=int(element.attrib["imageWidth"]), + height=int(element.attrib["imageHeight"]), + ), regions={ tr.id: tr for tr in (TextRegion.from_xml(region) for region in regions) }, @@ -289,8 +298,22 @@ def from_alto(cls, element: Element) -> "Page": text_blocks = find_children(printspace_element, "TextBlock") + # ALTO allows for float values, but we convert to int for consistency with PAGE XML + image_width = ( + int(float(page_element.attrib["WIDTH"])) + if "WIDTH" in page_element.attrib + else None + ) + image_height = ( + int(float(page_element.attrib["HEIGHT"])) + if "HEIGHT" in page_element.attrib + else None + ) + return Page( - image_filename=image_filename, + image=Image( + filename=image_filename, width=image_width, height=image_height + ), regions={ tb.id: tb for tb in (TextRegion.from_alto(tb) for tb in text_blocks) }, diff --git a/pygexml/strategies.py b/pygexml/strategies.py index d2964ce..df62d58 100644 --- a/pygexml/strategies.py +++ b/pygexml/strategies.py @@ -6,6 +6,7 @@ import hypothesis.strategies as st from pygexml.geometry import Point, Box, Polygon +from pygexml.image import Image from pygexml.page import Coords, Page, TextLine, TextRegion st_points = st.builds(Point, x=st.integers(min_value=0), y=st.integers(min_value=0)) @@ -60,10 +61,17 @@ def st_simple_text(**kwargs): ), ) +st_images = st.builds( + Image, + filename=st_simple_text(), + width=st.one_of(st.none(), st.integers(min_value=1)), + height=st.one_of(st.none(), st.integers(min_value=1)), +) + @st.composite def st_pages(draw): - image_filename = draw(st_simple_text()) + image = draw(st_images) regions = {tr.id: tr for tr in draw(st.lists(st_text_regions))} - page = Page(image_filename=image_filename, regions=regions) + page = Page(image=image, regions=regions) return page diff --git a/test/test_image.py b/test/test_image.py new file mode 100644 index 0000000..a2a0c9b --- /dev/null +++ b/test/test_image.py @@ -0,0 +1,29 @@ +from hypothesis import given +import hypothesis.strategies as st + +from pygexml.strategies import st_images +from pygexml.image import Image + + +def test_image_example() -> None: + image = Image(filename="a.jpg", width=800, height=600) + assert image.filename == "a.jpg" + assert image.width == 800 + assert image.height == 600 + + +@given( + st.text(), + st.one_of(st.none(), st.integers(min_value=1)), + st.one_of(st.none(), st.integers(min_value=1)), +) +def test_image_arbitrary(filename: str, width: int, height: int) -> None: + image = Image(filename=filename, width=width, height=height) + assert image.filename == filename + assert image.width == width + assert image.height == height + + +@given(st_images) +def test_image_serialization_roundtrip_arbitrary(image: Image) -> None: + assert Image.from_dict(image.to_dict()) == image diff --git a/test/test_page.py b/test/test_page.py index 23745e0..6beeb04 100644 --- a/test/test_page.py +++ b/test/test_page.py @@ -8,6 +8,7 @@ from pygexml.strategies import * from pygexml.geometry import Point, Box, Polygon +from pygexml.image import Image from pygexml.page import Coords, ID, TextLine, TextRegion, Page ############## Tests for Coords #################### @@ -410,7 +411,7 @@ def test_page_from_element_example() -> None: """)) - assert pa.image_filename == "7895328.jpg" + assert pa.image == Image(filename="7895328.jpg", width=4279, height=5315) assert pa.regions == { "tr-1": TextRegion( id="tr-1", @@ -454,7 +455,19 @@ def test_page_wrong_element() -> None: def test_page_no_filename() -> None: xml = "" - with pytest.raises(Exception, match="No filename found"): + with pytest.raises(Exception, match="No image filename found"): + Page.from_xml(etree.fromstring(xml)) + + +def test_page_no_image_width() -> None: + xml = """""" + with pytest.raises(Exception, match="No image width found"): + Page.from_xml(etree.fromstring(xml)) + + +def test_page_no_image_height() -> None: + xml = """""" + with pytest.raises(Exception, match="No image height found"): Page.from_xml(etree.fromstring(xml)) @@ -479,7 +492,7 @@ def test_page_from_string() -> None: """) # use default PageXML namespace - assert pa.image_filename == "a.jpg" + assert pa.image == Image(filename="a.jpg", width=4217, height=1742) assert pa.regions == { "b": TextRegion( id="b", @@ -515,7 +528,7 @@ def test_from_xml_file_example(tmp_path: Path) -> None: xml_filepath.write_text(content, encoding="utf-8") result = Page.from_xml_file(xml_filepath) - assert result.image_filename == "a.jpg" + assert result.image == Image(filename="a.jpg", width=4217, height=1742) assert result.regions == { "b": TextRegion( id="b", @@ -562,7 +575,7 @@ def test_page_from_alto_example() -> None: """)) - assert pa.image_filename == "a.jpg" + assert pa.image == Image(filename="a.jpg", width=None, height=None) assert pa.regions == { "tr-1": TextRegion( id="tr-1", @@ -589,6 +602,30 @@ def test_page_from_alto_example() -> None: } +def test_page_alto_with_dimensions() -> None: + pa = Page.from_alto(etree.fromstring(""" + + + + a.jpg + + + + + + + + + + + + + + + """)) + assert pa.image == Image(filename="a.jpg", width=800, height=600) + + def test_page_alto_wrong_element() -> None: with pytest.raises(Exception, match="Wrong element given"): Page.from_alto(etree.fromstring("!!!")) @@ -687,7 +724,7 @@ def test_page_alto_from_string() -> None: """ page = Page.from_alto_string(alto_string) - assert page.image_filename == "a.jpg" + assert page.image == Image(filename="a.jpg", width=None, height=None) assert page.regions == { "tr-1": TextRegion( id="tr-1", @@ -730,7 +767,7 @@ def test_page_alto_from_file_example(tmp_path: Path) -> None: filepath.write_text(alto_string, encoding="utf-8") result = Page.from_alto_file(filepath) - assert result.image_filename == "a.jpg" + assert result.image == Image(filename="a.jpg", width=None, height=None) assert result.regions == { "tr-1": TextRegion( id="tr-1", @@ -769,7 +806,7 @@ def test_page_region_lookup_not_found(id: str, page: Page) -> None: def test_page_all_text_and_words() -> None: pa = Page( - image_filename="a", + image=Image(filename="a", width=None, height=None), regions={ "a": TextRegion( id="a", @@ -807,7 +844,7 @@ def test_page_all_arbitrary_text_and_words(page: Page) -> None: def test_page_serialization_roundtrip() -> None: pa = Page( - image_filename="a.jpg", + image=Image(filename="a.jpg", width=1920, height=1080), regions={ "tr-1": TextRegion( id="tr-1", From 1252c7b1d8d350304e411f08b3430ce0886c3801 Mon Sep 17 00:00:00 2001 From: Mirko Westermeier Date: Tue, 28 Apr 2026 13:52:30 +0200 Subject: [PATCH 2/9] Add simple SVG functions to display outlines --- pygexml/strategies.py | 15 +++++ pygexml/svg.py | 85 +++++++++++++++++++++++++++ test/test_svg.py | 134 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 234 insertions(+) create mode 100644 pygexml/svg.py create mode 100644 test/test_svg.py diff --git a/pygexml/strategies.py b/pygexml/strategies.py index df62d58..851b8ee 100644 --- a/pygexml/strategies.py +++ b/pygexml/strategies.py @@ -68,6 +68,13 @@ def st_simple_text(**kwargs): height=st.one_of(st.none(), st.integers(min_value=1)), ) +st_images_with_dimensions = st.builds( + Image, + filename=st_simple_text(), + width=st.integers(min_value=1), + height=st.integers(min_value=1), +) + @st.composite def st_pages(draw): @@ -75,3 +82,11 @@ def st_pages(draw): regions = {tr.id: tr for tr in draw(st.lists(st_text_regions))} page = Page(image=image, regions=regions) return page + + +@st.composite +def st_pages_with_dimensions(draw): + image = draw(st_images_with_dimensions) + regions = {tr.id: tr for tr in draw(st.lists(st_text_regions))} + page = Page(image=image, regions=regions) + return page diff --git a/pygexml/svg.py b/pygexml/svg.py new file mode 100644 index 0000000..9fecf8a --- /dev/null +++ b/pygexml/svg.py @@ -0,0 +1,85 @@ +from lxml import etree +from lxml.etree import _Element as Element + +from .page import Page, TextRegion, TextLine + +SVG_NS = "http://www.w3.org/2000/svg" + + +class SVGError(Exception): + pass + + +def _coords_path(coords_str: str) -> str: + return f"M {coords_str}" + + +def _line_to_svg(line: TextLine) -> Element: + g = etree.Element(f"{{{SVG_NS}}}g", attrib={"id": line.id, "class": "TextLine"}) + etree.SubElement( + g, + f"{{{SVG_NS}}}path", + attrib={ + "d": _coords_path(str(line.coords)), + "class": "Coords", + }, + ) + return g + + +def _region_to_svg(region: TextRegion) -> Element: + g = etree.Element(f"{{{SVG_NS}}}g", attrib={"id": region.id, "class": "TextRegion"}) + etree.SubElement( + g, + f"{{{SVG_NS}}}path", + attrib={ + "d": _coords_path(str(region.coords)), + "class": "Coords", + }, + ) + for line in region.textlines.values(): + g.append(_line_to_svg(line)) + return g + + +def page_to_svg(page: Page) -> Element: + if page.image.width is None: + raise SVGError("Image width is required for SVG generation") + if page.image.height is None: + raise SVGError("Image height is required for SVG generation") + + width = page.image.width + height = page.image.height + + svg = etree.Element( + f"{{{SVG_NS}}}svg", + # the official way to do it although stubs are wrong: + nsmap={None: SVG_NS}, # type: ignore + attrib={ + "width": str(width), + "height": str(height), + "viewBox": f"0 0 {width} {height}", + }, + ) + + etree.SubElement( + svg, + f"{{{SVG_NS}}}image", + attrib={ + "x": "0", + "y": "0", + "width": str(width), + "height": str(height), + "href": page.image.filename, + "preserveAspectRatio": "none", + }, + ) + + for region in page.regions.values(): + svg.append(_region_to_svg(region)) + + return svg + + +def page_to_svg_string(page: Page) -> str: + return etree.tostring(page_to_svg(page), encoding="unicode", pretty_print=True) diff --git a/test/test_svg.py b/test/test_svg.py new file mode 100644 index 0000000..ca84f37 --- /dev/null +++ b/test/test_svg.py @@ -0,0 +1,134 @@ +import pytest +from hypothesis import given +from lxml import etree +from lxml.etree import _Element as Element + +from pygexml.strategies import st_pages_with_dimensions +from pygexml.image import Image +from pygexml.page import Coords, TextLine, TextRegion, Page +from pygexml.svg import SVGError, page_to_svg, page_to_svg_string + +SVG_NS = "http://www.w3.org/2000/svg" + + +def make_page(**kwargs) -> Page: + defaults: dict = dict( + image=Image(filename="a.jpg", width=800, height=600), + regions={}, + ) + return Page(**(defaults | kwargs)) + + +############## Tests for page_to_svg #################### + + +def test_page_to_svg_raises_without_image_width() -> None: + page = make_page(image=Image(filename="a.jpg", width=None, height=600)) + with pytest.raises(SVGError, match="width"): + page_to_svg(page) + + +def test_page_to_svg_raises_without_image_height() -> None: + page = make_page(image=Image(filename="a.jpg", width=800, height=None)) + with pytest.raises(SVGError, match="height"): + page_to_svg(page) + + +def test_page_to_svg_returns_svg_element() -> None: + svg = page_to_svg(make_page()) + assert isinstance(svg, Element) + assert svg.tag == f"{{{SVG_NS}}}svg" + + +def test_page_to_svg_dimensions() -> None: + svg = page_to_svg(make_page(image=Image(filename="a.jpg", width=800, height=600))) + assert svg.attrib["width"] == "800" + assert svg.attrib["height"] == "600" + assert svg.attrib["viewBox"] == "0 0 800 600" + + +def test_page_to_svg_image_element() -> None: + svg = page_to_svg(make_page(image=Image(filename="a.jpg", width=800, height=600))) + images = svg.findall(f"{{{SVG_NS}}}image") + assert len(images) == 1 + img = images[0] + assert img.attrib["href"] == "a.jpg" + assert img.attrib["width"] == "800" + assert img.attrib["height"] == "600" + + +def test_page_to_svg_text_regions() -> None: + page = make_page( + regions={ + "r1": TextRegion( + id="r1", + coords=Coords.parse("0,0 10,0 10,10 0,10"), + textlines={ + "l1": TextLine( + id="l1", coords=Coords.parse("1,1 9,1 9,9 1,9"), text="foo" + ), + }, + ), + } + ) + svg = page_to_svg(page) + groups = svg.findall(f"{{{SVG_NS}}}g") + assert len(groups) == 1 + region_g = groups[0] + assert region_g.attrib["id"] == "r1" + assert region_g.attrib["class"] == "TextRegion" + line_groups = region_g.findall(f"{{{SVG_NS}}}g") + assert len(line_groups) == 1 + assert line_groups[0].attrib["id"] == "l1" + assert line_groups[0].attrib["class"] == "TextLine" + + +def test_page_to_svg_coords_path() -> None: + page = make_page( + regions={ + "r1": TextRegion( + id="r1", + coords=Coords.parse("0,0 10,0 10,10 0,10"), + textlines={}, + ), + } + ) + svg = page_to_svg(page) + region_g = svg.find(f"{{{SVG_NS}}}g") + assert region_g is not None + path = region_g.find(f"{{{SVG_NS}}}path") + assert path is not None + assert path.attrib["d"] == "M 0,0 10,0 10,10 0,10" + assert path.attrib["class"] == "Coords" + + +############## Tests for page_to_svg_string #################### + + +def test_page_to_svg_string_example() -> None: + result = page_to_svg_string( + make_page(image=Image(filename="a.jpg", width=800, height=600)) + ) + assert isinstance(result, str) + assert 'xmlns="http://www.w3.org/2000/svg"' in result + assert 'href="a.jpg"' in result + assert 'viewBox="0 0 800 600"' in result + + +def test_page_to_svg_string_is_valid_xml() -> None: + result = page_to_svg_string(make_page()) + root = etree.fromstring(result.encode("utf-8")) + assert root.tag == f"{{{SVG_NS}}}svg" + + +def test_page_to_svg_string_raises_without_dimensions() -> None: + page = make_page(image=Image(filename="a.jpg", width=None, height=None)) + with pytest.raises(SVGError): + page_to_svg_string(page) + + +@given(st_pages_with_dimensions()) +def test_page_to_svg_string_arbitrary_with_dimensions(page: Page) -> None: + result = page_to_svg_string(page) + root = etree.fromstring(result.encode("utf-8")) + assert root.tag == f"{{{SVG_NS}}}svg" From 28045138a9de9f50185de524ecf911e0ce4e3820 Mon Sep 17 00:00:00 2001 From: Mirko Westermeier Date: Tue, 28 Apr 2026 14:02:12 +0200 Subject: [PATCH 3/9] Add simple text rendering (we have no baseline) --- pygexml/svg.py | 26 +++++++++++++++++++ test/test_svg.py | 67 ++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 91 insertions(+), 2 deletions(-) diff --git a/pygexml/svg.py b/pygexml/svg.py index 9fecf8a..2239841 100644 --- a/pygexml/svg.py +++ b/pygexml/svg.py @@ -14,6 +14,12 @@ def _coords_path(coords_str: str) -> str: return f"M {coords_str}" +def _baseline_path_d(line: TextLine) -> str: + box = line.coords.polygon.bounding_box() + y_mid = (box.top_left.y + box.bottom_right.y) // 2 + return f"M {box.top_left.x},{y_mid} {box.bottom_right.x},{y_mid}" + + def _line_to_svg(line: TextLine) -> Element: g = etree.Element(f"{{{SVG_NS}}}g", attrib={"id": line.id, "class": "TextLine"}) etree.SubElement( @@ -24,6 +30,26 @@ def _line_to_svg(line: TextLine) -> Element: "class": "Coords", }, ) + etree.SubElement( + g, + f"{{{SVG_NS}}}path", + attrib={ + "id": f"bl-{line.id}", + "d": _baseline_path_d(line), + "class": "Baseline", + }, + ) + if line.text: + text = etree.SubElement(g, f"{{{SVG_NS}}}text") + text_path = etree.SubElement( + text, + f"{{{SVG_NS}}}textPath", + attrib={"href": f"#bl-{line.id}", "textLength": "100%"}, + ) + tspan = etree.SubElement( + text_path, f"{{{SVG_NS}}}tspan", attrib={"class": "Text"} + ) + tspan.text = line.text return g diff --git a/test/test_svg.py b/test/test_svg.py index ca84f37..32b1c29 100644 --- a/test/test_svg.py +++ b/test/test_svg.py @@ -1,5 +1,6 @@ import pytest from hypothesis import given +from typing import Any from lxml import etree from lxml.etree import _Element as Element @@ -11,8 +12,8 @@ SVG_NS = "http://www.w3.org/2000/svg" -def make_page(**kwargs) -> Page: - defaults: dict = dict( +def make_page(**kwargs: Any) -> Page: + defaults: dict[str, Any] = dict( image=Image(filename="a.jpg", width=800, height=600), regions={}, ) @@ -132,3 +133,65 @@ def test_page_to_svg_string_arbitrary_with_dimensions(page: Page) -> None: result = page_to_svg_string(page) root = etree.fromstring(result.encode("utf-8")) assert root.tag == f"{{{SVG_NS}}}svg" + + +############## Tests for text rendering #################### + + +def make_page_with_line(text: str = "foo") -> Page: + return make_page( + regions={ + "r1": TextRegion( + id="r1", + coords=Coords.parse("0,0 10,0 10,10 0,10"), + textlines={ + "l1": TextLine( + id="l1", coords=Coords.parse("1,1 9,1 9,9 1,9"), text=text + ), + }, + ), + } + ) + + +def get_line_g(page: Page) -> Element: + svg = page_to_svg(page) + region_g = svg.find(f"{{{SVG_NS}}}g") + assert region_g is not None + line_g = region_g.find(f"{{{SVG_NS}}}g") + assert line_g is not None + return line_g + + +def test_page_to_svg_line_has_baseline_path() -> None: + line_g = get_line_g(make_page_with_line()) + paths = line_g.findall(f"{{{SVG_NS}}}path") + assert len(paths) == 2 + baseline = next(p for p in paths if p.attrib.get("class") == "Baseline") + assert baseline.attrib["id"] == "bl-l1" + + +def test_page_to_svg_line_baseline_from_bounding_box() -> None: + # coords "1,1 9,1 9,9 1,9": x=[1,9], y=[1,9], y_mid=(1+9)//2=5 + line_g = get_line_g(make_page_with_line()) + paths = line_g.findall(f"{{{SVG_NS}}}path") + baseline = next(p for p in paths if p.attrib.get("class") == "Baseline") + assert baseline.attrib["d"] == "M 1,5 9,5" + + +def test_page_to_svg_line_text_content() -> None: + line_g = get_line_g(make_page_with_line("Hallo Welt")) + text = line_g.find(f"{{{SVG_NS}}}text") + assert text is not None + text_path = text.find(f"{{{SVG_NS}}}textPath") + assert text_path is not None + assert text_path.attrib["href"] == "#bl-l1" + tspan = text_path.find(f"{{{SVG_NS}}}tspan") + assert tspan is not None + assert tspan.text == "Hallo Welt" + assert tspan.attrib["class"] == "Text" + + +def test_page_to_svg_line_no_text_element_when_empty() -> None: + line_g = get_line_g(make_page_with_line("")) + assert line_g.find(f"{{{SVG_NS}}}text") is None From 4c0467aba3684301d31e67547dd48eae0b987d09 Mon Sep 17 00:00:00 2001 From: Mirko Westermeier Date: Tue, 28 Apr 2026 14:29:54 +0200 Subject: [PATCH 4/9] Correctly handle optional image dimensions in PAGE-XML --- pygexml/page.py | 16 ++++++++++------ test/test_page.py | 8 ++++---- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/pygexml/page.py b/pygexml/page.py index d14aaf4..ff05343 100644 --- a/pygexml/page.py +++ b/pygexml/page.py @@ -236,18 +236,22 @@ def from_xml(cls, element: Element) -> "Page": if "imageFilename" not in element.attrib: raise PageXMLError("No image filename found") - if "imageWidth" not in element.attrib: - raise PageXMLError("No image width found") - if "imageHeight" not in element.attrib: - raise PageXMLError("No image height found") regions = find_children(element, "TextRegion") return Page( image=Image( filename=str(element.attrib["imageFilename"]), - width=int(element.attrib["imageWidth"]), - height=int(element.attrib["imageHeight"]), + width=( + int(element.attrib["imageWidth"]) + if "imageWidth" in element.attrib + else None + ), + height=( + int(element.attrib["imageHeight"]) + if "imageHeight" in element.attrib + else None + ), ), regions={ tr.id: tr for tr in (TextRegion.from_xml(region) for region in regions) diff --git a/test/test_page.py b/test/test_page.py index 6beeb04..1c6df8e 100644 --- a/test/test_page.py +++ b/test/test_page.py @@ -461,14 +461,14 @@ def test_page_no_filename() -> None: def test_page_no_image_width() -> None: xml = """""" - with pytest.raises(Exception, match="No image width found"): - Page.from_xml(etree.fromstring(xml)) + pa = Page.from_xml(etree.fromstring(xml)) + assert pa.image == Image(filename="a.jpg", width=None, height=600) def test_page_no_image_height() -> None: xml = """""" - with pytest.raises(Exception, match="No image height found"): - Page.from_xml(etree.fromstring(xml)) + pa = Page.from_xml(etree.fromstring(xml)) + assert pa.image == Image(filename="a.jpg", width=800, height=None) def test_page_from_string() -> None: From 374942c84814eb144c246c1b049ce5d9898d8644 Mon Sep 17 00:00:00 2001 From: Mirko Westermeier Date: Tue, 28 Apr 2026 14:30:25 +0200 Subject: [PATCH 5/9] Correctly handle xlink:href attributes --- pygexml/svg.py | 7 ++++--- test/test_svg.py | 7 ++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/pygexml/svg.py b/pygexml/svg.py index 2239841..0ee5a91 100644 --- a/pygexml/svg.py +++ b/pygexml/svg.py @@ -4,6 +4,7 @@ from .page import Page, TextRegion, TextLine SVG_NS = "http://www.w3.org/2000/svg" +XLINK_NS = "http://www.w3.org/1999/xlink" class SVGError(Exception): @@ -44,7 +45,7 @@ def _line_to_svg(line: TextLine) -> Element: text_path = etree.SubElement( text, f"{{{SVG_NS}}}textPath", - attrib={"href": f"#bl-{line.id}", "textLength": "100%"}, + attrib={f"{{{XLINK_NS}}}href": f"#bl-{line.id}", "textLength": "100%"}, ) tspan = etree.SubElement( text_path, f"{{{SVG_NS}}}tspan", attrib={"class": "Text"} @@ -80,7 +81,7 @@ def page_to_svg(page: Page) -> Element: svg = etree.Element( f"{{{SVG_NS}}}svg", # the official way to do it although stubs are wrong: - nsmap={None: SVG_NS}, # type: ignore + nsmap={None: SVG_NS, "xlink": XLINK_NS}, # type: ignore attrib={ "width": str(width), "height": str(height), @@ -96,7 +97,7 @@ def page_to_svg(page: Page) -> Element: "y": "0", "width": str(width), "height": str(height), - "href": page.image.filename, + f"{{{XLINK_NS}}}href": page.image.filename, "preserveAspectRatio": "none", }, ) diff --git a/test/test_svg.py b/test/test_svg.py index 32b1c29..0cba72e 100644 --- a/test/test_svg.py +++ b/test/test_svg.py @@ -10,6 +10,7 @@ from pygexml.svg import SVGError, page_to_svg, page_to_svg_string SVG_NS = "http://www.w3.org/2000/svg" +XLINK_NS = "http://www.w3.org/1999/xlink" def make_page(**kwargs: Any) -> Page: @@ -53,7 +54,7 @@ def test_page_to_svg_image_element() -> None: images = svg.findall(f"{{{SVG_NS}}}image") assert len(images) == 1 img = images[0] - assert img.attrib["href"] == "a.jpg" + assert img.attrib[f"{{{XLINK_NS}}}href"] == "a.jpg" assert img.attrib["width"] == "800" assert img.attrib["height"] == "600" @@ -112,7 +113,7 @@ def test_page_to_svg_string_example() -> None: ) assert isinstance(result, str) assert 'xmlns="http://www.w3.org/2000/svg"' in result - assert 'href="a.jpg"' in result + assert 'xlink:href="a.jpg"' in result assert 'viewBox="0 0 800 600"' in result @@ -185,7 +186,7 @@ def test_page_to_svg_line_text_content() -> None: assert text is not None text_path = text.find(f"{{{SVG_NS}}}textPath") assert text_path is not None - assert text_path.attrib["href"] == "#bl-l1" + assert text_path.attrib[f"{{{XLINK_NS}}}href"] == "#bl-l1" tspan = text_path.find(f"{{{SVG_NS}}}tspan") assert tspan is not None assert tspan.text == "Hallo Welt" From 96a4e6750d010774775780e4be577b7e7366689c Mon Sep 17 00:00:00 2001 From: Mirko Westermeier Date: Tue, 28 Apr 2026 14:30:40 +0200 Subject: [PATCH 6/9] Minor: fix import ordering --- test/test_svg.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/test_svg.py b/test/test_svg.py index 0cba72e..3c8a3ee 100644 --- a/test/test_svg.py +++ b/test/test_svg.py @@ -1,6 +1,7 @@ +from typing import Any + import pytest from hypothesis import given -from typing import Any from lxml import etree from lxml.etree import _Element as Element From ce373d2243d76106a567e9f11211eeec20e43373 Mon Sep 17 00:00:00 2001 From: Mirko Westermeier Date: Tue, 28 Apr 2026 15:05:45 +0200 Subject: [PATCH 7/9] Fix approximated baseline --- pygexml/svg.py | 4 ++-- test/test_svg.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pygexml/svg.py b/pygexml/svg.py index 0ee5a91..e570a39 100644 --- a/pygexml/svg.py +++ b/pygexml/svg.py @@ -17,8 +17,8 @@ def _coords_path(coords_str: str) -> str: def _baseline_path_d(line: TextLine) -> str: box = line.coords.polygon.bounding_box() - y_mid = (box.top_left.y + box.bottom_right.y) // 2 - return f"M {box.top_left.x},{y_mid} {box.bottom_right.x},{y_mid}" + y_baseline = box.top_left.y + (box.bottom_right.y - box.top_left.y) * 2 // 3 + return f"M {box.top_left.x},{y_baseline} {box.bottom_right.x},{y_baseline}" def _line_to_svg(line: TextLine) -> Element: diff --git a/test/test_svg.py b/test/test_svg.py index 3c8a3ee..5b43ada 100644 --- a/test/test_svg.py +++ b/test/test_svg.py @@ -174,11 +174,11 @@ def test_page_to_svg_line_has_baseline_path() -> None: def test_page_to_svg_line_baseline_from_bounding_box() -> None: - # coords "1,1 9,1 9,9 1,9": x=[1,9], y=[1,9], y_mid=(1+9)//2=5 + # coords "1,1 9,1 9,9 1,9": x=[1,9], y=[1,9], height=8, y_baseline=1+8*2//3=6 line_g = get_line_g(make_page_with_line()) paths = line_g.findall(f"{{{SVG_NS}}}path") baseline = next(p for p in paths if p.attrib.get("class") == "Baseline") - assert baseline.attrib["d"] == "M 1,5 9,5" + assert baseline.attrib["d"] == "M 1,6 9,6" def test_page_to_svg_line_text_content() -> None: From 753a8b90c3857ac3bb28d7b13266aa8694086963 Mon Sep 17 00:00:00 2001 From: Mirko Westermeier Date: Tue, 28 Apr 2026 15:09:43 +0200 Subject: [PATCH 8/9] Add default SVG styling --- pygexml/svg.py | 27 ++++++++++++++++++++++++--- test/test_svg.py | 22 ++++++++++++++++++++++ 2 files changed, 46 insertions(+), 3 deletions(-) diff --git a/pygexml/svg.py b/pygexml/svg.py index e570a39..0385458 100644 --- a/pygexml/svg.py +++ b/pygexml/svg.py @@ -69,7 +69,21 @@ def _region_to_svg(region: TextRegion) -> Element: return g -def page_to_svg(page: Page) -> Element: +def _default_style(width: int, height: int) -> Element: + font_size = max(width, height) // 60 + style = etree.Element(f"{{{SVG_NS}}}style") + style.text = ( + f"\n" + f" path.Coords {{ fill: rgba(100,160,255,0.12); stroke: steelblue; stroke-width: {max(width, height) // 1500}; }}\n" + f" path.Baseline {{ stroke: #e74c3c; stroke-width: {max(width, height) // 2000}; fill: none; }}\n" + f" .TextLine text {{ font-size: {font_size}px; font-family: serif; fill: #000; opacity: 0; transition: opacity 0.15s; }}\n" + f" .TextLine:hover text {{ opacity: 1; }}\n" + f" " + ) + return style + + +def page_to_svg(page: Page, include_style: bool = True) -> Element: if page.image.width is None: raise SVGError("Image width is required for SVG generation") if page.image.height is None: @@ -102,11 +116,18 @@ def page_to_svg(page: Page) -> Element: }, ) + if include_style: + svg.insert(0, _default_style(width, height)) + for region in page.regions.values(): svg.append(_region_to_svg(region)) return svg -def page_to_svg_string(page: Page) -> str: - return etree.tostring(page_to_svg(page), encoding="unicode", pretty_print=True) +def page_to_svg_string(page: Page, include_style: bool = True) -> str: + return etree.tostring( + page_to_svg(page, include_style=include_style), + encoding="unicode", + pretty_print=True, + ) diff --git a/test/test_svg.py b/test/test_svg.py index 5b43ada..5b7cdd7 100644 --- a/test/test_svg.py +++ b/test/test_svg.py @@ -137,6 +137,28 @@ def test_page_to_svg_string_arbitrary_with_dimensions(page: Page) -> None: assert root.tag == f"{{{SVG_NS}}}svg" +def test_page_to_svg_includes_style_by_default() -> None: + svg = page_to_svg(make_page()) + assert svg.find(f"{{{SVG_NS}}}style") is not None + + +def test_page_to_svg_style_contains_hover_rule() -> None: + svg = page_to_svg(make_page()) + style = svg.find(f"{{{SVG_NS}}}style") + assert style is not None + assert ".TextLine:hover" in (style.text or "") + + +def test_page_to_svg_no_style_when_disabled() -> None: + svg = page_to_svg(make_page(), include_style=False) + assert svg.find(f"{{{SVG_NS}}}style") is None + + +def test_page_to_svg_string_no_style_when_disabled() -> None: + result = page_to_svg_string(make_page(), include_style=False) + assert " Date: Tue, 28 Apr 2026 15:10:00 +0200 Subject: [PATCH 9/9] Close SVG polygons --- pygexml/svg.py | 2 +- test/test_svg.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pygexml/svg.py b/pygexml/svg.py index 0385458..2467d50 100644 --- a/pygexml/svg.py +++ b/pygexml/svg.py @@ -12,7 +12,7 @@ class SVGError(Exception): def _coords_path(coords_str: str) -> str: - return f"M {coords_str}" + return f"M {coords_str} Z" def _baseline_path_d(line: TextLine) -> str: diff --git a/test/test_svg.py b/test/test_svg.py index 5b7cdd7..6510e04 100644 --- a/test/test_svg.py +++ b/test/test_svg.py @@ -101,7 +101,7 @@ def test_page_to_svg_coords_path() -> None: assert region_g is not None path = region_g.find(f"{{{SVG_NS}}}path") assert path is not None - assert path.attrib["d"] == "M 0,0 10,0 10,10 0,10" + assert path.attrib["d"] == "M 0,0 10,0 10,10 0,10 Z" assert path.attrib["class"] == "Coords"