Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 25 additions & 1 deletion pdf2docx/image/ImagesExtractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,7 +296,31 @@ def _to_raw_dict(image: fitz.Pixmap, bbox: fitz.Rect):
Returns:
dict: Raw dict of the pixmap.
"""
if image.colorspace.n > 3: # must convert: we only support PNG
"""
if image.colorspace.n > 3: # must convert: we only support PNG image with alpha channel
image = fitz.Pixmap(fitz.csRGB, image)
"""
# added by maksym
# Get the colorspace object of the pixmap.
# NOTE: colorspace.n (number of channels) is NOT sufficient to decide
# whether MuPDF can write this pixmap as PNG.
cs = image.colorspace
# MuPDF can write PNG only if the colorspace is exactly:
# - DeviceGray (fitz.csGRAY)
# - DeviceRGB (fitz.csRGB)
#
# Even if cs.n == 1 or cs.n == 3, the pixmap may still be:
# - ICCBased grayscale/RGB
# - Separation / spot color
# - Indexed colorspace
# - Alpha-only pixmap
#
# All of these will trigger:
# "pixmap must be grayscale or rgb to write as png"
#
# Therefore we must convert unless the colorspace object itself
# is exactly csGRAY or csRGB.
if cs is None or cs not in (fitz.csGRAY, fitz.csRGB):
image = fitz.Pixmap(fitz.csRGB, image)
return {
"type": BlockType.IMAGE.value,
Expand Down
Binary file added test/samples/demo-issue-340.pdf
Binary file not shown.
19 changes: 19 additions & 0 deletions test/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,25 @@ def test_rotated_images(self):
docx_file = os.path.join(output_path, f'{filename}.docx')
assert os.path.isfile(docx_file), f'Expected output file: {docx_file}'
assert os.path.getsize(docx_file) > 0, 'Output docx should not be empty'

# ------------------------------------------
# non-grayscale/non-RGB images (issue 340)
# ------------------------------------------
def test_non_grayscale_non_rgb_images(self):
'''Test converting PDFs with rotated images and non-RGB colorspace (CMYK, alpha).

Covers: per-image rotation, pixmap tobytes() for non-grayscale/non-RGB.
'''
filenames = [
'demo-issue-340'
]
for filename in filenames:
pdf_file = os.path.join(sample_path, f'{filename}.pdf')
if not os.path.isfile(pdf_file):
continue
docx_file = os.path.join(output_path, f'{filename}.docx')
parse(pdf_file, docx_file, start=0, end=None)
assert os.path.isfile(docx_file), f'Expected output {docx_file}'


# We make a separate pytest test for each sample file.
Expand Down