Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -152,3 +152,5 @@ Desktop.ini

fusion_result.json
kernel_meta/

hf_model
61 changes: 54 additions & 7 deletions demo_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import argparse
import glob
import os
import sys

import cv2
import torch
Expand Down Expand Up @@ -111,7 +112,7 @@ def chat(self, prompt, image):
return results


def process_document(document_path, model, save_dir, max_batch_size=None):
def process_document(document_path, model, save_dir, max_batch_size=None, processed_images_dir=None):
"""Parse documents with two stages - Handles both images and PDFs"""
file_ext = os.path.splitext(document_path)[1].lower()

Expand All @@ -133,7 +134,8 @@ def process_document(document_path, model, save_dir, max_batch_size=None):

# Process this page (don't save individual page results)
json_path, recognition_results = process_single_image(
pil_image, model, save_dir, page_name, max_batch_size, save_individual=False
pil_image, model, save_dir, page_name, max_batch_size, save_individual=False,
processed_images_dir=processed_images_dir
)

# Add page information to results
Expand All @@ -155,7 +157,7 @@ def process_document(document_path, model, save_dir, max_batch_size=None):
return process_single_image(pil_image, model, save_dir, base_name, max_batch_size)


def process_single_image(image, model, save_dir, image_name, max_batch_size=None, save_individual=True):
def process_single_image(image, model, save_dir, image_name, max_batch_size=None, save_individual=True, processed_images_dir=None):
"""Process a single image (either from file or converted from PDF page)

Args:
Expand All @@ -173,7 +175,25 @@ def process_single_image(image, model, save_dir, image_name, max_batch_size=None
layout_output = model.chat("Parse the reading order of this document.", image)

# Stage 2: Element-level content parsing
padded_image, dims = prepare_image(image)

# Extract PDF name and page number for organized image saving
pdf_name = None
page_number = None

# Check if this is a PDF page (format: "pdfname_page_001")
if "_page_" in image_name:
parts = image_name.split("_page_")
if len(parts) == 2:
pdf_name = parts[0]
try:
page_number = int(parts[1])
except ValueError:
page_number = None
else:
# For single images, use the image name as pdf_name
pdf_name = image_name

padded_image, dims = prepare_image(image, pdf_name=pdf_name, page_number=page_number, processed_images_dir=processed_images_dir)
recognition_results = process_elements(layout_output, padded_image, dims, model, max_batch_size, save_dir, image_name)

# Save outputs only if requested (skip for PDF pages)
Expand Down Expand Up @@ -215,15 +235,17 @@ def process_elements(layout_results, padded_image, dims, model, max_batch_size,
"label": label,
"text": f"![Figure](figures/{figure_filename})",
"figure_path": f"figures/{figure_filename}",
"bbox": [orig_x1, orig_y1, orig_x2, orig_y2],
"bbox": [orig_x1, orig_y1, orig_x2, orig_y2], # Original image coordinates
"padded_bbox": [x1, y1, x2, y2], # Padded image coordinates
"reading_order": reading_order,
})
else:
# Prepare element information
element_info = {
"crop": pil_crop,
"label": label,
"bbox": [orig_x1, orig_y1, orig_x2, orig_y2],
"bbox": [orig_x1, orig_y1, orig_x2, orig_y2], # Original image coordinates
"padded_bbox": [x1, y1, x2, y2], # Padded image coordinates
"reading_order": reading_order,
}

Expand Down Expand Up @@ -291,6 +313,7 @@ def process_element_batch(elements, model, prompt, max_batch_size=None):
results.append({
"label": elem["label"],
"bbox": elem["bbox"],
"padded_bbox": elem["padded_bbox"], # Padded coordinates
"text": result.strip(),
"reading_order": elem["reading_order"],
})
Expand All @@ -314,8 +337,31 @@ def main():
default=16,
help="Maximum number of document elements to parse in a single batch (default: 16)",
)
parser.add_argument(
"--processed_images_dir",
type=str,
default=None,
help="Directory to save processed images (default: from config or './processed_images_by_dolphin')",
)
args = parser.parse_args()

# Determine processed_images_dir with fallback logic
processed_images_dir = args.processed_images_dir
if processed_images_dir is None:
# Try to get from config file
try:
# Add parent directory to path to access config
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, parent_dir)
from config.config import PROCESSED_IMAGES_DIR
processed_images_dir = PROCESSED_IMAGES_DIR
except (ImportError, AttributeError):
processed_images_dir = None

# If not in config, use hardcoded default
if processed_images_dir is None:
processed_images_dir = "./processed_images_by_dolphin"

# Load Model
model = DOLPHIN(args.model_path)

Expand Down Expand Up @@ -358,6 +404,7 @@ def main():
model=model,
save_dir=save_dir,
max_batch_size=args.max_batch_size,
processed_images_dir=processed_images_dir
)

print(f"Processing completed. Results saved to {save_dir}")
Expand All @@ -368,4 +415,4 @@ def main():


if __name__ == "__main__":
main()
main()
38 changes: 36 additions & 2 deletions utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
import json
import os
import re
import time
import uuid
from dataclasses import dataclass
from typing import List, Tuple

Expand Down Expand Up @@ -297,7 +299,7 @@ def process_coordinates(coords, padded_image, dims: ImageDimensions, previous_bo
return 0, 0, 100, 100, orig_x1, orig_y1, orig_x2, orig_y2, [0, 0, 100, 100]


def prepare_image(image) -> Tuple[np.ndarray, ImageDimensions]:
def prepare_image(image, pdf_name=None, page_number=None, processed_images_dir=None) -> Tuple[np.ndarray, ImageDimensions]:
"""Load and prepare image with padding while maintaining aspect ratio

Args:
Expand All @@ -321,6 +323,38 @@ def prepare_image(image) -> Tuple[np.ndarray, ImageDimensions]:
# Apply padding
padded_image = cv2.copyMakeBorder(image, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(0, 0, 0))

# Save the processed padded image with organized filename
try:
if processed_images_dir is None:
processed_images_dir = "./processed_images"
# Create PDF-specific subdirectory if pdf_name is provided
if pdf_name:
pdf_dir = os.path.join(processed_images_dir, pdf_name)
os.makedirs(pdf_dir, exist_ok=True)

# Generate organized filename
if page_number is not None:
unique_filename = f"page-{page_number}.png"
else:
unique_filename = f"{pdf_name}.png"

processed_image_path = os.path.join(pdf_dir, unique_filename)
else:
# Fallback to original naming scheme for non-PDF files
os.makedirs(processed_images_dir, exist_ok=True)
timestamp = int(time.time() * 1000) # milliseconds since epoch
unique_id = str(uuid.uuid4())[:8] # first 8 characters of UUID
unique_filename = f"processed_{timestamp}_{unique_id}.png"
processed_image_path = os.path.join(processed_images_dir, unique_filename)

cv2.imwrite(processed_image_path, padded_image)
print(f"✓ Saved processed padded image: {unique_filename}")

except Exception as save_error:
# Don't let saving errors affect the main functionality
print(f"Warning: Could not save processed image: {str(save_error)}")


padded_h, padded_w = padded_image.shape[:2]

dimensions = ImageDimensions(original_w=original_w, original_h=original_h, padded_w=padded_w, padded_h=padded_h)
Expand Down Expand Up @@ -603,4 +637,4 @@ def assign_colors_to_elements(num_elements):
color_idx = i % len(palette)
colors.append(palette[color_idx])

return colors
return colors