bytedance · muhammad-ali-emumba · Oct 27, 2025
diff --git a/.gitignore b/.gitignore
@@ -152,3 +152,5 @@ Desktop.ini
 
 fusion_result.json
 kernel_meta/
+
+hf_model
diff --git a/demo_page.py b/demo_page.py
@@ -6,6 +6,7 @@
 import argparse
 import glob
 import os
+import sys
 
 import cv2
 import torch
@@ -111,7 +112,7 @@ def chat(self, prompt, image):
         return results
 
 
-def process_document(document_path, model, save_dir, max_batch_size=None):
+def process_document(document_path, model, save_dir, max_batch_size=None, processed_images_dir=None):
     """Parse documents with two stages - Handles both images and PDFs"""
     file_ext = os.path.splitext(document_path)[1].lower()
 
@@ -133,7 +134,8 @@ def process_document(document_path, model, save_dir, max_batch_size=None):
 
             # Process this page (don't save individual page results)
             json_path, recognition_results = process_single_image(
-                pil_image, model, save_dir, page_name, max_batch_size, save_individual=False
+                pil_image, model, save_dir, page_name, max_batch_size, save_individual=False, 
+                processed_images_dir=processed_images_dir
             )
 
             # Add page information to results
@@ -155,7 +157,7 @@ def process_document(document_path, model, save_dir, max_batch_size=None):
         return process_single_image(pil_image, model, save_dir, base_name, max_batch_size)
 
 
-def process_single_image(image, model, save_dir, image_name, max_batch_size=None, save_individual=True):
+def process_single_image(image, model, save_dir, image_name, max_batch_size=None, save_individual=True, processed_images_dir=None):
     """Process a single image (either from file or converted from PDF page)
 
     Args:
@@ -173,7 +175,25 @@ def process_single_image(image, model, save_dir, image_name, max_batch_size=None
     layout_output = model.chat("Parse the reading order of this document.", image)
 
     # Stage 2: Element-level content parsing
-    padded_image, dims = prepare_image(image)
+
+    # Extract PDF name and page number for organized image saving
+    pdf_name = None
+    page_number = None
+
+    # Check if this is a PDF page (format: "pdfname_page_001")
+    if "_page_" in image_name:
+        parts = image_name.split("_page_")
+        if len(parts) == 2:
+            pdf_name = parts[0]
+            try:
+                page_number = int(parts[1])
+            except ValueError:
+                page_number = None
+    else:
+        # For single images, use the image name as pdf_name
+        pdf_name = image_name
+
+    padded_image, dims = prepare_image(image, pdf_name=pdf_name, page_number=page_number, processed_images_dir=processed_images_dir)
     recognition_results = process_elements(layout_output, padded_image, dims, model, max_batch_size, save_dir, image_name)
 
     # Save outputs only if requested (skip for PDF pages)
@@ -215,15 +235,17 @@ def process_elements(layout_results, padded_image, dims, model, max_batch_size,
                         "label": label,
                         "text": f"![Figure](figures/{figure_filename})",
                         "figure_path": f"figures/{figure_filename}",
-                        "bbox": [orig_x1, orig_y1, orig_x2, orig_y2],
+                        "bbox": [orig_x1, orig_y1, orig_x2, orig_y2],  # Original image coordinates
+                        "padded_bbox": [x1, y1, x2, y2],  # Padded image coordinates
                         "reading_order": reading_order,
                     })
                 else:
                     # Prepare element information
                     element_info = {
                         "crop": pil_crop,
                         "label": label,
-                        "bbox": [orig_x1, orig_y1, orig_x2, orig_y2],
+                        "bbox": [orig_x1, orig_y1, orig_x2, orig_y2],  # Original image coordinates
+                        "padded_bbox": [x1, y1, x2, y2],  # Padded image coordinates
                         "reading_order": reading_order,
                     }
 
@@ -291,6 +313,7 @@ def process_element_batch(elements, model, prompt, max_batch_size=None):
             results.append({
                 "label": elem["label"],
                 "bbox": elem["bbox"],
+                "padded_bbox": elem["padded_bbox"],  # Padded coordinates
                 "text": result.strip(),
                 "reading_order": elem["reading_order"],
             })
@@ -314,8 +337,31 @@ def main():
         default=16,
         help="Maximum number of document elements to parse in a single batch (default: 16)",
     )
+    parser.add_argument(
+        "--processed_images_dir",
+        type=str,
+        default=None,
+        help="Directory to save processed images (default: from config or './processed_images_by_dolphin')",
+    )
     args = parser.parse_args()
 
+    # Determine processed_images_dir with fallback logic
+    processed_images_dir = args.processed_images_dir
+    if processed_images_dir is None:
+        # Try to get from config file
+        try:
+            # Add parent directory to path to access config
+            parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+            sys.path.insert(0, parent_dir)
+            from config.config import PROCESSED_IMAGES_DIR
+            processed_images_dir = PROCESSED_IMAGES_DIR
+        except (ImportError, AttributeError):
+            processed_images_dir = None
+
+        # If not in config, use hardcoded default
+        if processed_images_dir is None:
+            processed_images_dir = "./processed_images_by_dolphin"
+
     # Load Model
     model = DOLPHIN(args.model_path)
 
@@ -358,6 +404,7 @@ def main():
                 model=model,
                 save_dir=save_dir,
                 max_batch_size=args.max_batch_size,
+                processed_images_dir=processed_images_dir
             )
 
             print(f"Processing completed. Results saved to {save_dir}")
@@ -368,4 +415,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
+    main()
diff --git a/utils/utils.py b/utils/utils.py
@@ -7,6 +7,8 @@
 import json
 import os
 import re
+import time
+import uuid
 from dataclasses import dataclass
 from typing import List, Tuple
 
@@ -297,7 +299,7 @@ def process_coordinates(coords, padded_image, dims: ImageDimensions, previous_bo
         return 0, 0, 100, 100, orig_x1, orig_y1, orig_x2, orig_y2, [0, 0, 100, 100]
 
 
-def prepare_image(image) -> Tuple[np.ndarray, ImageDimensions]:
+def prepare_image(image, pdf_name=None, page_number=None, processed_images_dir=None) -> Tuple[np.ndarray, ImageDimensions]:
     """Load and prepare image with padding while maintaining aspect ratio
 
     Args:
@@ -321,6 +323,38 @@ def prepare_image(image) -> Tuple[np.ndarray, ImageDimensions]:
         # Apply padding
         padded_image = cv2.copyMakeBorder(image, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(0, 0, 0))
 
+         # Save the processed padded image with organized filename
+        try:
+            if processed_images_dir is None:
+                processed_images_dir = "./processed_images"
+            # Create PDF-specific subdirectory if pdf_name is provided
+            if pdf_name:
+                pdf_dir = os.path.join(processed_images_dir, pdf_name)
+                os.makedirs(pdf_dir, exist_ok=True)
+
+                # Generate organized filename
+                if page_number is not None:
+                    unique_filename = f"page-{page_number}.png"
+                else:
+                    unique_filename = f"{pdf_name}.png"
+
+                processed_image_path = os.path.join(pdf_dir, unique_filename)
+            else:
+                # Fallback to original naming scheme for non-PDF files
+                os.makedirs(processed_images_dir, exist_ok=True)
+                timestamp = int(time.time() * 1000)  # milliseconds since epoch
+                unique_id = str(uuid.uuid4())[:8]  # first 8 characters of UUID
+                unique_filename = f"processed_{timestamp}_{unique_id}.png"
+                processed_image_path = os.path.join(processed_images_dir, unique_filename)
+
+            cv2.imwrite(processed_image_path, padded_image)
+            print(f"✓ Saved processed padded image: {unique_filename}")
+
+        except Exception as save_error:
+            # Don't let saving errors affect the main functionality
+            print(f"Warning: Could not save processed image: {str(save_error)}")
+
+
         padded_h, padded_w = padded_image.shape[:2]
 
         dimensions = ImageDimensions(original_w=original_w, original_h=original_h, padded_w=padded_w, padded_h=padded_h)
@@ -603,4 +637,4 @@ def assign_colors_to_elements(num_elements):
         color_idx = i % len(palette)
         colors.append(palette[color_idx])
 
-    return colors
+    return colors
Original file line number	Diff line number	Diff line change
Expand Up		@@ -152,3 +152,5 @@ Desktop.ini

		fusion_result.json
		kernel_meta/

		hf_model