@@ -100,9 +100,7 @@ def get_image_size_for_max_num_patches(
100100
101101 def get_scaled_image_size (scale : float , size : int , patch_size : int ) -> int :
102102 scaled_size = size * scale
103- scaled_size = (
104- math .ceil (scaled_size / patch_size ) * patch_size
105- ) # make divisible by patch_size
103+ scaled_size = math .ceil (scaled_size / patch_size ) * patch_size # make divisible by patch_size
106104 scaled_size = max (patch_size , scaled_size ) # ensure at least 1 patch
107105 return int (scaled_size )
108106
@@ -133,17 +131,13 @@ def convert_image_to_patches(image: np.ndarray, patch_size: int) -> np.ndarray:
133131 image_height , image_width , num_channels = image .shape
134132 num_patches_height = image_height // patch_size
135133 num_patches_width = image_width // patch_size
136- patched_image = image .reshape (
137- num_patches_height , patch_size , num_patches_width , patch_size , num_channels
138- )
134+ patched_image = image .reshape (num_patches_height , patch_size , num_patches_width , patch_size , num_channels )
139135 patched_image = patched_image .transpose (0 , 2 , 1 , 3 , 4 )
140136 patched_image = patched_image .reshape (num_patches_height * num_patches_width , - 1 )
141137 return patched_image
142138
143139
144- def pad_along_first_dim (
145- array : np .ndarray , target_length : int , pad_value : int = 0
146- ) -> tuple [np .ndarray , np .ndarray ]:
140+ def pad_along_first_dim (array : np .ndarray , target_length : int , pad_value : int = 0 ) -> tuple [np .ndarray , np .ndarray ]:
147141 """
148142 Pad the array along the first dimension.
149143 """
@@ -158,7 +152,6 @@ def pad_along_first_dim(
158152
159153
160154def _determine_max_value (image , patch_size : int = 16 ) -> int :
161-
162155 image_height = image .shape [0 ]
163156 image_width = image .shape [1 ]
164157
@@ -181,34 +174,37 @@ class Fgclip2ImageProcessor(BaseImageProcessor):
181174 Constructs a FG-CLIP2 image processor.
182175
183176 Args:
184- do_resize (`bool`, *optional*, defaults to `True`):
185- Whether to resize the image's dimensions to fit `max_num_patches` according to given `patch_size`.
186- Can be overridden by `do_resize` in the `preprocess` method.
187- resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
188- Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
189- do_rescale (`bool`, *optional*, defaults to `True`):
190- Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
191- the `preprocess` method.
192- rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
193- Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
194- method.
195- do_normalize (`bool`, *optional*, defaults to `True`):
196- Whether to normalize the image by the specified mean and standard deviation. Can be overridden by
197- `do_normalize` in the `preprocess` method.
198- image_mean (`float` or `list[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
199- Mean to use if normalizing the image. This is a float or list of floats the length of the number of
200- channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
201- image_std (`float` or `list[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
202- Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
203- number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
204- Can be overridden by the `image_std` parameter in the `preprocess` method.
205- do_convert_rgb (`bool`, *optional*, defaults to `True`):
206- Whether to convert the image to RGB.
207- patch_size (`int`, *optional*, defaults to 16):
208- The size (resolution) of each patch the image will be split to.
209- max_num_patches (`int`, *optional*, defaults to 256):
210- The image will be resized to have at most this number of patches,
211- and then padded in "patch" dimension to match this number exactly.
177+ do_resize (`bool`, *optional*, defaults to `True`):
178+ Whether to resize the image's dimensions to fit `max_num_patches` according to given `patch_size`.
179+ Can be overridden by `do_resize` in the `preprocess` method.
180+ resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
181+ Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
182+ do_rescale (`bool`, *optional*, defaults to `True`):
183+ Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
184+ the `preprocess` method.
185+ rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
186+ Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
187+ method.
188+ do_normalize (`bool`, *optional*, defaults to `True`):
189+ Whether to normalize the image by the specified mean and standard deviation. Can be overridden by
190+ `do_normalize` in the `preprocess` method.
191+ image_mean (`float` or `list[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
192+ Mean to use if normalizing the image. This is a float or list of floats the length of the number of
193+ channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
194+ image_std (`float` or `list[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
195+ Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
196+ number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
197+ Can be overridden by the `image_std` parameter in the `preprocess` method.
198+ do_convert_rgb (`bool`, *optional*, defaults to `True`):
199+ Whether to convert the image to RGB.
200+ patch_size (`int`, *optional*, defaults to 16):
201+ The size (resolution) of each patch the image will be split to.
202+ max_num_patches (`int`, *optional*, defaults to 256):
203+ The image will be resized to have at most this number of patches,
204+ and then padded in "patch" dimension to match this number exactly.
205+ dynamic_max_patches (`bool`, *optional*, defaults to `self.dynamic_max_patches`):
206+ Whether to dynamically determine `max_num_patches` from the largest input image.
207+ If `False`, uses `max_num_patches` (either passed or default).
212208 """
213209
214210 model_input_names = ["pixel_values" , "pixel_attention_mask" , "spatial_shapes" ]
@@ -319,23 +315,15 @@ def preprocess(
319315 do_resize = do_resize if do_resize is not None else self .do_resize
320316 resample = resample if resample is not None else self .resample
321317 do_rescale = do_rescale if do_rescale is not None else self .do_rescale
322- rescale_factor = (
323- rescale_factor if rescale_factor is not None else self .rescale_factor
324- )
318+ rescale_factor = rescale_factor if rescale_factor is not None else self .rescale_factor
325319 do_normalize = do_normalize if do_normalize is not None else self .do_normalize
326320 image_mean = image_mean if image_mean is not None else self .image_mean
327321 image_std = image_std if image_std is not None else self .image_std
328- do_convert_rgb = (
329- do_convert_rgb if do_convert_rgb is not None else self .do_convert_rgb
330- )
322+ do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self .do_convert_rgb
331323 patch_size = patch_size if patch_size is not None else self .patch_size
332- max_num_patches = (
333- max_num_patches if max_num_patches is not None else self .max_num_patches
334- )
324+ max_num_patches = max_num_patches if max_num_patches is not None else self .max_num_patches
335325 dynamic_max_patches = (
336- dynamic_max_patches
337- if dynamic_max_patches is not None
338- else self .dynamic_max_patches
326+ dynamic_max_patches if dynamic_max_patches is not None else self .dynamic_max_patches
339327 ) # ← 获取设置
340328
341329 data_format = ChannelDimension .LAST
@@ -376,21 +364,14 @@ def preprocess(
376364 spatial_shapes = []
377365
378366 images = [
379- to_channel_dimension_format (
380- image , data_format , input_channel_dim = input_data_format
381- )
382- for image in images
367+ to_channel_dimension_format (image , data_format , input_channel_dim = input_data_format ) for image in images
383368 ]
384369
385370 if dynamic_max_patches :
386371 original_max_num_patches = max_num_patches
387- candidate_values = [
388- _determine_max_value (img , patch_size = patch_size ) for img in images
389- ]
372+ candidate_values = [_determine_max_value (img , patch_size = patch_size ) for img in images ]
390373 max_num_patches = max (candidate_values )
391- logger .info (
392- f"Dynamically set max_num_patches={ max_num_patches } (originally { original_max_num_patches } )"
393- )
374+ logger .info (f"Dynamically set max_num_patches={ max_num_patches } (originally { original_max_num_patches } )" )
394375
395376 for image in images :
396377 if do_resize :
@@ -408,9 +389,7 @@ def preprocess(
408389 )
409390
410391 if do_rescale :
411- image = self .rescale (
412- image = image , scale = rescale_factor , input_data_format = data_format
413- )
392+ image = self .rescale (image = image , scale = rescale_factor , input_data_format = data_format )
414393
415394 if do_normalize :
416395 image = self .normalize (
0 commit comments