binwang777
diff --git a/‎src/transformers/models/auto/configuration_auto.py‎
Lines changed: 4 additions & 1 deletion b/‎src/transformers/models/auto/configuration_auto.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎src/transformers/models/auto/modeling_auto.py‎
Lines changed: 1 addition & 1 deletion b/‎src/transformers/models/auto/modeling_auto.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/transformers/models/auto/processing_auto.py‎
Lines changed: 1 addition & 1 deletion b/‎src/transformers/models/auto/processing_auto.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/transformers/models/fgclip2/__init__.py‎
Lines changed: 1 addition & 3 deletions b/‎src/transformers/models/fgclip2/__init__.py‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎src/transformers/models/fgclip2/configuration_fgclip2.py‎
Lines changed: 5 additions & 19 deletions b/‎src/transformers/models/fgclip2/configuration_fgclip2.py‎
Lines changed: 5 additions & 19 deletions
diff --git a/‎src/transformers/models/fgclip2/image_processing_fgclip2.py‎
Lines changed: 42 additions & 63 deletions b/‎src/transformers/models/fgclip2/image_processing_fgclip2.py‎
Lines changed: 42 additions & 63 deletions
diff --git a/‎src/transformers/models/fgclip2/image_processing_fgclip2_fast.py‎
Lines changed: 6 additions & 20 deletions b/‎src/transformers/models/fgclip2/image_processing_fgclip2_fast.py‎
Lines changed: 6 additions & 20 deletions
@@ -151,6 +151,7 @@
         ("fastspeech2_conformer", "FastSpeech2ConformerConfig"),
         ("fastspeech2_conformer_with_hifigan", "FastSpeech2ConformerWithHifiGanConfig"),
         ("fgclip2", "Fgclip2Config"),
+        ("fgclip2_vision_model", "Fgclip2VisionConfig"),
         ("flaubert", "FlaubertConfig"),
         ("flava", "FlavaConfig"),
         ("flex_olmo", "FlexOlmoConfig"),
@@ -594,7 +595,8 @@
         ("falcon_mamba", "FalconMamba"),
         ("fastspeech2_conformer", "FastSpeech2Conformer"),
         ("fastspeech2_conformer_with_hifigan", "FastSpeech2ConformerWithHifiGan"),
-        ("fgclip2", "FG-CLIP 2"),
+        ("fgclip2", "FGCLIP2"),
+        ("fgclip2_vision_model", "Fgclip2VisionModel"),
         ("flan-t5", "FLAN-T5"),
         ("flan-ul2", "FLAN-UL2"),
         ("flaubert", "FlauBERT"),
@@ -984,6 +986,7 @@
         ("idefics3_vision", "idefics3"),
         ("siglip_vision_model", "siglip"),
         ("siglip2_vision_model", "siglip2"),
+        ("fgclip2_vision_model", "fgclip2"),
         ("aimv2_vision_model", "aimv2"),
         ("smolvlm_vision", "smolvlm"),
         ("chinese_clip_vision_model", "chinese_clip"),
 
@@ -153,7 +153,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         ("falcon_mamba", "FalconMambaModel"),
         ("fastspeech2_conformer", "FastSpeech2ConformerModel"),
         ("fastspeech2_conformer_with_hifigan", "FastSpeech2ConformerWithHifiGan"),
-        ("fgclip2","Fgclip2Model"),
+        ("fgclip2", "Fgclip2Model"),
         ("fgclip2_vision_model", "Fgclip2VisionModel"),
         ("flaubert", "FlaubertModel"),
         ("flava", "FlavaModel"),
 
@@ -68,7 +68,7 @@
         ("edgetam", "Sam2Processor"),
         ("emu3", "Emu3Processor"),
         ("evolla", "EvollaProcessor"),
-        ("fgclip2", "FgClip2Processor"),
+        ("fgclip2", "Fgclip2Processor"),
         ("flava", "FlavaProcessor"),
         ("florence2", "Florence2Processor"),
         ("fuyu", "FuyuProcessor"),
 
@@ -27,6 +27,4 @@
     import sys
 
     _file = globals()["__file__"]
-    sys.modules[__name__] = _LazyModule(
-        __name__, _file, define_import_structure(_file), module_spec=__spec__
-    )
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
@@ -108,12 +108,7 @@ def __init__(
         longtext_len=196,
         **kwargs,
     ):
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            **kwargs,
-        )
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
         self.vocab_size = vocab_size
         self.hidden_size = hidden_size
@@ -124,9 +119,7 @@ def __init__(
         self.layer_norm_eps = layer_norm_eps
         self.hidden_act = hidden_act
         self.attention_dropout = attention_dropout
-        self.projection_size = (
-            projection_size if projection_size is not None else hidden_size
-        )
+        self.projection_size = projection_size if projection_size is not None else hidden_size
         self.keep_len = keep_len
         self.longtext_len = longtext_len
 
@@ -256,25 +249,18 @@ class Fgclip2Config(PreTrainedConfig):
     ```"""
 
     model_type = "fgclip2"
-    sub_configs = {
-        "text_config": Fgclip2TextConfig,
-        "vision_config": Fgclip2VisionConfig,
-    }
+    sub_configs = {"text_config": Fgclip2TextConfig, "vision_config": Fgclip2VisionConfig}
 
     def __init__(self, text_config=None, vision_config=None, **kwargs):
         if text_config is None:
             text_config = Fgclip2TextConfig()
-            logger.info(
-                "`text_config` is `None`. Initializing the `Fgclip2TextConfig` with default values."
-            )
+            logger.info("`text_config` is `None`. Initializing the `Fgclip2TextConfig` with default values.")
         elif isinstance(text_config, dict):
             text_config = Fgclip2TextConfig(**text_config)
 
         if vision_config is None:
             vision_config = Fgclip2VisionConfig()
-            logger.info(
-                "`vision_config` is `None`. initializing the `Fgclip2VisionConfig` with default values."
-            )
+            logger.info("`vision_config` is `None`. initializing the `Fgclip2VisionConfig` with default values.")
         elif isinstance(vision_config, dict):
             vision_config = Fgclip2VisionConfig(**vision_config)
 
 
@@ -100,9 +100,7 @@ def get_image_size_for_max_num_patches(
 
     def get_scaled_image_size(scale: float, size: int, patch_size: int) -> int:
         scaled_size = size * scale
-        scaled_size = (
-            math.ceil(scaled_size / patch_size) * patch_size
-        )  # make divisible by patch_size
+        scaled_size = math.ceil(scaled_size / patch_size) * patch_size  # make divisible by patch_size
         scaled_size = max(patch_size, scaled_size)  # ensure at least 1 patch
         return int(scaled_size)
 
@@ -133,17 +131,13 @@ def convert_image_to_patches(image: np.ndarray, patch_size: int) -> np.ndarray:
     image_height, image_width, num_channels = image.shape
     num_patches_height = image_height // patch_size
     num_patches_width = image_width // patch_size
-    patched_image = image.reshape(
-        num_patches_height, patch_size, num_patches_width, patch_size, num_channels
-    )
+    patched_image = image.reshape(num_patches_height, patch_size, num_patches_width, patch_size, num_channels)
     patched_image = patched_image.transpose(0, 2, 1, 3, 4)
     patched_image = patched_image.reshape(num_patches_height * num_patches_width, -1)
     return patched_image
 
 
-def pad_along_first_dim(
-    array: np.ndarray, target_length: int, pad_value: int = 0
-) -> tuple[np.ndarray, np.ndarray]:
+def pad_along_first_dim(array: np.ndarray, target_length: int, pad_value: int = 0) -> tuple[np.ndarray, np.ndarray]:
     """
     Pad the array along the first dimension.
     """
@@ -158,7 +152,6 @@ def pad_along_first_dim(
 
 
 def _determine_max_value(image, patch_size: int = 16) -> int:
-
     image_height = image.shape[0]
     image_width = image.shape[1]
 
@@ -181,34 +174,37 @@ class Fgclip2ImageProcessor(BaseImageProcessor):
     Constructs a FG-CLIP2 image processor.
 
     Args:
-        do_resize (`bool`, *optional*, defaults to `True`):
-            Whether to resize the image's dimensions to fit `max_num_patches` according to given `patch_size`.
-            Can be overridden by `do_resize` in the `preprocess` method.
-        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
-            Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
-        do_rescale (`bool`, *optional*, defaults to `True`):
-            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
-            the `preprocess` method.
-        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
-            Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
-            method.
-        do_normalize (`bool`, *optional*, defaults to `True`):
-            Whether to normalize the image by the specified mean and standard deviation. Can be overridden by
-            `do_normalize` in the `preprocess` method.
-        image_mean (`float` or `list[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
-            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
-            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
-        image_std (`float` or `list[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
-            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
-            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
-            Can be overridden by the `image_std` parameter in the `preprocess` method.
-        do_convert_rgb (`bool`, *optional*, defaults to `True`):
-            Whether to convert the image to RGB.
-        patch_size (`int`, *optional*, defaults to 16):
-            The size (resolution) of each patch the image will be split to.
-        max_num_patches (`int`, *optional*, defaults to 256):
-            The image will be resized to have at most this number of patches,
-            and then padded in "patch" dimension to match this number exactly.
+            do_resize (`bool`, *optional*, defaults to `True`):
+                Whether to resize the image's dimensions to fit `max_num_patches` according to given `patch_size`.
+                Can be overridden by `do_resize` in the `preprocess` method.
+            resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
+                Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
+            do_rescale (`bool`, *optional*, defaults to `True`):
+                Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
+                the `preprocess` method.
+            rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+                Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
+                method.
+            do_normalize (`bool`, *optional*, defaults to `True`):
+                Whether to normalize the image by the specified mean and standard deviation. Can be overridden by
+                `do_normalize` in the `preprocess` method.
+            image_mean (`float` or `list[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
+                Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+                channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+            image_std (`float` or `list[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
+                Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+                number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+                Can be overridden by the `image_std` parameter in the `preprocess` method.
+            do_convert_rgb (`bool`, *optional*, defaults to `True`):
+                Whether to convert the image to RGB.
+            patch_size (`int`, *optional*, defaults to 16):
+                The size (resolution) of each patch the image will be split to.
+            max_num_patches (`int`, *optional*, defaults to 256):
+                The image will be resized to have at most this number of patches,
+                and then padded in "patch" dimension to match this number exactly.
+            dynamic_max_patches (`bool`, *optional*, defaults to `self.dynamic_max_patches`):
+                Whether to dynamically determine `max_num_patches` from the largest input image.
+                If `False`, uses `max_num_patches` (either passed or default).
     """
 
     model_input_names = ["pixel_values", "pixel_attention_mask", "spatial_shapes"]
@@ -319,23 +315,15 @@ def preprocess(
         do_resize = do_resize if do_resize is not None else self.do_resize
         resample = resample if resample is not None else self.resample
         do_rescale = do_rescale if do_rescale is not None else self.do_rescale
-        rescale_factor = (
-            rescale_factor if rescale_factor is not None else self.rescale_factor
-        )
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
         do_normalize = do_normalize if do_normalize is not None else self.do_normalize
         image_mean = image_mean if image_mean is not None else self.image_mean
         image_std = image_std if image_std is not None else self.image_std
-        do_convert_rgb = (
-            do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
-        )
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
         patch_size = patch_size if patch_size is not None else self.patch_size
-        max_num_patches = (
-            max_num_patches if max_num_patches is not None else self.max_num_patches
-        )
+        max_num_patches = max_num_patches if max_num_patches is not None else self.max_num_patches
         dynamic_max_patches = (
-            dynamic_max_patches
-            if dynamic_max_patches is not None
-            else self.dynamic_max_patches
+            dynamic_max_patches if dynamic_max_patches is not None else self.dynamic_max_patches
         )  # ← 获取设置
 
         data_format = ChannelDimension.LAST
@@ -376,21 +364,14 @@ def preprocess(
         spatial_shapes = []
 
         images = [
-            to_channel_dimension_format(
-                image, data_format, input_channel_dim=input_data_format
-            )
-            for image in images
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
         ]
 
         if dynamic_max_patches:
             original_max_num_patches = max_num_patches
-            candidate_values = [
-                _determine_max_value(img, patch_size=patch_size) for img in images
-            ]
+            candidate_values = [_determine_max_value(img, patch_size=patch_size) for img in images]
             max_num_patches = max(candidate_values)
-            logger.info(
-                f"Dynamically set max_num_patches={max_num_patches} (originally {original_max_num_patches})"
-            )
+            logger.info(f"Dynamically set max_num_patches={max_num_patches} (originally {original_max_num_patches})")
 
         for image in images:
             if do_resize:
@@ -408,9 +389,7 @@ def preprocess(
                 )
 
             if do_rescale:
-                image = self.rescale(
-                    image=image, scale=rescale_factor, input_data_format=data_format
-                )
+                image = self.rescale(image=image, scale=rescale_factor, input_data_format=data_format)
 
             if do_normalize:
                 image = self.normalize(
 
@@ -63,9 +63,7 @@ def convert_image_to_patches(image: "torch.Tensor", patch_size: int) -> "torch.T
     num_channels, image_height, image_width = image.shape
     num_patches_height = image_height // patch_size
     num_patches_width = image_width // patch_size
-    patched_image = image.reshape(
-        num_channels, num_patches_height, patch_size, num_patches_width, patch_size
-    )
+    patched_image = image.reshape(num_channels, num_patches_height, patch_size, num_patches_width, patch_size)
     patched_image = patched_image.permute(1, 3, 2, 4, 0)
     patched_image = patched_image.reshape(num_patches_height * num_patches_width, -1)
     return patched_image
@@ -82,15 +80,12 @@ def pad_along_first_dim(
     mask = torch.ones((target_length,), dtype=torch.int32)
     if padding_length > 0:
         padding = [0, 0] * (tensor.ndim - 1) + [0, padding_length]
-        tensor = torch.nn.functional.pad(
-            tensor, padding, mode="constant", value=pad_value
-        )
+        tensor = torch.nn.functional.pad(tensor, padding, mode="constant", value=pad_value)
         mask[-padding_length:] = 0
     return tensor, mask
 
 
 def _determine_max_value(image, patch_size: int = 16) -> int:
-
     image_height = image.shape[1]
     image_width = image.shape[2]
 
@@ -131,9 +126,7 @@ def _validate_preprocess_kwargs(self, **kwargs) -> tuple:
         return super()._validate_preprocess_kwargs(**kwargs)
 
     @auto_docstring
-    def preprocess(
-        self, images: ImageInput, **kwargs: Unpack[Fgclip2ImageProcessorKwargs]
-    ) -> BatchFeature:
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[Fgclip2ImageProcessorKwargs]) -> BatchFeature:
         return super().preprocess(images, **kwargs)
 
     def _preprocess(
@@ -152,15 +145,12 @@ def _preprocess(
         return_tensors: Optional[Union[str, TensorType]],
         **kwargs,
     ) -> BatchFeature:
-
         pixel_masks = []
         pixel_values = []
         spatial_shapes = []
 
         if dynamic_max_patches:
-            candidate_values = [
-                _determine_max_value(img, patch_size=patch_size) for img in images
-            ]
+            candidate_values = [_determine_max_value(img, patch_size=patch_size) for img in images]
             max_num_patches = max(candidate_values)
 
         for image in images:
@@ -172,13 +162,9 @@ def _preprocess(
                     max_num_patches=max_num_patches,
                 )
                 side_dict = SizeDict(height=height, width=width)
-                image = self.resize(
-                    image=image, size=side_dict, interpolation=interpolation
-                )
+                image = self.resize(image=image, size=side_dict, interpolation=interpolation)
 
-            image = self.rescale_and_normalize(
-                image, do_rescale, rescale_factor, do_normalize, image_mean, image_std
-            )
+            image = self.rescale_and_normalize(image, do_rescale, rescale_factor, do_normalize, image_mean, image_std)
 
             # (num_channels, height, width) -> (num_patches, patch_size * patch_size * num_channels)
             patches = convert_image_to_patches(image, patch_size)