Skip to content

Commit 747cd65

Browse files
committed
make fixup done
1 parent 7ede819 commit 747cd65

13 files changed

+184
-495
lines changed

src/transformers/models/auto/configuration_auto.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,7 @@
151151
("fastspeech2_conformer", "FastSpeech2ConformerConfig"),
152152
("fastspeech2_conformer_with_hifigan", "FastSpeech2ConformerWithHifiGanConfig"),
153153
("fgclip2", "Fgclip2Config"),
154+
("fgclip2_vision_model", "Fgclip2VisionConfig"),
154155
("flaubert", "FlaubertConfig"),
155156
("flava", "FlavaConfig"),
156157
("flex_olmo", "FlexOlmoConfig"),
@@ -594,7 +595,8 @@
594595
("falcon_mamba", "FalconMamba"),
595596
("fastspeech2_conformer", "FastSpeech2Conformer"),
596597
("fastspeech2_conformer_with_hifigan", "FastSpeech2ConformerWithHifiGan"),
597-
("fgclip2", "FG-CLIP 2"),
598+
("fgclip2", "FGCLIP2"),
599+
("fgclip2_vision_model", "Fgclip2VisionModel"),
598600
("flan-t5", "FLAN-T5"),
599601
("flan-ul2", "FLAN-UL2"),
600602
("flaubert", "FlauBERT"),
@@ -984,6 +986,7 @@
984986
("idefics3_vision", "idefics3"),
985987
("siglip_vision_model", "siglip"),
986988
("siglip2_vision_model", "siglip2"),
989+
("fgclip2_vision_model", "fgclip2"),
987990
("aimv2_vision_model", "aimv2"),
988991
("smolvlm_vision", "smolvlm"),
989992
("chinese_clip_vision_model", "chinese_clip"),

src/transformers/models/auto/modeling_auto.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
153153
("falcon_mamba", "FalconMambaModel"),
154154
("fastspeech2_conformer", "FastSpeech2ConformerModel"),
155155
("fastspeech2_conformer_with_hifigan", "FastSpeech2ConformerWithHifiGan"),
156-
("fgclip2","Fgclip2Model"),
156+
("fgclip2", "Fgclip2Model"),
157157
("fgclip2_vision_model", "Fgclip2VisionModel"),
158158
("flaubert", "FlaubertModel"),
159159
("flava", "FlavaModel"),

src/transformers/models/auto/processing_auto.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@
6868
("edgetam", "Sam2Processor"),
6969
("emu3", "Emu3Processor"),
7070
("evolla", "EvollaProcessor"),
71-
("fgclip2", "FgClip2Processor"),
71+
("fgclip2", "Fgclip2Processor"),
7272
("flava", "FlavaProcessor"),
7373
("florence2", "Florence2Processor"),
7474
("fuyu", "FuyuProcessor"),

src/transformers/models/fgclip2/__init__.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,4 @@
2727
import sys
2828

2929
_file = globals()["__file__"]
30-
sys.modules[__name__] = _LazyModule(
31-
__name__, _file, define_import_structure(_file), module_spec=__spec__
32-
)
30+
sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)

src/transformers/models/fgclip2/configuration_fgclip2.py

Lines changed: 5 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -108,12 +108,7 @@ def __init__(
108108
longtext_len=196,
109109
**kwargs,
110110
):
111-
super().__init__(
112-
pad_token_id=pad_token_id,
113-
bos_token_id=bos_token_id,
114-
eos_token_id=eos_token_id,
115-
**kwargs,
116-
)
111+
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
117112

118113
self.vocab_size = vocab_size
119114
self.hidden_size = hidden_size
@@ -124,9 +119,7 @@ def __init__(
124119
self.layer_norm_eps = layer_norm_eps
125120
self.hidden_act = hidden_act
126121
self.attention_dropout = attention_dropout
127-
self.projection_size = (
128-
projection_size if projection_size is not None else hidden_size
129-
)
122+
self.projection_size = projection_size if projection_size is not None else hidden_size
130123
self.keep_len = keep_len
131124
self.longtext_len = longtext_len
132125

@@ -256,25 +249,18 @@ class Fgclip2Config(PreTrainedConfig):
256249
```"""
257250

258251
model_type = "fgclip2"
259-
sub_configs = {
260-
"text_config": Fgclip2TextConfig,
261-
"vision_config": Fgclip2VisionConfig,
262-
}
252+
sub_configs = {"text_config": Fgclip2TextConfig, "vision_config": Fgclip2VisionConfig}
263253

264254
def __init__(self, text_config=None, vision_config=None, **kwargs):
265255
if text_config is None:
266256
text_config = Fgclip2TextConfig()
267-
logger.info(
268-
"`text_config` is `None`. Initializing the `Fgclip2TextConfig` with default values."
269-
)
257+
logger.info("`text_config` is `None`. Initializing the `Fgclip2TextConfig` with default values.")
270258
elif isinstance(text_config, dict):
271259
text_config = Fgclip2TextConfig(**text_config)
272260

273261
if vision_config is None:
274262
vision_config = Fgclip2VisionConfig()
275-
logger.info(
276-
"`vision_config` is `None`. initializing the `Fgclip2VisionConfig` with default values."
277-
)
263+
logger.info("`vision_config` is `None`. initializing the `Fgclip2VisionConfig` with default values.")
278264
elif isinstance(vision_config, dict):
279265
vision_config = Fgclip2VisionConfig(**vision_config)
280266

src/transformers/models/fgclip2/image_processing_fgclip2.py

Lines changed: 42 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -100,9 +100,7 @@ def get_image_size_for_max_num_patches(
100100

101101
def get_scaled_image_size(scale: float, size: int, patch_size: int) -> int:
102102
scaled_size = size * scale
103-
scaled_size = (
104-
math.ceil(scaled_size / patch_size) * patch_size
105-
) # make divisible by patch_size
103+
scaled_size = math.ceil(scaled_size / patch_size) * patch_size # make divisible by patch_size
106104
scaled_size = max(patch_size, scaled_size) # ensure at least 1 patch
107105
return int(scaled_size)
108106

@@ -133,17 +131,13 @@ def convert_image_to_patches(image: np.ndarray, patch_size: int) -> np.ndarray:
133131
image_height, image_width, num_channels = image.shape
134132
num_patches_height = image_height // patch_size
135133
num_patches_width = image_width // patch_size
136-
patched_image = image.reshape(
137-
num_patches_height, patch_size, num_patches_width, patch_size, num_channels
138-
)
134+
patched_image = image.reshape(num_patches_height, patch_size, num_patches_width, patch_size, num_channels)
139135
patched_image = patched_image.transpose(0, 2, 1, 3, 4)
140136
patched_image = patched_image.reshape(num_patches_height * num_patches_width, -1)
141137
return patched_image
142138

143139

144-
def pad_along_first_dim(
145-
array: np.ndarray, target_length: int, pad_value: int = 0
146-
) -> tuple[np.ndarray, np.ndarray]:
140+
def pad_along_first_dim(array: np.ndarray, target_length: int, pad_value: int = 0) -> tuple[np.ndarray, np.ndarray]:
147141
"""
148142
Pad the array along the first dimension.
149143
"""
@@ -158,7 +152,6 @@ def pad_along_first_dim(
158152

159153

160154
def _determine_max_value(image, patch_size: int = 16) -> int:
161-
162155
image_height = image.shape[0]
163156
image_width = image.shape[1]
164157

@@ -181,34 +174,37 @@ class Fgclip2ImageProcessor(BaseImageProcessor):
181174
Constructs a FG-CLIP2 image processor.
182175
183176
Args:
184-
do_resize (`bool`, *optional*, defaults to `True`):
185-
Whether to resize the image's dimensions to fit `max_num_patches` according to given `patch_size`.
186-
Can be overridden by `do_resize` in the `preprocess` method.
187-
resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
188-
Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
189-
do_rescale (`bool`, *optional*, defaults to `True`):
190-
Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
191-
the `preprocess` method.
192-
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
193-
Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
194-
method.
195-
do_normalize (`bool`, *optional*, defaults to `True`):
196-
Whether to normalize the image by the specified mean and standard deviation. Can be overridden by
197-
`do_normalize` in the `preprocess` method.
198-
image_mean (`float` or `list[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
199-
Mean to use if normalizing the image. This is a float or list of floats the length of the number of
200-
channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
201-
image_std (`float` or `list[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
202-
Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
203-
number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
204-
Can be overridden by the `image_std` parameter in the `preprocess` method.
205-
do_convert_rgb (`bool`, *optional*, defaults to `True`):
206-
Whether to convert the image to RGB.
207-
patch_size (`int`, *optional*, defaults to 16):
208-
The size (resolution) of each patch the image will be split to.
209-
max_num_patches (`int`, *optional*, defaults to 256):
210-
The image will be resized to have at most this number of patches,
211-
and then padded in "patch" dimension to match this number exactly.
177+
do_resize (`bool`, *optional*, defaults to `True`):
178+
Whether to resize the image's dimensions to fit `max_num_patches` according to given `patch_size`.
179+
Can be overridden by `do_resize` in the `preprocess` method.
180+
resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
181+
Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
182+
do_rescale (`bool`, *optional*, defaults to `True`):
183+
Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
184+
the `preprocess` method.
185+
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
186+
Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
187+
method.
188+
do_normalize (`bool`, *optional*, defaults to `True`):
189+
Whether to normalize the image by the specified mean and standard deviation. Can be overridden by
190+
`do_normalize` in the `preprocess` method.
191+
image_mean (`float` or `list[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
192+
Mean to use if normalizing the image. This is a float or list of floats the length of the number of
193+
channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
194+
image_std (`float` or `list[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
195+
Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
196+
number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
197+
Can be overridden by the `image_std` parameter in the `preprocess` method.
198+
do_convert_rgb (`bool`, *optional*, defaults to `True`):
199+
Whether to convert the image to RGB.
200+
patch_size (`int`, *optional*, defaults to 16):
201+
The size (resolution) of each patch the image will be split to.
202+
max_num_patches (`int`, *optional*, defaults to 256):
203+
The image will be resized to have at most this number of patches,
204+
and then padded in "patch" dimension to match this number exactly.
205+
dynamic_max_patches (`bool`, *optional*, defaults to `self.dynamic_max_patches`):
206+
Whether to dynamically determine `max_num_patches` from the largest input image.
207+
If `False`, uses `max_num_patches` (either passed or default).
212208
"""
213209

214210
model_input_names = ["pixel_values", "pixel_attention_mask", "spatial_shapes"]
@@ -319,23 +315,15 @@ def preprocess(
319315
do_resize = do_resize if do_resize is not None else self.do_resize
320316
resample = resample if resample is not None else self.resample
321317
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
322-
rescale_factor = (
323-
rescale_factor if rescale_factor is not None else self.rescale_factor
324-
)
318+
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
325319
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
326320
image_mean = image_mean if image_mean is not None else self.image_mean
327321
image_std = image_std if image_std is not None else self.image_std
328-
do_convert_rgb = (
329-
do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
330-
)
322+
do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
331323
patch_size = patch_size if patch_size is not None else self.patch_size
332-
max_num_patches = (
333-
max_num_patches if max_num_patches is not None else self.max_num_patches
334-
)
324+
max_num_patches = max_num_patches if max_num_patches is not None else self.max_num_patches
335325
dynamic_max_patches = (
336-
dynamic_max_patches
337-
if dynamic_max_patches is not None
338-
else self.dynamic_max_patches
326+
dynamic_max_patches if dynamic_max_patches is not None else self.dynamic_max_patches
339327
) # ← 获取设置
340328

341329
data_format = ChannelDimension.LAST
@@ -376,21 +364,14 @@ def preprocess(
376364
spatial_shapes = []
377365

378366
images = [
379-
to_channel_dimension_format(
380-
image, data_format, input_channel_dim=input_data_format
381-
)
382-
for image in images
367+
to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
383368
]
384369

385370
if dynamic_max_patches:
386371
original_max_num_patches = max_num_patches
387-
candidate_values = [
388-
_determine_max_value(img, patch_size=patch_size) for img in images
389-
]
372+
candidate_values = [_determine_max_value(img, patch_size=patch_size) for img in images]
390373
max_num_patches = max(candidate_values)
391-
logger.info(
392-
f"Dynamically set max_num_patches={max_num_patches} (originally {original_max_num_patches})"
393-
)
374+
logger.info(f"Dynamically set max_num_patches={max_num_patches} (originally {original_max_num_patches})")
394375

395376
for image in images:
396377
if do_resize:
@@ -408,9 +389,7 @@ def preprocess(
408389
)
409390

410391
if do_rescale:
411-
image = self.rescale(
412-
image=image, scale=rescale_factor, input_data_format=data_format
413-
)
392+
image = self.rescale(image=image, scale=rescale_factor, input_data_format=data_format)
414393

415394
if do_normalize:
416395
image = self.normalize(

src/transformers/models/fgclip2/image_processing_fgclip2_fast.py

Lines changed: 6 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -63,9 +63,7 @@ def convert_image_to_patches(image: "torch.Tensor", patch_size: int) -> "torch.T
6363
num_channels, image_height, image_width = image.shape
6464
num_patches_height = image_height // patch_size
6565
num_patches_width = image_width // patch_size
66-
patched_image = image.reshape(
67-
num_channels, num_patches_height, patch_size, num_patches_width, patch_size
68-
)
66+
patched_image = image.reshape(num_channels, num_patches_height, patch_size, num_patches_width, patch_size)
6967
patched_image = patched_image.permute(1, 3, 2, 4, 0)
7068
patched_image = patched_image.reshape(num_patches_height * num_patches_width, -1)
7169
return patched_image
@@ -82,15 +80,12 @@ def pad_along_first_dim(
8280
mask = torch.ones((target_length,), dtype=torch.int32)
8381
if padding_length > 0:
8482
padding = [0, 0] * (tensor.ndim - 1) + [0, padding_length]
85-
tensor = torch.nn.functional.pad(
86-
tensor, padding, mode="constant", value=pad_value
87-
)
83+
tensor = torch.nn.functional.pad(tensor, padding, mode="constant", value=pad_value)
8884
mask[-padding_length:] = 0
8985
return tensor, mask
9086

9187

9288
def _determine_max_value(image, patch_size: int = 16) -> int:
93-
9489
image_height = image.shape[1]
9590
image_width = image.shape[2]
9691

@@ -131,9 +126,7 @@ def _validate_preprocess_kwargs(self, **kwargs) -> tuple:
131126
return super()._validate_preprocess_kwargs(**kwargs)
132127

133128
@auto_docstring
134-
def preprocess(
135-
self, images: ImageInput, **kwargs: Unpack[Fgclip2ImageProcessorKwargs]
136-
) -> BatchFeature:
129+
def preprocess(self, images: ImageInput, **kwargs: Unpack[Fgclip2ImageProcessorKwargs]) -> BatchFeature:
137130
return super().preprocess(images, **kwargs)
138131

139132
def _preprocess(
@@ -152,15 +145,12 @@ def _preprocess(
152145
return_tensors: Optional[Union[str, TensorType]],
153146
**kwargs,
154147
) -> BatchFeature:
155-
156148
pixel_masks = []
157149
pixel_values = []
158150
spatial_shapes = []
159151

160152
if dynamic_max_patches:
161-
candidate_values = [
162-
_determine_max_value(img, patch_size=patch_size) for img in images
163-
]
153+
candidate_values = [_determine_max_value(img, patch_size=patch_size) for img in images]
164154
max_num_patches = max(candidate_values)
165155

166156
for image in images:
@@ -172,13 +162,9 @@ def _preprocess(
172162
max_num_patches=max_num_patches,
173163
)
174164
side_dict = SizeDict(height=height, width=width)
175-
image = self.resize(
176-
image=image, size=side_dict, interpolation=interpolation
177-
)
165+
image = self.resize(image=image, size=side_dict, interpolation=interpolation)
178166

179-
image = self.rescale_and_normalize(
180-
image, do_rescale, rescale_factor, do_normalize, image_mean, image_std
181-
)
167+
image = self.rescale_and_normalize(image, do_rescale, rescale_factor, do_normalize, image_mean, image_std)
182168

183169
# (num_channels, height, width) -> (num_patches, patch_size * patch_size * num_channels)
184170
patches = convert_image_to_patches(image, patch_size)

0 commit comments

Comments
 (0)