1+ import torch
2+ import glob
3+ import argparse
4+ import os
5+ import json
6+ from tqdm import tqdm
7+ import itertools
8+ import numpy as np
9+ from fgclip2 .model .strcs .fgclip2 import FG_CLIP2_Model
10+ from fgclip2 .model .strcs .image_processing_fgclip2 import Fgclip2ImageProcessor
11+ from transformers import AutoTokenizer
12+
13+ from PIL import Image
14+ import numpy as np
15+
16+ def normalize_and_tensorize_boxes_naflex (bbox , image_width , image_height , real_w , real_h ):
17+ x , y , w , h = bbox
18+ x1 = (x / image_width ) * real_w
19+ y1 = (y / image_height ) * real_h
20+ x2 = ((x + w ) / image_width ) * real_w
21+ y2 = ((y + h ) / image_height ) * real_h
22+ newbox = [[0 , x1 , y1 , x2 , y2 ]]
23+ boxes_tensor = torch .tensor (newbox , dtype = torch .float32 )
24+
25+ return boxes_tensor
26+
27+ def eval_fgovd (model , image_processor , tokenizer , device , args ):
28+ pred_true = 0
29+ index_i = 0
30+ image_folder = args .image_folder
31+ ann_file = args .ann_file
32+ with torch .no_grad ():
33+ with open (ann_file , 'r' ) as file :
34+ jsonlist = file .readlines ()
35+ itemnum = len (jsonlist )
36+
37+ image_size = args .image_size
38+ patch_size = model .config .vision_config .patch_size
39+
40+ for item in jsonlist :
41+ msg = json .loads (item )
42+ image_path = os .path .join (image_folder , msg ["img_path" ])
43+ captions = msg ["pos_expression" ]
44+ neg_expression = msg ["neg_expression" ]
45+ captions = captions + neg_expression
46+ captions = [caption .lower () for caption in captions ]
47+
48+ boxmsg = msg ["bbox" ]
49+ bbox = (boxmsg [0 ],boxmsg [1 ],boxmsg [2 ],boxmsg [3 ])
50+ left = int (bbox [0 ])
51+ top = int (bbox [1 ])
52+ right = int (bbox [0 ] + bbox [2 ])
53+ bottom = int (bbox [1 ] + bbox [3 ])
54+
55+ img = Image .open (image_path ).convert ('RGB' )
56+ image_width ,image_height = img .size
57+ image_input = image_processor (images = img , return_tensors = "pt" ).to (device )
58+ spatial_values = image_input ["spatial_shapes" ][0 ]
59+ real_h = spatial_values [0 ].item ()
60+ real_w = spatial_values [1 ].item ()
61+ boxinfo_tensor = normalize_and_tensorize_boxes_naflex (bbox ,image_width ,image_height ,real_w ,real_h )
62+ boxinfo_tensor = boxinfo_tensor .to (device )
63+ boxinfo_tensor = boxinfo_tensor .unsqueeze (dim = 0 )
64+
65+ with torch .no_grad ():
66+ image_features = model .get_image_box_roi_features (** image_input ,box_info = boxinfo_tensor ).to (device )
67+ image_features = image_features / image_features .norm (p = 2 , dim = - 1 , keepdim = True )
68+
69+ caption_input = tokenizer (captions , max_length = args .max_length , padding = "max_length" , truncation = True , return_tensors = 'pt' ).to (device )
70+ text_features = model .get_text_features (** caption_input , walk_type = args .walk_type )
71+ text_features /= text_features .norm (dim = - 1 , keepdim = True )
72+
73+ similarity = (100.0 * image_features @ text_features .T ).softmax (dim = - 1 )
74+ max_value = torch .max (similarity [0 ])
75+ value_at_index_0 = similarity [0 ][0 ]
76+ is_max_at_index_0 = torch .equal (max_value , value_at_index_0 )
77+
78+ if is_max_at_index_0 :
79+ pred_true += 1
80+ else :
81+ pass
82+ index_i += 1
83+ print (index_i ," / " , itemnum , " precision: " , pred_true / itemnum )
84+
85+ def eval_model (args ):
86+ assert args .naflex
87+ image_processor = Fgclip2ImageProcessor .from_pretrained (args .model_base )
88+ tokenizer = AutoTokenizer .from_pretrained (args .model_base )
89+ model = FG_CLIP2_Model .from_pretrained (args .model_path , device_map = "cuda" ).cuda ().eval ()
90+ device = model .device
91+
92+ eval_fgovd (model , image_processor , tokenizer , device , args )
93+
94+ if __name__ == "__main__" :
95+ parser = argparse .ArgumentParser ()
96+ parser .add_argument ("--model-path" , type = str , default = "qihoo360/fg-clip2-base" )
97+ parser .add_argument ("--model-base" , type = str , default = "qihoo360/fg-clip2-base" )
98+ parser .add_argument ("--max_length" , type = int , default = 64 )
99+ parser .add_argument ("--image_size" , type = int , default = 224 , help = 'for no-naflex siglip2' )
100+ parser .add_argument ("--naflex" , action = 'store_true' , default = True )
101+ parser .add_argument ("--walk_type" , type = str , default = "box" )
102+ parser .add_argument ("--image-folder" , type = str , default = "data/coco/" )
103+ parser .add_argument ("--ann_file" , type = str , default = "" )
104+ args = parser .parse_args ()
105+
106+ eval_model (args )
0 commit comments