Skip to content

Commit f07f7e0

Browse files
committed
add fgovd eval
1 parent 2205809 commit f07f7e0

File tree

3 files changed

+118
-235
lines changed

3 files changed

+118
-235
lines changed

fgclip2/eval/fgovd.py

Lines changed: 0 additions & 233 deletions
This file was deleted.

fgclip2/eval/fgovd_bbox_roi.py

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
import torch
2+
import glob
3+
import argparse
4+
import os
5+
import json
6+
from tqdm import tqdm
7+
import itertools
8+
import numpy as np
9+
from fgclip2.model.strcs.fgclip2 import FG_CLIP2_Model
10+
from fgclip2.model.strcs.image_processing_fgclip2 import Fgclip2ImageProcessor
11+
from transformers import AutoTokenizer
12+
13+
from PIL import Image
14+
import numpy as np
15+
16+
def normalize_and_tensorize_boxes_naflex(bbox, image_width, image_height, real_w, real_h):
17+
x, y, w, h = bbox
18+
x1 = (x / image_width) * real_w
19+
y1 = (y / image_height) * real_h
20+
x2 = ((x + w) / image_width) * real_w
21+
y2 = ((y + h) / image_height) * real_h
22+
newbox = [[0, x1, y1, x2, y2]]
23+
boxes_tensor = torch.tensor(newbox, dtype=torch.float32)
24+
25+
return boxes_tensor
26+
27+
def eval_fgovd(model, image_processor, tokenizer, device, args):
28+
pred_true = 0
29+
index_i = 0
30+
image_folder = args.image_folder
31+
ann_file = args.ann_file
32+
with torch.no_grad():
33+
with open(ann_file, 'r') as file:
34+
jsonlist = file.readlines()
35+
itemnum = len(jsonlist)
36+
37+
image_size = args.image_size
38+
patch_size = model.config.vision_config.patch_size
39+
40+
for item in jsonlist:
41+
msg = json.loads(item)
42+
image_path = os.path.join(image_folder, msg["img_path"])
43+
captions = msg["pos_expression"]
44+
neg_expression = msg["neg_expression"]
45+
captions = captions+neg_expression
46+
captions = [caption.lower() for caption in captions]
47+
48+
boxmsg = msg["bbox"]
49+
bbox = (boxmsg[0],boxmsg[1],boxmsg[2],boxmsg[3])
50+
left = int(bbox[0])
51+
top = int(bbox[1])
52+
right = int(bbox[0] + bbox[2])
53+
bottom = int(bbox[1] + bbox[3])
54+
55+
img = Image.open(image_path).convert('RGB')
56+
image_width,image_height = img.size
57+
image_input = image_processor(images=img, return_tensors="pt").to(device)
58+
spatial_values = image_input["spatial_shapes"][0]
59+
real_h = spatial_values[0].item()
60+
real_w = spatial_values[1].item()
61+
boxinfo_tensor = normalize_and_tensorize_boxes_naflex(bbox,image_width,image_height,real_w,real_h)
62+
boxinfo_tensor = boxinfo_tensor.to(device)
63+
boxinfo_tensor = boxinfo_tensor.unsqueeze(dim=0)
64+
65+
with torch.no_grad():
66+
image_features = model.get_image_box_roi_features(**image_input,box_info=boxinfo_tensor).to(device)
67+
image_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True)
68+
69+
caption_input = tokenizer(captions, max_length=args.max_length, padding="max_length", truncation=True, return_tensors='pt').to(device)
70+
text_features = model.get_text_features(**caption_input, walk_type=args.walk_type)
71+
text_features /= text_features.norm(dim=-1, keepdim=True)
72+
73+
similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
74+
max_value = torch.max(similarity[0])
75+
value_at_index_0 = similarity[0][0]
76+
is_max_at_index_0 = torch.equal(max_value, value_at_index_0)
77+
78+
if is_max_at_index_0:
79+
pred_true+=1
80+
else:
81+
pass
82+
index_i+=1
83+
print(index_i," / ", itemnum, " precision: ", pred_true/itemnum)
84+
85+
def eval_model(args):
86+
assert args.naflex
87+
image_processor = Fgclip2ImageProcessor.from_pretrained(args.model_base)
88+
tokenizer = AutoTokenizer.from_pretrained(args.model_base)
89+
model = FG_CLIP2_Model.from_pretrained(args.model_path, device_map="cuda").cuda().eval()
90+
device = model.device
91+
92+
eval_fgovd(model, image_processor, tokenizer, device, args)
93+
94+
if __name__ == "__main__":
95+
parser = argparse.ArgumentParser()
96+
parser.add_argument("--model-path", type=str, default="qihoo360/fg-clip2-base")
97+
parser.add_argument("--model-base", type=str, default="qihoo360/fg-clip2-base")
98+
parser.add_argument("--max_length", type=int, default=64)
99+
parser.add_argument("--image_size", type=int, default=224, help='for no-naflex siglip2')
100+
parser.add_argument("--naflex", action='store_true', default=True)
101+
parser.add_argument("--walk_type", type=str, default="box")
102+
parser.add_argument("--image-folder", type=str, default="data/coco/")
103+
parser.add_argument("--ann_file", type=str, default="")
104+
args = parser.parse_args()
105+
106+
eval_model(args)

scripts/eval/eval.sh

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,6 @@ basename="fgclip2-base-patch16/"
6464
# --walk_type box \
6565

6666

67-
6867
# S EVAL LVIS-BOXCLS
6968
# lvis_box_ann="lvis/lvis_v1_val.json"
7069
# torchrun --master_port=8888 --nproc_per_node 8 -m fgclip2.eval.in1k.lvis_box_cls_ddp \
@@ -83,7 +82,18 @@ basename="fgclip2-base-patch16/"
8382
# --max_length 64 \
8483
# --ann_file $bcn_box_ann \
8584
# --walk_type box \
86-
85+
86+
87+
# S EVAL FGOVD
88+
# fg_ovd_ann="1_attributes_llava.jsonl"
89+
# python -m fgclip2.eval.fgovd_bbox_roi \
90+
# --model-path $INIT_MODEL_PATH/$basename \
91+
# --model-base $INIT_MODEL_PATH/$basename \
92+
# --max_length 64 \
93+
# --ann_file $fg_ovd_ann \
94+
# --image-folder data/coco \
95+
# --walk_type box \
96+
8797

8898
# S EVAL ShareGPT4V
8999
# ShareGPT4V_ann="share-captioner_coco_lcs_sam_1246k_1107.json"

0 commit comments

Comments
 (0)