1+ import os
2+ import json
3+ import torch
4+ import logging
5+ from llm2vec import LLM2Vec
6+ from typing import List , Dict , Any
7+ from transformers import AutoModel , AutoConfig , AutoTokenizer
8+
9+ os .environ ["CUDA_VISIBLE_DEVICES" ] = "0"
10+
11+ logging .basicConfig (level = logging .INFO , format = '%(asctime)s - %(levelname)s - %(message)s' )
12+
13+ CONFIG = {
14+ "llm_model_name" : "microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned" ,
15+ "flickr" : {
16+ "ann_path" : "eval_data/flickr30k/test.json" ,
17+ "root" : "eval_data/flickr30k/" ,
18+ "save_filename" : "flickr30k_8B_llm_features.dpt"
19+ },
20+ "coco" : {
21+ "ann_path" : "eval_data/coco/coco_karpathy_test.json" ,
22+ "root" : "eval_data/coco/" ,
23+ "save_filename" : "coco_8B_llm_features.dpt"
24+ },
25+ "sharegpt4v" : {
26+ "path" : "eval_data/sharegpt4v/share-captioner_coco_lcs_sam_1246k_1107.json" ,
27+ "ann_path" : "eval_data/sharegpt4v/validation_1k.json" ,
28+ "root" : "eval_data/sharegpt4v/" ,
29+ "save_filename" : "sv_8B_llm_features.dpt" ,
30+ "total_len" : 1000
31+ },
32+ "urban1k" : {
33+ "ann_path" : "eval_data/Urban1k/test.json" ,
34+ "root" : "eval_data/Urban1k" ,
35+ "save_filename" : "urban1k_8B_llm_features.dpt"
36+ },
37+ "docci" : {
38+ "path" : "eval_data/docci/docci_descriptions.jsonlines" ,
39+ "ann_path" : "eval_data/docci/test.json" ,
40+ "root" : "eval_data/docci" ,
41+ "save_filename" : "docci_8B_llm_features.dpt"
42+ }
43+ }
44+
45+ def load_json (file_path : str ) -> List [Dict [str , Any ]]:
46+ try :
47+ with open (file_path , 'r' ) as f :
48+ return json .load (f )
49+ except Exception as e :
50+ logging .error (f"Failed to load JSON file { file_path } : { e } " )
51+ raise
52+
53+ def save_embeddings (embeddings : torch .Tensor , save_path : str ) -> None :
54+ try :
55+ torch .save (embeddings , save_path )
56+ logging .info (f"Embeddings saved to { save_path } " )
57+ except Exception as e :
58+ logging .error (f"Failed to save embeddings to { save_path } : { e } " )
59+ raise
60+
61+ def process_multi_texts_dataset (data : List [Dict [str , Any ]], llm_model : LLM2Vec , save_path : str ) -> None :
62+ texts = [caption for item in data for caption in item ['caption' ]]
63+ with torch .no_grad ():
64+ embeddings = llm_model .encode (texts , convert_to_tensor = True , batch_size = 196 )
65+
66+ texts_num = len (data [0 ]['caption' ])
67+ embeddings = embeddings .view (- 1 , texts_num , embeddings .size (- 1 ))
68+ save_embeddings (embeddings , save_path )
69+
70+ def process_dataset (texts : List , llm_model : LLM2Vec , save_path : str ) -> None :
71+ with torch .no_grad ():
72+ embeddings = llm_model .encode (texts , convert_to_tensor = True , batch_size = 128 )
73+ save_embeddings (embeddings , save_path )
74+
75+ def flickr (llm_model : LLM2Vec ) -> None :
76+ config = CONFIG ["flickr" ]
77+ data = load_json (config ["ann_path" ])
78+ save_path = os .path .join (config ["root" ], config ["save_filename" ])
79+ process_multi_texts_dataset (data , llm_model , save_path )
80+
81+ def coco (llm_model : LLM2Vec ) -> None :
82+ config = CONFIG ["coco" ]
83+ data = load_json (config ["ann_path" ])
84+ save_path = os .path .join (config ["root" ], config ["save_filename" ])
85+ process_multi_texts_dataset (data , llm_model , save_path )
86+
87+ def sharegpt4v (llm_model : LLM2Vec ) -> None :
88+ config = CONFIG ["sharegpt4v" ]
89+ data = load_json (config ["path" ])[:config ["total_len" ]]
90+ captions = []
91+ for it in data :
92+ dic = {}
93+ dic ['caption' ] = it ['conversations' ][1 ]['value' ]
94+ dic ['image' ] = it ['image' ]
95+ captions .append (dic )
96+
97+ json .dump (captions , open (config ['ann_path' ], 'w' ))
98+
99+ texts = [item ['caption' ] for item in captions ]
100+ save_path = os .path .join (config ["root" ], config ["save_filename" ])
101+ process_dataset (texts , llm_model , save_path )
102+
103+
104+ def urban1k (llm_model : LLM2Vec ) -> None :
105+ config = CONFIG ["urban1k" ]
106+ eval_data = []
107+ for i in range (1 , 1001 ):
108+ caption_path = os .path .join (config ["root" ], f'caption/{ i } .txt' )
109+ image_path = os .path .join (config ["root" ], f'image/{ i } .jpg' )
110+ caption = open (caption_path , 'r' ).readlines ()[0 ]
111+ eval_data .append ({'caption' : caption , 'image' : image_path })
112+
113+ json .dump (eval_data , open (config ['ann_path' ], 'w' ))
114+
115+ texts = [item ['caption' ] for item in eval_data ]
116+ save_path = os .path .join (config ["root" ], config ["save_filename" ])
117+ process_dataset (texts , llm_model , save_path )
118+
119+ def docci (llm_model : LLM2Vec ) -> None :
120+ config = CONFIG ["docci" ]
121+ data = open (config ["path" ], 'r' ).readlines ()
122+ eval_data = []
123+ for line in data :
124+ dic = json .loads (line )
125+ if dic ['split' ] == "test" :
126+ eval_data .append ({'caption' : dic ['description' ], 'image' : dic ['image_file' ]})
127+
128+ json .dump (eval_data , open (config ['ann_path' ], 'w' ))
129+
130+ texts = [item ['caption' ] for item in eval_data ]
131+ save_path = os .path .join (config ["root" ], config ["save_filename" ])
132+ process_dataset (texts , llm_model , save_path )
133+
134+ def main () -> None :
135+ llm_model_name = CONFIG ["llm_model_name" ]
136+ config = AutoConfig .from_pretrained (llm_model_name , trust_remote_code = True )
137+ llm_model = AutoModel .from_pretrained (
138+ llm_model_name ,
139+ torch_dtype = torch .bfloat16 ,
140+ config = config ,
141+ trust_remote_code = True ,
142+ )
143+ tokenizer = AutoTokenizer .from_pretrained (llm_model_name )
144+ llm_model .config ._name_or_path = "meta-llama/Meta-Llama-3-8B-Instruct"
145+ model = LLM2Vec (llm_model , tokenizer , pooling_mode = "mean" , max_length = 512 , doc_max_length = 512 )
146+
147+ flickr (model )
148+ coco (model )
149+ sharegpt4v (model )
150+ urban1k (model )
151+ docci (model )
152+
153+ if __name__ == '__main__' :
154+ main ()
0 commit comments