gpt testing also supports benchmark

yeshaokai · yeshaokai · commit 79829dba17d8 · 2025-02-05T21:19:34.000Z
diff --git a/llava/action/benchmark.py b/llava/action/benchmark.py
@@ -10,9 +10,10 @@
 n_frames = 4
 topk = 5
 action_representation = 'GT_random_narration'
-gpt_model = 'gpt-4o-mini-2024-07-18'
-# gpt_model = 'gpt-4o-2024-08-06'
-perspective = 'third_person'
+#gpt_model = 'gpt-4o-mini-2024-07-18'
+gpt_model = 'gpt-4o-2024-08-06'
+perspective = 'first_person'
+benchmark_testing = True
 
 
 def benchmark_avion_mcq(n_samples):
@@ -26,6 +27,7 @@ def benchmark_avion_mcq(n_samples):
                                         question_type = 'mc_',
                                         action_representation=action_representation,
                                         perspective = perspective,
+                                        benchmark_testing = benchmark_testing,
                                         topk = topk)
     inferencer.multi_process_run(n_samples)
                                        
@@ -40,6 +42,7 @@ def benchmark_tim_mcq(n_samples):
                                         question_type = 'mc_',
                                         action_representation=action_representation,
                                         perspective = perspective,
+                                        benchmark_testing = benchmark_testing,
                                         topk = topk) 
     inferencer.multi_process_run(n_samples)    
 
@@ -53,6 +56,7 @@ def benchmark_random_mcq(n_samples):
                                         question_type = 'mc_',
                                         action_representation=action_representation,
                                         perspective = perspective,
+                                        benchmark_testing = benchmark_testing,
                                         topk = topk) 
     
     inferencer.multi_process_run(n_samples)
@@ -61,4 +65,4 @@ def benchmark_random_mcq(n_samples):
 if __name__ == '__main__':
     benchmark_avion_mcq(100)
     benchmark_tim_mcq(100)
-    benchmark_random_mcq(100)    
+    #benchmark_random_mcq(100)    
diff --git a/llava/action/chatgpt_utils.py b/llava/action/chatgpt_utils.py
@@ -411,7 +411,7 @@ def init_data(self):
                                                             self.mapping_vn2narration,
                                                             self.verb_maps,
                                                             self.noun_maps,
-                                                            benchmark_tesitng = self.benchmark_testing,
+                                                            benchmark_testing = self.benchmark_testing,
                                                             is_train = False)
             else:
                 mc_data = self.mc_generator.generate_multi_choice(gt_vn,
diff --git a/llava/action/utils.py b/llava/action/utils.py
@@ -411,7 +411,8 @@ def generate_multi_choice(self,
                               mapping_vn2narration, 
                               verb_maps, 
                               noun_maps,
-                              is_train = True
+                              is_train = True,
+                              benchmark_testing = False
                               ):
 
         """
@@ -425,7 +426,7 @@ def generate_multi_choice(self,
         if is_train:
             return self.train_generate(gt_vn, narration, k, action_representation, n_narrations, labels, mapping_vn2narration, verb_maps, noun_maps)
         else:
-            return self.test_generate(gt_vn, narration, k, action_representation, n_narrations, labels, mapping_vn2narration, verb_maps, noun_maps)
+            return self.test_generate(gt_vn, narration, k, action_representation, n_narrations, labels, mapping_vn2narration, verb_maps, noun_maps, benchmark_testing = benchmark_testing)
     
     def train_generate(self, gt_vn, narration, k, action_representation, n_narrations, labels, mapping_vn2narration, verb_maps, noun_maps):
         # letters as A, B, C, D, .. Note we maximally support 26 letters