AdaptiveMotorControlLab
diff --git a/‎llava/action/chatgpt_utils.py‎
Lines changed: 43 additions & 17 deletions b/‎llava/action/chatgpt_utils.py‎
Lines changed: 43 additions & 17 deletions
diff --git a/‎llava/action/render_utils.py‎
Lines changed: 154 additions & 0 deletions b/‎llava/action/render_utils.py‎
Lines changed: 154 additions & 0 deletions
diff --git a/‎llava/action/times_b.ttf‎
93.4 KB b/‎llava/action/times_b.ttf‎
93.4 KB
@@ -2,6 +2,8 @@
 import io
 import json
 import os
+import sys
+sys.path[0] = os.path.dirname(os.path.dirname(sys.path[0]))
 import numpy as np
 import openai
 from pydantic import BaseModel
@@ -12,7 +14,7 @@
 from tqdm import tqdm
 import csv
 import llava
-from llava.action.utils import avion_video_loader, create_multi_choice_from_avion_predictions, generate_label_map, AvionMultiChoiceGenerator
+from llava.action.utils import avion_video_loader, create_multi_choice_from_avion_predictions, generate_label_map, AvionMultiChoiceGenerator, avion_video_render_loader
 from llava.action.dataset import datetime2sec
 
 client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
@@ -34,6 +36,9 @@ def generate_prompt(cls, start_second, end_second, option_text, gt_answer):
 You are seeing video frames from an egocentric view of a person. 
 Please talk as if you are the person in the video and describe what action you are performing.
 To assist you for how to describe the action, the video's start time is {start_second} and the end time is {end_second} and the duration is {end_second - start_second} seconds.
+Meanwhile, the left hand region is marked as 'L' in red bounding box and the right hand region is marked as 'R' in blue bounding box.
+The contact information is also provided in the bouding box tags with 'N' for no contact, 'S' for self contact, 'O' for other person contact, 'P' for portable object contact, and 'F' for stationary object contact.
+The contacted objects are also marked as 'O' in yellow bounding box.
 To further assist you for how to describe the action, note that in a multi-choice video question answering, you were given following options {option_text} and the correct answer is {gt_answer}.
 In addition to describe what you see, describe why wrong answers were wrong and why right answer was right.
 When you explain why wrong answers were wrong and why right answer was right, you should use the following flow of reasoning:
@@ -192,7 +197,8 @@ def prepare_multiple_images(self, images):
         return multi_image_content 
 
     def extract_frames(self,  vid_path, start_second, end_second):
-        frames, time_meta = avion_video_loader(self.root,
+        frames, time_meta = avion_video_render_loader(self.root, self.handobj_root,
+        # frames, time_meta = avion_video_loader(self.root,
                         vid_path,
                         'MP4',
                         start_second,
@@ -216,6 +222,7 @@ def __init__(self,
                  root,                 
                  annotation_file,
                  avion_prediction_file,
+                 handobj_root,
                  clip_length = 4, 
                  action_representation = 'GT_random_narration',
                  debug = False,
@@ -233,6 +240,7 @@ def __init__(self,
         self.topk = topk
         self.annotation_file = annotation_file
         self.avion_prediction_file = avion_prediction_file     
+        self.handobj_root = handobj_root
         self.annotation_root = Path(annotation_file).parent
         self.action_representation = action_representation
         self.labels, self.mapping_vn2narration, self.mapping_vn2act, self.verb_maps, self.noun_maps = generate_label_map(self.annotation_root,                                                                                           
@@ -289,7 +297,7 @@ def init_data(self):
     def multi_process_run(self):
         # to initialize it
 
-        indices = list(range(len(self.data)))
+        indices = list(range(len(self.data)))[:500]
 
         num_chunks = os.cpu_count() if not self.debug else 2
 
@@ -312,8 +320,11 @@ def multi_process_run(self):
 
         self.checkpoint(combined_results, "gpt_inference_results.json")                            
 
-    def run(self, indices):
-        data_batch = {i : self.data[i] for i in range(len(self.data)) if i in indices}
+    def run(self, indices=None):
+        if indices is None:
+            data_batch = {i : self.data[i] for i in range(len(self.data)) if i in list(range(len(self.data)))}
+        else:
+            data_batch = {i : self.data[i] for i in range(len(self.data)) if i in indices}
         ret = {}
 
         for k,v in tqdm(data_batch.items()):            
@@ -337,7 +348,11 @@ def run(self, indices):
             }
             if self.debug:
                 break
-        return ret 
+        if indices is None:
+            calculation = calculate_gpt_accuracy(data = ret)
+            self.checkpoint(ret, "gpt_inference_results.json")
+        else:
+            return ret 
 
 
 
@@ -508,6 +523,7 @@ def multi_process_annotate(train_file_path, root, debug = False, anno_type = 'gp
 def multi_process_inference(root,
                             annotation_file, 
                             avion_prediction_file,
+                            handobj_root,
                             action_representation = 'GT_random_narration',
                             clip_length = 4,
                             topk = 5,                             
@@ -516,11 +532,15 @@ def multi_process_inference(root,
     annotator = GPTInferenceAnnotator(root, 
     annotation_file,
     avion_prediction_file,
+    handobj_root,
     clip_length = clip_length,
     debug = debug,
     action_representation = action_representation,
     topk = topk)
 
+    # indices = list(range(len(annotator.data)))[:100]
+    # annotator.run()
+
     annotator.multi_process_run()
 
 def calculate_gpt_accuracy(path = None, data = None):
@@ -557,10 +577,15 @@ def convert_json_to_jsonl(path):
 
 if __name__ == '__main__':    
 
-    train_file_path = '/data/epic_kitchen/AVION_PREDS/avion_mc_top5_GT_random_narration/train_convs_narration.jsonl'
-    root = '/data/EK100/EK100_320p_15sec_30fps_libx264'
-    val_file = '/data/epic_kitchen/epic-kitchens-100-annotations/EPIC_100_validation.csv'
-    avion_prediction_file = '/data/epic_kitchen/AVION_PREDS/avion_pred_ids_val.json'
+    # train_file_path = '/data/epic_kitchen/AVION_PREDS/avion_mc_top5_GT_random_narration/train_convs_narration.jsonl'
+    # root = '/data/EK100/EK100_320p_15sec_30fps_libx264'
+    # val_file = '/data/epic_kitchen/epic-kitchens-100-annotations/EPIC_100_validation.csv'
+    # avion_prediction_file = '/data/epic_kitchen/AVION_PREDS/avion_pred_ids_val.json'
+
+    root = '/mediaPFM/data/haozhe/onevision/llava_video/EK100'
+    val_file = '/mediaPFM/data/haozhe/EK100/epic-kitchens-100-annotations/EPIC_100_validation.csv'
+    avion_prediction_file = '/mediaPFM/data/haozhe/EK100/EK100_in_LLAVA/avion_pred_ids_val.json'
+    handobj_root = '/mnt/SV_storage/VFM/hand_object_detector/Save_dir'
 
 
 
@@ -569,13 +594,14 @@ def convert_json_to_jsonl(path):
 
     # multi_process_annotate(train_file_path, root, debug = False, n_samples = 10000)
 
-    # multi_process_inference(root, 
-    #                         val_file, 
-    #                         avion_prediction_file,
-    #                         debug = True,
-    #                         clip_length = 4,
-    #                         topk = 5)
+    multi_process_inference(root, 
+                            val_file, 
+                            avion_prediction_file,
+                            handobj_root,
+                            debug = False,
+                            clip_length = 4,
+                            topk = 5)
 
     #calculate_gpt_accuracy('valset_chatgpt_inference_results/gpt-4o-avion_top10_4frames_fixed_narration.json')
 
-    convert_json_to_jsonl('train_anno_gpt-gt-reason_4_10000.json')
+    # convert_json_to_jsonl('train_anno_gpt-gt-reason_4_10000.json')
@@ -0,0 +1,154 @@
+import numpy as np
+import cv2
+import ast
+from PIL import Image, ImageDraw, ImageFont
+
+color_rgb = [(255,255,0), (255, 128,0), (128,255,0), (0,128,255), (0,0,255), (127,0,255), (255,0,255), (255,0,127), (255,0,0), (255,204,153), (255,102,102), (153,255,153), (153,153,255), (0,0,153)]
+color_rgba = [(255,255,0,70), (255, 128,0,70), (128,255,0,70), (0,128,255,70), (0,0,255,70), (127,0,255,70), (255,0,255,70), (255,0,127,70), (255,0,0,70), (255,204,153,70), (255,102,102,70), (153,255,153,70), (153,153,255,70), (0,0,153,70)]
+
+
+hand_rgb = [(0, 90, 181), (220, 50, 32)] 
+hand_rgba = [(0, 90, 181, 70), (220, 50, 32, 70)]
+
+obj_rgb = (255, 194, 10)
+obj_rgba = (255, 194, 10, 70)
+
+side_map = {'l':'Left', 'r':'Right'}
+side_map2 = {0:'Left', 1:'Right'}
+side_map3 = {0:'L', 1:'R'}
+state_map = {0:'No Contact', 1:'Self Contact', 2:'Another Person', 3:'Portable Object', 4:'Stationary Object'}
+state_map2 = {0:'N', 1:'S', 2:'O', 3:'P', 4:'F'}
+
+vis_settings = {'font_size':20, 'line_width':2, 'point_radius':4, 'hand_color':hand_rgb, 'hand_alpha':[None, None], 'obj_color':obj_rgb, 'obj_alpha':None, 'text_alpha':(255, 255, 255, 255)}
+
+def calculate_center(bb):
+    return [(bb[0] + bb[2])/2, (bb[1] + bb[3])/2]
+
+def filter_object(obj_dets, hand_dets):
+    filtered_object = []
+    object_cc_list = []
+    for j in range(obj_dets.shape[0]):
+        object_cc_list.append(calculate_center(obj_dets[j,:4]))
+    object_cc_list = np.array(object_cc_list)
+    img_obj_id = []
+    for i in range(hand_dets.shape[0]):
+        if hand_dets[i, 5] <= 0:
+            img_obj_id.append(-1)
+            continue
+        hand_cc = np.array(calculate_center(hand_dets[i,:4]))
+        point_cc = np.array([(hand_cc[0]+hand_dets[i,6]*10000*hand_dets[i,7]), (hand_cc[1]+hand_dets[i,6]*10000*hand_dets[i,8])])
+        dist = np.sum((object_cc_list - point_cc)**2,axis=1)
+        dist_min = np.argmin(dist)
+        img_obj_id.append(dist_min)
+    return img_obj_id
+
+def draw_obj_mask(image, draw, obj_idx, obj_bbox, obj_score, width, height):
+    font = ImageFont.truetype('llava/action/times_b.ttf', size=vis_settings['font_size'])
+    mask = Image.new('RGBA', (width, height))
+    pmask = ImageDraw.Draw(mask)
+    pmask.rectangle(obj_bbox, outline=vis_settings['obj_color'], width=vis_settings['line_width'], fill=vis_settings['obj_alpha']) 
+    image.paste(mask, (0,0), mask)  
+
+    draw.rectangle([obj_bbox[0], max(0, obj_bbox[1]-vis_settings['font_size']), obj_bbox[0]+vis_settings['font_size']+2, 
+                    max(0, obj_bbox[1]-vis_settings['font_size'])+vis_settings['font_size']], 
+                    fill=vis_settings['text_alpha'], outline=vis_settings['obj_color'], width=vis_settings['line_width'])
+    draw.text((obj_bbox[0]+5, max(0, obj_bbox[1]-vis_settings['font_size'])-2), f'O', font=font, fill=(0,0,0)) #
+
+    return image
+
+def draw_hand_mask(image, draw, hand_idx, hand_bbox, hand_score, side, state, width, height):
+    font = ImageFont.truetype('llava/action/times_b.ttf', size=vis_settings['font_size'])
+    if side == 0:
+        side_idx = 0
+    elif side == 1:
+        side_idx = 1
+    mask = Image.new('RGBA', (width, height))
+    pmask = ImageDraw.Draw(mask)
+    pmask.rectangle(hand_bbox, outline=vis_settings['hand_color'][side_idx], width=vis_settings['line_width'], fill=vis_settings['hand_alpha'][side_idx])
+    image.paste(mask, (0,0), mask)
+    # text
+    
+    draw = ImageDraw.Draw(image)
+    draw.rectangle([hand_bbox[0], max(0, hand_bbox[1]-vis_settings['font_size']), hand_bbox[0]+vis_settings['font_size']*2+2, 
+                    max(0, hand_bbox[1]-vis_settings['font_size'])+vis_settings['font_size']], 
+                    fill=vis_settings['text_alpha'], outline=vis_settings['hand_color'][side_idx], width=vis_settings['line_width'])
+    draw.text((hand_bbox[0]+6, max(0, hand_bbox[1]-vis_settings['font_size'])-2), f'{side_map3[int(float(side))]}-{state_map2[int(float(state))]}', font=font, fill=(0,0,0)) # 
+
+    return image
+    
+def draw_line_point(draw, side_idx, hand_center, object_center):
+    
+    draw.line([hand_center, object_center], fill=vis_settings['hand_color'][side_idx], width=vis_settings['line_width'])
+    x, y = hand_center[0], hand_center[1]
+    r=vis_settings['point_radius']
+    draw.ellipse((x-r, y-r, x+r, y+r), fill=vis_settings['hand_color'][side_idx])
+    x, y = object_center[0], object_center[1]
+    draw.ellipse((x-r, y-r, x+r, y+r), fill=vis_settings['obj_color'])
+
+def vis_detections_PIL(im, class_name, dets, thresh=0.8):
+    """Visual debugging of detections."""
+    
+    image = Image.fromarray(im).convert("RGBA")
+    draw = ImageDraw.Draw(image)
+    width, height = image.size
+    
+    for hand_idx, i in enumerate(range(np.minimum(10, dets.shape[0]))):
+        bbox = list(int(np.round(x)) for x in dets[i, :4])
+        score = dets[i, 4]
+        lr = dets[i, -1]
+        state = dets[i, 5]
+        if score > thresh:
+            image = draw_hand_mask(image, draw, hand_idx, bbox, score, lr, state, width, height)
+            
+    return image
+
+def vis_detections_filtered_objects_PIL(im, obj_dets, hand_dets, thresh_hand=0.8, thresh_obj=0.01):
+
+    # convert to PIL
+    im = im[:,:,::-1]
+    image = Image.fromarray(im).convert("RGBA")
+    draw = ImageDraw.Draw(image)
+    width, height = image.size 
+
+    if (obj_dets is not None) and (hand_dets is not None):
+        img_obj_id = filter_object(obj_dets, hand_dets)
+        for obj_idx, i in enumerate(range(np.minimum(10, obj_dets.shape[0]))):
+            bbox = list(int(np.round(x)) for x in obj_dets[i, :4])
+            score = obj_dets[i, 4]
+            if score > thresh_obj and i in img_obj_id:
+                # viz obj by PIL
+                image = draw_obj_mask(image, draw, obj_idx, bbox, score, width, height)
+
+        for hand_idx, i in enumerate(range(np.minimum(10, hand_dets.shape[0]))):
+            bbox = list(int(np.round(x)) for x in hand_dets[i, :4])
+            score = hand_dets[i, 4]
+            lr = hand_dets[i, -1]
+            state = hand_dets[i, 5]
+            if score > thresh_hand:
+                # viz hand by PIL
+                image = draw_hand_mask(image, draw, hand_idx, bbox, score, lr, state, width, height)
+
+                if state > 0: # in contact hand
+
+                    obj_cc, hand_cc =  calculate_center(obj_dets[img_obj_id[i],:4]), calculate_center(bbox)
+                    # viz line by PIL
+                    if lr == 0:
+                        side_idx = 0
+                    elif lr == 1:
+                        side_idx = 1
+                    draw_line_point(draw, side_idx, (int(hand_cc[0]), int(hand_cc[1])), (int(obj_cc[0]), int(obj_cc[1])))
+
+    elif hand_dets is not None:
+        image = vis_detections_PIL(im, 'hand', hand_dets, thresh_hand)
+        
+    return image
+
+def render_frame(im, hand_dets, obj_dets, thresh_hand=0.5, thresh_obj=0.5):
+    im_show = im.copy()
+    im_show = cv2.cvtColor(im_show, cv2.COLOR_RGB2BGR)
+    hand_dets = np.array(ast.literal_eval(hand_dets)) if hand_dets != '[]' else None
+    obj_dets = np.array(ast.literal_eval(obj_dets)) if obj_dets != '[]' else None
+    im_show = vis_detections_filtered_objects_PIL(im_show, obj_dets, hand_dets, thresh_hand, thresh_obj)
+    # im_show.save('test.png')
+    im_show = np.array(im_show)
+    return im_show