Skip to content

Commit e5f99ce

Browse files
committed
more improvements
1 parent 36c7044 commit e5f99ce

File tree

2 files changed

+203
-66
lines changed

2 files changed

+203
-66
lines changed

llava/action/chatgpt_utils.py

Lines changed: 188 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -36,52 +36,38 @@ def generate_prompt(cls, start_second, end_second, option_text, gt_answer):
3636

3737
reason_mc_string = gt_answer
3838

39-
prompt = f"""Your job is to create 3 question and answer pairs based on the text below.
40-
{reason_mc_string}
41-
Example questions you can ask include. Note you are not limited to these questions:
42-
What object the person is interacting with?
43-
What objects are visible in the video?
44-
What is the sequence of the atomic actions that the person is performing?
45-
Make sure your only ask questions that can be answered with enough grounding in the text.
39+
prompt = f"""Your job is to create 3 question-answer pairs based on the text below. The text contains a first-person narrative of video frames from an egocentric perspective of a person interacting with objects in a kitchen.
40+
{reason_mc_string}
41+
You can ask questions such as:
42+
What object am I interacting with?
43+
What objects are visible in the video?
44+
What is the sequence of the atomic actions I am performing?
45+
46+
Make sure your questions can be answered based on the information provided in the text. Do not ask questions that require additional context or information beyond what is given.
4647
4748
"""
4849
return prompt
4950

5051

51-
class LLaVAWrongAnswerAwarePrompt:
52-
"""
53-
The prompt for the annotation
54-
"""
52+
class GPTHandObjectPrompt:
5553
@classmethod
56-
def generate_prompt(cls, start_second, end_second, option_text, gt_answer):
57-
prompt = f"""
58-
You are seeing video frames from an egocentric view of a person.
59-
Please talk as if you are the person in the video and describe what action you are performing.
60-
To assist you for how to describe the action, the video's start time is {start_second} and the end time is {end_second} and the duration is {end_second - start_second} seconds.
61-
62-
To further assist you for how to describe the action, note that in a multi-choice video question answering, you were given following options {option_text} and the correct answer is {gt_answer}.
63-
In addition to describe what you see, describe why wrong answers were wrong and why right answer was right.
64-
When you explain why wrong answers were wrong and why right answer was right, you should use the following flow of reasoning:
65-
66-
The flow of reasoning:
67-
1. What objects need to be visible to support the answer?
68-
2. Whether the duration in time supports that answer?
69-
70-
Based on the answers above, why right answer is right and why wrong answers were wrong."""
71-
return prompt
72-
73-
74-
class GPTReasoningWithoutGTPrompt:
75-
"""
76-
The perhaps simplest reasoning explanation.
77-
"""
78-
@classmethod
79-
def generate_prompt(cls, start_second, end_second, option_text, gt_answer):
80-
prompt = f"""
81-
You are seeing video frames from an egocentric view of a person. The person is interacting with objects in a kitchen.
82-
Describe the action the person is performing. Pay attention to the objects the person's hands are interacting.
83-
Explain in details what are the supporting evidences for the action. Useful evidences include the duration of the video, the objects the person is interacting with, and the context of the video.
84-
"""
54+
def generate_prompt(cls, left_hand_state, right_hand_state, gt_narration):
55+
prompt = f"""
56+
You are a helpful AI assistant, and you will assist in creating question-answer pairs.
57+
I will provide you with the state of the left hand and the right hand, as well as the ground-truth narration.
58+
For the hand states:
59+
- -1 denotes the hand is not visible
60+
- 0 denotes the hand is visible but not interacting with objects
61+
- 1 denotes the hand is interacting with another hand
62+
- 3 denotes the hand is interacting with a portable object
63+
- 4 denotes the hand is interacting with a stationary object
64+
65+
The state for the left hand is {left_hand_state}, and the state for the right hand is {right_hand_state}.
66+
Using this information, create 3 question-answer pairs. Pretend you are seeing an image from a first-person perspective and can see your hands and the objects you are interacting with.
67+
Do not ask questions about the action, as you are viewing an image and not a video.
68+
Do not describe what the object is, only mention whether it's portable or stationary.
69+
Ask and answer the questions in the first-person perspective.
70+
"""
8571
return prompt
8672

8773
class GPTReasoningWithGTPrompt:
@@ -95,6 +81,7 @@ def generate_prompt(cls, start_second, end_second, option_text, gt_answer):
9581
Your reasoning steps should include supporting evidence for the action, such as the duration of the video, the sequence of actions the person performs, the objects they interact with, and the overall context of the video.
9682
As a general guideline, for videos longer than 3 seconds, provide detailed reasoning steps, and for videos shorter than 3 seconds, generate less detailed reasoning.
9783
The video duration is {end_second - start_second:.3f} seconds.
84+
Make sure you use the first-person perspective in your reasoning.
9885
"""
9986
print (prompt)
10087
return prompt
@@ -107,6 +94,17 @@ class GT_Augmentation_Response(BaseModel):
10794
disagree_with_human_annotation: bool
10895

10996

97+
class GPTHandObjectResponse(BaseModel):
98+
"""
99+
The response for the GPTHandObjectPrompt
100+
"""
101+
first_question: str
102+
first_answer: str
103+
second_question: str
104+
second_answer: str
105+
third_question: str
106+
third_answer: str
107+
110108
class ExpandReasonMCResponse(BaseModel):
111109
"""
112110
The response for the ExpandReasonMCPrompt
@@ -119,12 +117,14 @@ class ExpandReasonMCResponse(BaseModel):
119117
third_answer: str
120118

121119
PROMPT_FACTORY = {'gpt-gt-reason': GPTReasoningWithGTPrompt,
122-
'gpt-gt-instruct-reason': ExpandReasonMCPrompt}
120+
'gpt-gt-instruct-reason': ExpandReasonMCPrompt,
121+
'gpt-hand-object': GPTHandObjectPrompt}
123122

124123
REQUIRES_VIS = set(['gpt-gt-reason'])
125124

126125
RESPONSE_FACTORY = {'gpt-gt-reason': GT_Augmentation_Response,
127-
'gpt-gt-instruct-reason': ExpandReasonMCResponse}
126+
'gpt-gt-instruct-reason': ExpandReasonMCResponse,
127+
'gpt-hand-object': GPTHandObjectResponse}
128128

129129
class ChatGPT:
130130
"""
@@ -436,6 +436,108 @@ def predict_images(self, images, parsed_item):
436436

437437
return response.choices[0].message.parsed
438438

439+
class GPTHandObjectAnnotator(ChatGPT):
440+
"""
441+
No need to see the video frames. Just annotate the hand and object
442+
"""
443+
def __init__(self, ann_file, debug = False):
444+
super().__init__()
445+
self.ann_file = ann_file
446+
self.anno_type = 'gpt-hand-object'
447+
self.data = []
448+
self.debug = debug
449+
with open(ann_file, 'r') as f:
450+
for line in f:
451+
self.data.append(json.loads(line))
452+
453+
def parse_conversation_from_train_convs(self, item):
454+
"""
455+
The item has the structure of convs defined in the train anno.
456+
"""
457+
left_hand_state = item['left_hand_state']
458+
right_hand_state = item['right_hand_state']
459+
gt_narration = item['narration']
460+
461+
ret = {'left_hand_state': left_hand_state,
462+
'right_hand_state': right_hand_state,
463+
'gt_narration': gt_narration}
464+
465+
return ret
466+
467+
def run(self, indices):
468+
469+
ret = {}
470+
for index in tqdm(indices):
471+
item = self.data[index]
472+
parsed_item = self.parse_conversation_from_train_convs(item)
473+
print ('gt_narration', parsed_item['gt_narration'])
474+
try:
475+
gpt_answer = dict(self.annotate(parsed_item))
476+
except Exception as e:
477+
print ("An exception occurred: ", e)
478+
continue
479+
conversations = [{'from': 'human', 'value':''}, {'from': 'gpt', 'value': ''}]
480+
item['conversations'] = conversations
481+
item['conversations'][1]['value'] = gpt_answer
482+
item['question_type'] = self.anno_type
483+
ret[index] = item
484+
print (item)
485+
if self.debug:
486+
break
487+
488+
return ret
489+
490+
def annotate(self, data_item):
491+
"""
492+
Assuming that data_item already has the multi-choice options and the gt_answer
493+
"""
494+
gt_narration = data_item['gt_narration']
495+
left_hand_state = data_item['left_hand_state']
496+
right_hand_state = data_item['right_hand_state']
497+
temperature = 0
498+
system_prompt = GPTHandObjectPrompt.generate_prompt(left_hand_state, right_hand_state, gt_narration)
499+
system_message = [{"role": "system", "content": system_prompt}]
500+
501+
user_message = [{"role": "user", "content": ""}]
502+
503+
response = client.beta.chat.completions.parse(
504+
model=GPT_MODEL,
505+
messages=system_message + user_message,
506+
response_format = RESPONSE_FACTORY[self.anno_type],
507+
temperature = temperature
508+
)
509+
510+
total_cost = self.calculate_cost(response)
511+
ret = response.choices[0].message.parsed
512+
return ret
513+
514+
def multi_process_run(self, n_samples = -1):
515+
if n_samples == -1:
516+
indices = list(range(len(self.data)))
517+
else:
518+
indices = list(range(n_samples))[:n_samples]
519+
520+
sample_suffix = 'all' if n_samples == -1 else str(n_samples)
521+
522+
num_cores = os.cpu_count() * 2 if not self.debug else 2
523+
indices_groups = self.split_indices(indices, num_cores)
524+
with ProcessPoolExecutor(max_workers=num_cores) as executor:
525+
# Pass additional arguments to the function
526+
futures = [executor.submit(self.run, group) for group in indices_groups]
527+
528+
# Wait for all futures to complete
529+
combined_results = {}
530+
for future in futures:
531+
result_dict = future.result()
532+
combined_results.update(result_dict)
533+
534+
if self.debug:
535+
self.checkpoint(combined_results, 'train_anno_debug.json')
536+
else:
537+
self.checkpoint(combined_results, f"train_anno_{self.anno_type}_{sample_suffix}.json")
538+
print ('finished the annotation')
539+
return combined_results
540+
439541

440542
class GPTAugmentationAnnotator(ChatGPT):
441543
"""
@@ -513,7 +615,6 @@ def multi_process_run(self, n_samples = -1):
513615
return combined_results
514616

515617
def run(self, indices):
516-
517618
ret = {}
518619
for index in tqdm(indices):
519620
item = self.data[index]
@@ -646,7 +747,21 @@ def convert_json_to_jsonl(path):
646747
for k,v in data.items():
647748
json.dump(v, f)
648749
f.write('\n')
649-
def convert_instruct_json_to_jsonl(path):
750+
751+
def calc_disagree_ratio_from_jsonl(path):
752+
# note it's a jsonl file
753+
with open(path, 'r') as f:
754+
data = [json.loads(line) for line in f]
755+
756+
disagree_count = 0
757+
for item in data:
758+
if item['conversations'][1]['value']['disagree_with_human_annotation']:
759+
print (item)
760+
disagree_count += 1
761+
762+
print ('disagree ratio', disagree_count / len(data))
763+
764+
def convert_instruct_json_to_jsonl(path, apply_filter = False):
650765
"""
651766
We split multiple-question answer into multiple lines in the jsonl format. An example of such a json
652767
"2": {
@@ -705,9 +820,14 @@ def convert_instruct_json_to_jsonl(path):
705820
temp_3['conversations'][0]['value'] = third_question
706821
temp_3['conversations'][1]['value'] = third_answer
707822

708-
ret.append(temp_1)
709-
ret.append(temp_2)
710-
ret.append(temp_3)
823+
if apply_filter:
824+
if 'disagree_with_human_annotation' in v['conversations'][1]['value'] and v['conversations'][1]['value']['disagree_with_human_annotation'] is True:
825+
continue
826+
ret.append(temp_1)
827+
else:
828+
ret.append(temp_1)
829+
ret.append(temp_2)
830+
ret.append(temp_3)
711831

712832
for item in ret:
713833
json.dump(item, f)
@@ -734,13 +854,16 @@ def convert_instruct_json_to_jsonl(path):
734854
#train_file_path = '/storage-rcp-pure/upmwmathis_scratch/shaokai/AVION_PREDS/avion_mc_top5_GT_random_narration/train_convs_narration.jsonl'
735855

736856
#train_file_path = '/data/epic_kitchen/shaokai_explore/LLaVA-NeXT/train_anno_gpt-gt-reason_4_all.jsonl'
737-
train_file_path = '/data/epic_kitchen/AVION_PREDS/avion_mc_top5_GT_random_narration/train_convs_narration.jsonl'
738-
root = '/data/EK100/EK100_320p_15sec_30fps_libx264'
739-
multi_process_annotate(train_file_path,
740-
root,
741-
debug = True,
742-
clip_length = 8,
743-
n_samples = -1, anno_type = 'gpt-gt-reason')
857+
# train_file_path = '/data/epic_kitchen/AVION_PREDS/avion_mc_top5_GT_random_narration/train_convs_narration.jsonl'
858+
859+
# train_file_path = '/data/epic_kitchen/shaokai_explore/LLaVA-NeXT/train_anno_gpt-gt-reason_4_first_person_all.jsonl'
860+
# root = '/data/EK100/EK100_320p_15sec_30fps_libx264'
861+
# multi_process_annotate(train_file_path,
862+
# root,
863+
# debug = False,
864+
# clip_length = 4,
865+
# n_samples = -1,
866+
# anno_type = 'gpt-gt-instruct-reason')
744867

745868
# multi_process_inference(root,
746869
# val_file,
@@ -751,7 +874,16 @@ def convert_instruct_json_to_jsonl(path):
751874
# topk = 5,
752875
# n_samples = 100)
753876

754-
755877
# convert_json_to_jsonl('train_anno_gpt-gt-reason_4_10000.json')
756878

757-
#convert_instruct_json_to_jsonl('train_anno_gpt-gt-instruct-reason_4_all.json')
879+
#convert_instruct_json_to_jsonl('train_anno_gpt-gt-instruct-reason_4_all.json')
880+
881+
# train_file_path = '/data/epic_kitchen/haozhe/handobj_imageset/train/EPIC_100_handobj_imageset_train_8.jsonl'
882+
# ann = GPTHandObjectAnnotator(train_file_path, debug = False)
883+
# ann.multi_process_run(n_samples = -1)
884+
885+
#convert_json_to_jsonl('train_anno_gpt-gt-reason_4_first_person_all.json')
886+
887+
#calc_disagree_ratio_from_jsonl('train_anno_gpt-gt-reason_4_first_person_all.jsonl')
888+
889+
convert_instruct_json_to_jsonl('train_anno_gpt-hand-object_all.json', apply_filter = True)

llava/action/utils.py

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -161,30 +161,35 @@ def generate_label_map(anno_root, action_representation, cache_file = None):
161161

162162

163163

164-
def format_task_related_prompt(question, question_type):
164+
def format_task_related_prompt(question, question_type, perspective = "first_person"):
165165
"""
166166
Task related prompt is impacted by the question_type.
167167
We currently support mc_{action_representation} and gpt-gt-reason
168168
We are thinking about tweaking the prompt based on the action representation.
169169
"""
170-
170+
if perspective == "first_person":
171+
perspective_prefix = "You are seeing this video from egocentric view and your hands are sometimes interacting with obects. What action are you performing? "
172+
elif perspective == "third_person":
173+
perspective_prefix = "The video is taken from egocentric view. What action is the person performing? "
171174
if question_type.startswith("mc_"):
172-
action_rep_suffix = "Given multiple choices, format your answer briefly such as 'A. move knife'"
173-
prefix = f"The video is taken from egocentric view. What action is the person performing? {action_rep_suffix}\n"
175+
action_rep_suffix = "Given multiple choices, format your answer briefly such as 'A. move knife'. "
176+
prefix = f"{perspective_prefix}{action_rep_suffix}\n"
174177
assert isinstance(question, list)
175178
suffix = ",".join(question)
176-
suffix = "Here are the options you are tasked :\n" + suffix
179+
suffix = "Here are the options you are tasked:\n" + suffix
177180
ret = prefix + suffix
178181
elif question_type == "gpt-gt-reason":
179-
ret = "The video is taken from egocentric view. What action is the person performing? Please explain your reasoning steps before reaching to your answer."
182+
ret = f"{perspective_prefix}Please explain your reasoning steps before reaching to your answer. "
180183
elif question_type == "gpt-gt-instruct-reason":
181184
ret = question
185+
elif question_type == "gpt-hand-object":
186+
ret = question
182187
elif question_type == "cot_mc":
183188
"""
184189
Explain the reasoning first and do the multiple-choice.
185190
"""
186-
action_rep_suffix = "Given multiple choices, explain your reasoning steps before you reach to your answer."
187-
prefix = f"The video is taken from egocentric view. What action is the person performing? {action_rep_suffix}\n"
191+
action_rep_suffix = "Given multiple choices, explain your reasoning steps before you reach to your answer. "
192+
prefix = f"{perspective_prefix} {action_rep_suffix}\n"
188193
assert isinstance(question, list)
189194
suffix = ",".join(question)
190195
suffix = "Here are the options you are tasked:\n" + suffix
@@ -197,14 +202,14 @@ def format_task_related_prompt(question, question_type):
197202

198203
def format_time_instruction(video_duration, n_frames, include_frame_time = False):
199204

200-
prefix = f"You are seeing a video taken from egocentric view. The video lasts for {video_duration:.2f} seconds, and {n_frames} frames are uniformly sampled from it."
205+
prefix = f"You are seeing a video taken from egocentric view. The video lasts for {video_duration:.3f} seconds, and {n_frames} frames are uniformly sampled from it."
201206

202207
frame_time = [i * (video_duration / n_frames) for i in range(n_frames)]
203208
frame_time = ",".join([f"{i:.2f}s" for i in frame_time])
204209

205210
suffix = ""
206211
if include_frame_time:
207-
suffix = f"These frames are located at {frame_time}. The video duration is {video_duration:.2f} seconds."
212+
suffix = f"These frames are located at {frame_time}. The video duration is {video_duration:.3f} seconds. "
208213

209214
return prefix + suffix
210215

0 commit comments

Comments
 (0)