Skip to content

Commit 36c7044

Browse files
committed
improved gpt annotation
1 parent 0839d9c commit 36c7044

File tree

1 file changed

+103
-27
lines changed

1 file changed

+103
-27
lines changed

llava/action/chatgpt_utils.py

Lines changed: 103 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,11 @@
88
from tqdm import tqdm
99
from llava.action.utils import AvionMultiChoiceGenerator
1010
from llava.action.utils import avion_video_loader, avion_video_render_loader
11-
11+
import copy
12+
import torch
13+
import io
14+
import numpy as np
15+
import base64
1216

1317
client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
1418

@@ -84,30 +88,23 @@ class GPTReasoningWithGTPrompt:
8488
@classmethod
8589
def generate_prompt(cls, start_second, end_second, option_text, gt_answer):
8690
prompt = f"""
87-
You are seeing video frames from an egocentric view of a person. The person is interacting with objects in a kitchen.
88-
Describe the action the person is performing but do not say you see the person as you can only see the person's hands.
89-
You can say something that the video is showing the egocentric view of person doing something.
90-
Pay attention to the objects the person's hands are interacting.
91-
The true ground-truth action is {gt_answer}. However, I want you to come to your ownconclusion from your own observation and show your reasoning steps. Make sure it matches the ground-truth action.
92-
Your reasoning steps should include supporting evidences for the action. Useful evidences include the duration of the video, the objects the person is interacting with, and the context of the video.
91+
You are viewing video frames from an egocentric perspective of a person interacting with objects in a kitchen. Describe the video frames in detail and reason about the actions the person is performing. You will be provided with the human-annotated ground-truth for the action, but you should independently come to your own conclusion.
92+
If you disagree with the human annotation, indicate "true" in the "disagree_with_human_annotation" field of your response, and provide your reasoning without mentioning the ground-truth answer. This will keep your reasoning clean. If you agree with the human annotation, indicate "false" in the "disagree_with_human_annotation" field and provide your reasoning without referencing the ground-truth to maintain a clean description.
93+
Pay close attention to the objects the person's hands are interacting with.
94+
The true ground-truth action is {gt_answer}.
95+
Your reasoning steps should include supporting evidence for the action, such as the duration of the video, the sequence of actions the person performs, the objects they interact with, and the overall context of the video.
96+
As a general guideline, for videos longer than 3 seconds, provide detailed reasoning steps, and for videos shorter than 3 seconds, generate less detailed reasoning.
9397
The video duration is {end_second - start_second:.3f} seconds.
9498
"""
9599
print (prompt)
96100
return prompt
97101

98-
99-
class GT_Agnostic_Response(BaseModel):
100-
"""
101-
The GT was not known. The response is to generate a new answer
102-
"""
103-
explanation: str
104-
answer: str
105-
106102
class GT_Augmentation_Response(BaseModel):
107103
"""
108104
The GT was known. The response is to add more information to the GT
109105
"""
110-
explanation: str
106+
caption_with_reasoning: str
107+
disagree_with_human_annotation: bool
111108

112109

113110
class ExpandReasonMCResponse(BaseModel):
@@ -496,7 +493,7 @@ def multi_process_run(self, n_samples = -1):
496493

497494
sample_suffix = 'all' if n_samples == -1 else str(n_samples)
498495

499-
num_cores = os.cpu_count() if not self.debug else 2
496+
num_cores = os.cpu_count() * 2 if not self.debug else 2
500497
indices_groups = self.split_indices(indices, num_cores)
501498
with ProcessPoolExecutor(max_workers=num_cores) as executor:
502499
# Pass additional arguments to the function
@@ -530,7 +527,7 @@ def run(self, indices):
530527
parsed_item = self.parse_conversation_from_train_convs(item)
531528
try:
532529
if self.anno_type == 'gpt-gt-reason':
533-
gpt_answer = self.annotate(frames, parsed_item).explanation
530+
gpt_answer = dict(self.annotate(frames, parsed_item))
534531
elif self.anno_type == 'gpt-gt-instruct-reason':
535532
gpt_answer = dict(self.annotate(frames, parsed_item))
536533
except Exception as e:
@@ -540,8 +537,9 @@ def run(self, indices):
540537
item['conversations'][1]['value'] = gpt_answer
541538
item['question_type'] = self.anno_type
542539
ret[index] = item
540+
print (item)
543541
if self.debug:
544-
print (item)
542+
545543
break
546544

547545
return ret
@@ -578,14 +576,19 @@ def annotate(self, images, data_item):
578576
return response.choices[0].message.parsed
579577

580578

581-
def multi_process_annotate(train_file_path, root, debug = False, anno_type = 'gpt-gt-reason', n_samples = -1):
579+
def multi_process_annotate(train_file_path,
580+
root,
581+
debug = False,
582+
clip_length = 4,
583+
anno_type = 'gpt-gt-reason',
584+
n_samples = -1):
582585
annotator = GPTAugmentationAnnotator(train_file_path,
583586
root,
584-
clip_length = 4,
587+
clip_length = clip_length,
585588
debug = debug,
586589
anno_type = anno_type)
587590

588-
results = annotator.multi_process_run(n_samples = n_samples)
591+
annotator.multi_process_run(n_samples = n_samples)
589592

590593
def multi_process_inference(root,
591594
annotation_file,
@@ -643,6 +646,73 @@ def convert_json_to_jsonl(path):
643646
for k,v in data.items():
644647
json.dump(v, f)
645648
f.write('\n')
649+
def convert_instruct_json_to_jsonl(path):
650+
"""
651+
We split multiple-question answer into multiple lines in the jsonl format. An example of such a json
652+
"2": {
653+
"video": "P01-P01_01",
654+
"conversations": [
655+
{
656+
"from": "human",
657+
"value": "['A. open tap', 'B. pick up knife', 'C. turn off tap', 'D. open drawer', 'E. open cupboard']"
658+
},
659+
{
660+
"from": "gpt",
661+
"value": {
662+
"first_question": "What action is the person performing in the video?",
663+
"first_answer": "The person is pulling a drawer open inside a refrigerator.",
664+
"second_question": "What evidence suggests that the person is opening a drawer?",
665+
"second_answer": "The movement of the drawer outward and the person's hand gripping the handle indicate that the person is opening the drawer.",
666+
"third_question": "What is the duration of the action shown in the video?",
667+
"third_answer": "The action of opening the drawer is shown in a short duration of 1.230 seconds."
668+
}
669+
}
670+
],
671+
"id": "P01-P01_01",
672+
"split": "train",
673+
"task_instruction": "",
674+
"num_samples": 1,
675+
"question_type": "gpt-gt-instruct-reason",
676+
"dataset_name": "EK100",
677+
"start_timestamp": 24.97,
678+
"end_timestamp": 26.2}
679+
"""
680+
with open(path, 'r') as f:
681+
data = json.load(f)
682+
ret = []
683+
with open(path.replace('.json', '.jsonl'), 'w') as f:
684+
for k,v in data.items():
685+
temp_1 = copy.deepcopy(v)
686+
temp_2 = copy.deepcopy(v)
687+
temp_3 = copy.deepcopy(v)
688+
689+
conversations = v['conversations']
690+
first_question = conversations[1]['value']['first_question']
691+
first_answer = conversations[1]['value']['first_answer']
692+
693+
temp_1['conversations'][0]['value'] = first_question
694+
temp_1['conversations'][1]['value'] = first_answer
695+
696+
second_question = conversations[1]['value']['second_question']
697+
second_answer = conversations[1]['value']['second_answer']
698+
699+
temp_2['conversations'][0]['value'] = second_question
700+
temp_2['conversations'][1]['value'] = second_answer
701+
702+
third_question = conversations[1]['value']['third_question']
703+
third_answer = conversations[1]['value']['third_answer']
704+
705+
temp_3['conversations'][0]['value'] = third_question
706+
temp_3['conversations'][1]['value'] = third_answer
707+
708+
ret.append(temp_1)
709+
ret.append(temp_2)
710+
ret.append(temp_3)
711+
712+
for item in ret:
713+
json.dump(item, f)
714+
f.write('\n')
715+
646716

647717
if __name__ == '__main__':
648718

@@ -663,10 +733,14 @@ def convert_json_to_jsonl(path):
663733
#root = '/storage-rcp-pure/upmwmathis_scratch/shaokai/EK100'
664734
#train_file_path = '/storage-rcp-pure/upmwmathis_scratch/shaokai/AVION_PREDS/avion_mc_top5_GT_random_narration/train_convs_narration.jsonl'
665735

666-
train_file_path = '/storage-rcp-pure/upmwmathis_scratch/shaokai/train_anno_gpt-gt-reason_4_all.jsonl'
667-
668-
root = '/storage-rcp-pure/upmwmathis_scratch/shaokai/EK100'
669-
multi_process_annotate(train_file_path, root, debug = False, n_samples = -1, anno_type = 'gpt-gt-instruct-reason')
736+
#train_file_path = '/data/epic_kitchen/shaokai_explore/LLaVA-NeXT/train_anno_gpt-gt-reason_4_all.jsonl'
737+
train_file_path = '/data/epic_kitchen/AVION_PREDS/avion_mc_top5_GT_random_narration/train_convs_narration.jsonl'
738+
root = '/data/EK100/EK100_320p_15sec_30fps_libx264'
739+
multi_process_annotate(train_file_path,
740+
root,
741+
debug = True,
742+
clip_length = 8,
743+
n_samples = -1, anno_type = 'gpt-gt-reason')
670744

671745
# multi_process_inference(root,
672746
# val_file,
@@ -678,4 +752,6 @@ def convert_json_to_jsonl(path):
678752
# n_samples = 100)
679753

680754

681-
# convert_json_to_jsonl('train_anno_gpt-gt-reason_4_10000.json')
755+
# convert_json_to_jsonl('train_anno_gpt-gt-reason_4_10000.json')
756+
757+
#convert_instruct_json_to_jsonl('train_anno_gpt-gt-instruct-reason_4_all.json')

0 commit comments

Comments
 (0)