Skip to content

Commit f7ebbe8

Browse files
authored
[evals] added format() to ModelGradedSpec (#597)
- 'in_message' and 'out_message' formatting for modelgraded evals - factored out append_answer_prompt function
1 parent b928cd4 commit f7ebbe8

File tree

7 files changed

+122
-46
lines changed

7 files changed

+122
-46
lines changed

evals/elsuite/modelgraded/base.py

Lines changed: 78 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@
22
from typing import TYPE_CHECKING, Optional, Union
33

44
from evals.elsuite.modelgraded.classify_utils import ANSWER_PROMPTS, choice_to_str, expand_args_dict
5-
from evals.prompt.base import OpenAICreateChatPrompt
5+
from evals.elsuite.utils import format_prompt
6+
from evals.prompt.base import OpenAICreateChatPrompt, is_chat_prompt
67

78
if TYPE_CHECKING:
89
from dataclasses import dataclass
@@ -14,12 +15,12 @@
1415
class ModelGradedSpec:
1516
prompt: Union[str, OpenAICreateChatPrompt]
1617
choice_strings: Union[list[str], str]
17-
eval_type: str
1818
input_outputs: dict[str, str]
1919

20+
eval_type: Optional[str] = None
21+
format_type: str = "in_message"
2022
choice_scores: Optional[Union[dict[str, Union[float, int]], str]] = None
2123
multicomp_n: Optional[int] = None
22-
append_answer_prompt: bool = False
2324
args: Optional[dict[str, dict[str, str]]] = None
2425
expand_args_dict: Optional[dict[str, dict[str, tuple[str]]]] = None
2526
completion_sample_templates: Optional[dict[str, str]] = None
@@ -45,13 +46,9 @@ def __post_init__(self):
4546
if self.choice_scores == "from_strings":
4647
self.choice_scores = {c: float(c) for c in self.choice_strings}
4748

48-
# 'prompt' is a string that specifies the model-graded evaluation
49-
assert isinstance(self.prompt, str), f"prompt must be a string, not {type(self.prompt)}"
50-
if self.append_answer_prompt:
51-
self.prompt += "\n\n" + ANSWER_PROMPTS[self.eval_type].format(
52-
choices=choice_to_str(self.choice_strings)
53-
)
54-
self.prompt = [{"role": "user", "content": self.prompt}]
49+
if isinstance(self.prompt, str):
50+
self.prompt = [{"role": "user", "content": self.prompt}]
51+
assert is_chat_prompt(self.prompt)
5552

5653
# 'input_outputs' is a dict that specifies the input and output keys in the sample
5754
# output key is the model's raw response to input key. These are used for filling 'prompt' template.
@@ -75,3 +72,74 @@ def __post_init__(self):
7572
assert (
7673
self.completion_sample_templates
7774
), "completion_sample_templates must be specified if multicomp_n > 1"
75+
76+
def append_answer_prompt(
77+
self,
78+
eval_type: str,
79+
append_type: str = "as_content",
80+
prompt: Optional[OpenAICreateChatPrompt] = None,
81+
):
82+
"""Append answer prompt to prompt. Can only be called once."""
83+
assert self.eval_type is None, f"eval_type already set: {eval_type}"
84+
prompt = prompt or ANSWER_PROMPTS[eval_type]
85+
prompt = format_prompt(prompt, choices=choice_to_str(self.choice_strings))
86+
if append_type == "as_content":
87+
assert isinstance(prompt, str), f"prompt must be str, not {type(prompt)}"
88+
self.prompt[-1]["content"] += "\n\n" + prompt
89+
elif append_type == "as_message":
90+
assert is_chat_prompt(prompt), f"prompt must be chat prompt, not {prompt}"
91+
self.prompt += prompt
92+
else:
93+
raise ValueError(f"append_type must be 'as_content' or 'as_message', not {append_type}")
94+
self.eval_type = eval_type
95+
96+
def format(self, **kwargs: dict[str, OpenAICreateChatPrompt]) -> OpenAICreateChatPrompt:
97+
"""Return an OpenAICreateChatPrompt that can be passed PromptFn for modelgraded eval.
98+
99+
'in_message' returns: [
100+
{
101+
"role": "user",
102+
"content": \"""
103+
User: {input}
104+
Assistant: {completion}
105+
106+
Was the assistant response helpful?
107+
\""".strip(),
108+
}
109+
]
110+
111+
'out_message' returns: [
112+
{"role": "user", "content": "{input}"},
113+
{"role": "assistant", "content": "{completion}"},
114+
{"role": "user", "content": "Was the last assistant response helpful?"},
115+
]
116+
"""
117+
if self.format_type == "in_message":
118+
return format_prompt(self.prompt, **kwargs)
119+
elif self.format_type == "out_message":
120+
assert len(self.input_outputs) == 1, "out_message only supports one input/output pair"
121+
# extra input-output data, as it is treated specially
122+
input_completions = {
123+
k: (k, kwargs[k], v, kwargs[v]) for k, v in self.input_outputs.items()
124+
}
125+
kwargs = {
126+
k: v
127+
for k, v in kwargs.items()
128+
if k not in self.input_outputs.values() and k not in self.input_outputs
129+
}
130+
convo = []
131+
for input_key, input, completion_key, completion in input_completions.values():
132+
del input_key, completion_key
133+
assert isinstance(
134+
completion, str
135+
), f"completion must be str, not {type(completion)}"
136+
if is_chat_prompt(input):
137+
convo += input
138+
else:
139+
convo.append({"role": "user", "content": input})
140+
convo.append({"role": "assistant", "content": completion})
141+
return convo + format_prompt(self.prompt, **kwargs)
142+
else:
143+
raise ValueError(
144+
f"format_type must be 'in_message' or 'out_message', not {self.format_type}"
145+
)

evals/elsuite/modelgraded/classify.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
concat_n_completions,
2020
get_choice,
2121
)
22-
from evals.elsuite.utils import PromptFn, format_prompt, scrub_formatting_from_prompt
22+
from evals.elsuite.utils import PromptFn, scrub_formatting_from_prompt
2323

2424

2525
class ModelBasedClassify(evals.Eval):
@@ -72,14 +72,13 @@ def __init__(
7272
self.eval_modelspec = ModelSpec(name=eval_model, model=eval_model, is_chat=True)
7373

7474
spec_kwargs = {"multicomp_n": self.multicomp_n}
75-
if eval_type:
76-
spec_kwargs["eval_type"] = eval_type
77-
spec_kwargs["append_answer_prompt"] = True # append answer prompt to prompt
7875
if modelgraded_spec_args:
7976
spec_kwargs["args"] = modelgraded_spec_args
8077
self.mg: ModelGradedSpec = self.registry.get_modelgraded_spec(
8178
modelgraded_spec, **spec_kwargs
8279
)
80+
if eval_type:
81+
self.mg.append_answer_prompt(eval_type)
8382

8483
def eval_sample(self, test_sample: dict, rng: Random) -> None:
8584
"""Evaluate a single sample.
@@ -148,7 +147,7 @@ def eval_sample(self, test_sample: dict, rng: Random) -> None:
148147
args_dict = {CHOICE_KEY: {}}
149148
for metric, args in args_dict.items():
150149
args = {k: v[1] for k, v in args.items()}
151-
prompt = format_prompt(self.mg.prompt, **args, **completions, **test_sample)
150+
prompt = self.mg.format(**args, **completions, **test_sample)
152151
evaluate = PromptFn(
153152
prompt,
154153
model_spec=self.eval_modelspec,
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:782d5ba4d1fa47ad25490e39a7598021b864930bf3d586f0670861d06bb485df
3+
size 522

evals/registry/eval_sets/test-all.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,9 @@ test:
99
- coqa-closedqa
1010
- coqa-closedqa-correct
1111
- logic-fact
12-
- joke-animals
13-
- joke-animals-likert
1412
- joke-fruits
13+
- joke-fruits-v2
14+
- joke-fruits-likert
1515
- joke-fruits-meta
1616
- joke-fruits-expl-meta
1717
- diversity

evals/registry/eval_sets/test-modelgraded.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
test-modelgraded:
22
evals:
33
- logic-fact
4-
- joke-animals
5-
- joke-animals-likert
64
- joke-fruits
5+
- joke-fruits-v2
6+
- joke-fruits-likert
77
- joke-fruits-meta
88
- joke-fruits-expl-meta
99
- joke-fruits-ans-meta

evals/registry/evals/test-modelgraded.yaml

Lines changed: 18 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,42 +1,35 @@
11
# a simple modelgraded eval checking if a completion is funny or not
2-
joke-animals:
3-
id: joke-animals.dev.v0
2+
joke-fruits:
3+
id: joke-fruits.dev.v0
44
metrics: [accuracy]
5-
joke-animals.dev.v0:
5+
joke-fruits.dev.v0:
66
class: evals.elsuite.modelgraded.classify:ModelBasedClassify
77
args:
8-
samples_jsonl: test_multiio/battles/joke_animals_vs_fruits.jsonl
9-
samples_renamings:
10-
input1: "input"
11-
completion1: "completion"
8+
samples_jsonl: test_modelgraded/joke_fruits.jsonl
129
eval_type: cot_classify
1310
modelgraded_spec: humor
1411

15-
# (same eval as above, but with likert scale of 1-5)
16-
joke-animals-likert:
17-
id: joke-animals-likert.dev.v0
12+
# (same eval as above, but with format_type="out_message")
13+
joke-fruits-v2:
14+
id: joke-fruits-v2.dev.v0
1815
metrics: [accuracy]
19-
joke-animals-likert.dev.v0:
16+
joke-fruits-v2.dev.v0:
2017
class: evals.elsuite.modelgraded.classify:ModelBasedClassify
2118
args:
22-
samples_jsonl: test_multiio/battles/joke_animals_vs_fruits.jsonl
23-
samples_renamings:
24-
input1: "input"
25-
completion1: "completion"
19+
samples_jsonl: test_modelgraded/joke_fruits.jsonl
2620
eval_type: cot_classify
27-
modelgraded_spec: humor_likert
21+
modelgraded_spec: humor_out_message
2822

29-
# a simple modelgraded eval checking if a completion is funny or not
30-
# this example uses a labeled dataset, but ignores "completion" and "choice"
31-
joke-fruits:
32-
id: joke-fruits.dev.v0
23+
# (same eval as above, but with likert scale of 1-5)
24+
joke-fruits-likert:
25+
id: joke-fruits-likert.dev.v0
3326
metrics: [accuracy]
34-
joke-fruits.dev.v0:
27+
joke-fruits-likert.dev.v0:
3528
class: evals.elsuite.modelgraded.classify:ModelBasedClassify
3629
args:
37-
samples_jsonl: test_metaeval/joke_fruits_labeled.jsonl
30+
samples_jsonl: test_modelgraded/joke_fruits.jsonl
3831
eval_type: cot_classify
39-
modelgraded_spec: humor
32+
modelgraded_spec: humor_likert
4033

4134
# a meta-evaluation of a modelgraded eval checking if a completion is funny or not
4235
# this example uses a labeled dataset with "completion" and "choice"
@@ -76,29 +69,27 @@ joke-fruits-ans-meta.dev.v0:
7669
metaeval: true
7770

7871
# a simple modelgraded eval checking if 4 completions to the sample prompt is diverse
79-
# this example uses a labeled dataset, but ignore "completion" and "choice"
8072
diversity:
8173
id: diversity.dev.v0
8274
metrics: [accuracy]
8375
diversity.dev.v0:
8476
class: evals.elsuite.modelgraded.classify:ModelBasedClassify
8577
args:
86-
samples_jsonl: test_metaeval/joke_fruits_labeled.jsonl
78+
samples_jsonl: test_modelgraded/joke_fruits.jsonl
8779
eval_type: cot_classify
8880
modelgraded_spec: diversity
8981
multicomp_n: 4
9082
multicomp_temperature: 0.4
9183

9284
# a simple modelgraded eval checking which of N completions to the sample prompt is the best response
93-
# this example uses a labeled dataset, but ignore "completion" and "choice"
9485
# command: `oaleval gpt-3.5-turbo,gpt-4 best`
9586
best:
9687
id: best.dev.v0
9788
metrics: [accuracy]
9889
best.dev.v0:
9990
class: evals.elsuite.modelgraded.classify:ModelBasedClassify
10091
args:
101-
samples_jsonl: test_metaeval/joke_fruits_labeled.jsonl
92+
samples_jsonl: test_modelgraded/joke_fruits.jsonl
10293
eval_type: cot_classify
10394
modelgraded_spec: best
10495
multicomp_n: from_models

evals/registry/modelgraded/humor.yaml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,21 @@ humor:
1414
input_outputs:
1515
input: completion
1616

17+
humor_out_message:
18+
prompt: |-
19+
Was the assistant's (your) last response above to the user funny?
20+
choice_scores:
21+
"Yes": 1.0
22+
"No": 0.0
23+
"Unsure": 0.5
24+
choice_strings:
25+
- "Yes"
26+
- "No"
27+
- "Unsure"
28+
format_type: out_message
29+
input_outputs:
30+
input: completion
31+
1732
humor_likert:
1833
prompt: |-
1934
Is the following funny?

0 commit comments

Comments
 (0)