Skip to content

Commit 4b5ab80

Browse files
authored
[rule] Update rule health for append/commit fails (prometheus#8619)
* [rule] Update rule health for append/commit fails Similar to prometheus#8410 will provide more context. Signed-off-by: Goutham Veeramachaneni <[email protected]> * Add test for updating health on append fails Signed-off-by: Goutham Veeramachaneni <[email protected]>
1 parent 195611e commit 4b5ab80

File tree

3 files changed

+67
-2
lines changed

3 files changed

+67
-2
lines changed

rules/manager.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -592,6 +592,9 @@ func (g *Group) Eval(ctx context.Context, ts time.Time) {
592592

593593
vector, err := rule.Eval(ctx, ts, g.opts.QueryFunc, g.opts.ExternalURL)
594594
if err != nil {
595+
rule.SetHealth(HealthBad)
596+
rule.SetLastError(err)
597+
595598
// Canceled queries are intentional termination of queries. This normally
596599
// happens on shutdown and thus we skip logging of any errors here.
597600
if _, ok := err.(promql.ErrQueryCanceled); !ok {
@@ -616,13 +619,20 @@ func (g *Group) Eval(ctx context.Context, ts time.Time) {
616619
seriesReturned := make(map[string]labels.Labels, len(g.seriesInPreviousEval[i]))
617620
defer func() {
618621
if err := app.Commit(); err != nil {
622+
rule.SetHealth(HealthBad)
623+
rule.SetLastError(err)
624+
619625
level.Warn(g.logger).Log("msg", "Rule sample appending failed", "err", err)
620626
return
621627
}
622628
g.seriesInPreviousEval[i] = seriesReturned
623629
}()
630+
624631
for _, s := range vector {
625632
if _, err := app.Append(0, s.Metric, s.T, s.V); err != nil {
633+
rule.SetHealth(HealthBad)
634+
rule.SetLastError(err)
635+
626636
switch errors.Cause(err) {
627637
case storage.ErrOutOfOrderSample:
628638
numOutOfOrder++

rules/manager_test.go

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1163,3 +1163,60 @@ func TestGroupHasAlertingRules(t *testing.T) {
11631163
require.Equal(t, test.want, got, "test case %d failed, expected:%t got:%t", i, test.want, got)
11641164
}
11651165
}
1166+
1167+
func TestRuleHealthUpdates(t *testing.T) {
1168+
st := teststorage.New(t)
1169+
defer st.Close()
1170+
engineOpts := promql.EngineOpts{
1171+
Logger: nil,
1172+
Reg: nil,
1173+
MaxSamples: 10,
1174+
Timeout: 10 * time.Second,
1175+
}
1176+
engine := promql.NewEngine(engineOpts)
1177+
opts := &ManagerOptions{
1178+
QueryFunc: EngineQueryFunc(engine, st),
1179+
Appendable: st,
1180+
Queryable: st,
1181+
Context: context.Background(),
1182+
Logger: log.NewNopLogger(),
1183+
}
1184+
1185+
expr, err := parser.ParseExpr("a + 1")
1186+
require.NoError(t, err)
1187+
rule := NewRecordingRule("a_plus_one", expr, labels.Labels{})
1188+
group := NewGroup(GroupOptions{
1189+
Name: "default",
1190+
Interval: time.Second,
1191+
Rules: []Rule{rule},
1192+
ShouldRestore: true,
1193+
Opts: opts,
1194+
})
1195+
1196+
// A time series that has two samples.
1197+
app := st.Appender(context.Background())
1198+
app.Append(0, labels.FromStrings(model.MetricNameLabel, "a"), 0, 1)
1199+
app.Append(0, labels.FromStrings(model.MetricNameLabel, "a"), 1000, 2)
1200+
err = app.Commit()
1201+
require.NoError(t, err)
1202+
1203+
ctx := context.Background()
1204+
1205+
rules := group.Rules()[0]
1206+
require.NoError(t, rules.LastError())
1207+
require.Equal(t, HealthUnknown, rules.Health())
1208+
1209+
// Execute 2 times, it should be all green.
1210+
group.Eval(ctx, time.Unix(0, 0))
1211+
group.Eval(ctx, time.Unix(1, 0))
1212+
1213+
rules = group.Rules()[0]
1214+
require.NoError(t, rules.LastError())
1215+
require.Equal(t, HealthGood, rules.Health())
1216+
1217+
// Now execute the rule in the past again, this should cause append failures.
1218+
group.Eval(ctx, time.Unix(0, 0))
1219+
rules = group.Rules()[0]
1220+
require.EqualError(t, rules.LastError(), storage.ErrOutOfOrderSample.Error())
1221+
require.Equal(t, HealthBad, rules.Health())
1222+
}

rules/recording.go

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,8 +76,6 @@ func (rule *RecordingRule) Labels() labels.Labels {
7676
func (rule *RecordingRule) Eval(ctx context.Context, ts time.Time, query QueryFunc, _ *url.URL) (promql.Vector, error) {
7777
vector, err := query(ctx, rule.vector.String(), ts)
7878
if err != nil {
79-
rule.SetHealth(HealthBad)
80-
rule.SetLastError(err)
8179
return nil, err
8280
}
8381
// Override the metric name and labels.

0 commit comments

Comments
 (0)