Skip to content

Commit 10d3952

Browse files
gouthamvekrasi-georgiev
authored andcommitted
Avoid creation of 0 sized segments. (prometheus#527)
If the corrupt segment is full, then we set donePages on open, https://github.com/prometheus/tsdb/blob/c59ed492b284c1f5c09d9f27167b2d27df7d09c3/wal/wal.go#L235-L243 Then when we try to repair, we set the segment to be a new segment but we don't update the donePages: https://github.com/prometheus/tsdb/blob/c59ed492b284c1f5c09d9f27167b2d27df7d09c3/wal/wal.go#L334 We we try to log to this, because donePages is full, we will never log anything to this segment and create a new one: https://github.com/prometheus/tsdb/blob/c59ed492b284c1f5c09d9f27167b2d27df7d09c3/wal/wal.go#L486 This does not cause issues because we simply concatenate the segments on read, there by transparently skipping this `0b` segment.
1 parent b48394e commit 10d3952

File tree

2 files changed

+37
-12
lines changed

2 files changed

+37
-12
lines changed

wal/wal.go

Lines changed: 29 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -228,19 +228,23 @@ func NewSize(logger log.Logger, reg prometheus.Registerer, dir string, segmentSi
228228
}
229229
// Fresh dir, no segments yet.
230230
if j == -1 {
231-
if w.segment, err = CreateSegment(w.dir, 0); err != nil {
231+
segment, err := CreateSegment(w.dir, 0)
232+
if err != nil {
232233
return nil, err
233234
}
234-
} else {
235-
if w.segment, err = OpenWriteSegment(logger, w.dir, j); err != nil {
235+
236+
if err := w.setSegment(segment); err != nil {
236237
return nil, err
237238
}
238-
// Correctly initialize donePages.
239-
stat, err := w.segment.Stat()
239+
} else {
240+
segment, err := OpenWriteSegment(logger, w.dir, j)
240241
if err != nil {
241242
return nil, err
242243
}
243-
w.donePages = int(stat.Size() / pageSize)
244+
245+
if err := w.setSegment(segment); err != nil {
246+
return nil, err
247+
}
244248
}
245249
go w.run()
246250

@@ -331,7 +335,9 @@ func (w *WAL) Repair(origErr error) error {
331335
if err != nil {
332336
return err
333337
}
334-
w.segment = s
338+
if err := w.setSegment(s); err != nil {
339+
return err
340+
}
335341

336342
f, err := os.Open(tmpfn)
337343
if err != nil {
@@ -382,8 +388,9 @@ func (w *WAL) nextSegment() error {
382388
return errors.Wrap(err, "create new segment file")
383389
}
384390
prev := w.segment
385-
w.segment = next
386-
w.donePages = 0
391+
if err := w.setSegment(next); err != nil {
392+
return err
393+
}
387394

388395
// Don't block further writes by fsyncing the last segment.
389396
w.actorc <- func() {
@@ -397,6 +404,19 @@ func (w *WAL) nextSegment() error {
397404
return nil
398405
}
399406

407+
func (w *WAL) setSegment(segment *Segment) error {
408+
w.segment = segment
409+
410+
// Correctly initialize donePages.
411+
stat, err := segment.Stat()
412+
if err != nil {
413+
return err
414+
}
415+
w.donePages = int(stat.Size() / pageSize)
416+
417+
return nil
418+
}
419+
400420
// flushPage writes the new contents of the page to disk. If no more records will fit into
401421
// the page, the remaining bytes will be set to zero and a new page will be started.
402422
// If clear is true, this is enforced regardless of how many bytes are left in the page.

wal/wal_test.go

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@ import (
2727
)
2828

2929
func TestWAL_Repair(t *testing.T) {
30-
3130
for name, test := range map[string]struct {
3231
corrSgm int // Which segment to corrupt.
3332
corrFunc func(f *os.File) // Func that applies the corruption.
@@ -115,7 +114,8 @@ func TestWAL_Repair(t *testing.T) {
115114
// We create 3 segments with 3 records each and
116115
// then corrupt a given record in a given segment.
117116
// As a result we want a repaired WAL with given intact records.
118-
w, err := NewSize(nil, nil, dir, 3*pageSize)
117+
segSize := 3 * pageSize
118+
w, err := NewSize(nil, nil, dir, segSize)
119119
testutil.Ok(t, err)
120120

121121
var records [][]byte
@@ -136,7 +136,7 @@ func TestWAL_Repair(t *testing.T) {
136136

137137
testutil.Ok(t, f.Close())
138138

139-
w, err = New(nil, nil, dir)
139+
w, err = NewSize(nil, nil, dir, segSize)
140140
testutil.Ok(t, err)
141141

142142
sr, err := NewSegmentsReader(dir)
@@ -166,6 +166,11 @@ func TestWAL_Repair(t *testing.T) {
166166
t.Fatalf("record %d diverges: want %x, got %x", i, records[i][:10], r[:10])
167167
}
168168
}
169+
170+
// Make sure the last segment is the corrupt segment.
171+
_, last, err := w.Segments()
172+
testutil.Ok(t, err)
173+
testutil.Equals(t, test.corrSgm, last)
169174
})
170175
}
171176
}

0 commit comments

Comments
 (0)