Skip to content

Commit 4e5ed83

Browse files
committed
runtime: use bit-parallel operations to compute heap bit summaries
The new implementation is much faster in all cases. name old time/op new time/op delta PallocBitsSummarize/Unpacked00-16 142ns ± 1% 7ns ± 2% -94.75% (p=0.000 n=10+9) PallocBitsSummarize/UnpackedFFFFFFFFFFFFFFFF-16 172ns ± 0% 24ns ± 0% -86.02% (p=0.000 n=9+9) PallocBitsSummarize/UnpackedAA-16 145ns ± 0% 32ns ± 0% -78.16% (p=0.000 n=8+10) PallocBitsSummarize/UnpackedAAAAAAAAAAAAAAAA-16 172ns ± 0% 33ns ± 0% -80.95% (p=0.000 n=9+9) PallocBitsSummarize/Unpacked80000000AAAAAAAA-16 162ns ± 1% 60ns ± 0% -62.69% (p=0.000 n=10+9) PallocBitsSummarize/UnpackedAAAAAAAA00000001-16 163ns ± 0% 68ns ± 1% -58.47% (p=0.000 n=8+10) PallocBitsSummarize/UnpackedBBBBBBBBBBBBBBBB-16 172ns ± 0% 35ns ± 0% -79.70% (p=0.000 n=9+9) PallocBitsSummarize/Unpacked80000000BBBBBBBB-16 161ns ± 0% 63ns ± 0% -60.61% (p=0.000 n=8+10) PallocBitsSummarize/UnpackedBBBBBBBB00000001-16 163ns ± 0% 60ns ± 0% -63.14% (p=0.000 n=9+10) PallocBitsSummarize/UnpackedCCCCCCCCCCCCCCCC-16 172ns ± 0% 39ns ± 0% -77.41% (p=0.000 n=7+10) PallocBitsSummarize/Unpacked4444444444444444-16 172ns ± 0% 39ns ± 0% -77.42% (p=0.000 n=7+10) PallocBitsSummarize/Unpacked4040404040404040-16 173ns ± 2% 51ns ± 1% -70.55% (p=0.000 n=10+10) PallocBitsSummarize/Unpacked4000400040004000-16 160ns ± 1% 53ns ± 0% -66.78% (p=0.000 n=10+10) PallocBitsSummarize/Unpacked1000404044CCAAFF-16 169ns ± 1% 59ns ± 1% -65.28% (p=0.000 n=10+10) Change-Id: I94daa645b76a9cf9c93edeb2058d7132216fcb72 Reviewed-on: https://go-review.googlesource.com/c/go/+/240900 Run-TryBot: Keith Randall <[email protected]> TryBot-Result: Gobot Gobot <[email protected]> Reviewed-by: Michael Knyszek <[email protected]>
1 parent 88c094c commit 4e5ed83

File tree

2 files changed

+108
-75
lines changed

2 files changed

+108
-75
lines changed

src/runtime/mpallocbits.go

Lines changed: 84 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -120,78 +120,99 @@ func (b *pageBits) popcntRange(i, n uint) (s uint) {
120120
// sake of documentation, 0s are free pages and 1s are allocated pages.
121121
type pallocBits pageBits
122122

123-
// consec8tab is a table containing the number of consecutive
124-
// zero bits for any uint8 value.
125-
//
126-
// The table is generated by calling consec8(i) for each
127-
// possible uint8 value, which is defined as:
128-
//
129-
// // consec8 counts the maximum number of consecutive 0 bits
130-
// // in a uint8.
131-
// func consec8(n uint8) int {
132-
// n = ^n
133-
// i := 0
134-
// for n != 0 {
135-
// n &= (n << 1)
136-
// i++
137-
// }
138-
// return i
139-
// }
140-
var consec8tab = [256]uint{
141-
8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
142-
4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
143-
5, 4, 3, 3, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2,
144-
4, 3, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2,
145-
6, 5, 4, 4, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2,
146-
4, 3, 2, 2, 2, 1, 1, 1, 3, 2, 1, 1, 2, 1, 1, 1,
147-
5, 4, 3, 3, 2, 2, 2, 2, 3, 2, 1, 1, 2, 1, 1, 1,
148-
4, 3, 2, 2, 2, 1, 1, 1, 3, 2, 1, 1, 2, 1, 1, 1,
149-
7, 6, 5, 5, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3,
150-
4, 3, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2,
151-
5, 4, 3, 3, 2, 2, 2, 2, 3, 2, 1, 1, 2, 1, 1, 1,
152-
4, 3, 2, 2, 2, 1, 1, 1, 3, 2, 1, 1, 2, 1, 1, 1,
153-
6, 5, 4, 4, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2,
154-
4, 3, 2, 2, 2, 1, 1, 1, 3, 2, 1, 1, 2, 1, 1, 1,
155-
5, 4, 3, 3, 2, 2, 2, 2, 3, 2, 1, 1, 2, 1, 1, 1,
156-
4, 3, 2, 2, 2, 1, 1, 1, 3, 2, 1, 1, 2, 1, 1, 0,
157-
}
158-
159123
// summarize returns a packed summary of the bitmap in pallocBits.
160124
func (b *pallocBits) summarize() pallocSum {
161-
// TODO(mknyszek): There may be something more clever to be done
162-
// here to make the summarize operation more efficient. For example,
163-
// we can compute start and end with 64-bit wide operations easily,
164-
// but max is a bit more complex. Perhaps there exists some way to
165-
// leverage the 64-bit start and end to our advantage?
166-
var start, max, end uint
125+
var start, max, cur uint
126+
const notSetYet = ^uint(0) // sentinel for start value
127+
start = notSetYet
167128
for i := 0; i < len(b); i++ {
168-
a := b[i]
169-
for j := 0; j < 64; j += 8 {
170-
k := uint8(a >> j)
171-
172-
// Compute start.
173-
si := uint(sys.TrailingZeros8(k))
174-
if start == uint(i*64+j) {
175-
start += si
176-
}
129+
x := b[i]
130+
if x == 0 {
131+
cur += 64
132+
continue
133+
}
134+
t := uint(sys.TrailingZeros64(x))
135+
l := uint(sys.LeadingZeros64(x))
177136

178-
// Compute max.
179-
if end+si > max {
180-
max = end + si
181-
}
182-
if mi := consec8tab[k]; mi > max {
183-
max = mi
137+
// Finish any region spanning the uint64s
138+
cur += t
139+
if start == notSetYet {
140+
start = cur
141+
}
142+
if cur > max {
143+
max = cur
144+
}
145+
// Final region that might span to next uint64
146+
cur = l
147+
}
148+
if start == notSetYet {
149+
// Made it all the way through without finding a single 1 bit.
150+
const n = uint(64 * len(b))
151+
return packPallocSum(n, n, n)
152+
}
153+
if cur > max {
154+
max = cur
155+
}
156+
if max >= 64-2 {
157+
// There is no way an internal run of zeros could beat max.
158+
return packPallocSum(start, max, cur)
159+
}
160+
// Now look inside each uint64 for runs of zeros.
161+
// All uint64s must be nonzero, or we would have aborted above.
162+
outer:
163+
for i := 0; i < len(b); i++ {
164+
x := b[i]
165+
166+
// Look inside this uint64. We have a pattern like
167+
// 000000 1xxxxx1 000000
168+
// We need to look inside the 1xxxxx1 for any contiguous
169+
// region of zeros.
170+
171+
// We already know the trailing zeros are no larger than max. Remove them.
172+
x >>= sys.TrailingZeros64(x) & 63
173+
if x&(x+1) == 0 { // no more zeros (except at the top).
174+
continue
175+
}
176+
177+
// Strategy: shrink all runs of zeros by max. If any runs of zero
178+
// remain, then we've identified a larger maxiumum zero run.
179+
p := max // number of zeros we still need to shrink by.
180+
k := uint(1) // current minimum length of runs of ones in x.
181+
for {
182+
// Shrink all runs of zeros by p places (except the top zeros).
183+
for p > 0 {
184+
if p <= k {
185+
// Shift p ones down into the top of each run of zeros.
186+
x |= x >> (p & 63)
187+
if x&(x+1) == 0 { // no more zeros (except at the top).
188+
continue outer
189+
}
190+
break
191+
}
192+
// Shift k ones down into the top of each run of zeros.
193+
x |= x >> (k & 63)
194+
if x&(x+1) == 0 { // no more zeros (except at the top).
195+
continue outer
196+
}
197+
p -= k
198+
// We've just doubled the minimum length of 1-runs.
199+
// This allows us to shift farther in the next iteration.
200+
k *= 2
184201
}
185202

186-
// Compute end.
187-
if k == 0 {
188-
end += 8
189-
} else {
190-
end = uint(sys.LeadingZeros8(k))
203+
// The length of the lowest-order zero run is an increment to our maximum.
204+
j := uint(sys.TrailingZeros64(^x)) // count contiguous trailing ones
205+
x >>= j & 63 // remove trailing ones
206+
j = uint(sys.TrailingZeros64(x)) // count contiguous trailing zeros
207+
x >>= j & 63 // remove zeros
208+
max += j // we have a new maximum!
209+
if x&(x+1) == 0 { // no more zeros (except at the top).
210+
continue outer
191211
}
212+
p = j // remove j more zeros from each zero run.
192213
}
193214
}
194-
return packPallocSum(start, max, end)
215+
return packPallocSum(start, max, cur)
195216
}
196217

197218
// find searches for npages contiguous free pages in pallocBits and returns

src/runtime/mpallocbits_test.go

Lines changed: 24 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ func invertPallocBits(b *PallocBits) {
101101

102102
// Ensures two packed summaries are identical, and reports a detailed description
103103
// of the difference if they're not.
104-
func checkPallocSum(t *testing.T, got, want PallocSum) {
104+
func checkPallocSum(t testing.TB, got, want PallocSum) {
105105
if got.Start() != want.Start() {
106106
t.Errorf("inconsistent start: got %d, want %d", got.Start(), want.Start())
107107
}
@@ -297,17 +297,29 @@ func TestPallocBitsSummarize(t *testing.T) {
297297

298298
// Benchmarks how quickly we can summarize a PallocBits.
299299
func BenchmarkPallocBitsSummarize(b *testing.B) {
300-
buf0 := new(PallocBits)
301-
buf1 := new(PallocBits)
302-
for i := 0; i < len(buf1); i++ {
303-
buf1[i] = ^uint64(0)
304-
}
305-
bufa := new(PallocBits)
306-
for i := 0; i < len(bufa); i++ {
307-
bufa[i] = 0xaa
308-
}
309-
for _, buf := range []*PallocBits{buf0, buf1, bufa} {
310-
b.Run(fmt.Sprintf("Unpacked%02X", buf[0]), func(b *testing.B) {
300+
patterns := []uint64{
301+
0,
302+
^uint64(0),
303+
0xaa,
304+
0xaaaaaaaaaaaaaaaa,
305+
0x80000000aaaaaaaa,
306+
0xaaaaaaaa00000001,
307+
0xbbbbbbbbbbbbbbbb,
308+
0x80000000bbbbbbbb,
309+
0xbbbbbbbb00000001,
310+
0xcccccccccccccccc,
311+
0x4444444444444444,
312+
0x4040404040404040,
313+
0x4000400040004000,
314+
0x1000404044ccaaff,
315+
}
316+
for _, p := range patterns {
317+
buf := new(PallocBits)
318+
for i := 0; i < len(buf); i++ {
319+
buf[i] = p
320+
}
321+
b.Run(fmt.Sprintf("Unpacked%02X", p), func(b *testing.B) {
322+
checkPallocSum(b, buf.Summarize(), SummarizeSlow(buf))
311323
for i := 0; i < b.N; i++ {
312324
buf.Summarize()
313325
}

0 commit comments

Comments
 (0)