Skip to content

Commit 410dae2

Browse files
authored
Unroll sketch increment (#653)
* unroll * bench * unroll freq * comment * rem extra file ---------
1 parent b34b12c commit 410dae2

File tree

5 files changed

+75
-28
lines changed

5 files changed

+75
-28
lines changed

BitFaster.Caching.Benchmarks/Lfu/CmSketchNoPin.cs

+3
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@
1212

1313
namespace BitFaster.Caching.Benchmarks.Lfu
1414
{
15+
// Block sketch implementation without:
16+
// - Pinned buffer for vector code paths
17+
// - Loop unroll for non-vector code paths
1518
internal class CmSketchNoPin<T, I>
1619
where T : notnull
1720
where I : struct, IsaProbe

BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs

+15-3
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,8 @@ public class SketchFrequency
2424
private CmSketchFlat<int, DisableHardwareIntrinsics> flatStd;
2525
private CmSketchFlat<int, DetectIsa> flatAvx;
2626

27-
private CmSketchCore<int, DisableHardwareIntrinsics> blockStd;
27+
private CmSketchNoPin<int, DisableHardwareIntrinsics> blockStdNoUnroll;
28+
private CmSketchCore<int, DisableHardwareIntrinsics> blockStdUnroll;
2829
private CmSketchNoPin<int, DetectIsa> blockAvxNoPin;
2930
private CmSketchCore<int, DetectIsa> blockAvx;
3031

@@ -37,7 +38,8 @@ public void Setup()
3738
flatStd = new CmSketchFlat<int, DisableHardwareIntrinsics>(Size, EqualityComparer<int>.Default);
3839
flatAvx = new CmSketchFlat<int, DetectIsa>(Size, EqualityComparer<int>.Default);
3940

40-
blockStd = new CmSketchCore<int, DisableHardwareIntrinsics>(Size, EqualityComparer<int>.Default);
41+
blockStdNoUnroll = new CmSketchNoPin<int, DisableHardwareIntrinsics>(Size, EqualityComparer<int>.Default);
42+
blockStdUnroll = new CmSketchCore<int, DisableHardwareIntrinsics>(Size, EqualityComparer<int>.Default);
4143
blockAvxNoPin = new CmSketchNoPin<int, DetectIsa>(Size, EqualityComparer<int>.Default);
4244
blockAvx = new CmSketchCore<int, DetectIsa>(Size, EqualityComparer<int>.Default);
4345
}
@@ -67,7 +69,17 @@ public int FrequencyBlock()
6769
{
6870
int count = 0;
6971
for (int i = 0; i < iterations; i++)
70-
count += blockStd.EstimateFrequency(i) > blockStd.EstimateFrequency(i + 1) ? 1 : 0;
72+
count += blockStdNoUnroll.EstimateFrequency(i) > blockStdNoUnroll.EstimateFrequency(i + 1) ? 1 : 0;
73+
74+
return count;
75+
}
76+
77+
[Benchmark(OperationsPerInvoke = iterations)]
78+
public int FrequencyBlockUnroll()
79+
{
80+
int count = 0;
81+
for (int i = 0; i < iterations; i++)
82+
count += blockStdUnroll.EstimateFrequency(i) > blockStdUnroll.EstimateFrequency(i + 1) ? 1 : 0;
7183

7284
return count;
7385
}

BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs

+14-3
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,8 @@ public class SketchIncrement
2323
private CmSketchFlat<int, DisableHardwareIntrinsics> flatStd;
2424
private CmSketchFlat<int, DetectIsa> flatAvx;
2525

26-
private CmSketchCore<int, DisableHardwareIntrinsics> blockStd;
26+
private CmSketchNoPin<int, DisableHardwareIntrinsics> blockStdNoUnroll;
27+
private CmSketchCore<int, DisableHardwareIntrinsics> blockStdUnroll;
2728
private CmSketchNoPin<int, DetectIsa> blockAvxNoPin;
2829
private CmSketchCore<int, DetectIsa> blockAvx;
2930

@@ -37,7 +38,8 @@ public void Setup()
3738
flatStd = new CmSketchFlat<int, DisableHardwareIntrinsics>(Size, EqualityComparer<int>.Default);
3839
flatAvx = new CmSketchFlat<int, DetectIsa>(Size, EqualityComparer<int>.Default);
3940

40-
blockStd = new CmSketchCore<int, DisableHardwareIntrinsics>(Size, EqualityComparer<int>.Default);
41+
blockStdNoUnroll = new CmSketchNoPin<int, DisableHardwareIntrinsics>(Size, EqualityComparer<int>.Default);
42+
blockStdUnroll = new CmSketchCore<int, DisableHardwareIntrinsics>(Size, EqualityComparer<int>.Default);
4143
blockAvxNoPin = new CmSketchNoPin<int, DetectIsa>(Size, EqualityComparer<int>.Default);
4244
blockAvx = new CmSketchCore<int, DetectIsa>(Size, EqualityComparer<int>.Default);
4345
}
@@ -65,7 +67,16 @@ public void IncBlock()
6567
{
6668
for (int i = 0; i < iterations; i++)
6769
{
68-
blockStd.Increment(i);
70+
blockStdNoUnroll.Increment(i);
71+
}
72+
}
73+
74+
[Benchmark(OperationsPerInvoke = iterations)]
75+
public void IncBlockUnroll()
76+
{
77+
for (int i = 0; i < iterations; i++)
78+
{
79+
blockStdUnroll.Increment(i);
6980
}
7081
}
7182

BitFaster.Caching/BitFaster.Caching.csproj

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
<PropertyGroup>
44
<TargetFrameworks>netstandard2.0;netcoreapp3.1;net6.0</TargetFrameworks>
5-
<LangVersion>10.0</LangVersion>
5+
<LangVersion>11.0</LangVersion>
66
<Authors>Alex Peck</Authors>
77
<Company />
88
<Product>BitFaster.Caching</Product>

BitFaster.Caching/Lfu/CmSketchCore.cs

+42-21
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#if !NETSTANDARD2_0
99
using System.Runtime.Intrinsics;
1010
using System.Runtime.Intrinsics.X86;
11+
1112
#endif
1213

1314
#if NET6_0_OR_GREATER
@@ -169,41 +170,61 @@ private void EnsureCapacity(long maximumSize)
169170

170171
private unsafe int EstimateFrequencyStd(T value)
171172
{
172-
var count = stackalloc int[4];
173173
int blockHash = Spread(comparer.GetHashCode(value));
174174
int counterHash = Rehash(blockHash);
175175
int block = (blockHash & blockMask) << 3;
176176

177-
for (int i = 0; i < 4; i++)
178-
{
179-
int h = (int)((uint)counterHash >> (i << 3));
180-
int index = (h >> 1) & 15;
181-
int offset = h & 1;
182-
count[i] = (int)(((ulong)table[block + offset + (i << 1)] >> (index << 2)) & 0xfL);
183-
}
184-
return Math.Min(Math.Min(count[0], count[1]), Math.Min(count[2], count[3]));
177+
// Loop unrolling improves throughput
178+
int h0 = counterHash;
179+
int h1 = counterHash >>> 8;
180+
int h2 = counterHash >>> 16;
181+
int h3 = counterHash >>> 24;
182+
183+
int index0 = (h0 >>> 1) & 15;
184+
int index1 = (h1 >>> 1) & 15;
185+
int index2 = (h2 >>> 1) & 15;
186+
int index3 = (h3 >>> 1) & 15;
187+
188+
int slot0 = block + (h0 & 1);
189+
int slot1 = block + (h1 & 1) + 2;
190+
int slot2 = block + (h2 & 1) + 4;
191+
int slot3 = block + (h3 & 1) + 6;
192+
193+
int count0 = (int)((table[slot0] >>> (index0 << 2)) & 0xfL);
194+
int count1 = (int)((table[slot1] >>> (index1 << 2)) & 0xfL);
195+
int count2 = (int)((table[slot2] >>> (index2 << 2)) & 0xfL);
196+
int count3 = (int)((table[slot3] >>> (index3 << 2)) & 0xfL);
197+
198+
return Math.Min(Math.Min(count0, count1), Math.Min(count2, count3));
185199
}
186200

187201
private unsafe void IncrementStd(T value)
188202
{
189-
var index = stackalloc int[8];
190203
int blockHash = Spread(comparer.GetHashCode(value));
191204
int counterHash = Rehash(blockHash);
192205
int block = (blockHash & blockMask) << 3;
193206

194-
for (int i = 0; i < 4; i++)
195-
{
196-
int h = (int)((uint)counterHash >> (i << 3));
197-
index[i] = (h >> 1) & 15;
198-
int offset = h & 1;
199-
index[i + 4] = block + offset + (i << 1);
200-
}
207+
// Loop unrolling improves throughput
208+
int h0 = counterHash;
209+
int h1 = counterHash >>> 8;
210+
int h2 = counterHash >>> 16;
211+
int h3 = counterHash >>> 24;
212+
213+
int index0 = (h0 >>> 1) & 15;
214+
int index1 = (h1 >>> 1) & 15;
215+
int index2 = (h2 >>> 1) & 15;
216+
int index3 = (h3 >>> 1) & 15;
217+
218+
int slot0 = block + (h0 & 1);
219+
int slot1 = block + (h1 & 1) + 2;
220+
int slot2 = block + (h2 & 1) + 4;
221+
int slot3 = block + (h3 & 1) + 6;
201222

202223
bool added =
203-
IncrementAt(index[4], index[0])
204-
| IncrementAt(index[5], index[1])
205-
| IncrementAt(index[6], index[2])
206-
| IncrementAt(index[7], index[3]);
224+
IncrementAt(slot0, index0)
225+
| IncrementAt(slot1, index1)
226+
| IncrementAt(slot2, index2)
227+
| IncrementAt(slot3, index3);
207228

208229
if (added && (++size == sampleSize))
209230
{

0 commit comments

Comments
 (0)