-
Notifications
You must be signed in to change notification settings - Fork 1.9k
MurmurHash Onnx Export #5013
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
MurmurHash Onnx Export #5013
Changes from 27 commits
e9c522d
e909636
c14b085
f9dd77c
f0d513c
d1a4737
edf6d2b
c0167ec
257ed6d
d7e6c19
aff61f5
566dd50
9f91326
be1f77e
25fe50b
c1aa8d1
e04e4f8
46036be
17c7893
501f49e
dbc3e33
46e7ead
63d31dd
6e5dac7
ae4ec77
c649393
306d0ca
57d2f35
bc3b068
ac31058
a0712db
072a9db
0fe7de7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -174,6 +174,94 @@ public static uint MurmurHash(uint hash, ReadOnlySpan<char> span, bool toUpper = | |
return hash; | ||
} | ||
|
||
// MurmurHashV2 is an implementation of MurmurHash3_x86_32 (https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp#L94) used by Onnxruntime's | ||
// MurmurHash operator, implemented to have matching hashing results between ORT and ML.NET. | ||
// One key difference between the two implementations is that strings use a different hashing algorithm in the previous version, but | ||
// every type uses the same implementation on V2. | ||
// The V1 String Hashing Algorithm had the following properities: | ||
// - Case Conversion: used inside the hashing algorithm in ML.Net. | ||
// - Mock UTF8 encoding: strings in C# are UTF16 and need to be converted to UTF8 before hashing | ||
public static uint MurmurHashV2(uint hash, ReadOnlySpan<char> span, bool toUpper = false) | ||
{ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is a lot of context missing here. Can you please ask Ksenija for the docs she wrote and add a detailed explanation here as to why MurmurHasV2 was necessary and how it is different from the previous version? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added a brief description. @KsenijaS can you take a look and add anything you think is important to mention? |
||
// Byte length (in pseudo UTF-8 form). | ||
int len = 0; | ||
|
||
// Current bits, value and count. | ||
ulong cur = 0; | ||
int bits = 0; | ||
for (int ich = 0; ich < span.Length; ich++) | ||
{ | ||
Contracts.Assert((bits & 0x7) == 0); | ||
Contracts.Assert((uint)bits <= 24); | ||
Contracts.Assert(cur <= 0x00FFFFFF); | ||
|
||
uint ch = toUpper ? char.ToUpperInvariant(span[ich]) : span[ich]; | ||
if (ch <= 0x007F) | ||
{ | ||
cur |= ch << bits; | ||
bits += 8; | ||
} | ||
else if (ch <= 0x07FF) | ||
{ | ||
cur |= (ulong)((ch & 0x003F) | ((ch << 2) & 0x1F00) | 0xC080) << bits; | ||
cur = (cur & 0xFF) << 8 | cur >> 8; | ||
bits += 16; | ||
} | ||
else if (ch <= 0xFFFF) | ||
{ | ||
cur |= (ulong)((ch & 0x003F) | ((ch << 2) & 0x3F00) | ((ch << 4) & 0x0F0000) | 0xE08080) << bits; | ||
cur = (cur & 0xFF) << 16 | ((cur >> 8) & 0xFF) << 8 | cur >> 16; | ||
bits += 24; | ||
} | ||
else | ||
{ | ||
Contracts.Assert(ch <= 0x10FFFF); | ||
cur |= (ulong)((ch & 0x003F) | ((ch << 2) & 0x3F00) | ((ch << 4) & 0x3F0000) | ((ch << 6) & 0x07000000) | 0xF0808080) << bits; | ||
cur = (cur & 0xFF) << 24 | ((cur >> 8) & 0xFF) << 16 | ((cur >> 16) & 0xFF) << 8 | cur >> 24; | ||
bits += 32; | ||
} | ||
|
||
if (bits >= 32) | ||
{ | ||
hash = MurmurRound(hash, (uint)cur); | ||
cur = cur >> 32; | ||
bits -= 32; | ||
len += 4; | ||
} | ||
} | ||
Contracts.Assert((bits & 0x7) == 0); | ||
Contracts.Assert((uint)bits <= 24); | ||
Contracts.Assert(cur <= 0x00FFFFFF); | ||
|
||
if (bits > 0) | ||
{ | ||
len += bits / 8; | ||
} | ||
|
||
// tail processing | ||
uint k1 = 0; | ||
switch (len & 3) | ||
{ | ||
case 3: | ||
k1 ^= (uint)(((cur >> 16) & 0xFF) << 16); | ||
goto case 2; | ||
case 2: | ||
k1 ^= (uint)((cur >> 8) & 0xFF) << 8; | ||
goto case 1; | ||
case 1: | ||
k1 ^= (uint)(cur & 0xFF); | ||
k1 *= 0xCC9E2D51; k1 = Rotate(k1, 15); | ||
k1 *= 0x1B873593; | ||
hash ^= k1; | ||
break; | ||
} | ||
|
||
// Final mixing ritual for the hash. | ||
hash = MixHashV2(hash, len); | ||
|
||
return hash; | ||
} | ||
|
||
/// <summary> | ||
/// Implements the murmur hash 3 algorithm, using a mock UTF-8 encoding. | ||
/// The UTF-8 conversion ignores the possibilities of unicode planes other than the 0th. | ||
|
@@ -284,6 +372,18 @@ public static uint MixHash(uint hash) | |
return hash; | ||
} | ||
|
||
[MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
public static uint MixHashV2(uint hash, int len) | ||
{ | ||
hash ^= (uint)len; | ||
hash ^= hash >> 16; | ||
hash *= 0x85ebca6b; | ||
hash ^= hash >> 13; | ||
hash *= 0xc2b2ae35; | ||
hash ^= hash >> 16; | ||
return hash; | ||
} | ||
|
||
[MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
private static uint Rotate(uint x, int r) | ||
{ | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please elaborate on what Mock UTF8 encoding is. If I recall correctly, we are omitting certain code pages.
@KsenijaS Didn't you have an implementation that supported the full UTF8 encoding? Can we use that?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, but also partial UTF8 support is sufficient. Mock UTF8 doesn't cover emojis and some special math characters