Closed
Description
There is a Neon SHA3 v2i64 XAR operation, but not for v4i32, v8i16 and v16i8. If sve2-sha3 is available we can use the SVE instructions instead.
https://godbolt.org/z/9hdqKoWMx (G1 and F1 are already OK).
vs with scalable vectors: https://godbolt.org/z/GhazeoaWY
https://godbolt.org/z/fejTchexj
typedef char __attribute__ ((vector_size (16))) v16qi;
typedef unsigned short __attribute__ ((vector_size (16))) v8hi;
typedef unsigned int __attribute__ ((vector_size (16))) v4si;
typedef unsigned long long __attribute__ ((vector_size (16))) v2di;
typedef char __attribute__ ((vector_size (8))) v8qi;
typedef unsigned short __attribute__ ((vector_size (8))) v4hi;
typedef unsigned int __attribute__ ((vector_size (8))) v2si;
v2di
G1 (v2di r) {
return (r >> 39) | (r << 25);
}
v4si
G2 (v4si r) {
return (r >> 23) | (r << 9);
}
v8hi
G3 (v8hi r) {
return (r >> 5) | (r << 11);
}
v16qi
G4 (v16qi r)
{
return (r << 2) | (r >> 6);
}
v2si
G5 (v2si r) {
return (r >> 22) | (r << 10);
}
v4hi
G6 (v4hi r) {
return (r >> 7) | (r << 9);
}
v8qi
G7 (v8qi r)
{
return (r << 3) | (r >> 5);
}
See #137162, this is an extension to that issue.