From 1d6bc62cb49e6ed0ec79a298504d33940f857aac Mon Sep 17 00:00:00 2001 From: Yuri Gaevsky Date: Sat, 13 Jan 2024 11:52:05 +0300 Subject: [PATCH 1/6] 8322174: RISC-V: C2 VectorizedHashCode RVV Version --- .../cpu/riscv/c2_MacroAssembler_riscv.cpp | 162 +++++++++++++++ .../cpu/riscv/c2_MacroAssembler_riscv.hpp | 8 +- src/hotspot/cpu/riscv/riscv.ad | 196 ++++++++++++++++++ src/hotspot/cpu/riscv/riscv_v.ad | 31 +++ src/hotspot/cpu/riscv/stubGenerator_riscv.cpp | 20 ++ src/hotspot/cpu/riscv/stubRoutines_riscv.cpp | 1 + src/hotspot/cpu/riscv/stubRoutines_riscv.hpp | 5 + 7 files changed, 422 insertions(+), 1 deletion(-) diff --git a/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp b/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp index 711eb2100912b..5c150d02f7e50 100644 --- a/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp +++ b/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp @@ -1465,6 +1465,7 @@ void C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register res Register tmp4, Register tmp5, Register tmp6, BasicType eltype) { + assert(!UseRVV, "sanity"); assert_different_registers(ary, cnt, result, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, t0, t1); const int elsize = arrays_hashcode_elsize(eltype); @@ -1539,6 +1540,137 @@ void C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register res BLOCK_COMMENT("} // arrays_hashcode"); } +void C2_MacroAssembler::arrays_hashcode_v(Register ary, Register cnt, Register result, + Register tmp1, Register tmp2, Register tmp3, + Register tmp4, Register tmp5, Register tmp6, + BasicType eltype) +{ + assert(UseRVV, "sanity"); + assert(StubRoutines::riscv::arrays_hashcode_powers_of_31() != nullptr, "sanity"); + assert_different_registers(ary, cnt, result, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, t0, t1); + + const int num_8b_elems_in_vec = MaxVectorSize; + const int elsize_bytes = arrays_hashcode_elsize(eltype); + const int elsize_shift = exact_log2(elsize_bytes); + const int vec_step_bytes = num_8b_elems_in_vec << elsize_shift; + const address adr_pows31 = StubRoutines::riscv::arrays_hashcode_powers_of_31() + + sizeof(jint); + + switch (eltype) { + case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode_v(unsigned byte) {"); break; + case T_CHAR: BLOCK_COMMENT("arrays_hashcode_v(char) {"); break; + case T_BYTE: BLOCK_COMMENT("arrays_hashcode_v(byte) {"); break; + case T_SHORT: BLOCK_COMMENT("arrays_hashcode_v(short) {"); break; + case T_INT: BLOCK_COMMENT("arrays_hashcode_v(int) {"); break; + default: + ShouldNotReachHere(); + } + + const int scalar_stride = 4; + const Register pow31_4 = tmp1; + const Register pow31_3 = tmp2; + const Register pow31_2 = tmp3; + const Register chunks = tmp4; + const Register chunks_end = chunks; + + const Register pows31 = tmp1; + const VectorRegister v_coeffs = v4; + const VectorRegister v_src = v8; + const VectorRegister v_sum = v12; + const VectorRegister v_powmax = v16; + const VectorRegister v_result = v20; + const VectorRegister v_tmp = v24; + const VectorRegister v_zred = v28; + + Label DONE, TAIL, TAIL_LOOP, WIDE_TAIL, WIDE_LOOP, VEC_LOOP; + + // result has a value initially + + beqz(cnt, DONE); + + andi(chunks, cnt, ~(num_8b_elems_in_vec-1)); + beqz(chunks, WIDE_TAIL); + + subw(cnt, cnt, chunks); + slli(chunks_end, chunks, elsize_shift); + add(chunks_end, ary, chunks_end); + + // load pre-calculated powers of 31: + // 31^^MaxVectorSize ==> scalar register + // 31^^(MaxVectorSize-1)...31^^0 ==> vector registers + la(pows31, ExternalAddress(adr_pows31)); + mv(t1, num_8b_elems_in_vec); + vsetvli(t0, t1, Assembler::e32, Assembler::m4); + vle32_v(v_coeffs, pows31); + lw(pows31, Address(pows31, -1 * sizeof(jint))); + // clear vector registers used in intermediate calculations + vmv_v_i(v_sum, 0); + vmv_v_i(v_powmax, 0); + vmv_v_i(v_result, 0); + vmv_s_x(v_zred, x0); + // set initial values + vmv_s_x(v_powmax, pows31); + vmv_s_x(v_result, result); + + bind(VEC_LOOP); + vmul_vv(v_result, v_result, v_powmax); + arrays_hashcode_vec_elload(v_src, v_tmp, ary, eltype); + vmul_vv(v_src, v_src, v_coeffs); + vredsum_vs(v_sum, v_src, v_zred); + vadd_vv(v_result, v_result, v_sum); + addi(ary, ary, vec_step_bytes); + bne(ary, chunks_end, VEC_LOOP); + // finally remember calculated result value in scalar register + vmv_x_s(result, v_result); + beqz(cnt, DONE); + + bind(WIDE_TAIL); + andi(chunks, cnt, ~(scalar_stride-1)); + beqz(chunks, TAIL); + + mv(pow31_4, 923521); // [31^^4] + mv(pow31_3, 29791); // [31^^3] + mv(pow31_2, 961); // [31^^2] + + slli(chunks_end, chunks, elsize_shift); + add(chunks_end, ary, chunks_end); + andi(cnt, cnt, scalar_stride-1); // don't forget about tail! + + bind(WIDE_LOOP); + mulw(result, result, pow31_4); // 31^^4 * h + arrays_hashcode_elload(t0, Address(ary, 0 * elsize_bytes), eltype); + arrays_hashcode_elload(t1, Address(ary, 1 * elsize_bytes), eltype); + arrays_hashcode_elload(tmp5, Address(ary, 2 * elsize_bytes), eltype); + arrays_hashcode_elload(tmp6, Address(ary, 3 * elsize_bytes), eltype); + mulw(t0, t0, pow31_3); // 31^^3 * ary[i+0] + addw(result, result, t0); + mulw(t1, t1, pow31_2); // 31^^2 * ary[i+1] + addw(result, result, t1); + slli(t0, tmp5, 5); // optimize 31^^1 * ary[i+2] + subw(tmp5, t0, tmp5); // with ary[i+2]<<5 - ary[i+2] + addw(result, result, tmp5); + addw(result, result, tmp6); // 31^^4 * h + 31^^3 * ary[i+0] + 31^^2 * ary[i+1] + // + 31^^1 * ary[i+2] + 31^^0 * ary[i+3] + addi(ary, ary, elsize_bytes * scalar_stride); + bne(ary, chunks_end, WIDE_LOOP); + beqz(cnt, DONE); + + bind(TAIL); + slli(chunks_end, cnt, elsize_shift); + add(chunks_end, ary, chunks_end); + + bind(TAIL_LOOP); + arrays_hashcode_elload(t0, Address(ary), eltype); + slli(t1, result, 5); // optimize 31 * result + subw(result, t1, result); // with result<<5 - result + addw(result, result, t0); + addi(ary, ary, elsize_bytes); + bne(ary, chunks_end, TAIL_LOOP); + + bind(DONE); + BLOCK_COMMENT("} // arrays_hashcode_v"); +} + int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) { switch (eltype) { case T_BOOLEAN: return sizeof(jboolean); @@ -1565,6 +1697,36 @@ void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicT } } +void C2_MacroAssembler::arrays_hashcode_vec_elload(VectorRegister varr, + VectorRegister vtmp, + Register array, + BasicType eltype) { + assert((T_INT == eltype) || (varr != vtmp), "should be"); + switch (eltype) { + case T_BOOLEAN: + vle8_v(vtmp, array); + vzext_vf4(varr, vtmp); + break; + case T_BYTE: + vle8_v(vtmp, array); + vsext_vf4(varr, vtmp); + break; + case T_CHAR: + vle16_v(vtmp, array); + vzext_vf2(varr, vtmp); + break; + case T_SHORT: + vle16_v(vtmp, array); + vsext_vf2(varr, vtmp); + break; + case T_INT: + vle32_v(varr, array); + break; + default: + ShouldNotReachHere(); + } +} + typedef void (Assembler::*conditional_branch_insn)(Register op1, Register op2, Label& label, bool is_far); typedef void (MacroAssembler::*float_conditional_branch_insn)(FloatRegister op1, FloatRegister op2, Label& label, bool is_far, bool is_unordered); diff --git a/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.hpp b/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.hpp index 4940ce5fe9e94..f572c7eea67f1 100644 --- a/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.hpp +++ b/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.hpp @@ -87,9 +87,15 @@ Register tmp3, Register tmp4, Register tmp5, Register tmp6, BasicType eltype); - // helper function for arrays_hashcode + void arrays_hashcode_v(Register ary, Register cnt, Register result, + Register tmp1, Register tmp2, + Register tmp3, Register tmp4, + Register tmp5, Register tmp6, + BasicType eltype); + // helper functions for arrays_hashcode int arrays_hashcode_elsize(BasicType eltype); void arrays_hashcode_elload(Register dst, Address src, BasicType eltype); + void arrays_hashcode_vec_elload(VectorRegister varr, VectorRegister vtmp, Register array, BasicType eltype); void string_equals(Register r1, Register r2, Register result, Register cnt1, diff --git a/src/hotspot/cpu/riscv/riscv.ad b/src/hotspot/cpu/riscv/riscv.ad index 7e1291f49d74c..94753876d191c 100644 --- a/src/hotspot/cpu/riscv/riscv.ad +++ b/src/hotspot/cpu/riscv/riscv.ad @@ -954,6 +954,71 @@ reg_class v15_reg( V15, V15_H, V15_J, V15_K ); +// class for vector register v16 +reg_class v16_reg( + V16, V16_H, V16_J, V16_K +); + +// class for vector register v17 +reg_class v17_reg( + V17, V17_H, V17_J, V17_K +); + +// class for vector register v18 +reg_class v18_reg( + V18, V18_H, V18_J, V18_K +); + +// class for vector register v19 +reg_class v19_reg( + V19, V19_H, V19_J, V19_K +); + +// class for vector register v20 +reg_class v20_reg( + V20, V20_H, V20_J, V20_K +); + +// class for vector register v21 +reg_class v21_reg( + V21, V21_H, V21_J, V21_K +); + +// class for vector register v22 +reg_class v22_reg( + V22, V22_H, V22_J, V22_K +); + +// class for vector register v23 +reg_class v23_reg( + V23, V23_H, V23_J, V23_K +); + +// class for vector register v24 +reg_class v24_reg( + V24, V24_H, V24_J, V24_K +); + +// class for vector register v25 +reg_class v25_reg( + V25, V25_H, V25_J, V25_K +); + +// class for vector register v26 +reg_class v26_reg( + V26, V26_H, V26_J, V26_K +); + +// class for vector register v27 +reg_class v27_reg( + V27, V27_H, V27_J, V27_K +); + +// class for vector register v28 +reg_class v28_reg( + V28, V28_H, V28_J, V28_K +); + // class for condition codes reg_class reg_flags(RFLAGS); @@ -3573,6 +3638,136 @@ operand vReg_V15() interface(REG_INTER); %} +operand vReg_V16() +%{ + constraint(ALLOC_IN_RC(v16_reg)); + match(VecA); + match(vReg); + op_cost(0); + format %{ %} + interface(REG_INTER); +%} + +operand vReg_V17() +%{ + constraint(ALLOC_IN_RC(v17_reg)); + match(VecA); + match(vReg); + op_cost(0); + format %{ %} + interface(REG_INTER); +%} + +operand vReg_V18() +%{ + constraint(ALLOC_IN_RC(v18_reg)); + match(VecA); + match(vReg); + op_cost(0); + format %{ %} + interface(REG_INTER); +%} + +operand vReg_V19() +%{ + constraint(ALLOC_IN_RC(v19_reg)); + match(VecA); + match(vReg); + op_cost(0); + format %{ %} + interface(REG_INTER); +%} + +operand vReg_V20() +%{ + constraint(ALLOC_IN_RC(v20_reg)); + match(VecA); + match(vReg); + op_cost(0); + format %{ %} + interface(REG_INTER); +%} + +operand vReg_V21() +%{ + constraint(ALLOC_IN_RC(v21_reg)); + match(VecA); + match(vReg); + op_cost(0); + format %{ %} + interface(REG_INTER); +%} + +operand vReg_V22() +%{ + constraint(ALLOC_IN_RC(v22_reg)); + match(VecA); + match(vReg); + op_cost(0); + format %{ %} + interface(REG_INTER); +%} + +operand vReg_V23() +%{ + constraint(ALLOC_IN_RC(v23_reg)); + match(VecA); + match(vReg); + op_cost(0); + format %{ %} + interface(REG_INTER); +%} + +operand vReg_V24() +%{ + constraint(ALLOC_IN_RC(v24_reg)); + match(VecA); + match(vReg); + op_cost(0); + format %{ %} + interface(REG_INTER); +%} + +operand vReg_V25() +%{ + constraint(ALLOC_IN_RC(v25_reg)); + match(VecA); + match(vReg); + op_cost(0); + format %{ %} + interface(REG_INTER); +%} + +operand vReg_V26() +%{ + constraint(ALLOC_IN_RC(v26_reg)); + match(VecA); + match(vReg); + op_cost(0); + format %{ %} + interface(REG_INTER); +%} + +operand vReg_V27() +%{ + constraint(ALLOC_IN_RC(v27_reg)); + match(VecA); + match(vReg); + op_cost(0); + format %{ %} + interface(REG_INTER); +%} + +operand vReg_V28() +%{ + constraint(ALLOC_IN_RC(v28_reg)); + match(VecA); + match(vReg); + op_cost(0); + format %{ %} + interface(REG_INTER); +%} + operand vRegMask() %{ constraint(ALLOC_IN_RC(vmask_reg)); @@ -10406,6 +10601,7 @@ instruct arrays_hashcode(iRegP_R11 ary, iRegI_R12 cnt, iRegI_R10 result, immI ba iRegLNoSp tmp3, iRegLNoSp tmp4, iRegLNoSp tmp5, iRegLNoSp tmp6, rFlagsReg cr) %{ + predicate(!UseRVV); match(Set result (VectorizedHashCode (Binary ary cnt) (Binary result basic_type))); effect(TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, TEMP tmp6, USE_KILL ary, USE_KILL cnt, USE basic_type, KILL cr); diff --git a/src/hotspot/cpu/riscv/riscv_v.ad b/src/hotspot/cpu/riscv/riscv_v.ad index c163325fc815f..674c873a86933 100644 --- a/src/hotspot/cpu/riscv/riscv_v.ad +++ b/src/hotspot/cpu/riscv/riscv_v.ad @@ -2666,6 +2666,37 @@ instruct varray_equalsC(iRegP_R11 ary1, iRegP_R12 ary2, iRegI_R10 result, ins_pipe(pipe_class_memory); %} +// fast ArraysSupport.vectorizedHashCode +instruct varrays_hashcode(iRegP_R11 ary, iRegI_R12 cnt, iRegI_R10 result, immI basic_type, + vReg_V4 v4, vReg_V5 v5, vReg_V6 v6, vReg_V7 v7, + vReg_V8 v8, vReg_V9 v9, vReg_V10 v10, vReg_V11 v11, + vReg_V12 v12, vReg_V13 v13, vReg_V14 v14, vReg_V15 v15, + vReg_V16 v16, vReg_V17 v17, vReg_V18 v18, vReg_V19 v19, + vReg_V20 v20, vReg_V21 v21, vReg_V22 v22, vReg_V23 v23, + vReg_V24 v24, vReg_V25 v25, vReg_V26 v26, vReg_V27 v27, + vReg_V28 v28, + iRegLNoSp tmp1, iRegLNoSp tmp2, iRegLNoSp tmp3, + iRegLNoSp tmp4, iRegLNoSp tmp5, iRegLNoSp tmp6, rFlagsReg cr) +%{ + predicate(UseRVV && (MaxVectorSize >= 16)); + match(Set result (VectorizedHashCode (Binary ary cnt) (Binary result basic_type))); + effect(USE_KILL ary, USE_KILL cnt, USE basic_type, + TEMP v4, TEMP v5, TEMP v6, TEMP v7, TEMP v8, TEMP v9, TEMP v10, TEMP v11, + TEMP v12, TEMP v13, TEMP v14, TEMP v15, TEMP v16, TEMP v17, TEMP v18, TEMP v19, + TEMP v20, TEMP v21, TEMP v22, TEMP v23, TEMP v24, TEMP v25, TEMP v26, TEMP v27, + TEMP v28, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, TEMP tmp6, + KILL cr); + + format %{ "Array HashCode array[] $ary,$cnt,$result,$basic_type -> $result // KILL all" %} + ins_encode %{ + __ arrays_hashcode_v($ary$$Register, $cnt$$Register, $result$$Register, + $tmp1$$Register, $tmp2$$Register, $tmp3$$Register, + $tmp4$$Register, $tmp5$$Register, $tmp6$$Register, + (BasicType)$basic_type$$constant); + %} + ins_pipe(pipe_class_memory); +%} + instruct vstring_compareU(iRegP_R11 str1, iRegI_R12 cnt1, iRegP_R13 str2, iRegI_R14 cnt2, iRegI_R10 result, vReg_V2 v2, vReg_V3 v3, vReg_V4 v4, vReg_V5 v5, iRegP_R28 tmp1, iRegL_R29 tmp2) diff --git a/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp b/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp index 4bd33d08f8928..17a8b9a367f19 100644 --- a/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp +++ b/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp @@ -5015,6 +5015,22 @@ static const int64_t right_3_bits = right_n_bits(3); return start; } + address generate_arrays_hashcode_powers_of_31() { + const int num_8b_elems_in_vec = MaxVectorSize; + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "arrays_hashcode_powers_of_31"); + address start = __ pc(); + for (int i = num_8b_elems_in_vec; i >= 0; i--) { + jint power_of_31 = 1; + for (int j = i; j > 0; j--) { + power_of_31 = java_multiply(power_of_31, 31); + } + __ emit_int32(power_of_31); + } + + return start; + } + #endif // COMPILER2 #if INCLUDE_JFR @@ -5246,6 +5262,10 @@ static const int64_t right_3_bits = right_n_bits(3); StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift(); StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift(); } + + if (UseVectorizedHashCodeIntrinsic && UseRVV && (MaxVectorSize >= 16)) { + StubRoutines::riscv::_arrays_hashcode_powers_of_31 = generate_arrays_hashcode_powers_of_31(); + } #endif // COMPILER2 if (UseSHA256Intrinsics) { diff --git a/src/hotspot/cpu/riscv/stubRoutines_riscv.cpp b/src/hotspot/cpu/riscv/stubRoutines_riscv.cpp index 39068a9a026ac..bae11450379f9 100644 --- a/src/hotspot/cpu/riscv/stubRoutines_riscv.cpp +++ b/src/hotspot/cpu/riscv/stubRoutines_riscv.cpp @@ -53,5 +53,6 @@ address StubRoutines::riscv::_string_indexof_linear_ll = nullptr; address StubRoutines::riscv::_string_indexof_linear_uu = nullptr; address StubRoutines::riscv::_string_indexof_linear_ul = nullptr; address StubRoutines::riscv::_large_byte_array_inflate = nullptr; +address StubRoutines::riscv::_arrays_hashcode_powers_of_31 = nullptr; bool StubRoutines::riscv::_completed = false; diff --git a/src/hotspot/cpu/riscv/stubRoutines_riscv.hpp b/src/hotspot/cpu/riscv/stubRoutines_riscv.hpp index 7c604e8c11cc2..fc4a280b5be65 100644 --- a/src/hotspot/cpu/riscv/stubRoutines_riscv.hpp +++ b/src/hotspot/cpu/riscv/stubRoutines_riscv.hpp @@ -69,6 +69,7 @@ class riscv { static address _string_indexof_linear_uu; static address _string_indexof_linear_ul; static address _large_byte_array_inflate; + static address _arrays_hashcode_powers_of_31; static bool _completed; @@ -146,6 +147,10 @@ class riscv { return _large_byte_array_inflate; } + static address arrays_hashcode_powers_of_31() { + return _arrays_hashcode_powers_of_31; + } + static bool complete() { return _completed; } From b976ca79bf44bb2316a64e03ca850f5b6edebd11 Mon Sep 17 00:00:00 2001 From: Yuri Gaevsky Date: Thu, 25 Jan 2024 17:25:51 +0300 Subject: [PATCH 2/6] Removed checks for (MaxVectorSize >= 16) per @RealFYang suggestion. --- src/hotspot/cpu/riscv/riscv_v.ad | 2 +- src/hotspot/cpu/riscv/stubGenerator_riscv.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hotspot/cpu/riscv/riscv_v.ad b/src/hotspot/cpu/riscv/riscv_v.ad index 674c873a86933..db1e0380c9702 100644 --- a/src/hotspot/cpu/riscv/riscv_v.ad +++ b/src/hotspot/cpu/riscv/riscv_v.ad @@ -2678,7 +2678,7 @@ instruct varrays_hashcode(iRegP_R11 ary, iRegI_R12 cnt, iRegI_R10 result, immI b iRegLNoSp tmp1, iRegLNoSp tmp2, iRegLNoSp tmp3, iRegLNoSp tmp4, iRegLNoSp tmp5, iRegLNoSp tmp6, rFlagsReg cr) %{ - predicate(UseRVV && (MaxVectorSize >= 16)); + predicate(UseRVV); match(Set result (VectorizedHashCode (Binary ary cnt) (Binary result basic_type))); effect(USE_KILL ary, USE_KILL cnt, USE basic_type, TEMP v4, TEMP v5, TEMP v6, TEMP v7, TEMP v8, TEMP v9, TEMP v10, TEMP v11, diff --git a/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp b/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp index 17a8b9a367f19..1eaf09a1cf32e 100644 --- a/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp +++ b/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp @@ -5263,7 +5263,7 @@ static const int64_t right_3_bits = right_n_bits(3); StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift(); } - if (UseVectorizedHashCodeIntrinsic && UseRVV && (MaxVectorSize >= 16)) { + if (UseVectorizedHashCodeIntrinsic && UseRVV) { StubRoutines::riscv::_arrays_hashcode_powers_of_31 = generate_arrays_hashcode_powers_of_31(); } #endif // COMPILER2 From 7ed3d86eab5f4b18c3cb9f2b9d1f59d3113df46c Mon Sep 17 00:00:00 2001 From: Yuri Gaevsky Date: Thu, 25 Jan 2024 17:40:01 +0300 Subject: [PATCH 3/6] num_8b_elems_in_vec --> nof_vec_elems --- src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp b/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp index 5c150d02f7e50..1021ac4360bfd 100644 --- a/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp +++ b/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp @@ -1549,10 +1549,10 @@ void C2_MacroAssembler::arrays_hashcode_v(Register ary, Register cnt, Register r assert(StubRoutines::riscv::arrays_hashcode_powers_of_31() != nullptr, "sanity"); assert_different_registers(ary, cnt, result, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, t0, t1); - const int num_8b_elems_in_vec = MaxVectorSize; + const int nof_vec_elems = MaxVectorSize; const int elsize_bytes = arrays_hashcode_elsize(eltype); const int elsize_shift = exact_log2(elsize_bytes); - const int vec_step_bytes = num_8b_elems_in_vec << elsize_shift; + const int vec_step_bytes = nof_vec_elems << elsize_shift; const address adr_pows31 = StubRoutines::riscv::arrays_hashcode_powers_of_31() + sizeof(jint); @@ -1588,7 +1588,7 @@ void C2_MacroAssembler::arrays_hashcode_v(Register ary, Register cnt, Register r beqz(cnt, DONE); - andi(chunks, cnt, ~(num_8b_elems_in_vec-1)); + andi(chunks, cnt, ~(nof_vec_elems-1)); beqz(chunks, WIDE_TAIL); subw(cnt, cnt, chunks); @@ -1599,7 +1599,7 @@ void C2_MacroAssembler::arrays_hashcode_v(Register ary, Register cnt, Register r // 31^^MaxVectorSize ==> scalar register // 31^^(MaxVectorSize-1)...31^^0 ==> vector registers la(pows31, ExternalAddress(adr_pows31)); - mv(t1, num_8b_elems_in_vec); + mv(t1, nof_vec_elems); vsetvli(t0, t1, Assembler::e32, Assembler::m4); vle32_v(v_coeffs, pows31); lw(pows31, Address(pows31, -1 * sizeof(jint))); From 9ba2768618f3f0842d9288250db0e313b72faec7 Mon Sep 17 00:00:00 2001 From: Yuri Gaevsky Date: Thu, 1 May 2025 17:11:53 +0300 Subject: [PATCH 4/6] Fixed git rebase artifacts. --- src/hotspot/cpu/riscv/stubGenerator_riscv.cpp | 52 ------------------- 1 file changed, 52 deletions(-) diff --git a/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp b/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp index 315a8522829d9..4076383fcbe6c 100644 --- a/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp +++ b/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp @@ -6458,58 +6458,6 @@ static const int64_t right_3_bits = right_n_bits(3); return start; } - void generate_vector_math_stubs() { - if (!UseRVV) { - log_info(library)("vector is not supported, skip loading vector math (sleef) library!"); - return; - } - - // Get native vector math stub routine addresses - void* libsleef = nullptr; - char ebuf[1024]; - char dll_name[JVM_MAXPATHLEN]; - if (os::dll_locate_lib(dll_name, sizeof(dll_name), Arguments::get_dll_dir(), "sleef")) { - libsleef = os::dll_load(dll_name, ebuf, sizeof ebuf); - } - if (libsleef == nullptr) { - log_info(library)("Failed to load native vector math (sleef) library, %s!", ebuf); - return; - } - - // Method naming convention - // All the methods are named as _ - // - // Where: - // is the operation name, e.g. sin, cos - // is to indicate float/double - // "fx/dx" for vector float/double operation - // is the precision level - // "u10/u05" represents 1.0/0.5 ULP error bounds - // We use "u10" for all operations by default - // But for those functions do not have u10 support, we use "u05" instead - // rvv, indicates riscv vector extension - // - // e.g. sinfx_u10rvv is the method for computing vector float sin using rvv instructions - // - log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "sleef" JNI_LIB_SUFFIX, p2i(libsleef)); - - for (int op = 0; op < VectorSupport::NUM_VECTOR_OP_MATH; op++) { - int vop = VectorSupport::VECTOR_OP_MATH_START + op; - if (vop == VectorSupport::VECTOR_OP_TANH) { // skip tanh because of performance regression - continue; - } - - // The native library does not support u10 level of "hypot". - const char* ulf = (vop == VectorSupport::VECTOR_OP_HYPOT) ? "u05" : "u10"; - - snprintf(ebuf, sizeof(ebuf), "%sfx_%srvv", VectorSupport::mathname[op], ulf); - StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_SCALABLE][op] = (address)os::dll_lookup(libsleef, ebuf); - - snprintf(ebuf, sizeof(ebuf), "%sdx_%srvv", VectorSupport::mathname[op], ulf); - StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_SCALABLE][op] = (address)os::dll_lookup(libsleef, ebuf); - } - } - address generate_arrays_hashcode_powers_of_31() { const int num_8b_elems_in_vec = MaxVectorSize; __ align(CodeEntryAlignment); From a64dc26e56b464f77fd558c8f2921db2462bde3d Mon Sep 17 00:00:00 2001 From: Yuri Gaevsky Date: Mon, 5 May 2025 11:15:46 +0300 Subject: [PATCH 5/6] reorder instructions to make RVV instructions contiguous --- src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp b/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp index 5a4672c3b554c..5cefd7dc9078c 100644 --- a/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp +++ b/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp @@ -2050,19 +2050,19 @@ void C2_MacroAssembler::arrays_hashcode_v(Register ary, Register cnt, Register r // load pre-calculated powers of 31: // 31^^MaxVectorSize ==> scalar register // 31^^(MaxVectorSize-1)...31^^0 ==> vector registers - la(pows31, ExternalAddress(adr_pows31)); + la(tmp3, ExternalAddress(adr_pows31)); + lw(pows31, Address(tmp3, -1 * sizeof(jint))); mv(t1, nof_vec_elems); vsetvli(t0, t1, Assembler::e32, Assembler::m4); - vle32_v(v_coeffs, pows31); - lw(pows31, Address(pows31, -1 * sizeof(jint))); + vle32_v(v_coeffs, tmp3); // clear vector registers used in intermediate calculations vmv_v_i(v_sum, 0); vmv_v_i(v_powmax, 0); vmv_v_i(v_result, 0); - vmv_s_x(v_zred, x0); // set initial values vmv_s_x(v_powmax, pows31); vmv_s_x(v_result, result); + vmv_s_x(v_zred, x0); bind(VEC_LOOP); vmul_vv(v_result, v_result, v_powmax); From 4e9ad18fc42787596d9f7109b9940869530e16d9 Mon Sep 17 00:00:00 2001 From: Yuri Gaevsky Date: Mon, 5 May 2025 13:13:24 +0300 Subject: [PATCH 6/6] change slli+add sequence to shadd --- src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp b/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp index 5cefd7dc9078c..41bbda189525c 100644 --- a/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp +++ b/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp @@ -2084,8 +2084,7 @@ void C2_MacroAssembler::arrays_hashcode_v(Register ary, Register cnt, Register r mv(pow31_3, 29791); // [31^^3] mv(pow31_2, 961); // [31^^2] - slli(chunks_end, chunks, elsize_shift); - add(chunks_end, ary, chunks_end); + shadd(chunks_end, chunks, ary, chunks, elsize_shift); andi(cnt, cnt, scalar_stride-1); // don't forget about tail! bind(WIDE_LOOP); @@ -2108,8 +2107,7 @@ void C2_MacroAssembler::arrays_hashcode_v(Register ary, Register cnt, Register r beqz(cnt, DONE); bind(TAIL); - slli(chunks_end, cnt, elsize_shift); - add(chunks_end, ary, chunks_end); + shadd(chunks_end, cnt, ary, chunks, elsize_shift); bind(TAIL_LOOP); arrays_hashcode_elload(t0, Address(ary), eltype);