< prev index next >
src/cpu/ppc/vm/macroAssembler_ppc.cpp
Print this page
rev 13389 : PPC: Implement MulAdd and SquareToLen intrinsics
This implementation is based on the algorithm implemented in java. It
yields a performance speedup of:
JDK8 - 23%
JDK9 - 5%
JDK10 - 5%
*** 5226,5235 ****
--- 5226,5269 ----
mr_if_needed(carry, product);
bind(L_post_third_loop_done);
} // multiply_128_x_128_loop
+ void MacroAssembler::muladd(Register out, Register in,
+ Register offset, Register len, Register k,
+ Register tmp1, Register tmp2, Register carry) {
+
+ // Labels
+ Label LOOP, SKIP;
+
+ // Make sure length is positive.
+ cmpdi (CCR0, len, 0);
+
+ // Prepare variables
+ subi (offset, offset, 4);
+ li (carry, 0);
+ ble (CCR0, SKIP);
+
+ mtctr (len);
+ subi (len, len, 1 );
+ sldi (len, len, 2 );
+
+ // Main loop
+ bind(LOOP);
+ lwzx (tmp1, len, in );
+ lwzx (tmp2, offset, out );
+ mulld (tmp1, tmp1, k );
+ add (tmp2, carry, tmp2 );
+ add (tmp2, tmp1, tmp2 );
+ stwx (tmp2, offset, out );
+ srdi (carry, tmp2, 32 );
+ subi (offset, offset, 4 );
+ subi (len, len, 4 );
+ bdnz (LOOP);
+ bind(SKIP);
+ }
+
void MacroAssembler::multiply_to_len(Register x, Register xlen,
Register y, Register ylen,
Register z, Register zlen,
Register tmp1, Register tmp2,
Register tmp3, Register tmp4,
< prev index next >