< prev index next >

src/cpu/ppc/vm/macroAssembler_ppc.cpp

Print this page
rev 13389 : PPC: Implement MulAdd and SquareToLen intrinsics

This implementation is based on the algorithm implemented in java. It
yields a performance speedup of:
JDK8 - 23%
JDK9 - 5%
JDK10 - 5%


5211   blt(CCR0, L_post_third_loop_done);
5212 
5213   sldi(tmp, idx, LogBytesPerInt);
5214   lwzx(yz_idx, y, tmp);
5215   multiply64(product_high, product, x_xstart, yz_idx);
5216   lwzx(yz_idx, z, tmp);
5217 
5218   add2_with_carry(product_high, product, yz_idx, carry);
5219 
5220   sldi(tmp, idx, LogBytesPerInt);
5221   stwx(product, z, tmp);
5222   srdi(product, product, 32);
5223 
5224   sldi(product_high, product_high, 32);
5225   orr(product, product, product_high);
5226   mr_if_needed(carry, product);
5227 
5228   bind(L_post_third_loop_done);
5229 }   // multiply_128_x_128_loop
5230 


































5231 void MacroAssembler::multiply_to_len(Register x, Register xlen,
5232                                      Register y, Register ylen,
5233                                      Register z, Register zlen,
5234                                      Register tmp1, Register tmp2,
5235                                      Register tmp3, Register tmp4,
5236                                      Register tmp5, Register tmp6,
5237                                      Register tmp7, Register tmp8,
5238                                      Register tmp9, Register tmp10,
5239                                      Register tmp11, Register tmp12,
5240                                      Register tmp13) {
5241 
5242   ShortBranchVerifier sbv(this);
5243 
5244   assert_different_registers(x, xlen, y, ylen, z, zlen,
5245                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
5246   assert_different_registers(x, xlen, y, ylen, z, zlen,
5247                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp7);
5248   assert_different_registers(x, xlen, y, ylen, z, zlen,
5249                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp8);
5250 




5211   blt(CCR0, L_post_third_loop_done);
5212 
5213   sldi(tmp, idx, LogBytesPerInt);
5214   lwzx(yz_idx, y, tmp);
5215   multiply64(product_high, product, x_xstart, yz_idx);
5216   lwzx(yz_idx, z, tmp);
5217 
5218   add2_with_carry(product_high, product, yz_idx, carry);
5219 
5220   sldi(tmp, idx, LogBytesPerInt);
5221   stwx(product, z, tmp);
5222   srdi(product, product, 32);
5223 
5224   sldi(product_high, product_high, 32);
5225   orr(product, product, product_high);
5226   mr_if_needed(carry, product);
5227 
5228   bind(L_post_third_loop_done);
5229 }   // multiply_128_x_128_loop
5230 
5231 void MacroAssembler::muladd(Register out, Register in,
5232                             Register offset, Register len, Register k,
5233                             Register tmp1, Register tmp2, Register carry) {
5234 
5235     // Labels
5236     Label LOOP, SKIP;
5237 
5238     // Make sure length is positive.
5239     cmpdi  (CCR0,    len,     0);
5240 
5241     // Prepare variables
5242     subi   (offset,  offset,  4);
5243     li     (carry,   0);
5244     ble    (CCR0,    SKIP);
5245 
5246     mtctr  (len);
5247     subi   (len,     len,     1    );
5248     sldi   (len,     len,     2    );
5249 
5250     // Main loop
5251     bind(LOOP);
5252     lwzx   (tmp1,    len,     in   );
5253     lwzx   (tmp2,    offset,  out  );
5254     mulld  (tmp1,    tmp1,    k    );
5255     add    (tmp2,    carry,   tmp2 );
5256     add    (tmp2,    tmp1,    tmp2 );
5257     stwx   (tmp2,    offset,  out  );
5258     srdi   (carry,   tmp2,    32   );
5259     subi   (offset,  offset,  4    );
5260     subi   (len,     len,     4    );
5261     bdnz   (LOOP);
5262     bind(SKIP);
5263 }
5264 
5265 void MacroAssembler::multiply_to_len(Register x, Register xlen,
5266                                      Register y, Register ylen,
5267                                      Register z, Register zlen,
5268                                      Register tmp1, Register tmp2,
5269                                      Register tmp3, Register tmp4,
5270                                      Register tmp5, Register tmp6,
5271                                      Register tmp7, Register tmp8,
5272                                      Register tmp9, Register tmp10,
5273                                      Register tmp11, Register tmp12,
5274                                      Register tmp13) {
5275 
5276   ShortBranchVerifier sbv(this);
5277 
5278   assert_different_registers(x, xlen, y, ylen, z, zlen,
5279                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
5280   assert_different_registers(x, xlen, y, ylen, z, zlen,
5281                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp7);
5282   assert_different_registers(x, xlen, y, ylen, z, zlen,
5283                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp8);
5284 


< prev index next >