< prev index next >

src/cpu/ppc/vm/macroAssembler_ppc.cpp

Print this page
rev 13389 : Fix vsldoi interface according to ISA

vsldoi should accept a 5 bit unsigned integer and OpenJDK has set it as
5 bit signed integer.

Changed not only the interface but also all related code that was
adapted to use it as signed (hence -8 is now 8)


4559   offsetInt -= 8; std(R28, offsetInt, R1_SP);
4560   offsetInt -= 8; std(R29, offsetInt, R1_SP);
4561   offsetInt -= 8; std(R30, offsetInt, R1_SP);
4562   offsetInt -= 8; std(R31, offsetInt, R1_SP);
4563 
4564   // Set constants
4565   li(off16, 16);
4566   li(off32, 32);
4567   li(off48, 48);
4568   li(off64, 64);
4569   li(off80, 80);
4570   li(off96, 96);
4571   li(off112, 112);
4572 
4573   clrldi(crc, crc, 32);
4574 
4575   vxor(zeroes, zeroes, zeroes);
4576   vspltisw(VR0, -1);
4577 
4578   vsldoi(mask_32bit, zeroes, VR0, 4);
4579   vsldoi(mask_64bit, zeroes, VR0, -8);
4580 
4581   // Get the initial value into v8
4582   vxor(VR8, VR8, VR8);
4583   mtvrd(VR8, crc);
4584   vsldoi(VR8, zeroes, VR8, -8); // shift into bottom 32 bits
4585 
4586   li (rLoaded, 0);
4587 
4588   rldicr(rIdx, len, 0, 56);
4589 
4590   {
4591     BIND(L_1);
4592     // Checksum in blocks of MAX_SIZE (32768)
4593     lis(rMax, 0);
4594     ori(rMax, rMax, 32768);
4595     mr(rTmp2, rMax);
4596     cmpd(CCR0, rIdx, rMax);
4597     bgt(CCR0, L_2);
4598     mr(rMax, rIdx);
4599 
4600     BIND(L_2);
4601     subf(rIdx, rMax, rIdx);
4602 
4603     // our main loop does 128 bytes at a time
4604     srdi(rMax, rMax, 7);


4913 
4914   BIND(L_first_warm_up_done);
4915   lvx(const1, constantsPos);
4916   addi(constantsPos, constantsPos, 16);
4917   vpmsumd(VR8,  VR16, const1);
4918   vpmsumd(VR9,  VR17, const1);
4919   vpmsumd(VR10, VR18, const1);
4920   vpmsumd(VR11, VR19, const1);
4921   vpmsumd(VR12, VR20, const1);
4922   vpmsumd(VR13, VR21, const1);
4923   vpmsumd(VR14, VR22, const1);
4924   vpmsumd(VR15, VR23, const1);
4925   b(L_second_cool_down);
4926 
4927   BIND(L_barrett_reduction);
4928 
4929   lvx(const1, barretConstants);
4930   addi(barretConstants, barretConstants, 16);
4931   lvx(const2, barretConstants);
4932 
4933   vsldoi(VR1, VR0, VR0, -8);
4934   vxor(VR0, VR0, VR1);    // xor two 64 bit results together
4935 
4936   // shift left one bit
4937   vspltisb(VR1, 1);
4938   vsl(VR0, VR0, VR1);
4939 
4940   vand(VR0, VR0, mask_64bit);
4941 
4942   /*
4943    * The reflected version of Barrett reduction. Instead of bit
4944    * reflecting our data (which is expensive to do), we bit reflect our
4945    * constants and our algorithm, which means the intermediate data in
4946    * our vector registers goes from 0-63 instead of 63-0. We can reflect
4947    * the algorithm because we don't carry in mod 2 arithmetic.
4948    */
4949   vand(VR1, VR0, mask_32bit);  // bottom 32 bits of a
4950   vpmsumd(VR1, VR1, const1);   // ma
4951   vand(VR1, VR1, mask_32bit);  // bottom 32bits of ma
4952   vpmsumd(VR1, VR1, const2);   // qn */
4953   vxor(VR0, VR0, VR1);         // a - qn, subtraction is xor in GF(2)




4559   offsetInt -= 8; std(R28, offsetInt, R1_SP);
4560   offsetInt -= 8; std(R29, offsetInt, R1_SP);
4561   offsetInt -= 8; std(R30, offsetInt, R1_SP);
4562   offsetInt -= 8; std(R31, offsetInt, R1_SP);
4563 
4564   // Set constants
4565   li(off16, 16);
4566   li(off32, 32);
4567   li(off48, 48);
4568   li(off64, 64);
4569   li(off80, 80);
4570   li(off96, 96);
4571   li(off112, 112);
4572 
4573   clrldi(crc, crc, 32);
4574 
4575   vxor(zeroes, zeroes, zeroes);
4576   vspltisw(VR0, -1);
4577 
4578   vsldoi(mask_32bit, zeroes, VR0, 4);
4579   vsldoi(mask_64bit, zeroes, VR0, 8);
4580 
4581   // Get the initial value into v8
4582   vxor(VR8, VR8, VR8);
4583   mtvrd(VR8, crc);
4584   vsldoi(VR8, zeroes, VR8, 8); // shift into bottom 32 bits
4585 
4586   li (rLoaded, 0);
4587 
4588   rldicr(rIdx, len, 0, 56);
4589 
4590   {
4591     BIND(L_1);
4592     // Checksum in blocks of MAX_SIZE (32768)
4593     lis(rMax, 0);
4594     ori(rMax, rMax, 32768);
4595     mr(rTmp2, rMax);
4596     cmpd(CCR0, rIdx, rMax);
4597     bgt(CCR0, L_2);
4598     mr(rMax, rIdx);
4599 
4600     BIND(L_2);
4601     subf(rIdx, rMax, rIdx);
4602 
4603     // our main loop does 128 bytes at a time
4604     srdi(rMax, rMax, 7);


4913 
4914   BIND(L_first_warm_up_done);
4915   lvx(const1, constantsPos);
4916   addi(constantsPos, constantsPos, 16);
4917   vpmsumd(VR8,  VR16, const1);
4918   vpmsumd(VR9,  VR17, const1);
4919   vpmsumd(VR10, VR18, const1);
4920   vpmsumd(VR11, VR19, const1);
4921   vpmsumd(VR12, VR20, const1);
4922   vpmsumd(VR13, VR21, const1);
4923   vpmsumd(VR14, VR22, const1);
4924   vpmsumd(VR15, VR23, const1);
4925   b(L_second_cool_down);
4926 
4927   BIND(L_barrett_reduction);
4928 
4929   lvx(const1, barretConstants);
4930   addi(barretConstants, barretConstants, 16);
4931   lvx(const2, barretConstants);
4932 
4933   vsldoi(VR1, VR0, VR0, 8);
4934   vxor(VR0, VR0, VR1);    // xor two 64 bit results together
4935 
4936   // shift left one bit
4937   vspltisb(VR1, 1);
4938   vsl(VR0, VR0, VR1);
4939 
4940   vand(VR0, VR0, mask_64bit);
4941 
4942   /*
4943    * The reflected version of Barrett reduction. Instead of bit
4944    * reflecting our data (which is expensive to do), we bit reflect our
4945    * constants and our algorithm, which means the intermediate data in
4946    * our vector registers goes from 0-63 instead of 63-0. We can reflect
4947    * the algorithm because we don't carry in mod 2 arithmetic.
4948    */
4949   vand(VR1, VR0, mask_32bit);  // bottom 32 bits of a
4950   vpmsumd(VR1, VR1, const1);   // ma
4951   vand(VR1, VR1, mask_32bit);  // bottom 32bits of ma
4952   vpmsumd(VR1, VR1, const2);   // qn */
4953   vxor(VR0, VR0, VR1);         // a - qn, subtraction is xor in GF(2)


< prev index next >