4559 offsetInt -= 8; std(R28, offsetInt, R1_SP);
4560 offsetInt -= 8; std(R29, offsetInt, R1_SP);
4561 offsetInt -= 8; std(R30, offsetInt, R1_SP);
4562 offsetInt -= 8; std(R31, offsetInt, R1_SP);
4563
4564 // Set constants
4565 li(off16, 16);
4566 li(off32, 32);
4567 li(off48, 48);
4568 li(off64, 64);
4569 li(off80, 80);
4570 li(off96, 96);
4571 li(off112, 112);
4572
4573 clrldi(crc, crc, 32);
4574
4575 vxor(zeroes, zeroes, zeroes);
4576 vspltisw(VR0, -1);
4577
4578 vsldoi(mask_32bit, zeroes, VR0, 4);
4579 vsldoi(mask_64bit, zeroes, VR0, -8);
4580
4581 // Get the initial value into v8
4582 vxor(VR8, VR8, VR8);
4583 mtvrd(VR8, crc);
4584 vsldoi(VR8, zeroes, VR8, -8); // shift into bottom 32 bits
4585
4586 li (rLoaded, 0);
4587
4588 rldicr(rIdx, len, 0, 56);
4589
4590 {
4591 BIND(L_1);
4592 // Checksum in blocks of MAX_SIZE (32768)
4593 lis(rMax, 0);
4594 ori(rMax, rMax, 32768);
4595 mr(rTmp2, rMax);
4596 cmpd(CCR0, rIdx, rMax);
4597 bgt(CCR0, L_2);
4598 mr(rMax, rIdx);
4599
4600 BIND(L_2);
4601 subf(rIdx, rMax, rIdx);
4602
4603 // our main loop does 128 bytes at a time
4604 srdi(rMax, rMax, 7);
4913
4914 BIND(L_first_warm_up_done);
4915 lvx(const1, constantsPos);
4916 addi(constantsPos, constantsPos, 16);
4917 vpmsumd(VR8, VR16, const1);
4918 vpmsumd(VR9, VR17, const1);
4919 vpmsumd(VR10, VR18, const1);
4920 vpmsumd(VR11, VR19, const1);
4921 vpmsumd(VR12, VR20, const1);
4922 vpmsumd(VR13, VR21, const1);
4923 vpmsumd(VR14, VR22, const1);
4924 vpmsumd(VR15, VR23, const1);
4925 b(L_second_cool_down);
4926
4927 BIND(L_barrett_reduction);
4928
4929 lvx(const1, barretConstants);
4930 addi(barretConstants, barretConstants, 16);
4931 lvx(const2, barretConstants);
4932
4933 vsldoi(VR1, VR0, VR0, -8);
4934 vxor(VR0, VR0, VR1); // xor two 64 bit results together
4935
4936 // shift left one bit
4937 vspltisb(VR1, 1);
4938 vsl(VR0, VR0, VR1);
4939
4940 vand(VR0, VR0, mask_64bit);
4941
4942 /*
4943 * The reflected version of Barrett reduction. Instead of bit
4944 * reflecting our data (which is expensive to do), we bit reflect our
4945 * constants and our algorithm, which means the intermediate data in
4946 * our vector registers goes from 0-63 instead of 63-0. We can reflect
4947 * the algorithm because we don't carry in mod 2 arithmetic.
4948 */
4949 vand(VR1, VR0, mask_32bit); // bottom 32 bits of a
4950 vpmsumd(VR1, VR1, const1); // ma
4951 vand(VR1, VR1, mask_32bit); // bottom 32bits of ma
4952 vpmsumd(VR1, VR1, const2); // qn */
4953 vxor(VR0, VR0, VR1); // a - qn, subtraction is xor in GF(2)
|
4559 offsetInt -= 8; std(R28, offsetInt, R1_SP);
4560 offsetInt -= 8; std(R29, offsetInt, R1_SP);
4561 offsetInt -= 8; std(R30, offsetInt, R1_SP);
4562 offsetInt -= 8; std(R31, offsetInt, R1_SP);
4563
4564 // Set constants
4565 li(off16, 16);
4566 li(off32, 32);
4567 li(off48, 48);
4568 li(off64, 64);
4569 li(off80, 80);
4570 li(off96, 96);
4571 li(off112, 112);
4572
4573 clrldi(crc, crc, 32);
4574
4575 vxor(zeroes, zeroes, zeroes);
4576 vspltisw(VR0, -1);
4577
4578 vsldoi(mask_32bit, zeroes, VR0, 4);
4579 vsldoi(mask_64bit, zeroes, VR0, 8);
4580
4581 // Get the initial value into v8
4582 vxor(VR8, VR8, VR8);
4583 mtvrd(VR8, crc);
4584 vsldoi(VR8, zeroes, VR8, 8); // shift into bottom 32 bits
4585
4586 li (rLoaded, 0);
4587
4588 rldicr(rIdx, len, 0, 56);
4589
4590 {
4591 BIND(L_1);
4592 // Checksum in blocks of MAX_SIZE (32768)
4593 lis(rMax, 0);
4594 ori(rMax, rMax, 32768);
4595 mr(rTmp2, rMax);
4596 cmpd(CCR0, rIdx, rMax);
4597 bgt(CCR0, L_2);
4598 mr(rMax, rIdx);
4599
4600 BIND(L_2);
4601 subf(rIdx, rMax, rIdx);
4602
4603 // our main loop does 128 bytes at a time
4604 srdi(rMax, rMax, 7);
4913
4914 BIND(L_first_warm_up_done);
4915 lvx(const1, constantsPos);
4916 addi(constantsPos, constantsPos, 16);
4917 vpmsumd(VR8, VR16, const1);
4918 vpmsumd(VR9, VR17, const1);
4919 vpmsumd(VR10, VR18, const1);
4920 vpmsumd(VR11, VR19, const1);
4921 vpmsumd(VR12, VR20, const1);
4922 vpmsumd(VR13, VR21, const1);
4923 vpmsumd(VR14, VR22, const1);
4924 vpmsumd(VR15, VR23, const1);
4925 b(L_second_cool_down);
4926
4927 BIND(L_barrett_reduction);
4928
4929 lvx(const1, barretConstants);
4930 addi(barretConstants, barretConstants, 16);
4931 lvx(const2, barretConstants);
4932
4933 vsldoi(VR1, VR0, VR0, 8);
4934 vxor(VR0, VR0, VR1); // xor two 64 bit results together
4935
4936 // shift left one bit
4937 vspltisb(VR1, 1);
4938 vsl(VR0, VR0, VR1);
4939
4940 vand(VR0, VR0, mask_64bit);
4941
4942 /*
4943 * The reflected version of Barrett reduction. Instead of bit
4944 * reflecting our data (which is expensive to do), we bit reflect our
4945 * constants and our algorithm, which means the intermediate data in
4946 * our vector registers goes from 0-63 instead of 63-0. We can reflect
4947 * the algorithm because we don't carry in mod 2 arithmetic.
4948 */
4949 vand(VR1, VR0, mask_32bit); // bottom 32 bits of a
4950 vpmsumd(VR1, VR1, const1); // ma
4951 vand(VR1, VR1, mask_32bit); // bottom 32bits of ma
4952 vpmsumd(VR1, VR1, const2); // qn */
4953 vxor(VR0, VR0, VR1); // a - qn, subtraction is xor in GF(2)
|