1 // Copyright (c) 2017 Instituto de Pesquisas Eldorado. All rights reserved.
   2 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   3 //
   4 // This code is free software; you can redistribute it and/or modify it
   5 // under the terms of the GNU General Public License version 2 only, as
   6 // published by the Free Software Foundation.
   7 //
   8 // This code is distributed in the hope that it will be useful, but WITHOUT
   9 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  10 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  11 // version 2 for more details (a copy is included in the LICENSE file that
  12 // accompanied this code).
  13 //
  14 // You should have received a copy of the GNU General Public License version
  15 // 2 along with this work; if not, write to the Free Software Foundation,
  16 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  17 //
  18 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  19 // or visit www.oracle.com if you need additional information or have any
  20 // questions.
  21 
  22 #include "asm/assembler.hpp"
  23 #include "asm/assembler.inline.hpp"
  24 #include "runtime/stubRoutines.hpp"
  25 #include "macroAssembler_ppc.hpp"
  26 
  27 /**********************************************************************
  28  * SHA 256
  29  *********************************************************************/
  30 
  31 void MacroAssembler::sha256_deque(const VectorRegister& src,
  32                                   const VectorRegister& dst1,
  33                                   const VectorRegister& dst2,
  34                                   const VectorRegister& dst3) {
  35   vsldoi (dst1, src, src, 12);
  36   vsldoi (dst2, src, src, 8);
  37   vsldoi (dst3, src, src, 4);
  38 }
  39 
  40 void MacroAssembler::sha256_round(const VectorRegister* hs,
  41                                   const int total_hs,
  42                                   int& h_cnt,
  43                                   const VectorRegister& kpw) {
  44   // convenience registers: cycle from 0-7 downwards
  45   const VectorRegister a = hs[(total_hs + 0 - (h_cnt % total_hs)) % total_hs];
  46   const VectorRegister b = hs[(total_hs + 1 - (h_cnt % total_hs)) % total_hs];
  47   const VectorRegister c = hs[(total_hs + 2 - (h_cnt % total_hs)) % total_hs];
  48   const VectorRegister d = hs[(total_hs + 3 - (h_cnt % total_hs)) % total_hs];
  49   const VectorRegister e = hs[(total_hs + 4 - (h_cnt % total_hs)) % total_hs];
  50   const VectorRegister f = hs[(total_hs + 5 - (h_cnt % total_hs)) % total_hs];
  51   const VectorRegister g = hs[(total_hs + 6 - (h_cnt % total_hs)) % total_hs];
  52   const VectorRegister h = hs[(total_hs + 7 - (h_cnt % total_hs)) % total_hs];
  53   // temporaries
  54   VectorRegister ch  = VR0;
  55   VectorRegister maj = VR1;
  56   VectorRegister bsa = VR2;
  57   VectorRegister bse = VR3;
  58   VectorRegister vt0 = VR4;
  59   VectorRegister vt1 = VR5;
  60   VectorRegister vt2 = VR6;
  61   VectorRegister vt3 = VR7;
  62 
  63   vsel       (ch,  g,   f, e);
  64   vxor       (maj, a,   b);
  65   vshasigmaw (bse, e,   1, 0xf);
  66   vadduwm    (vt2, ch,  kpw);
  67   vadduwm    (vt1, h,   bse);
  68   vsel       (maj, b,   c, maj);
  69   vadduwm    (vt3, vt1, vt2);
  70   vshasigmaw (bsa, a,   1, 0);
  71   vadduwm    (vt0, bsa, maj);
  72 
  73   vadduwm    (d,   d,   vt3);
  74   vadduwm    (h,   vt3, vt0);
  75 
  76   // advance vector pointer to the next iteration
  77   h_cnt++;
  78 }
  79 
  80 void MacroAssembler::sha256_load_h_vec(const VectorRegister& a,
  81                                        const VectorRegister& b,
  82                                        const VectorRegister& c,
  83                                        const VectorRegister& d,
  84                                        const VectorRegister& e,
  85                                        const VectorRegister& f,
  86                                        const VectorRegister& g,
  87                                        const VectorRegister& h,
  88                                        const Register& hptr) {
  89   // temporaries
  90   Register tmp = R8;
  91   VectorRegister vt0 = VR0;
  92   VectorRegister vRb = VR6;
  93   // labels
  94   Label sha256_aligned, sha256_load_end;;
  95 
  96   andi_  (tmp,  hptr, 0xf);
  97   addi   (tmp,  hptr, 16);
  98   beq    (CCR0, sha256_aligned);
  99 
 100   // handle unaligned accesses
 101   lvx    (a,    hptr);
 102   lvsr   (vRb,  hptr);
 103 
 104   lvx    (e,    tmp);
 105   addi   (tmp,  tmp,  16);
 106   vperm  (a,    e,    a, vRb);
 107 
 108   lvx    (vt0,  tmp);
 109   vperm  (e,    vt0,  e, vRb);
 110   this->b (sha256_load_end);
 111 
 112   // aligned accesses
 113   bind(sha256_aligned);
 114   lvx    (a,    hptr);
 115   addi   (tmp,  hptr, 16);
 116   lvx    (e,    tmp);
 117 
 118   bind(sha256_load_end);
 119 }
 120 
 121 void MacroAssembler::sha256_load_w_plus_k_vec(const Register& buf_in,
 122                                               const VectorRegister* ws,
 123                                               const int total_ws,
 124                                               const Register& k,
 125                                               const VectorRegister* kpws,
 126                                               const int total_kpws) {
 127   Label w_aligned, after_w_load;
 128 
 129   Register tmp       = R8;
 130   VectorRegister vt0 = VR0;
 131   VectorRegister vt1 = VR1;
 132   VectorRegister vRb = VR6;
 133 
 134   andi_ (tmp, buf_in, 0xF);
 135   beq   (CCR0, w_aligned); // address ends with 0x0, not 0x8
 136 
 137   // deal with unaligned addresses
 138   lvx    (ws[0], buf_in);
 139   addi   (buf_in, buf_in, 16);
 140   lvsl   (vRb, buf_in);
 141 
 142   for (int n = 1; n < total_ws; n++) {
 143     VectorRegister w_cur = ws[n];
 144     VectorRegister w_prev = ws[n-1];
 145 
 146     lvx  (w_cur, buf_in);
 147     addi (buf_in, buf_in, 16);
 148     vperm(w_prev, w_cur, w_prev, vRb);
 149   }
 150 
 151   lvx    (vt0, buf_in);
 152   vperm  (ws[total_ws-1], vt0, ws[total_ws-1], vRb);
 153 
 154   this->b(after_w_load);
 155 
 156   bind(w_aligned);
 157 
 158   // deal with aligned addresses
 159   for (int n = 0; n < total_ws; n++) {
 160     VectorRegister w = ws[n];
 161 
 162     lvx  (w, buf_in);
 163     addi (buf_in, buf_in, 16);
 164   }
 165 
 166   bind(after_w_load);
 167 
 168   // Byte swapping on little endian
 169   li       (tmp, 8);
 170   lvsl     (vt0, tmp);
 171   vspltisb (vt1, 0xb);
 172   vxor     (vt1, vt0, vt1);
 173   for (int n = 0; n < total_ws; n++) {
 174     VectorRegister w = ws[n];
 175     vperm  (w,   w,   w,   vt1);
 176   }
 177 
 178   // Loading k, which is always aligned to 16-bytes
 179   lvx    (kpws[0], k);
 180   addi   (tmp, k, 16);
 181   for (int n = 1; n < total_kpws-1; n++) {
 182     VectorRegister kpw = kpws[n];
 183 
 184     lvx  (kpw, tmp);
 185     addi (tmp, tmp, 16);
 186   }
 187   lvx  (kpws[total_kpws-1], tmp);
 188 
 189   // Add w to K
 190   assert(total_ws == total_kpws, "Redesign the loop below");
 191   for (int n = 0; n < total_kpws; n++) {
 192     VectorRegister kpw = kpws[n];
 193     VectorRegister w   = ws[n];
 194 
 195     vadduwm  (kpw, kpw, w);
 196   }
 197 }
 198 
 199 void MacroAssembler::sha256_calc_4w(const VectorRegister& w0,
 200                                     const VectorRegister& w1,
 201                                     const VectorRegister& w2,
 202                                     const VectorRegister& w3,
 203                                     const VectorRegister& kpw0,
 204                                     const VectorRegister& kpw1,
 205                                     const VectorRegister& kpw2,
 206                                     const VectorRegister& kpw3,
 207                                     const Register& j,
 208                                     const Register& k) {
 209   // Temporaries
 210   const VectorRegister  vt0  = VR0;
 211   const VectorRegister  vt1  = VR1;
 212   const VectorSRegister vsrt1 = vt1->to_vsr();
 213   const VectorRegister  vt2  = VR2;
 214   const VectorRegister  vt3  = VR3;
 215   const VectorSRegister vst3 = vt3->to_vsr();
 216   const VectorRegister  vt4  = VR4;
 217 
 218   // load to k[j]
 219   lvx        (vt0, j,   k);
 220 
 221   // advance j
 222   addi       (j,   j,   16); // 16 bytes were read
 223 
 224   // b = w[j-15], w[j-14], w[j-13], w[j-12]
 225   vsldoi     (vt1, w1,  w0, 12);
 226 
 227   // c = w[j-7], w[j-6], w[j-5], w[j-4]
 228   vsldoi     (vt2, w3,  w2, 12);
 229 
 230   // d = w[j-2], w[j-1], w[j-4], w[j-3]
 231   vsldoi     (vt3, w3,  w3, 8);
 232 
 233   // b = s0(w[j-15]) , s0(w[j-14]) , s0(w[j-13]) , s0(w[j-12])
 234   vshasigmaw (vt1, vt1, 0,  0);
 235 
 236   // d = s1(w[j-2]) , s1(w[j-1]) , s1(w[j-4]) , s1(w[j-3])
 237   vshasigmaw (vt3, vt3, 0,  0xf);
 238 
 239   // c = s0(w[j-15]) + w[j-7],
 240   //     s0(w[j-14]) + w[j-6],
 241   //     s0(w[j-13]) + w[j-5],
 242   //     s0(w[j-12]) + w[j-4]
 243   vadduwm    (vt2, vt1, vt2);
 244 
 245   // c = s0(w[j-15]) + w[j-7] + w[j-16],
 246   //     s0(w[j-14]) + w[j-6] + w[j-15],
 247   //     s0(w[j-13]) + w[j-5] + w[j-14],
 248   //     s0(w[j-12]) + w[j-4] + w[j-13]
 249   vadduwm    (vt2, vt2, w0);
 250 
 251   // e = s0(w[j-15]) + w[j-7] + w[j-16] + s1(w[j-2]), // w[j]
 252   //     s0(w[j-14]) + w[j-6] + w[j-15] + s1(w[j-1]), // w[j+1]
 253   //     s0(w[j-13]) + w[j-5] + w[j-14] + s1(w[j-4]), // UNDEFINED
 254   //     s0(w[j-12]) + w[j-4] + w[j-13] + s1(w[j-3])  // UNDEFINED
 255   vadduwm    (vt4, vt2, vt3);
 256 
 257   // At this point, e[0] and e[1] are the correct values to be stored at w[j]
 258   // and w[j+1].
 259   // e[2] and e[3] are not considered.
 260   // b = s1(w[j]) , s1(s(w[j+1]) , UNDEFINED , UNDEFINED
 261   vshasigmaw (vt1, vt4, 0,  0xf);
 262 
 263   // v5 = s1(w[j-2]) , s1(w[j-1]) , s1(w[j]) , s1(w[j+1])
 264   xxmrgld    (vst3,vsrt1,vst3);
 265 
 266   // c = s0(w[j-15]) + w[j-7] + w[j-16] + s1(w[j-2]), // w[j]
 267   //     s0(w[j-14]) + w[j-6] + w[j-15] + s1(w[j-1]), // w[j+1]
 268   //     s0(w[j-13]) + w[j-5] + w[j-14] + s1(w[j]),   // w[j+2]
 269   //     s0(w[j-12]) + w[j-4] + w[j-13] + s1(w[j+1])  // w[j+4]
 270   vadduwm    (vt2, vt2, vt3);
 271 
 272   // Updating w0 to w3 to hold the new previous 16 values from w.
 273   vmr        (w0,  w1);
 274   vmr        (w1,  w2);
 275   vmr        (w2,  w3);
 276   vmr        (w3,  vt2);
 277 
 278   // store k + w to v9 (4 values at once)
 279   vadduwm    (kpw0,vt2, vt0);
 280 
 281   vsldoi     (kpw1,kpw0,kpw0, 12);
 282   vsldoi     (kpw2,kpw0,kpw0, 8);
 283   vsldoi     (kpw3,kpw0,kpw0, 4);
 284 }
 285 
 286 void MacroAssembler::sha256_update_sha_state(const VectorRegister& a,
 287                                              const VectorRegister& b,
 288                                              const VectorRegister& c,
 289                                              const VectorRegister& d,
 290                                              const VectorRegister& e,
 291                                              const VectorRegister& f,
 292                                              const VectorRegister& g,
 293                                              const VectorRegister& h,
 294                                              const Register& hptr) {
 295   // temporaries
 296   VectorRegister vt0  = VR0;
 297   VectorRegister vt1  = VR1;
 298   VectorRegister vt2  = VR2;
 299   VectorRegister vt3  = VR3;
 300   VectorRegister vt4  = VR4;
 301   VectorRegister vt5  = VR5;
 302   VectorRegister vaux = VR6;
 303   VectorRegister vRb  = VR6;
 304   Register tmp        = R8;
 305   Register of16       = R8;
 306   Register of32       = R9;
 307   Label state_load_aligned, after_state_load_aligned;
 308 
 309   // Load hptr
 310   andi_   (tmp, hptr, 0xf);
 311   li      (of16,16);
 312   beq     (CCR0,state_load_aligned);
 313 
 314   // handle unaligned accesses
 315   li      (of32,32);
 316   lvx     (vt0, hptr);
 317   lvsr    (vRb, hptr);
 318 
 319   lvx     (vt5, hptr,of16);
 320   vperm   (vt0, vt5, vt0, vRb);    // vt0 = hptr[0]..hptr[3]
 321 
 322   lvx     (vt1, hptr,of32);
 323   vperm   (vt5, vt1, vt5, vRb);    // vt5 = hptr[4]..hptr[7]
 324   this->b (after_state_load_aligned);
 325 
 326   // aligned accesses
 327   bind(state_load_aligned);
 328   lvx     (vt0, hptr);
 329   lvx     (vt5, of16,hptr);
 330 
 331   bind(after_state_load_aligned);
 332 
 333   vmrglw  (vt1, b, a);             // vt1 = {a, b, ?, ?}
 334   vmrglw  (vt2, d, c);             // vt2 = {c, d, ?, ?}
 335   vmrglw  (vt3, f, e);             // vt3 = {e, f, ?, ?}
 336   vmrglw  (vt4, h, g);             // vt4 = {g, h, ?, ?}
 337   xxmrgld (vt1->to_vsr(), vt2->to_vsr(), vt1->to_vsr()); // vt1 = {a, b, c, d}
 338   xxmrgld (vt3->to_vsr(), vt4->to_vsr(), vt3->to_vsr()); // vt3 = {e, f, g, h}
 339   vadduwm (a,   vt0, vt1);         // a = {a+hptr[0], b+hptr[1], c+hptr[2], d+hptr[3]}
 340   vadduwm (e,   vt5, vt3);         // e = {e+hptr[4], f+hptr[5], g+hptr[6], h+hptr[7]}
 341 
 342   // Save hptr back, works for any alignment
 343   xxswapd (vt0->to_vsr(), a->to_vsr());
 344   stxvd2x (vt0->to_vsr(), hptr);
 345   xxswapd (vt5->to_vsr(), e->to_vsr());
 346   stxvd2x (vt5->to_vsr(), of16, hptr);
 347 }
 348 
 349 
 350 //   R3_ARG1   - byte[]  Input string with padding but in Big Endian
 351 //   R4_ARG2   - int[]   SHA.state (at first, the root of primes)
 352 //   R5_ARG3   - int     offset
 353 //   R6_ARG4   - int     limit
 354 //
 355 //   Internal Register usage:
 356 //   R7        - k
 357 //   R8        - tmp | j | of16
 358 //   R9        - of32
 359 //   VR0-VR8   - ch, maj, bsa, bse, vt0-vt3 | vt0-vt5, vaux/vRb
 360 //   VR9-VR16  - a-h
 361 //   VR17-VR20 - w0-w3
 362 //   VR21-VR23 - vRb | vaux0-vaux2
 363 //   VR24-VR27 - kpw0-kpw3
 364 void MacroAssembler::sha256(bool multi_block) {
 365   static const ssize_t base_size = sizeof(uint32_t);
 366   static const ssize_t buf_size = 64;
 367   static uint32_t waux[buf_size / base_size] __attribute((aligned (16)));
 368   static const uint32_t round_consts[64] __attribute((aligned (16))) = {
 369     0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
 370     0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
 371     0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
 372     0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
 373     0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
 374     0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
 375     0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
 376     0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
 377     0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
 378     0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
 379     0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
 380     0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
 381     0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
 382     0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
 383     0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
 384     0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
 385   };
 386   static const uint8_t w_size = sizeof(round_consts)/sizeof(uint32_t);
 387 
 388   Register buf_in = R3_ARG1;
 389   Register state  = R4_ARG2;
 390   Register ofs    = R5_ARG3;
 391   Register limit  = R6_ARG4;
 392 
 393   Label sha_loop, bsw_loop, core_loop;
 394 
 395   // Save non-volatile vector registers in the red zone
 396   static const VectorRegister nv[] = {
 397     VR20, VR21, VR22, VR23, VR24, VR25, VR26, VR27/*, VR28, VR29, VR30, VR31*/
 398   };
 399   static const uint8_t nv_size = sizeof(nv) / sizeof (VectorRegister);
 400 
 401   for (int c = 0; c < nv_size; c++) {
 402     Register tmp = R8;
 403     li  (tmp, (c - (nv_size)) * 16);
 404     stvx(nv[c], tmp, R1);
 405   }
 406 
 407   // Load hash state to registers
 408   VectorRegister a = VR9;
 409   VectorRegister b = VR10;
 410   VectorRegister c = VR11;
 411   VectorRegister d = VR12;
 412   VectorRegister e = VR13;
 413   VectorRegister f = VR14;
 414   VectorRegister g = VR15;
 415   VectorRegister h = VR16;
 416   static const VectorRegister hs[] = {a, b, c, d, e, f, g, h};
 417   static const int total_hs = sizeof(hs)/sizeof(VectorRegister);
 418   // counter for cycling through hs vector to avoid register moves between iterations
 419   int h_cnt = 0;
 420 
 421   // Load a-h registers from the memory pointed by state
 422   sha256_load_h_vec(a, b, c, d, e, f, g, h, state);
 423 
 424   // keep k loaded also during MultiBlock loops
 425   Register k = R7;
 426   load_const(k, const_cast<uint32_t *>(round_consts));
 427 
 428   // Avoiding redundant loads
 429   bind(sha_loop);
 430   sha256_deque(a, b, c, d);
 431   sha256_deque(e, f, g, h);
 432 
 433   align(OptoLoopAlignment);
 434 
 435   // Load 16 elements from w out of the loop
 436   VectorRegister w0 = VR17;
 437   VectorRegister w1 = VR18;
 438   VectorRegister w2 = VR19;
 439   VectorRegister w3 = VR20;
 440   static const VectorRegister ws[] = {w0, w1, w2, w3};
 441   static const int total_ws = sizeof(ws)/sizeof(VectorRegister);
 442 
 443   VectorRegister kpw0 = VR24;
 444   VectorRegister kpw1 = VR25;
 445   VectorRegister kpw2 = VR26;
 446   VectorRegister kpw3 = VR27;
 447   static const VectorRegister kpws[] = {kpw0, kpw1, kpw2, kpw3};
 448   static const int total_kpws = sizeof(kpws)/sizeof(VectorRegister);
 449 
 450   sha256_load_w_plus_k_vec(buf_in, ws, total_ws, k, kpws, total_kpws);
 451 
 452   // Cycle through the first 16 elements
 453   assert(total_ws == total_kpws, "Redesign the loop below");
 454   for (int n = 0; n < total_ws; n++) {
 455     VectorRegister vaux0 = VR21;
 456     VectorRegister vaux1 = VR22;
 457     VectorRegister vaux2 = VR23;
 458 
 459     sha256_deque(kpws[n],vaux0, vaux1, vaux2);
 460 
 461     sha256_round(hs, total_hs, h_cnt, kpws[n]);
 462     sha256_round(hs, total_hs, h_cnt, vaux0);
 463     sha256_round(hs, total_hs, h_cnt, vaux1);
 464     sha256_round(hs, total_hs, h_cnt, vaux2);
 465   }
 466 
 467   Register tmp = R8;
 468   // loop the 16th to the 64th iteration by 8 steps
 469   li   (tmp, (w_size - 16) / total_hs);
 470   mtctr(tmp);
 471 
 472   // j will be aligned to 4 for loading words.
 473   // Whenever read, advance the pointer (e.g: when j is used in a function)
 474   Register j = R8;
 475   li   (j, 16*4);
 476 
 477   align(OptoLoopAlignment);
 478   bind(core_loop);
 479 
 480   // due to VectorRegister rotate, always iterate in multiples of total_hs
 481   for (int n = 0; n < total_hs/4; n++) {
 482     sha256_calc_4w(w0, w1, w2, w3, kpw0, kpw1, kpw2, kpw3, j, k);
 483     sha256_round(hs, total_hs, h_cnt, kpw0);
 484     sha256_round(hs, total_hs, h_cnt, kpw1);
 485     sha256_round(hs, total_hs, h_cnt, kpw2);
 486     sha256_round(hs, total_hs, h_cnt, kpw3);
 487   }
 488 
 489   bdnz   (core_loop);
 490 
 491   // Update hash state
 492   sha256_update_sha_state(a, b, c, d, e, f, g, h, state);
 493 
 494   if (multi_block) {
 495     // process next 1024 bit block (buf_in already updated)
 496     addi(ofs, ofs, buf_size);
 497     cmpd(CCR0, ofs, limit);
 498     blt(CCR0, sha_loop);
 499 
 500     // return ofs
 501     mr(R3_ARG1, ofs);
 502   }
 503 
 504   // Restore non-volatile registers
 505   for (int c = 0; c < nv_size; c++) {
 506     Register tmp = R8;
 507     li  (tmp, (c - (nv_size)) * 16);
 508     lvx(nv[c], tmp, R1);
 509   }
 510 }
 511 
 512 /**********************************************************************
 513  * SHA 512
 514  *********************************************************************/
 515 
 516 void MacroAssembler::sha512_load_w_vec(const Register& buf_in,
 517                                        const VectorRegister* ws,
 518                                        const int total_ws) {
 519   Register tmp       = R8;
 520   VectorRegister vRb = VR8;
 521   VectorRegister aux = VR9;
 522   Label is_aligned, after_alignment;
 523 
 524   andi_  (tmp, buf_in, 0xF);
 525   beq    (CCR0, is_aligned); // address ends with 0x0, not 0x8
 526 
 527   // deal with unaligned addresses
 528   lvx    (ws[0], buf_in);
 529   addi   (buf_in, buf_in, 16);
 530   lvsl   (vRb, buf_in);
 531 
 532   for (int n = 1; n < total_ws; n++) {
 533     VectorRegister w_cur = ws[n];
 534     VectorRegister w_prev = ws[n-1];
 535 
 536     lvx  (w_cur, buf_in);
 537     addi (buf_in, buf_in, 16);
 538     vperm(w_prev, w_cur, w_prev, vRb);
 539   }
 540 
 541   lvx    (aux, buf_in);
 542   vperm  (ws[total_ws-1], aux, ws[total_ws-1], vRb);
 543 
 544   this->b(after_alignment);
 545 
 546   bind(is_aligned);
 547 
 548   for (int n = 0; n < total_ws; n++) {
 549     VectorRegister w = ws[n];
 550 
 551     lvx  (w, buf_in);
 552     addi (buf_in, buf_in, 16);
 553   }
 554 
 555   bind(after_alignment);
 556 }
 557 
 558 // Update hash state
 559 void MacroAssembler::sha512_update_sha_state(const Register& state,
 560                                              const VectorRegister* hs,
 561                                              const int total_hs) {
 562 
 563   // load initial hash from the memory pointed by state
 564   VectorRegister ini_a = VR10;
 565   VectorRegister ini_c = VR12;
 566   VectorRegister ini_e = VR14;
 567   VectorRegister ini_g = VR16;
 568   static const VectorRegister inis[] = {ini_a, ini_c, ini_e, ini_g};
 569   static const int total_inis = sizeof(inis)/sizeof(VectorRegister);
 570 
 571   Label state_save_aligned, after_state_save_aligned;
 572 
 573   Register addr      = R7;
 574   Register tmp       = R8;
 575   VectorRegister vRb = VR8;
 576   VectorRegister aux = VR9;
 577 
 578   andi_(tmp, state, 0xf);
 579   beq(CCR0, state_save_aligned);
 580   // deal with unaligned addresses
 581 
 582   {
 583     VectorRegister a = hs[0];
 584     VectorRegister b = hs[1];
 585     VectorRegister c = hs[2];
 586     VectorRegister d = hs[3];
 587     VectorRegister e = hs[4];
 588     VectorRegister f = hs[5];
 589     VectorRegister g = hs[6];
 590     VectorRegister h = hs[7];
 591     lvsr   (vRb, state);
 592     lvx    (ini_a, state);
 593     addi   (addr, state, 16);
 594 
 595     lvx    (ini_c, addr);
 596     addi   (addr, addr, 16);
 597     vperm  (ini_a, ini_c, ini_a, vRb);
 598 
 599     lvx    (ini_e, addr);
 600     addi   (addr, addr, 16);
 601     vperm  (ini_c, ini_e, ini_c, vRb);
 602 
 603     lvx    (ini_g, addr);
 604     addi   (addr, addr, 16);
 605     vperm  (ini_e, ini_g, ini_e, vRb);
 606 
 607     lvx    (aux, addr);
 608     vperm  (ini_g, aux, ini_g, vRb);
 609 
 610     xxmrgld(a->to_vsr(), b->to_vsr(), a->to_vsr());
 611     xxmrgld(c->to_vsr(), d->to_vsr(), c->to_vsr());
 612     xxmrgld(e->to_vsr(), f->to_vsr(), e->to_vsr());
 613     xxmrgld(g->to_vsr(), h->to_vsr(), g->to_vsr());
 614 
 615     for (int n = 0; n < total_hs; n += 2) {
 616       VectorRegister h_cur = hs[n];
 617       VectorRegister ini_cur = inis[n/2];
 618 
 619       vaddudm(h_cur, ini_cur, h_cur);
 620     }
 621 
 622     for (int n = 0; n < total_hs; n += 2) {
 623       VectorRegister h_cur = hs[n];
 624 
 625       mfvrd  (tmp, h_cur);
 626       std    (tmp, 8*n + 8, state);
 627       vsldoi (aux, h_cur, h_cur, 8);
 628       mfvrd  (tmp, aux);
 629       std    (tmp, 8*n + 0, state);
 630     }
 631 
 632     this->b(after_state_save_aligned);
 633   }
 634 
 635   bind(state_save_aligned);
 636 
 637   {
 638     mr(addr, state);
 639     for (int n = 0; n < total_hs; n += 2) {
 640       VectorRegister h_cur = hs[n];
 641       VectorRegister h_next = hs[n+1];
 642       VectorRegister ini_cur = inis[n/2];
 643 
 644       lvx(ini_cur, addr);
 645       addi(addr, addr, 16);
 646       xxmrgld(h_cur->to_vsr(), h_next->to_vsr(), h_cur->to_vsr());
 647     }
 648 
 649     for (int n = 0; n < total_hs; n += 2) {
 650       VectorRegister h_cur = hs[n];
 651       VectorRegister ini_cur = inis[n/2];
 652 
 653       vaddudm(h_cur, ini_cur, h_cur);
 654     }
 655 
 656     mr(addr, state);
 657     for (int n = 0; n < total_hs; n += 2) {
 658       VectorRegister h_cur = hs[n];
 659 
 660       stvx(h_cur, addr);
 661       addi(addr, addr, 16);
 662     }
 663   }
 664 
 665   bind(after_state_save_aligned);
 666 }
 667 
 668 // Use h_cnt to cycle through hs elements but also increment it at the end
 669 void MacroAssembler::sha512_round(const VectorRegister* hs,
 670                                   const int total_hs, int& h_cnt,
 671                                   const VectorRegister& kpw) {
 672 
 673   // convenience registers: cycle from 0-7 downwards
 674   const VectorRegister a = hs[(total_hs + 0 - (h_cnt % total_hs)) % total_hs];
 675   const VectorRegister b = hs[(total_hs + 1 - (h_cnt % total_hs)) % total_hs];
 676   const VectorRegister c = hs[(total_hs + 2 - (h_cnt % total_hs)) % total_hs];
 677   const VectorRegister d = hs[(total_hs + 3 - (h_cnt % total_hs)) % total_hs];
 678   const VectorRegister e = hs[(total_hs + 4 - (h_cnt % total_hs)) % total_hs];
 679   const VectorRegister f = hs[(total_hs + 5 - (h_cnt % total_hs)) % total_hs];
 680   const VectorRegister g = hs[(total_hs + 6 - (h_cnt % total_hs)) % total_hs];
 681   const VectorRegister h = hs[(total_hs + 7 - (h_cnt % total_hs)) % total_hs];
 682   // temporaries
 683   const VectorRegister Ch   = VR20;
 684   const VectorRegister Maj  = VR21;
 685   const VectorRegister bsa  = VR22;
 686   const VectorRegister bse  = VR23;
 687   const VectorRegister tmp1 = VR24;
 688   const VectorRegister tmp2 = VR25;
 689 
 690   vsel      (Ch,   g,    f,   e);
 691   vxor      (Maj,  a,    b);
 692   vshasigmad(bse,  e,    1,   0xf);
 693   vaddudm   (tmp2, Ch,   kpw);
 694   vaddudm   (tmp1, h,    bse);
 695   vsel      (Maj,  b,    c,   Maj);
 696   vaddudm   (tmp1, tmp1, tmp2);
 697   vshasigmad(bsa,  a,    1,   0);
 698   vaddudm   (tmp2, bsa,  Maj);
 699   vaddudm   (d,    d,    tmp1);
 700   vaddudm   (h,    tmp1, tmp2);
 701 
 702   // advance vector pointer to the next iteration
 703   h_cnt++;
 704 }
 705 
 706 void MacroAssembler::sha512_calc_2w(const VectorRegister& w0,
 707                                     const VectorRegister& w1,
 708                                     const VectorRegister& w2,
 709                                     const VectorRegister& w3,
 710                                     const VectorRegister& w4,
 711                                     const VectorRegister& w5,
 712                                     const VectorRegister& w6,
 713                                     const VectorRegister& w7,
 714                                     const VectorRegister& kpw0,
 715                                     const VectorRegister& kpw1,
 716                                     const Register& j,
 717                                     const VectorRegister& vRb,
 718                                     const Register& k) {
 719   // Temporaries
 720   const VectorRegister VR_a = VR20;
 721   const VectorRegister VR_b = VR21;
 722   const VectorRegister VR_c = VR22;
 723   const VectorRegister VR_d = VR23;
 724 
 725   // load to k[j]
 726   lvx        (VR_a, j,    k);
 727   // advance j
 728   addi       (j,    j,    16); // 16 bytes were read
 729   // v6 = w[j-15], w[j-14]
 730   vperm      (VR_b, w1,   w0,  vRb);
 731   // v12 = w[j-7], w[j-6]
 732   vperm      (VR_c, w5,   w4,  vRb);
 733   // v6 = s0(w[j-15]) , s0(w[j-14])
 734   vshasigmad (VR_b, VR_b,    0,   0);
 735   // v5 = s1(w[j-2]) , s1(w[j-1])
 736   vshasigmad (VR_d, w7,      0,   0xf);
 737   // v6 = s0(w[j-15]) + w[j-7] , s0(w[j-14]) + w[j-6]
 738   vaddudm    (VR_b, VR_b, VR_c);
 739   // v8 = s1(w[j-2]) + w[j-16] , s1(w[j-1]) + w[j-15]
 740   vaddudm    (VR_d, VR_d, w0);
 741   // v9 = s0(w[j-15]) + w[j-7] + w[j-16] + s1(w[j-2]), // w[j]
 742   //      s0(w[j-14]) + w[j-6] + w[j-15] + s1(w[j-1]), // w[j+1]
 743   vaddudm    (VR_c, VR_d, VR_b);
 744   // Updating w0 to w7 to hold the new previous 16 values from w.
 745   vmr        (w0,   w1);
 746   vmr        (w1,   w2);
 747   vmr        (w2,   w3);
 748   vmr        (w3,   w4);
 749   vmr        (w4,   w5);
 750   vmr        (w5,   w6);
 751   vmr        (w6,   w7);
 752   vmr        (w7,   VR_c);
 753   // store k + w to kpw0 (2 values at once)
 754   vaddudm    (kpw0, VR_c, VR_a);
 755   // kpw1 holds (k + w)[1]
 756   vsldoi     (kpw1, kpw0, kpw0, 8);
 757 }
 758 
 759 void MacroAssembler::sha512_load_h_vec(const Register& state,
 760                                        const VectorRegister* hs,
 761                                        const int total_hs) {
 762   VectorRegister a   = hs[0];
 763   VectorRegister b   = hs[1];
 764   VectorRegister c   = hs[2];
 765   VectorRegister d   = hs[3];
 766   VectorRegister e   = hs[4];
 767   VectorRegister f   = hs[5];
 768   VectorRegister g   = hs[6];
 769   VectorRegister h   = hs[7];
 770 
 771   Register addr      = R7;
 772   VectorRegister vRb = VR8;
 773   Register tmp       = R8;
 774   Label state_aligned, after_state_aligned;
 775 
 776   andi_(tmp, state, 0xf);
 777   beq(CCR0, state_aligned);
 778 
 779   // deal with unaligned addresses
 780   VectorRegister aux = VR9;
 781 
 782   lvx    (a,    state);
 783   addi   (addr, state, 16);
 784   lvsl   (vRb,  addr);
 785 
 786   for (int n = 2; n < total_hs; n += 2) {
 787     VectorRegister h_cur   = hs[n];
 788     VectorRegister h_prev2 = hs[n - 2];
 789 
 790     lvx    (h_cur,   addr);
 791     addi   (addr,    addr,  16);
 792     vperm  (h_prev2, h_cur, h_prev2, vRb);
 793   }
 794   lvx    (aux, addr);
 795   vperm  (g,   aux, g, vRb);
 796 
 797   this->b(after_state_aligned);
 798 
 799   bind(state_aligned);
 800 
 801   // deal with aligned addresses
 802   mr(addr, state);
 803   for (int n = 0; n < total_hs; n += 2) {
 804     VectorRegister h_cur = hs[n];
 805 
 806     lvx    (h_cur, addr);
 807     addi   (addr, addr, 16);
 808   }
 809 
 810   bind(after_state_aligned);
 811 }
 812 
 813 //   R3_ARG1   - byte[]  Input string with padding but in Big Endian
 814 //   R4_ARG2   - int[]   SHA.state (at first, the root of primes)
 815 //   R5_ARG3   - int     offset
 816 //   R6_ARG4   - int     limit
 817 //
 818 //   Internal Register usage:
 819 //   R7 R8 R9  - volatile temporaries
 820 //   VR0-VR7   - a-h
 821 //   VR8       - vRb
 822 //   VR9       - aux (highly volatile, use with care)
 823 //   VR10-VR17 - w0-w7 | ini_a-ini_h
 824 //   VR18      - vsp16 | kplusw0
 825 //   VR19      - vsp32 | kplusw1
 826 //   VR20-VR25 - sha512_calc_2w and sha512_round temporaries
 827 void MacroAssembler::sha512(bool multi_block) {
 828   static const ssize_t base_size = sizeof(uint64_t);
 829   static const ssize_t buf_size = 128;
 830   static uint64_t waux[buf_size / base_size] __attribute((aligned (16)));
 831   static const uint64_t round_consts[80] __attribute((aligned (16))) = {
 832     0x428a2f98d728ae22, 0x7137449123ef65cd, 0xb5c0fbcfec4d3b2f,
 833     0xe9b5dba58189dbbc, 0x3956c25bf348b538, 0x59f111f1b605d019,
 834     0x923f82a4af194f9b, 0xab1c5ed5da6d8118, 0xd807aa98a3030242,
 835     0x12835b0145706fbe, 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2,
 836     0x72be5d74f27b896f, 0x80deb1fe3b1696b1, 0x9bdc06a725c71235,
 837     0xc19bf174cf692694, 0xe49b69c19ef14ad2, 0xefbe4786384f25e3,
 838     0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65, 0x2de92c6f592b0275,
 839     0x4a7484aa6ea6e483, 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5,
 840     0x983e5152ee66dfab, 0xa831c66d2db43210, 0xb00327c898fb213f,
 841     0xbf597fc7beef0ee4, 0xc6e00bf33da88fc2, 0xd5a79147930aa725,
 842     0x06ca6351e003826f, 0x142929670a0e6e70, 0x27b70a8546d22ffc,
 843     0x2e1b21385c26c926, 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df,
 844     0x650a73548baf63de, 0x766a0abb3c77b2a8, 0x81c2c92e47edaee6,
 845     0x92722c851482353b, 0xa2bfe8a14cf10364, 0xa81a664bbc423001,
 846     0xc24b8b70d0f89791, 0xc76c51a30654be30, 0xd192e819d6ef5218,
 847     0xd69906245565a910, 0xf40e35855771202a, 0x106aa07032bbd1b8,
 848     0x19a4c116b8d2d0c8, 0x1e376c085141ab53, 0x2748774cdf8eeb99,
 849     0x34b0bcb5e19b48a8, 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb,
 850     0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3, 0x748f82ee5defb2fc,
 851     0x78a5636f43172f60, 0x84c87814a1f0ab72, 0x8cc702081a6439ec,
 852     0x90befffa23631e28, 0xa4506cebde82bde9, 0xbef9a3f7b2c67915,
 853     0xc67178f2e372532b, 0xca273eceea26619c, 0xd186b8c721c0c207,
 854     0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178, 0x06f067aa72176fba,
 855     0x0a637dc5a2c898a6, 0x113f9804bef90dae, 0x1b710b35131c471b,
 856     0x28db77f523047d84, 0x32caab7b40c72493, 0x3c9ebe0a15c9bebc,
 857     0x431d67c49c100d4c, 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a,
 858     0x5fcb6fab3ad6faec, 0x6c44198c4a475817
 859   };
 860   static const uint8_t w_size = sizeof(round_consts)/sizeof(uint64_t);
 861 
 862   Register buf_in = R3_ARG1;
 863   Register state  = R4_ARG2;
 864   Register ofs    = R5_ARG3;
 865   Register limit  = R6_ARG4;
 866 
 867   Label sha_loop, bsw_loop, core_loop;
 868 
 869   // Save non-volatile vector registers in the red zone
 870   static const VectorRegister nv[] = {
 871     VR20, VR21, VR22, VR23, VR24, VR25/*, VR26, VR27, VR28, VR29, VR30, VR31*/
 872   };
 873   static const uint8_t nv_size = sizeof(nv) / sizeof (VectorRegister);
 874 
 875   for (int c = 0; c < nv_size; c++) {
 876     Register idx = R7;
 877     li  (idx, (c - (nv_size)) * 16);
 878     stvx(nv[c], idx, R1);
 879   }
 880 
 881   // Load hash state to registers
 882   VectorRegister a = VR0;
 883   VectorRegister b = VR1;
 884   VectorRegister c = VR2;
 885   VectorRegister d = VR3;
 886   VectorRegister e = VR4;
 887   VectorRegister f = VR5;
 888   VectorRegister g = VR6;
 889   VectorRegister h = VR7;
 890   static const VectorRegister hs[] = {a, b, c, d, e, f, g, h};
 891   static const int total_hs = sizeof(hs)/sizeof(VectorRegister);
 892   // counter for cycling through hs vector to avoid register moves between iterations
 893   int h_cnt = 0;
 894 
 895   // Load a-h registers from the memory pointed by state
 896   sha512_load_h_vec(state, hs, total_hs);
 897 
 898   align(OptoLoopAlignment);
 899   bind(sha_loop);
 900 
 901   for (int n = 0; n < total_hs; n += 2) {
 902     VectorRegister h_cur = hs[n];
 903     VectorRegister h_next = hs[n + 1];
 904 
 905     vsldoi (h_next, h_cur, h_cur, 8);
 906   }
 907 
 908   Register k = R9;
 909   load_const(k, const_cast<uint64_t *>(round_consts));
 910 
 911   // Load 16 elements from w out of the loop
 912   VectorRegister w0 = VR10;
 913   VectorRegister w1 = VR11;
 914   VectorRegister w2 = VR12;
 915   VectorRegister w3 = VR13;
 916   VectorRegister w4 = VR14;
 917   VectorRegister w5 = VR15;
 918   VectorRegister w6 = VR16;
 919   VectorRegister w7 = VR17;
 920   static const VectorRegister ws[] = {w0, w1, w2, w3, w4, w5, w6, w7};
 921   static const int total_ws = sizeof(ws)/sizeof(VectorRegister);
 922 
 923   // Load 16 w into vectors and setup vsl for vperm
 924   sha512_load_w_vec(buf_in, ws, total_ws);
 925 
 926   VectorRegister vsp16 = VR18;
 927   VectorRegister vsp32 = VR19;
 928   VectorRegister shiftarg = VR9;
 929 
 930   vspltisw(vsp16,    8);
 931   vspltisw(shiftarg, 1);
 932   vsl     (vsp16,    vsp16, shiftarg);
 933   vsl     (vsp32,    vsp16, shiftarg);
 934 
 935   VectorRegister vsp8 = VR9;
 936   vspltish(vsp8,     8);
 937 
 938   // Convert input from Big Endian to Little Endian
 939   for (int c = 0; c < total_ws; c++) {
 940     VectorRegister w = ws[c];
 941     vrlh  (w, w, vsp8);
 942   }
 943   for (int c = 0; c < total_ws; c++) {
 944     VectorRegister w = ws[c];
 945     vrlw  (w, w, vsp16);
 946   }
 947   for (int c = 0; c < total_ws; c++) {
 948     VectorRegister w = ws[c];
 949     vrld  (w, w, vsp32);
 950   }
 951 
 952   Register Rb        = R10;
 953   VectorRegister vRb = VR8;
 954   li      (Rb, 8);
 955   lvsl    (vRb, Rb);
 956 
 957   VectorRegister kplusw0 = VR18;
 958   VectorRegister kplusw1 = VR19;
 959 
 960   Register addr      = R7;
 961   mr      (addr, k);
 962 
 963   for (int n = 0; n < total_ws; n++) {
 964     VectorRegister w = ws[n];
 965 
 966     lvx    (kplusw0, addr);
 967     addi   (addr, addr, 16);
 968     vaddudm(kplusw0, kplusw0, w);
 969 
 970     sha512_round(hs, total_hs, h_cnt, kplusw0);
 971     vsldoi      (kplusw1, kplusw0, kplusw0, 8);
 972     sha512_round(hs, total_hs, h_cnt, kplusw1);
 973   }
 974 
 975   Register tmp       = R8;
 976   li    (tmp, (w_size-16)/total_hs);
 977   mtctr (tmp);
 978   // j will be aligned to 4 for loading words.
 979   // Whenever read, advance the pointer (e.g: when j is used in a function)
 980   Register j = tmp;
 981   li     (j, 8*16);
 982 
 983   align(OptoLoopAlignment);
 984   bind(core_loop);
 985 
 986   // due to VectorRegister rotate, always iterate in multiples of total_hs
 987   for (int n = 0; n < total_hs/2; n++) {
 988     sha512_calc_2w(w0, w1, w2, w3, w4, w5, w6, w7, kplusw0, kplusw1, j, vRb, k);
 989     sha512_round(hs, total_hs, h_cnt, kplusw0);
 990     sha512_round(hs, total_hs, h_cnt, kplusw1);
 991   }
 992 
 993   bdnz   (core_loop);
 994 
 995   sha512_update_sha_state(state, hs, total_hs);
 996 
 997   if (multi_block) {
 998     // process next 1024 bit block (buf_in already updated)
 999     addi(ofs, ofs, buf_size);
1000     cmpd(CCR0, ofs, limit);
1001     blt(CCR0, sha_loop);
1002 
1003     // return ofs
1004     mr(R3_ARG1, ofs);
1005   }
1006 
1007   // Restore non-volatile registers
1008   for (int c = 0; c < nv_size; c++) {
1009     Register idx = R7;
1010     li  (idx, (c - (nv_size)) * 16);
1011     lvx(nv[c], idx, R1);
1012   }
1013 }