New src/cpu/ppc/vm/macroAssembler_ppc

   1 // Copyright (c) 2017 Instituto de Pesquisas Eldorado. All rights reserved.
   2 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   3 //
   4 // This code is free software; you can redistribute it and/or modify it
   5 // under the terms of the GNU General Public License version 2 only, as
   6 // published by the Free Software Foundation.
   7 //
   8 // This code is distributed in the hope that it will be useful, but WITHOUT
   9 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  10 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  11 // version 2 for more details (a copy is included in the LICENSE file that
  12 // accompanied this code).
  13 //
  14 // You should have received a copy of the GNU General Public License version
  15 // 2 along with this work; if not, write to the Free Software Foundation,
  16 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  17 //
  18 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  19 // or visit www.oracle.com if you need additional information or have any
  20 // questions.
  21 
  22 // This implementation was contributed by the following people:
  23 // Bruno Rosa <bruno.rosa@eldorado.org.br>
  24 // Gustavo Serra Scalet <gustavo.scalet@eldorado.org.br>
  25 // Igor Nunes <igor.nunes@eldorado.org.br>
  26 
  27 // Support to Big Endian by:
  28 // Martin Doerr <martin.doerr@sap.com>
  29 
  30 // Implemented according to "Descriptions of SHA-256, SHA-384, and SHA-512"
  31 // (http://www.iwar.org.uk/comsec/resources/cipher/sha256-384-512.pdf).
  32 
  33 #include "asm/macroAssembler.inline.hpp"
  34 #include "runtime/stubRoutines.hpp"
  35 
  36 /**********************************************************************
  37  * SHA 256
  38  *********************************************************************/
  39 
  40 void MacroAssembler::sha256_deque(const VectorRegister src,
  41                                   const VectorRegister dst1,
  42                                   const VectorRegister dst2,
  43                                   const VectorRegister dst3) {
  44   vsldoi (dst1, src, src, 12);
  45   vsldoi (dst2, src, src, 8);
  46   vsldoi (dst3, src, src, 4);
  47 }
  48 
  49 void MacroAssembler::sha256_round(const VectorRegister* hs,
  50                                   const int total_hs,
  51                                   int& h_cnt,
  52                                   const VectorRegister kpw) {
  53   // convenience registers: cycle from 0-7 downwards
  54   const VectorRegister a = hs[(total_hs + 0 - (h_cnt % total_hs)) % total_hs];
  55   const VectorRegister b = hs[(total_hs + 1 - (h_cnt % total_hs)) % total_hs];
  56   const VectorRegister c = hs[(total_hs + 2 - (h_cnt % total_hs)) % total_hs];
  57   const VectorRegister d = hs[(total_hs + 3 - (h_cnt % total_hs)) % total_hs];
  58   const VectorRegister e = hs[(total_hs + 4 - (h_cnt % total_hs)) % total_hs];
  59   const VectorRegister f = hs[(total_hs + 5 - (h_cnt % total_hs)) % total_hs];
  60   const VectorRegister g = hs[(total_hs + 6 - (h_cnt % total_hs)) % total_hs];
  61   const VectorRegister h = hs[(total_hs + 7 - (h_cnt % total_hs)) % total_hs];
  62   // temporaries
  63   VectorRegister ch  = VR0;
  64   VectorRegister maj = VR1;
  65   VectorRegister bsa = VR2;
  66   VectorRegister bse = VR3;
  67   VectorRegister vt0 = VR4;
  68   VectorRegister vt1 = VR5;
  69   VectorRegister vt2 = VR6;
  70   VectorRegister vt3 = VR7;
  71 
  72   vsel       (ch,  g,   f, e);
  73   vxor       (maj, a,   b);
  74   vshasigmaw (bse, e,   1, 0xf);
  75   vadduwm    (vt2, ch,  kpw);
  76   vadduwm    (vt1, h,   bse);
  77   vsel       (maj, b,   c, maj);
  78   vadduwm    (vt3, vt1, vt2);
  79   vshasigmaw (bsa, a,   1, 0);
  80   vadduwm    (vt0, bsa, maj);
  81 
  82   vadduwm    (d,   d,   vt3);
  83   vadduwm    (h,   vt3, vt0);
  84 
  85   // advance vector pointer to the next iteration
  86   h_cnt++;
  87 }
  88 
  89 void MacroAssembler::sha256_load_h_vec(const VectorRegister a,
  90                                        const VectorRegister e,
  91                                        const Register hptr) {
  92   // temporaries
  93   Register tmp = R8;
  94   VectorRegister vt0 = VR0;
  95   VectorRegister vRb = VR6;
  96   // labels
  97   Label sha256_aligned, sha256_load_end;;
  98 
  99   andi_  (tmp,  hptr, 0xf);
 100   addi   (tmp,  hptr, 16);
 101   beq    (CCR0, sha256_aligned);
 102 
 103   // handle unaligned accesses
 104   lvx    (a,    hptr);
 105   lvsr   (vRb,  hptr);
 106 
 107   lvx    (e,    tmp);
 108   addi   (tmp,  tmp,  16);
 109   vec_perm(a,   e,    vRb);
 110 
 111   lvx    (vt0,  tmp);
 112   vec_perm(e,   vt0,  vRb);
 113   b      (sha256_load_end);
 114 
 115   // aligned accesses
 116   bind(sha256_aligned);
 117   lvx    (a,    hptr);
 118   addi   (tmp,  hptr, 16);
 119   lvx    (e,    tmp);
 120 
 121   bind(sha256_load_end);
 122 }
 123 
 124 void MacroAssembler::sha256_load_w_plus_k_vec(const Register buf_in,
 125                                               const VectorRegister* ws,
 126                                               const int total_ws,
 127                                               const Register k,
 128                                               const VectorRegister* kpws,
 129                                               const int total_kpws) {
 130   Label w_aligned, after_w_load;
 131 
 132   Register tmp       = R8;
 133   VectorRegister vt0 = VR0;
 134   VectorRegister vt1 = VR1;
 135   VectorRegister vRb = VR6;
 136 
 137   andi_ (tmp, buf_in, 0xF);
 138   beq   (CCR0, w_aligned); // address ends with 0x0, not 0x8
 139 
 140   // deal with unaligned addresses
 141   lvx    (ws[0], buf_in);
 142   addi   (buf_in, buf_in, 16);
 143   lvsr   (vRb, buf_in);
 144 
 145   for (int n = 1; n < total_ws; n++) {
 146     VectorRegister w_cur = ws[n];
 147     VectorRegister w_prev = ws[n-1];
 148 
 149     lvx  (w_cur, buf_in);
 150     addi (buf_in, buf_in, 16);
 151     vec_perm(w_prev, w_cur, vRb);
 152   }
 153 
 154   lvx    (vt0, buf_in);
 155   vec_perm(ws[total_ws-1], vt0, vRb);
 156 
 157   b      (after_w_load);
 158 
 159   bind(w_aligned);
 160 
 161   // deal with aligned addresses
 162   for (int n = 0; n < total_ws; n++) {
 163     VectorRegister w = ws[n];
 164 
 165     lvx  (w, buf_in);
 166     addi (buf_in, buf_in, 16);
 167   }
 168 
 169   bind(after_w_load);
 170 
 171 #if defined(VM_LITTLE_ENDIAN)
 172   // Byte swapping within int values
 173   li       (tmp, 8);
 174   lvsl     (vt0, tmp);
 175   vspltisb (vt1, 0xb);
 176   vxor     (vt1, vt0, vt1);
 177   for (int n = 0; n < total_ws; n++) {
 178     VectorRegister w = ws[n];
 179     vec_perm(w, w, vt1);
 180   }
 181 #endif
 182 
 183   // Loading k, which is always aligned to 16-bytes
 184   lvx    (kpws[0], k);
 185   addi   (tmp, k, 16);
 186   for (int n = 1; n < total_kpws-1; n++) {
 187     VectorRegister kpw = kpws[n];
 188 
 189     lvx  (kpw, tmp);
 190     addi (tmp, tmp, 16);
 191   }
 192   lvx  (kpws[total_kpws-1], tmp);
 193 
 194   // Add w to K
 195   assert(total_ws == total_kpws, "Redesign the loop below");
 196   for (int n = 0; n < total_kpws; n++) {
 197     VectorRegister kpw = kpws[n];
 198     VectorRegister w   = ws[n];
 199 
 200     vadduwm  (kpw, kpw, w);
 201   }
 202 }
 203 
 204 void MacroAssembler::sha256_calc_4w(const VectorRegister w0,
 205                                     const VectorRegister w1,
 206                                     const VectorRegister w2,
 207                                     const VectorRegister w3,
 208                                     const VectorRegister kpw0,
 209                                     const VectorRegister kpw1,
 210                                     const VectorRegister kpw2,
 211                                     const VectorRegister kpw3,
 212                                     const Register j,
 213                                     const Register k) {
 214   // Temporaries
 215   const VectorRegister  vt0  = VR0;
 216   const VectorRegister  vt1  = VR1;
 217   const VectorSRegister vsrt1 = vt1->to_vsr();
 218   const VectorRegister  vt2  = VR2;
 219   const VectorRegister  vt3  = VR3;
 220   const VectorSRegister vst3 = vt3->to_vsr();
 221   const VectorRegister  vt4  = VR4;
 222 
 223   // load to k[j]
 224   lvx        (vt0, j,   k);
 225 
 226   // advance j
 227   addi       (j,   j,   16); // 16 bytes were read
 228 
 229 #if defined(VM_LITTLE_ENDIAN)
 230   // b = w[j-15], w[j-14], w[j-13], w[j-12]
 231   vsldoi     (vt1, w1,  w0, 12);
 232 
 233   // c = w[j-7], w[j-6], w[j-5], w[j-4]
 234   vsldoi     (vt2, w3,  w2, 12);
 235 
 236 #else
 237   // b = w[j-15], w[j-14], w[j-13], w[j-12]
 238   vsldoi     (vt1, w0,  w1, 4);
 239 
 240   // c = w[j-7], w[j-6], w[j-5], w[j-4]
 241   vsldoi     (vt2, w2,  w3, 4);
 242 #endif
 243 
 244   // d = w[j-2], w[j-1], w[j-4], w[j-3]
 245   vsldoi     (vt3, w3,  w3, 8);
 246 
 247   // b = s0(w[j-15]) , s0(w[j-14]) , s0(w[j-13]) , s0(w[j-12])
 248   vshasigmaw (vt1, vt1, 0,  0);
 249 
 250   // d = s1(w[j-2]) , s1(w[j-1]) , s1(w[j-4]) , s1(w[j-3])
 251   vshasigmaw (vt3, vt3, 0,  0xf);
 252 
 253   // c = s0(w[j-15]) + w[j-7],
 254   //     s0(w[j-14]) + w[j-6],
 255   //     s0(w[j-13]) + w[j-5],
 256   //     s0(w[j-12]) + w[j-4]
 257   vadduwm    (vt2, vt1, vt2);
 258 
 259   // c = s0(w[j-15]) + w[j-7] + w[j-16],
 260   //     s0(w[j-14]) + w[j-6] + w[j-15],
 261   //     s0(w[j-13]) + w[j-5] + w[j-14],
 262   //     s0(w[j-12]) + w[j-4] + w[j-13]
 263   vadduwm    (vt2, vt2, w0);
 264 
 265   // e = s0(w[j-15]) + w[j-7] + w[j-16] + s1(w[j-2]), // w[j]
 266   //     s0(w[j-14]) + w[j-6] + w[j-15] + s1(w[j-1]), // w[j+1]
 267   //     s0(w[j-13]) + w[j-5] + w[j-14] + s1(w[j-4]), // UNDEFINED
 268   //     s0(w[j-12]) + w[j-4] + w[j-13] + s1(w[j-3])  // UNDEFINED
 269   vadduwm    (vt4, vt2, vt3);
 270 
 271   // At this point, e[0] and e[1] are the correct values to be stored at w[j]
 272   // and w[j+1].
 273   // e[2] and e[3] are not considered.
 274   // b = s1(w[j]) , s1(s(w[j+1]) , UNDEFINED , UNDEFINED
 275   vshasigmaw (vt1, vt4, 0,  0xf);
 276 
 277   // v5 = s1(w[j-2]) , s1(w[j-1]) , s1(w[j]) , s1(w[j+1])
 278 #if defined(VM_LITTLE_ENDIAN)
 279   xxmrgld    (vst3, vsrt1, vst3);
 280 #else
 281   xxmrghd    (vst3, vst3, vsrt1);
 282 #endif
 283 
 284   // c = s0(w[j-15]) + w[j-7] + w[j-16] + s1(w[j-2]), // w[j]
 285   //     s0(w[j-14]) + w[j-6] + w[j-15] + s1(w[j-1]), // w[j+1]
 286   //     s0(w[j-13]) + w[j-5] + w[j-14] + s1(w[j]),   // w[j+2]
 287   //     s0(w[j-12]) + w[j-4] + w[j-13] + s1(w[j+1])  // w[j+4]
 288   vadduwm    (vt2, vt2, vt3);
 289 
 290   // Updating w0 to w3 to hold the new previous 16 values from w.
 291   vmr        (w0,  w1);
 292   vmr        (w1,  w2);
 293   vmr        (w2,  w3);
 294   vmr        (w3,  vt2);
 295 
 296   // store k + w to v9 (4 values at once)
 297 #if defined(VM_LITTLE_ENDIAN)
 298   vadduwm    (kpw0, vt2, vt0);
 299 
 300   vsldoi     (kpw1, kpw0, kpw0, 12);
 301   vsldoi     (kpw2, kpw0, kpw0, 8);
 302   vsldoi     (kpw3, kpw0, kpw0, 4);
 303 #else
 304   vadduwm    (kpw3, vt2, vt0);
 305 
 306   vsldoi     (kpw2, kpw3, kpw3, 12);
 307   vsldoi     (kpw1, kpw3, kpw3, 8);
 308   vsldoi     (kpw0, kpw3, kpw3, 4);
 309 #endif
 310 }
 311 
 312 void MacroAssembler::sha256_update_sha_state(const VectorRegister a,
 313                                              const VectorRegister b_,
 314                                              const VectorRegister c,
 315                                              const VectorRegister d,
 316                                              const VectorRegister e,
 317                                              const VectorRegister f,
 318                                              const VectorRegister g,
 319                                              const VectorRegister h,
 320                                              const Register hptr) {
 321   // temporaries
 322   VectorRegister vt0  = VR0;
 323   VectorRegister vt1  = VR1;
 324   VectorRegister vt2  = VR2;
 325   VectorRegister vt3  = VR3;
 326   VectorRegister vt4  = VR4;
 327   VectorRegister vt5  = VR5;
 328   VectorRegister vaux = VR6;
 329   VectorRegister vRb  = VR6;
 330   Register tmp        = R8;
 331   Register of16       = R8;
 332   Register of32       = R9;
 333   Label state_load_aligned, after_state_load_aligned;
 334 
 335   // Load hptr
 336   andi_   (tmp, hptr, 0xf);
 337   li      (of16, 16);
 338   beq     (CCR0, state_load_aligned);
 339 
 340   // handle unaligned accesses
 341   li      (of32, 32);
 342   lvx     (vt0, hptr);
 343   lvsr    (vRb, hptr);
 344 
 345   lvx     (vt5, hptr, of16);
 346   vec_perm(vt0, vt5,  vRb);        // vt0 = hptr[0]..hptr[3]
 347 
 348   lvx     (vt1, hptr, of32);
 349   vec_perm(vt5, vt1,  vRb);        // vt5 = hptr[4]..hptr[7]
 350   b       (after_state_load_aligned);
 351 
 352   // aligned accesses
 353   bind(state_load_aligned);
 354   lvx     (vt0, hptr);
 355   lvx     (vt5, of16, hptr);
 356 
 357   bind(after_state_load_aligned);
 358 
 359 #if defined(VM_LITTLE_ENDIAN)
 360   vmrglw  (vt1, b_, a);            // vt1 = {a, b, ?, ?}
 361   vmrglw  (vt2, d, c);             // vt2 = {c, d, ?, ?}
 362   vmrglw  (vt3, f, e);             // vt3 = {e, f, ?, ?}
 363   vmrglw  (vt4, h, g);             // vt4 = {g, h, ?, ?}
 364   xxmrgld (vt1->to_vsr(), vt2->to_vsr(), vt1->to_vsr()); // vt1 = {a, b, c, d}
 365   xxmrgld (vt3->to_vsr(), vt4->to_vsr(), vt3->to_vsr()); // vt3 = {e, f, g, h}
 366   vadduwm (a,   vt0, vt1);         // a = {a+hptr[0], b+hptr[1], c+hptr[2], d+hptr[3]}
 367   vadduwm (e,   vt5, vt3);         // e = {e+hptr[4], f+hptr[5], g+hptr[6], h+hptr[7]}
 368 
 369   // Save hptr back, works for any alignment
 370   xxswapd (vt0->to_vsr(), a->to_vsr());
 371   stxvd2x (vt0->to_vsr(), hptr);
 372   xxswapd (vt5->to_vsr(), e->to_vsr());
 373   stxvd2x (vt5->to_vsr(), of16, hptr);
 374 #else
 375   vmrglw  (vt1, a, b_);            // vt1 = {a, b, ?, ?}
 376   vmrglw  (vt2, c, d);             // vt2 = {c, d, ?, ?}
 377   vmrglw  (vt3, e, f);             // vt3 = {e, f, ?, ?}
 378   vmrglw  (vt4, g, h);             // vt4 = {g, h, ?, ?}
 379   xxmrgld (vt1->to_vsr(), vt1->to_vsr(), vt2->to_vsr()); // vt1 = {a, b, c, d}
 380   xxmrgld (vt3->to_vsr(), vt3->to_vsr(), vt4->to_vsr()); // vt3 = {e, f, g, h}
 381   vadduwm (d,   vt0, vt1);         // d = {a+hptr[0], b+hptr[1], c+hptr[2], d+hptr[3]}
 382   vadduwm (h,   vt5, vt3);         // h = {e+hptr[4], f+hptr[5], g+hptr[6], h+hptr[7]}
 383 
 384   // Save hptr back, works for any alignment
 385   stxvd2x (d->to_vsr(), hptr);
 386   stxvd2x (h->to_vsr(), of16, hptr);
 387 #endif
 388 }
 389 
 390 
 391 //   R3_ARG1   - byte[]  Input string with padding but in Big Endian
 392 //   R4_ARG2   - int[]   SHA.state (at first, the root of primes)
 393 //   R5_ARG3   - int     offset
 394 //   R6_ARG4   - int     limit
 395 //
 396 //   Internal Register usage:
 397 //   R7        - k
 398 //   R8        - tmp | j | of16
 399 //   R9        - of32
 400 //   VR0-VR8   - ch, maj, bsa, bse, vt0-vt3 | vt0-vt5, vaux/vRb
 401 //   VR9-VR16  - a-h
 402 //   VR17-VR20 - w0-w3
 403 //   VR21-VR23 - vRb | vaux0-vaux2
 404 //   VR24-VR27 - kpw0-kpw3
 405 void MacroAssembler::sha256(bool multi_block) {
 406   static const ssize_t base_size = sizeof(uint32_t);
 407   static const ssize_t buf_size = 64;
 408   static uint32_t waux[buf_size / base_size] __attribute((aligned (16)));
 409   static const uint32_t round_consts[64] __attribute((aligned (16))) = {
 410     0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
 411     0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
 412     0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
 413     0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
 414     0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
 415     0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
 416     0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
 417     0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
 418     0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
 419     0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
 420     0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
 421     0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
 422     0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
 423     0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
 424     0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
 425     0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
 426   };
 427   static const uint8_t w_size = sizeof(round_consts)/sizeof(uint32_t);
 428 
 429   Register buf_in = R3_ARG1;
 430   Register state  = R4_ARG2;
 431   Register ofs    = R5_ARG3;
 432   Register limit  = R6_ARG4;
 433 
 434   Label sha_loop, bsw_loop, core_loop;
 435 
 436   // Save non-volatile vector registers in the red zone
 437   static const VectorRegister nv[] = {
 438     VR20, VR21, VR22, VR23, VR24, VR25, VR26, VR27/*, VR28, VR29, VR30, VR31*/
 439   };
 440   static const uint8_t nv_size = sizeof(nv) / sizeof (VectorRegister);
 441 
 442   for (int c = 0; c < nv_size; c++) {
 443     Register tmp = R8;
 444     li  (tmp, (c - (nv_size)) * 16);
 445     stvx(nv[c], tmp, R1);
 446   }
 447 
 448   // Load hash state to registers
 449   VectorRegister a = VR9;
 450   VectorRegister b = VR10;
 451   VectorRegister c = VR11;
 452   VectorRegister d = VR12;
 453   VectorRegister e = VR13;
 454   VectorRegister f = VR14;
 455   VectorRegister g = VR15;
 456   VectorRegister h = VR16;
 457   static const VectorRegister hs[] = {a, b, c, d, e, f, g, h};
 458   static const int total_hs = sizeof(hs)/sizeof(VectorRegister);
 459   // counter for cycling through hs vector to avoid register moves between iterations
 460   int h_cnt = 0;
 461 
 462   // Load a-h registers from the memory pointed by state
 463 #if defined(VM_LITTLE_ENDIAN)
 464   sha256_load_h_vec(a, e, state);
 465 #else
 466   sha256_load_h_vec(d, h, state);
 467 #endif
 468 
 469   // keep k loaded also during MultiBlock loops
 470   Register k = R7;
 471   load_const_optimized(k, const_cast<uint32_t *>(round_consts), R0);
 472 
 473   // Avoiding redundant loads
 474   if (multi_block) {
 475     align(OptoLoopAlignment);
 476   }
 477   bind(sha_loop);
 478 #if defined(VM_LITTLE_ENDIAN)
 479   sha256_deque(a, b, c, d);
 480   sha256_deque(e, f, g, h);
 481 #else
 482   sha256_deque(d, c, b, a);
 483   sha256_deque(h, g, f, e);
 484 #endif
 485 
 486   // Load 16 elements from w out of the loop.
 487   // Order of the int values is Endianess specific.
 488   VectorRegister w0 = VR17;
 489   VectorRegister w1 = VR18;
 490   VectorRegister w2 = VR19;
 491   VectorRegister w3 = VR20;
 492   static const VectorRegister ws[] = {w0, w1, w2, w3};
 493   static const int total_ws = sizeof(ws)/sizeof(VectorRegister);
 494 
 495   VectorRegister kpw0 = VR24;
 496   VectorRegister kpw1 = VR25;
 497   VectorRegister kpw2 = VR26;
 498   VectorRegister kpw3 = VR27;
 499   static const VectorRegister kpws[] = {kpw0, kpw1, kpw2, kpw3};
 500   static const int total_kpws = sizeof(kpws)/sizeof(VectorRegister);
 501 
 502   sha256_load_w_plus_k_vec(buf_in, ws, total_ws, k, kpws, total_kpws);
 503 
 504   // Cycle through the first 16 elements
 505   assert(total_ws == total_kpws, "Redesign the loop below");
 506   for (int n = 0; n < total_ws; n++) {
 507     VectorRegister vaux0 = VR21;
 508     VectorRegister vaux1 = VR22;
 509     VectorRegister vaux2 = VR23;
 510 
 511     sha256_deque(kpws[n], vaux0, vaux1, vaux2);
 512 
 513 #if defined(VM_LITTLE_ENDIAN)
 514     sha256_round(hs, total_hs, h_cnt, kpws[n]);
 515     sha256_round(hs, total_hs, h_cnt, vaux0);
 516     sha256_round(hs, total_hs, h_cnt, vaux1);
 517     sha256_round(hs, total_hs, h_cnt, vaux2);
 518 #else
 519     sha256_round(hs, total_hs, h_cnt, vaux2);
 520     sha256_round(hs, total_hs, h_cnt, vaux1);
 521     sha256_round(hs, total_hs, h_cnt, vaux0);
 522     sha256_round(hs, total_hs, h_cnt, kpws[n]);
 523 #endif
 524   }
 525 
 526   Register tmp = R8;
 527   // loop the 16th to the 64th iteration by 8 steps
 528   li   (tmp, (w_size - 16) / total_hs);
 529   mtctr(tmp);
 530 
 531   // j will be aligned to 4 for loading words.
 532   // Whenever read, advance the pointer (e.g: when j is used in a function)
 533   Register j = R8;
 534   li   (j, 16*4);
 535 
 536   align(OptoLoopAlignment);
 537   bind(core_loop);
 538 
 539   // due to VectorRegister rotate, always iterate in multiples of total_hs
 540   for (int n = 0; n < total_hs/4; n++) {
 541     sha256_calc_4w(w0, w1, w2, w3, kpw0, kpw1, kpw2, kpw3, j, k);
 542     sha256_round(hs, total_hs, h_cnt, kpw0);
 543     sha256_round(hs, total_hs, h_cnt, kpw1);
 544     sha256_round(hs, total_hs, h_cnt, kpw2);
 545     sha256_round(hs, total_hs, h_cnt, kpw3);
 546   }
 547 
 548   bdnz   (core_loop);
 549 
 550   // Update hash state
 551   sha256_update_sha_state(a, b, c, d, e, f, g, h, state);
 552 
 553   if (multi_block) {
 554     // process next 1024 bit block (buf_in already updated)
 555     addi(ofs, ofs, buf_size);
 556     cmpd(CCR0, ofs, limit);
 557     blt(CCR0, sha_loop);
 558 
 559     // return ofs
 560     mr(R3_ARG1, ofs);
 561   }
 562 
 563   // Restore non-volatile registers
 564   for (int c = 0; c < nv_size; c++) {
 565     Register tmp = R8;
 566     li  (tmp, (c - (nv_size)) * 16);
 567     lvx(nv[c], tmp, R1);
 568   }
 569 }
 570 
 571 
 572 /**********************************************************************
 573  * SHA 512
 574  *********************************************************************/
 575 
 576 void MacroAssembler::sha512_load_w_vec(const Register buf_in,
 577                                        const VectorRegister* ws,
 578                                        const int total_ws) {
 579   Register tmp       = R8;
 580   VectorRegister vRb = VR8;
 581   VectorRegister aux = VR9;
 582   Label is_aligned, after_alignment;
 583 
 584   andi_  (tmp, buf_in, 0xF);
 585   beq    (CCR0, is_aligned); // address ends with 0x0, not 0x8
 586 
 587   // deal with unaligned addresses
 588   lvx    (ws[0], buf_in);
 589   addi   (buf_in, buf_in, 16);
 590   lvsr   (vRb, buf_in);
 591 
 592   for (int n = 1; n < total_ws; n++) {
 593     VectorRegister w_cur = ws[n];
 594     VectorRegister w_prev = ws[n-1];
 595 
 596     lvx  (w_cur, buf_in);
 597     addi (buf_in, buf_in, 16);
 598     vec_perm(w_prev, w_cur, vRb);
 599   }
 600 
 601   lvx    (aux, buf_in);
 602   vec_perm(ws[total_ws-1], aux, vRb);
 603 
 604   b      (after_alignment);
 605 
 606   bind(is_aligned);
 607 
 608   for (int n = 0; n < total_ws; n++) {
 609     VectorRegister w = ws[n];
 610 
 611     lvx  (w, buf_in);
 612     addi (buf_in, buf_in, 16);
 613   }
 614 
 615   bind(after_alignment);
 616 }
 617 
 618 // Update hash state
 619 void MacroAssembler::sha512_update_sha_state(const Register state,
 620                                              const VectorRegister* hs,
 621                                              const int total_hs) {
 622 
 623 #if defined(VM_LITTLE_ENDIAN)
 624   int start_idx = 0;
 625 #else
 626   int start_idx = 1;
 627 #endif
 628 
 629   // load initial hash from the memory pointed by state
 630   VectorRegister ini_a = VR10;
 631   VectorRegister ini_c = VR12;
 632   VectorRegister ini_e = VR14;
 633   VectorRegister ini_g = VR16;
 634   static const VectorRegister inis[] = {ini_a, ini_c, ini_e, ini_g};
 635   static const int total_inis = sizeof(inis)/sizeof(VectorRegister);
 636 
 637   Label state_save_aligned, after_state_save_aligned;
 638 
 639   Register addr      = R7;
 640   Register tmp       = R8;
 641   VectorRegister vRb = VR8;
 642   VectorRegister aux = VR9;
 643 
 644   andi_(tmp, state, 0xf);
 645   beq(CCR0, state_save_aligned);
 646   // deal with unaligned addresses
 647 
 648   {
 649     VectorRegister a = hs[0];
 650     VectorRegister b_ = hs[1];
 651     VectorRegister c = hs[2];
 652     VectorRegister d = hs[3];
 653     VectorRegister e = hs[4];
 654     VectorRegister f = hs[5];
 655     VectorRegister g = hs[6];
 656     VectorRegister h = hs[7];
 657     lvsr   (vRb, state);
 658     lvx    (ini_a, state);
 659     addi   (addr, state, 16);
 660 
 661     lvx    (ini_c, addr);
 662     addi   (addr, addr, 16);
 663     vec_perm(ini_a, ini_c, vRb);
 664 
 665     lvx    (ini_e, addr);
 666     addi   (addr, addr, 16);
 667     vec_perm(ini_c, ini_e, vRb);
 668 
 669     lvx    (ini_g, addr);
 670     addi   (addr, addr, 16);
 671     vec_perm(ini_e, ini_g, vRb);
 672 
 673     lvx    (aux, addr);
 674     vec_perm(ini_g, aux, vRb);
 675 
 676 #if defined(VM_LITTLE_ENDIAN)
 677     xxmrgld(a->to_vsr(), b_->to_vsr(), a->to_vsr());
 678     xxmrgld(c->to_vsr(), d->to_vsr(), c->to_vsr());
 679     xxmrgld(e->to_vsr(), f->to_vsr(), e->to_vsr());
 680     xxmrgld(g->to_vsr(), h->to_vsr(), g->to_vsr());
 681 #else
 682     xxmrgld(b_->to_vsr(), a->to_vsr(), b_->to_vsr());
 683     xxmrgld(d->to_vsr(), c->to_vsr(), d->to_vsr());
 684     xxmrgld(f->to_vsr(), e->to_vsr(), f->to_vsr());
 685     xxmrgld(h->to_vsr(), g->to_vsr(), h->to_vsr());
 686 #endif
 687 
 688     for (int n = start_idx; n < total_hs; n += 2) {
 689       VectorRegister h_cur = hs[n];
 690       VectorRegister ini_cur = inis[n/2];
 691 
 692       vaddudm(h_cur, ini_cur, h_cur);
 693     }
 694 
 695     for (int n = start_idx; n < total_hs; n += 2) {
 696       VectorRegister h_cur = hs[n];
 697 
 698       mfvrd  (tmp, h_cur);
 699 #if defined(VM_LITTLE_ENDIAN)
 700       std    (tmp, 8*n + 8, state);
 701 #else
 702       std    (tmp, 8*n - 8, state);
 703 #endif
 704       vsldoi (aux, h_cur, h_cur, 8);
 705       mfvrd  (tmp, aux);
 706       std    (tmp, 8*n + 0, state);
 707     }
 708 
 709     b      (after_state_save_aligned);
 710   }
 711 
 712   bind(state_save_aligned);
 713   {
 714     mr(addr, state);
 715     for (int n = 0; n < total_hs; n += 2) {
 716 #if defined(VM_LITTLE_ENDIAN)
 717       VectorRegister h_cur = hs[n];
 718       VectorRegister h_next = hs[n+1];
 719 #else
 720       VectorRegister h_cur = hs[n+1];
 721       VectorRegister h_next = hs[n];
 722 #endif
 723       VectorRegister ini_cur = inis[n/2];
 724 
 725       lvx(ini_cur, addr);
 726       addi(addr, addr, 16);
 727       xxmrgld(h_cur->to_vsr(), h_next->to_vsr(), h_cur->to_vsr());
 728     }
 729 
 730     for (int n = start_idx; n < total_hs; n += 2) {
 731       VectorRegister h_cur = hs[n];
 732       VectorRegister ini_cur = inis[n/2];
 733 
 734       vaddudm(h_cur, ini_cur, h_cur);
 735     }
 736 
 737     mr(addr, state);
 738     for (int n = start_idx; n < total_hs; n += 2) {
 739       VectorRegister h_cur = hs[n];
 740 
 741       stvx(h_cur, addr);
 742       addi(addr, addr, 16);
 743     }
 744   }
 745 
 746   bind(after_state_save_aligned);
 747 }
 748 
 749 // Use h_cnt to cycle through hs elements but also increment it at the end
 750 void MacroAssembler::sha512_round(const VectorRegister* hs,
 751                                   const int total_hs, int& h_cnt,
 752                                   const VectorRegister kpw) {
 753 
 754   // convenience registers: cycle from 0-7 downwards
 755   const VectorRegister a = hs[(total_hs + 0 - (h_cnt % total_hs)) % total_hs];
 756   const VectorRegister b = hs[(total_hs + 1 - (h_cnt % total_hs)) % total_hs];
 757   const VectorRegister c = hs[(total_hs + 2 - (h_cnt % total_hs)) % total_hs];
 758   const VectorRegister d = hs[(total_hs + 3 - (h_cnt % total_hs)) % total_hs];
 759   const VectorRegister e = hs[(total_hs + 4 - (h_cnt % total_hs)) % total_hs];
 760   const VectorRegister f = hs[(total_hs + 5 - (h_cnt % total_hs)) % total_hs];
 761   const VectorRegister g = hs[(total_hs + 6 - (h_cnt % total_hs)) % total_hs];
 762   const VectorRegister h = hs[(total_hs + 7 - (h_cnt % total_hs)) % total_hs];
 763   // temporaries
 764   const VectorRegister Ch   = VR20;
 765   const VectorRegister Maj  = VR21;
 766   const VectorRegister bsa  = VR22;
 767   const VectorRegister bse  = VR23;
 768   const VectorRegister tmp1 = VR24;
 769   const VectorRegister tmp2 = VR25;
 770 
 771   vsel      (Ch,   g,    f,   e);
 772   vxor      (Maj,  a,    b);
 773   vshasigmad(bse,  e,    1,   0xf);
 774   vaddudm   (tmp2, Ch,   kpw);
 775   vaddudm   (tmp1, h,    bse);
 776   vsel      (Maj,  b,    c,   Maj);
 777   vaddudm   (tmp1, tmp1, tmp2);
 778   vshasigmad(bsa,  a,    1,   0);
 779   vaddudm   (tmp2, bsa,  Maj);
 780   vaddudm   (d,    d,    tmp1);
 781   vaddudm   (h,    tmp1, tmp2);
 782 
 783   // advance vector pointer to the next iteration
 784   h_cnt++;
 785 }
 786 
 787 void MacroAssembler::sha512_calc_2w(const VectorRegister w0,
 788                                     const VectorRegister w1,
 789                                     const VectorRegister w2,
 790                                     const VectorRegister w3,
 791                                     const VectorRegister w4,
 792                                     const VectorRegister w5,
 793                                     const VectorRegister w6,
 794                                     const VectorRegister w7,
 795                                     const VectorRegister kpw0,
 796                                     const VectorRegister kpw1,
 797                                     const Register j,
 798                                     const VectorRegister vRb,
 799                                     const Register k) {
 800   // Temporaries
 801   const VectorRegister VR_a = VR20;
 802   const VectorRegister VR_b = VR21;
 803   const VectorRegister VR_c = VR22;
 804   const VectorRegister VR_d = VR23;
 805 
 806   // load to k[j]
 807   lvx        (VR_a, j,    k);
 808   // advance j
 809   addi       (j,    j,    16); // 16 bytes were read
 810 
 811 #if defined(VM_LITTLE_ENDIAN)
 812   // v6 = w[j-15], w[j-14]
 813   vperm      (VR_b, w1,   w0,  vRb);
 814   // v12 = w[j-7], w[j-6]
 815   vperm      (VR_c, w5,   w4,  vRb);
 816 #else
 817   // v6 = w[j-15], w[j-14]
 818   vperm      (VR_b, w0,   w1,  vRb);
 819   // v12 = w[j-7], w[j-6]
 820   vperm      (VR_c, w4,   w5,  vRb);
 821 #endif
 822 
 823   // v6 = s0(w[j-15]) , s0(w[j-14])
 824   vshasigmad (VR_b, VR_b,    0,   0);
 825   // v5 = s1(w[j-2]) , s1(w[j-1])
 826   vshasigmad (VR_d, w7,      0,   0xf);
 827   // v6 = s0(w[j-15]) + w[j-7] , s0(w[j-14]) + w[j-6]
 828   vaddudm    (VR_b, VR_b, VR_c);
 829   // v8 = s1(w[j-2]) + w[j-16] , s1(w[j-1]) + w[j-15]
 830   vaddudm    (VR_d, VR_d, w0);
 831   // v9 = s0(w[j-15]) + w[j-7] + w[j-16] + s1(w[j-2]), // w[j]
 832   //      s0(w[j-14]) + w[j-6] + w[j-15] + s1(w[j-1]), // w[j+1]
 833   vaddudm    (VR_c, VR_d, VR_b);
 834   // Updating w0 to w7 to hold the new previous 16 values from w.
 835   vmr        (w0,   w1);
 836   vmr        (w1,   w2);
 837   vmr        (w2,   w3);
 838   vmr        (w3,   w4);
 839   vmr        (w4,   w5);
 840   vmr        (w5,   w6);
 841   vmr        (w6,   w7);
 842   vmr        (w7,   VR_c);
 843 
 844 #if defined(VM_LITTLE_ENDIAN)
 845   // store k + w to kpw0 (2 values at once)
 846   vaddudm    (kpw0, VR_c, VR_a);
 847   // kpw1 holds (k + w)[1]
 848   vsldoi     (kpw1, kpw0, kpw0, 8);
 849 #else
 850   // store k + w to kpw0 (2 values at once)
 851   vaddudm    (kpw1, VR_c, VR_a);
 852   // kpw1 holds (k + w)[1]
 853   vsldoi     (kpw0, kpw1, kpw1, 8);
 854 #endif
 855 }
 856 
 857 void MacroAssembler::sha512_load_h_vec(const Register state,
 858                                        const VectorRegister* hs,
 859                                        const int total_hs) {
 860 #if defined(VM_LITTLE_ENDIAN)
 861   VectorRegister a   = hs[0];
 862   VectorRegister g   = hs[6];
 863   int start_idx = 0;
 864 #else
 865   VectorRegister a   = hs[1];
 866   VectorRegister g   = hs[7];
 867   int start_idx = 1;
 868 #endif
 869 
 870   Register addr      = R7;
 871   VectorRegister vRb = VR8;
 872   Register tmp       = R8;
 873   Label state_aligned, after_state_aligned;
 874 
 875   andi_(tmp, state, 0xf);
 876   beq(CCR0, state_aligned);
 877 
 878   // deal with unaligned addresses
 879   VectorRegister aux = VR9;
 880 
 881   lvx    (a,    state);
 882   addi   (addr, state, 16);
 883   lvsr   (vRb,  addr);
 884 
 885   for (int n = start_idx + 2; n < total_hs; n += 2) {
 886     VectorRegister h_cur   = hs[n];
 887     VectorRegister h_prev2 = hs[n - 2];
 888 
 889     lvx    (h_cur,   addr);
 890     addi   (addr,    addr,  16);
 891     vec_perm(h_prev2, h_cur, vRb);
 892   }
 893   lvx    (aux, addr);
 894   vec_perm(g, aux, vRb);
 895 
 896   b      (after_state_aligned);
 897 
 898   bind(state_aligned);
 899 
 900   // deal with aligned addresses
 901   mr(addr, state);
 902   for (int n = start_idx; n < total_hs; n += 2) {
 903     VectorRegister h_cur = hs[n];
 904 
 905     lvx    (h_cur, addr);
 906     addi   (addr, addr, 16);
 907   }
 908 
 909   bind(after_state_aligned);
 910 }
 911 
 912 //   R3_ARG1   - byte[]  Input string with padding but in Big Endian
 913 //   R4_ARG2   - int[]   SHA.state (at first, the root of primes)
 914 //   R5_ARG3   - int     offset
 915 //   R6_ARG4   - int     limit
 916 //
 917 //   Internal Register usage:
 918 //   R7 R8 R9  - volatile temporaries
 919 //   VR0-VR7   - a-h
 920 //   VR8       - vRb
 921 //   VR9       - aux (highly volatile, use with care)
 922 //   VR10-VR17 - w0-w7 | ini_a-ini_h
 923 //   VR18      - vsp16 | kplusw0
 924 //   VR19      - vsp32 | kplusw1
 925 //   VR20-VR25 - sha512_calc_2w and sha512_round temporaries
 926 void MacroAssembler::sha512(bool multi_block) {
 927   static const ssize_t base_size = sizeof(uint64_t);
 928   static const ssize_t buf_size = 128;
 929   static uint64_t waux[buf_size / base_size] __attribute((aligned (16)));
 930   static const uint64_t round_consts[80] __attribute((aligned (16))) = {
 931     0x428a2f98d728ae22, 0x7137449123ef65cd,
 932     0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc,
 933     0x3956c25bf348b538, 0x59f111f1b605d019,
 934     0x923f82a4af194f9b, 0xab1c5ed5da6d8118,
 935     0xd807aa98a3030242, 0x12835b0145706fbe,
 936     0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2,
 937     0x72be5d74f27b896f, 0x80deb1fe3b1696b1,
 938     0x9bdc06a725c71235, 0xc19bf174cf692694,
 939     0xe49b69c19ef14ad2, 0xefbe4786384f25e3,
 940     0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65,
 941     0x2de92c6f592b0275, 0x4a7484aa6ea6e483,
 942     0x5cb0a9dcbd41fbd4, 0x76f988da831153b5,
 943     0x983e5152ee66dfab, 0xa831c66d2db43210,
 944     0xb00327c898fb213f, 0xbf597fc7beef0ee4,
 945     0xc6e00bf33da88fc2, 0xd5a79147930aa725,
 946     0x06ca6351e003826f, 0x142929670a0e6e70,
 947     0x27b70a8546d22ffc, 0x2e1b21385c26c926,
 948     0x4d2c6dfc5ac42aed, 0x53380d139d95b3df,
 949     0x650a73548baf63de, 0x766a0abb3c77b2a8,
 950     0x81c2c92e47edaee6, 0x92722c851482353b,
 951     0xa2bfe8a14cf10364, 0xa81a664bbc423001,
 952     0xc24b8b70d0f89791, 0xc76c51a30654be30,
 953     0xd192e819d6ef5218, 0xd69906245565a910,
 954     0xf40e35855771202a, 0x106aa07032bbd1b8,
 955     0x19a4c116b8d2d0c8, 0x1e376c085141ab53,
 956     0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8,
 957     0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb,
 958     0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3,
 959     0x748f82ee5defb2fc, 0x78a5636f43172f60,
 960     0x84c87814a1f0ab72, 0x8cc702081a6439ec,
 961     0x90befffa23631e28, 0xa4506cebde82bde9,
 962     0xbef9a3f7b2c67915, 0xc67178f2e372532b,
 963     0xca273eceea26619c, 0xd186b8c721c0c207,
 964     0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178,
 965     0x06f067aa72176fba, 0x0a637dc5a2c898a6,
 966     0x113f9804bef90dae, 0x1b710b35131c471b,
 967     0x28db77f523047d84, 0x32caab7b40c72493,
 968     0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c,
 969     0x4cc5d4becb3e42b6, 0x597f299cfc657e2a,
 970     0x5fcb6fab3ad6faec, 0x6c44198c4a475817,
 971   };
 972   static const uint8_t w_size = sizeof(round_consts)/sizeof(uint64_t);
 973 
 974   Register buf_in = R3_ARG1;
 975   Register state  = R4_ARG2;
 976   Register ofs    = R5_ARG3;
 977   Register limit  = R6_ARG4;
 978 
 979   Label sha_loop, bsw_loop, core_loop;
 980 
 981   // Save non-volatile vector registers in the red zone
 982   static const VectorRegister nv[] = {
 983     VR20, VR21, VR22, VR23, VR24, VR25/*, VR26, VR27, VR28, VR29, VR30, VR31*/
 984   };
 985   static const uint8_t nv_size = sizeof(nv) / sizeof (VectorRegister);
 986 
 987   for (int c = 0; c < nv_size; c++) {
 988     Register idx = R7;
 989     li  (idx, (c - (nv_size)) * 16);
 990     stvx(nv[c], idx, R1);
 991   }
 992 
 993   // Load hash state to registers
 994   VectorRegister a = VR0;
 995   VectorRegister b = VR1;
 996   VectorRegister c = VR2;
 997   VectorRegister d = VR3;
 998   VectorRegister e = VR4;
 999   VectorRegister f = VR5;
1000   VectorRegister g = VR6;
1001   VectorRegister h = VR7;
1002   static const VectorRegister hs[] = {a, b, c, d, e, f, g, h};
1003   static const int total_hs = sizeof(hs)/sizeof(VectorRegister);
1004   // counter for cycling through hs vector to avoid register moves between iterations
1005   int h_cnt = 0;
1006 
1007   // Load a-h registers from the memory pointed by state
1008   sha512_load_h_vec(state, hs, total_hs);
1009 
1010   if (multi_block) {
1011     align(OptoLoopAlignment);
1012   }
1013   bind(sha_loop);
1014 
1015   for (int n = 0; n < total_hs; n += 2) {
1016 #if defined(VM_LITTLE_ENDIAN)
1017     VectorRegister h_cur = hs[n];
1018     VectorRegister h_next = hs[n + 1];
1019 #else
1020     VectorRegister h_cur = hs[n + 1];
1021     VectorRegister h_next = hs[n];
1022 #endif
1023     vsldoi (h_next, h_cur, h_cur, 8);
1024   }
1025 
1026   Register k = R9;
1027   load_const_optimized(k, const_cast<uint64_t *>(round_consts), R0);
1028 
1029   // Load 16 elements from w out of the loop.
1030   // Order of the long values is Endianess specific.
1031   VectorRegister w0 = VR10;
1032   VectorRegister w1 = VR11;
1033   VectorRegister w2 = VR12;
1034   VectorRegister w3 = VR13;
1035   VectorRegister w4 = VR14;
1036   VectorRegister w5 = VR15;
1037   VectorRegister w6 = VR16;
1038   VectorRegister w7 = VR17;
1039   static const VectorRegister ws[] = {w0, w1, w2, w3, w4, w5, w6, w7};
1040   static const int total_ws = sizeof(ws)/sizeof(VectorRegister);
1041 
1042   // Load 16 w into vectors and setup vsl for vperm
1043   sha512_load_w_vec(buf_in, ws, total_ws);
1044 
1045 #if defined(VM_LITTLE_ENDIAN)
1046   VectorRegister vsp16 = VR18;
1047   VectorRegister vsp32 = VR19;
1048   VectorRegister shiftarg = VR9;
1049 
1050   vspltisw(vsp16,    8);
1051   vspltisw(shiftarg, 1);
1052   vsl     (vsp16,    vsp16, shiftarg);
1053   vsl     (vsp32,    vsp16, shiftarg);
1054 
1055   VectorRegister vsp8 = VR9;
1056   vspltish(vsp8,     8);
1057 
1058   // Convert input from Big Endian to Little Endian
1059   for (int c = 0; c < total_ws; c++) {
1060     VectorRegister w = ws[c];
1061     vrlh  (w, w, vsp8);
1062   }
1063   for (int c = 0; c < total_ws; c++) {
1064     VectorRegister w = ws[c];
1065     vrlw  (w, w, vsp16);
1066   }
1067   for (int c = 0; c < total_ws; c++) {
1068     VectorRegister w = ws[c];
1069     vrld  (w, w, vsp32);
1070   }
1071 #endif
1072 
1073   Register Rb        = R10;
1074   VectorRegister vRb = VR8;
1075   li      (Rb, 8);
1076   lvsr    (vRb, Rb);
1077 
1078   VectorRegister kplusw0 = VR18;
1079   VectorRegister kplusw1 = VR19;
1080 
1081   Register addr      = R7;
1082   mr      (addr, k);
1083 
1084   for (int n = 0; n < total_ws; n++) {
1085     VectorRegister w = ws[n];
1086 
1087     lvx    (kplusw0, addr);
1088     addi   (addr, addr, 16);
1089 #if defined(VM_LITTLE_ENDIAN)
1090     vaddudm(kplusw0, kplusw0, w);
1091     vsldoi (kplusw1, kplusw0, kplusw0, 8);
1092 #else
1093     vaddudm(kplusw1, kplusw0, w);
1094     vsldoi (kplusw0, kplusw1, kplusw1, 8);
1095 #endif
1096 
1097     sha512_round(hs, total_hs, h_cnt, kplusw0);
1098     sha512_round(hs, total_hs, h_cnt, kplusw1);
1099   }
1100 
1101   Register tmp       = R8;
1102   li    (tmp, (w_size-16)/total_hs);
1103   mtctr (tmp);
1104   // j will be aligned to 4 for loading words.
1105   // Whenever read, advance the pointer (e.g: when j is used in a function)
1106   Register j = tmp;
1107   li     (j, 8*16);
1108 
1109   align(OptoLoopAlignment);
1110   bind(core_loop);
1111 
1112   // due to VectorRegister rotate, always iterate in multiples of total_hs
1113   for (int n = 0; n < total_hs/2; n++) {
1114     sha512_calc_2w(w0, w1, w2, w3, w4, w5, w6, w7, kplusw0, kplusw1, j, vRb, k);
1115     sha512_round(hs, total_hs, h_cnt, kplusw0);
1116     sha512_round(hs, total_hs, h_cnt, kplusw1);
1117   }
1118 
1119   bdnz   (core_loop);
1120 
1121   sha512_update_sha_state(state, hs, total_hs);
1122 
1123   if (multi_block) {
1124     // process next 1024 bit block (buf_in already updated)
1125     addi(ofs, ofs, buf_size);
1126     cmpd(CCR0, ofs, limit);
1127     blt(CCR0, sha_loop);
1128 
1129     // return ofs
1130     mr(R3_ARG1, ofs);
1131   }
1132 
1133   // Restore non-volatile registers
1134   for (int c = 0; c < nv_size; c++) {
1135     Register idx = R7;
1136     li  (idx, (c - (nv_size)) * 16);
1137     lvx(nv[c], idx, R1);
1138   }
1139 }