1 // Copyright (c) 2017 Instituto de Pesquisas Eldorado. All rights reserved. 2 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 3 // 4 // This code is free software; you can redistribute it and/or modify it 5 // under the terms of the GNU General Public License version 2 only, as 6 // published by the Free Software Foundation. 7 // 8 // This code is distributed in the hope that it will be useful, but WITHOUT 9 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 10 // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 11 // version 2 for more details (a copy is included in the LICENSE file that 12 // accompanied this code). 13 // 14 // You should have received a copy of the GNU General Public License version 15 // 2 along with this work; if not, write to the Free Software Foundation, 16 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 17 // 18 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 19 // or visit www.oracle.com if you need additional information or have any 20 // questions. 21 22 // This implementation was contributed by the following people: 23 // Bruno Rosa <bruno.rosa@eldorado.org.br> 24 // Gustavo Serra Scalet <gustavo.scalet@eldorado.org.br> 25 // Igor Nunes <igor.nunes@eldorado.org.br> 26 27 // Support to Big Endian by: 28 // Martin Doerr <martin.doerr@sap.com> 29 30 // Implemented according to "Descriptions of SHA-256, SHA-384, and SHA-512" 31 // (http://www.iwar.org.uk/comsec/resources/cipher/sha256-384-512.pdf). 32 33 #include "asm/macroAssembler.inline.hpp" 34 #include "runtime/stubRoutines.hpp" 35 36 /********************************************************************** 37 * SHA 256 38 *********************************************************************/ 39 40 void MacroAssembler::sha256_deque(const VectorRegister src, 41 const VectorRegister dst1, 42 const VectorRegister dst2, 43 const VectorRegister dst3) { 44 vsldoi (dst1, src, src, 12); 45 vsldoi (dst2, src, src, 8); 46 vsldoi (dst3, src, src, 4); 47 } 48 49 void MacroAssembler::sha256_round(const VectorRegister* hs, 50 const int total_hs, 51 int& h_cnt, 52 const VectorRegister kpw) { 53 // convenience registers: cycle from 0-7 downwards 54 const VectorRegister a = hs[(total_hs + 0 - (h_cnt % total_hs)) % total_hs]; 55 const VectorRegister b = hs[(total_hs + 1 - (h_cnt % total_hs)) % total_hs]; 56 const VectorRegister c = hs[(total_hs + 2 - (h_cnt % total_hs)) % total_hs]; 57 const VectorRegister d = hs[(total_hs + 3 - (h_cnt % total_hs)) % total_hs]; 58 const VectorRegister e = hs[(total_hs + 4 - (h_cnt % total_hs)) % total_hs]; 59 const VectorRegister f = hs[(total_hs + 5 - (h_cnt % total_hs)) % total_hs]; 60 const VectorRegister g = hs[(total_hs + 6 - (h_cnt % total_hs)) % total_hs]; 61 const VectorRegister h = hs[(total_hs + 7 - (h_cnt % total_hs)) % total_hs]; 62 // temporaries 63 VectorRegister ch = VR0; 64 VectorRegister maj = VR1; 65 VectorRegister bsa = VR2; 66 VectorRegister bse = VR3; 67 VectorRegister vt0 = VR4; 68 VectorRegister vt1 = VR5; 69 VectorRegister vt2 = VR6; 70 VectorRegister vt3 = VR7; 71 72 vsel (ch, g, f, e); 73 vxor (maj, a, b); 74 vshasigmaw (bse, e, 1, 0xf); 75 vadduwm (vt2, ch, kpw); 76 vadduwm (vt1, h, bse); 77 vsel (maj, b, c, maj); 78 vadduwm (vt3, vt1, vt2); 79 vshasigmaw (bsa, a, 1, 0); 80 vadduwm (vt0, bsa, maj); 81 82 vadduwm (d, d, vt3); 83 vadduwm (h, vt3, vt0); 84 85 // advance vector pointer to the next iteration 86 h_cnt++; 87 } 88 89 void MacroAssembler::sha256_load_h_vec(const VectorRegister a, 90 const VectorRegister e, 91 const Register hptr) { 92 // temporaries 93 Register tmp = R8; 94 VectorRegister vt0 = VR0; 95 VectorRegister vRb = VR6; 96 // labels 97 Label sha256_aligned, sha256_load_end;; 98 99 andi_ (tmp, hptr, 0xf); 100 addi (tmp, hptr, 16); 101 beq (CCR0, sha256_aligned); 102 103 // handle unaligned accesses 104 lvx (a, hptr); 105 lvsr (vRb, hptr); 106 107 lvx (e, tmp); 108 addi (tmp, tmp, 16); 109 vec_perm(a, e, vRb); 110 111 lvx (vt0, tmp); 112 vec_perm(e, vt0, vRb); 113 b (sha256_load_end); 114 115 // aligned accesses 116 bind(sha256_aligned); 117 lvx (a, hptr); 118 addi (tmp, hptr, 16); 119 lvx (e, tmp); 120 121 bind(sha256_load_end); 122 } 123 124 void MacroAssembler::sha256_load_w_plus_k_vec(const Register buf_in, 125 const VectorRegister* ws, 126 const int total_ws, 127 const Register k, 128 const VectorRegister* kpws, 129 const int total_kpws) { 130 Label w_aligned, after_w_load; 131 132 Register tmp = R8; 133 VectorRegister vt0 = VR0; 134 VectorRegister vt1 = VR1; 135 VectorRegister vRb = VR6; 136 137 andi_ (tmp, buf_in, 0xF); 138 beq (CCR0, w_aligned); // address ends with 0x0, not 0x8 139 140 // deal with unaligned addresses 141 lvx (ws[0], buf_in); 142 addi (buf_in, buf_in, 16); 143 lvsr (vRb, buf_in); 144 145 for (int n = 1; n < total_ws; n++) { 146 VectorRegister w_cur = ws[n]; 147 VectorRegister w_prev = ws[n-1]; 148 149 lvx (w_cur, buf_in); 150 addi (buf_in, buf_in, 16); 151 vec_perm(w_prev, w_cur, vRb); 152 } 153 154 lvx (vt0, buf_in); 155 vec_perm(ws[total_ws-1], vt0, vRb); 156 157 b (after_w_load); 158 159 bind(w_aligned); 160 161 // deal with aligned addresses 162 for (int n = 0; n < total_ws; n++) { 163 VectorRegister w = ws[n]; 164 165 lvx (w, buf_in); 166 addi (buf_in, buf_in, 16); 167 } 168 169 bind(after_w_load); 170 171 #if defined(VM_LITTLE_ENDIAN) 172 // Byte swapping within int values 173 li (tmp, 8); 174 lvsl (vt0, tmp); 175 vspltisb (vt1, 0xb); 176 vxor (vt1, vt0, vt1); 177 for (int n = 0; n < total_ws; n++) { 178 VectorRegister w = ws[n]; 179 vec_perm(w, w, vt1); 180 } 181 #endif 182 183 // Loading k, which is always aligned to 16-bytes 184 lvx (kpws[0], k); 185 addi (tmp, k, 16); 186 for (int n = 1; n < total_kpws-1; n++) { 187 VectorRegister kpw = kpws[n]; 188 189 lvx (kpw, tmp); 190 addi (tmp, tmp, 16); 191 } 192 lvx (kpws[total_kpws-1], tmp); 193 194 // Add w to K 195 assert(total_ws == total_kpws, "Redesign the loop below"); 196 for (int n = 0; n < total_kpws; n++) { 197 VectorRegister kpw = kpws[n]; 198 VectorRegister w = ws[n]; 199 200 vadduwm (kpw, kpw, w); 201 } 202 } 203 204 void MacroAssembler::sha256_calc_4w(const VectorRegister w0, 205 const VectorRegister w1, 206 const VectorRegister w2, 207 const VectorRegister w3, 208 const VectorRegister kpw0, 209 const VectorRegister kpw1, 210 const VectorRegister kpw2, 211 const VectorRegister kpw3, 212 const Register j, 213 const Register k) { 214 // Temporaries 215 const VectorRegister vt0 = VR0; 216 const VectorRegister vt1 = VR1; 217 const VectorSRegister vsrt1 = vt1->to_vsr(); 218 const VectorRegister vt2 = VR2; 219 const VectorRegister vt3 = VR3; 220 const VectorSRegister vst3 = vt3->to_vsr(); 221 const VectorRegister vt4 = VR4; 222 223 // load to k[j] 224 lvx (vt0, j, k); 225 226 // advance j 227 addi (j, j, 16); // 16 bytes were read 228 229 #if defined(VM_LITTLE_ENDIAN) 230 // b = w[j-15], w[j-14], w[j-13], w[j-12] 231 vsldoi (vt1, w1, w0, 12); 232 233 // c = w[j-7], w[j-6], w[j-5], w[j-4] 234 vsldoi (vt2, w3, w2, 12); 235 236 #else 237 // b = w[j-15], w[j-14], w[j-13], w[j-12] 238 vsldoi (vt1, w0, w1, 4); 239 240 // c = w[j-7], w[j-6], w[j-5], w[j-4] 241 vsldoi (vt2, w2, w3, 4); 242 #endif 243 244 // d = w[j-2], w[j-1], w[j-4], w[j-3] 245 vsldoi (vt3, w3, w3, 8); 246 247 // b = s0(w[j-15]) , s0(w[j-14]) , s0(w[j-13]) , s0(w[j-12]) 248 vshasigmaw (vt1, vt1, 0, 0); 249 250 // d = s1(w[j-2]) , s1(w[j-1]) , s1(w[j-4]) , s1(w[j-3]) 251 vshasigmaw (vt3, vt3, 0, 0xf); 252 253 // c = s0(w[j-15]) + w[j-7], 254 // s0(w[j-14]) + w[j-6], 255 // s0(w[j-13]) + w[j-5], 256 // s0(w[j-12]) + w[j-4] 257 vadduwm (vt2, vt1, vt2); 258 259 // c = s0(w[j-15]) + w[j-7] + w[j-16], 260 // s0(w[j-14]) + w[j-6] + w[j-15], 261 // s0(w[j-13]) + w[j-5] + w[j-14], 262 // s0(w[j-12]) + w[j-4] + w[j-13] 263 vadduwm (vt2, vt2, w0); 264 265 // e = s0(w[j-15]) + w[j-7] + w[j-16] + s1(w[j-2]), // w[j] 266 // s0(w[j-14]) + w[j-6] + w[j-15] + s1(w[j-1]), // w[j+1] 267 // s0(w[j-13]) + w[j-5] + w[j-14] + s1(w[j-4]), // UNDEFINED 268 // s0(w[j-12]) + w[j-4] + w[j-13] + s1(w[j-3]) // UNDEFINED 269 vadduwm (vt4, vt2, vt3); 270 271 // At this point, e[0] and e[1] are the correct values to be stored at w[j] 272 // and w[j+1]. 273 // e[2] and e[3] are not considered. 274 // b = s1(w[j]) , s1(s(w[j+1]) , UNDEFINED , UNDEFINED 275 vshasigmaw (vt1, vt4, 0, 0xf); 276 277 // v5 = s1(w[j-2]) , s1(w[j-1]) , s1(w[j]) , s1(w[j+1]) 278 #if defined(VM_LITTLE_ENDIAN) 279 xxmrgld (vst3, vsrt1, vst3); 280 #else 281 xxmrghd (vst3, vst3, vsrt1); 282 #endif 283 284 // c = s0(w[j-15]) + w[j-7] + w[j-16] + s1(w[j-2]), // w[j] 285 // s0(w[j-14]) + w[j-6] + w[j-15] + s1(w[j-1]), // w[j+1] 286 // s0(w[j-13]) + w[j-5] + w[j-14] + s1(w[j]), // w[j+2] 287 // s0(w[j-12]) + w[j-4] + w[j-13] + s1(w[j+1]) // w[j+4] 288 vadduwm (vt2, vt2, vt3); 289 290 // Updating w0 to w3 to hold the new previous 16 values from w. 291 vmr (w0, w1); 292 vmr (w1, w2); 293 vmr (w2, w3); 294 vmr (w3, vt2); 295 296 // store k + w to v9 (4 values at once) 297 #if defined(VM_LITTLE_ENDIAN) 298 vadduwm (kpw0, vt2, vt0); 299 300 vsldoi (kpw1, kpw0, kpw0, 12); 301 vsldoi (kpw2, kpw0, kpw0, 8); 302 vsldoi (kpw3, kpw0, kpw0, 4); 303 #else 304 vadduwm (kpw3, vt2, vt0); 305 306 vsldoi (kpw2, kpw3, kpw3, 12); 307 vsldoi (kpw1, kpw3, kpw3, 8); 308 vsldoi (kpw0, kpw3, kpw3, 4); 309 #endif 310 } 311 312 void MacroAssembler::sha256_update_sha_state(const VectorRegister a, 313 const VectorRegister b_, 314 const VectorRegister c, 315 const VectorRegister d, 316 const VectorRegister e, 317 const VectorRegister f, 318 const VectorRegister g, 319 const VectorRegister h, 320 const Register hptr) { 321 // temporaries 322 VectorRegister vt0 = VR0; 323 VectorRegister vt1 = VR1; 324 VectorRegister vt2 = VR2; 325 VectorRegister vt3 = VR3; 326 VectorRegister vt4 = VR4; 327 VectorRegister vt5 = VR5; 328 VectorRegister vaux = VR6; 329 VectorRegister vRb = VR6; 330 Register tmp = R8; 331 Register of16 = R8; 332 Register of32 = R9; 333 Label state_load_aligned, after_state_load_aligned; 334 335 // Load hptr 336 andi_ (tmp, hptr, 0xf); 337 li (of16, 16); 338 beq (CCR0, state_load_aligned); 339 340 // handle unaligned accesses 341 li (of32, 32); 342 lvx (vt0, hptr); 343 lvsr (vRb, hptr); 344 345 lvx (vt5, hptr, of16); 346 vec_perm(vt0, vt5, vRb); // vt0 = hptr[0]..hptr[3] 347 348 lvx (vt1, hptr, of32); 349 vec_perm(vt5, vt1, vRb); // vt5 = hptr[4]..hptr[7] 350 b (after_state_load_aligned); 351 352 // aligned accesses 353 bind(state_load_aligned); 354 lvx (vt0, hptr); 355 lvx (vt5, of16, hptr); 356 357 bind(after_state_load_aligned); 358 359 #if defined(VM_LITTLE_ENDIAN) 360 vmrglw (vt1, b_, a); // vt1 = {a, b, ?, ?} 361 vmrglw (vt2, d, c); // vt2 = {c, d, ?, ?} 362 vmrglw (vt3, f, e); // vt3 = {e, f, ?, ?} 363 vmrglw (vt4, h, g); // vt4 = {g, h, ?, ?} 364 xxmrgld (vt1->to_vsr(), vt2->to_vsr(), vt1->to_vsr()); // vt1 = {a, b, c, d} 365 xxmrgld (vt3->to_vsr(), vt4->to_vsr(), vt3->to_vsr()); // vt3 = {e, f, g, h} 366 vadduwm (a, vt0, vt1); // a = {a+hptr[0], b+hptr[1], c+hptr[2], d+hptr[3]} 367 vadduwm (e, vt5, vt3); // e = {e+hptr[4], f+hptr[5], g+hptr[6], h+hptr[7]} 368 369 // Save hptr back, works for any alignment 370 xxswapd (vt0->to_vsr(), a->to_vsr()); 371 stxvd2x (vt0->to_vsr(), hptr); 372 xxswapd (vt5->to_vsr(), e->to_vsr()); 373 stxvd2x (vt5->to_vsr(), of16, hptr); 374 #else 375 vmrglw (vt1, a, b_); // vt1 = {a, b, ?, ?} 376 vmrglw (vt2, c, d); // vt2 = {c, d, ?, ?} 377 vmrglw (vt3, e, f); // vt3 = {e, f, ?, ?} 378 vmrglw (vt4, g, h); // vt4 = {g, h, ?, ?} 379 xxmrgld (vt1->to_vsr(), vt1->to_vsr(), vt2->to_vsr()); // vt1 = {a, b, c, d} 380 xxmrgld (vt3->to_vsr(), vt3->to_vsr(), vt4->to_vsr()); // vt3 = {e, f, g, h} 381 vadduwm (d, vt0, vt1); // d = {a+hptr[0], b+hptr[1], c+hptr[2], d+hptr[3]} 382 vadduwm (h, vt5, vt3); // h = {e+hptr[4], f+hptr[5], g+hptr[6], h+hptr[7]} 383 384 // Save hptr back, works for any alignment 385 stxvd2x (d->to_vsr(), hptr); 386 stxvd2x (h->to_vsr(), of16, hptr); 387 #endif 388 } 389 390 391 // R3_ARG1 - byte[] Input string with padding but in Big Endian 392 // R4_ARG2 - int[] SHA.state (at first, the root of primes) 393 // R5_ARG3 - int offset 394 // R6_ARG4 - int limit 395 // 396 // Internal Register usage: 397 // R7 - k 398 // R8 - tmp | j | of16 399 // R9 - of32 400 // VR0-VR8 - ch, maj, bsa, bse, vt0-vt3 | vt0-vt5, vaux/vRb 401 // VR9-VR16 - a-h 402 // VR17-VR20 - w0-w3 403 // VR21-VR23 - vRb | vaux0-vaux2 404 // VR24-VR27 - kpw0-kpw3 405 void MacroAssembler::sha256(bool multi_block) { 406 static const ssize_t base_size = sizeof(uint32_t); 407 static const ssize_t buf_size = 64; 408 static uint32_t waux[buf_size / base_size] __attribute((aligned (16))); 409 static const uint32_t round_consts[64] __attribute((aligned (16))) = { 410 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 411 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 412 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 413 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 414 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 415 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 416 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 417 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 418 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 419 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 420 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 421 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 422 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 423 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 424 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 425 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 426 }; 427 static const uint8_t w_size = sizeof(round_consts)/sizeof(uint32_t); 428 429 Register buf_in = R3_ARG1; 430 Register state = R4_ARG2; 431 Register ofs = R5_ARG3; 432 Register limit = R6_ARG4; 433 434 Label sha_loop, bsw_loop, core_loop; 435 436 // Save non-volatile vector registers in the red zone 437 static const VectorRegister nv[] = { 438 VR20, VR21, VR22, VR23, VR24, VR25, VR26, VR27/*, VR28, VR29, VR30, VR31*/ 439 }; 440 static const uint8_t nv_size = sizeof(nv) / sizeof (VectorRegister); 441 442 for (int c = 0; c < nv_size; c++) { 443 Register tmp = R8; 444 li (tmp, (c - (nv_size)) * 16); 445 stvx(nv[c], tmp, R1); 446 } 447 448 // Load hash state to registers 449 VectorRegister a = VR9; 450 VectorRegister b = VR10; 451 VectorRegister c = VR11; 452 VectorRegister d = VR12; 453 VectorRegister e = VR13; 454 VectorRegister f = VR14; 455 VectorRegister g = VR15; 456 VectorRegister h = VR16; 457 static const VectorRegister hs[] = {a, b, c, d, e, f, g, h}; 458 static const int total_hs = sizeof(hs)/sizeof(VectorRegister); 459 // counter for cycling through hs vector to avoid register moves between iterations 460 int h_cnt = 0; 461 462 // Load a-h registers from the memory pointed by state 463 #if defined(VM_LITTLE_ENDIAN) 464 sha256_load_h_vec(a, e, state); 465 #else 466 sha256_load_h_vec(d, h, state); 467 #endif 468 469 // keep k loaded also during MultiBlock loops 470 Register k = R7; 471 load_const_optimized(k, const_cast<uint32_t *>(round_consts), R0); 472 473 // Avoiding redundant loads 474 if (multi_block) { 475 align(OptoLoopAlignment); 476 } 477 bind(sha_loop); 478 #if defined(VM_LITTLE_ENDIAN) 479 sha256_deque(a, b, c, d); 480 sha256_deque(e, f, g, h); 481 #else 482 sha256_deque(d, c, b, a); 483 sha256_deque(h, g, f, e); 484 #endif 485 486 // Load 16 elements from w out of the loop. 487 // Order of the int values is Endianess specific. 488 VectorRegister w0 = VR17; 489 VectorRegister w1 = VR18; 490 VectorRegister w2 = VR19; 491 VectorRegister w3 = VR20; 492 static const VectorRegister ws[] = {w0, w1, w2, w3}; 493 static const int total_ws = sizeof(ws)/sizeof(VectorRegister); 494 495 VectorRegister kpw0 = VR24; 496 VectorRegister kpw1 = VR25; 497 VectorRegister kpw2 = VR26; 498 VectorRegister kpw3 = VR27; 499 static const VectorRegister kpws[] = {kpw0, kpw1, kpw2, kpw3}; 500 static const int total_kpws = sizeof(kpws)/sizeof(VectorRegister); 501 502 sha256_load_w_plus_k_vec(buf_in, ws, total_ws, k, kpws, total_kpws); 503 504 // Cycle through the first 16 elements 505 assert(total_ws == total_kpws, "Redesign the loop below"); 506 for (int n = 0; n < total_ws; n++) { 507 VectorRegister vaux0 = VR21; 508 VectorRegister vaux1 = VR22; 509 VectorRegister vaux2 = VR23; 510 511 sha256_deque(kpws[n], vaux0, vaux1, vaux2); 512 513 #if defined(VM_LITTLE_ENDIAN) 514 sha256_round(hs, total_hs, h_cnt, kpws[n]); 515 sha256_round(hs, total_hs, h_cnt, vaux0); 516 sha256_round(hs, total_hs, h_cnt, vaux1); 517 sha256_round(hs, total_hs, h_cnt, vaux2); 518 #else 519 sha256_round(hs, total_hs, h_cnt, vaux2); 520 sha256_round(hs, total_hs, h_cnt, vaux1); 521 sha256_round(hs, total_hs, h_cnt, vaux0); 522 sha256_round(hs, total_hs, h_cnt, kpws[n]); 523 #endif 524 } 525 526 Register tmp = R8; 527 // loop the 16th to the 64th iteration by 8 steps 528 li (tmp, (w_size - 16) / total_hs); 529 mtctr(tmp); 530 531 // j will be aligned to 4 for loading words. 532 // Whenever read, advance the pointer (e.g: when j is used in a function) 533 Register j = R8; 534 li (j, 16*4); 535 536 align(OptoLoopAlignment); 537 bind(core_loop); 538 539 // due to VectorRegister rotate, always iterate in multiples of total_hs 540 for (int n = 0; n < total_hs/4; n++) { 541 sha256_calc_4w(w0, w1, w2, w3, kpw0, kpw1, kpw2, kpw3, j, k); 542 sha256_round(hs, total_hs, h_cnt, kpw0); 543 sha256_round(hs, total_hs, h_cnt, kpw1); 544 sha256_round(hs, total_hs, h_cnt, kpw2); 545 sha256_round(hs, total_hs, h_cnt, kpw3); 546 } 547 548 bdnz (core_loop); 549 550 // Update hash state 551 sha256_update_sha_state(a, b, c, d, e, f, g, h, state); 552 553 if (multi_block) { 554 // process next 1024 bit block (buf_in already updated) 555 addi(ofs, ofs, buf_size); 556 cmpd(CCR0, ofs, limit); 557 blt(CCR0, sha_loop); 558 559 // return ofs 560 mr(R3_ARG1, ofs); 561 } 562 563 // Restore non-volatile registers 564 for (int c = 0; c < nv_size; c++) { 565 Register tmp = R8; 566 li (tmp, (c - (nv_size)) * 16); 567 lvx(nv[c], tmp, R1); 568 } 569 } 570 571 572 /********************************************************************** 573 * SHA 512 574 *********************************************************************/ 575 576 void MacroAssembler::sha512_load_w_vec(const Register buf_in, 577 const VectorRegister* ws, 578 const int total_ws) { 579 Register tmp = R8; 580 VectorRegister vRb = VR8; 581 VectorRegister aux = VR9; 582 Label is_aligned, after_alignment; 583 584 andi_ (tmp, buf_in, 0xF); 585 beq (CCR0, is_aligned); // address ends with 0x0, not 0x8 586 587 // deal with unaligned addresses 588 lvx (ws[0], buf_in); 589 addi (buf_in, buf_in, 16); 590 lvsr (vRb, buf_in); 591 592 for (int n = 1; n < total_ws; n++) { 593 VectorRegister w_cur = ws[n]; 594 VectorRegister w_prev = ws[n-1]; 595 596 lvx (w_cur, buf_in); 597 addi (buf_in, buf_in, 16); 598 vec_perm(w_prev, w_cur, vRb); 599 } 600 601 lvx (aux, buf_in); 602 vec_perm(ws[total_ws-1], aux, vRb); 603 604 b (after_alignment); 605 606 bind(is_aligned); 607 608 for (int n = 0; n < total_ws; n++) { 609 VectorRegister w = ws[n]; 610 611 lvx (w, buf_in); 612 addi (buf_in, buf_in, 16); 613 } 614 615 bind(after_alignment); 616 } 617 618 // Update hash state 619 void MacroAssembler::sha512_update_sha_state(const Register state, 620 const VectorRegister* hs, 621 const int total_hs) { 622 623 #if defined(VM_LITTLE_ENDIAN) 624 int start_idx = 0; 625 #else 626 int start_idx = 1; 627 #endif 628 629 // load initial hash from the memory pointed by state 630 VectorRegister ini_a = VR10; 631 VectorRegister ini_c = VR12; 632 VectorRegister ini_e = VR14; 633 VectorRegister ini_g = VR16; 634 static const VectorRegister inis[] = {ini_a, ini_c, ini_e, ini_g}; 635 static const int total_inis = sizeof(inis)/sizeof(VectorRegister); 636 637 Label state_save_aligned, after_state_save_aligned; 638 639 Register addr = R7; 640 Register tmp = R8; 641 VectorRegister vRb = VR8; 642 VectorRegister aux = VR9; 643 644 andi_(tmp, state, 0xf); 645 beq(CCR0, state_save_aligned); 646 // deal with unaligned addresses 647 648 { 649 VectorRegister a = hs[0]; 650 VectorRegister b_ = hs[1]; 651 VectorRegister c = hs[2]; 652 VectorRegister d = hs[3]; 653 VectorRegister e = hs[4]; 654 VectorRegister f = hs[5]; 655 VectorRegister g = hs[6]; 656 VectorRegister h = hs[7]; 657 lvsr (vRb, state); 658 lvx (ini_a, state); 659 addi (addr, state, 16); 660 661 lvx (ini_c, addr); 662 addi (addr, addr, 16); 663 vec_perm(ini_a, ini_c, vRb); 664 665 lvx (ini_e, addr); 666 addi (addr, addr, 16); 667 vec_perm(ini_c, ini_e, vRb); 668 669 lvx (ini_g, addr); 670 addi (addr, addr, 16); 671 vec_perm(ini_e, ini_g, vRb); 672 673 lvx (aux, addr); 674 vec_perm(ini_g, aux, vRb); 675 676 #if defined(VM_LITTLE_ENDIAN) 677 xxmrgld(a->to_vsr(), b_->to_vsr(), a->to_vsr()); 678 xxmrgld(c->to_vsr(), d->to_vsr(), c->to_vsr()); 679 xxmrgld(e->to_vsr(), f->to_vsr(), e->to_vsr()); 680 xxmrgld(g->to_vsr(), h->to_vsr(), g->to_vsr()); 681 #else 682 xxmrgld(b_->to_vsr(), a->to_vsr(), b_->to_vsr()); 683 xxmrgld(d->to_vsr(), c->to_vsr(), d->to_vsr()); 684 xxmrgld(f->to_vsr(), e->to_vsr(), f->to_vsr()); 685 xxmrgld(h->to_vsr(), g->to_vsr(), h->to_vsr()); 686 #endif 687 688 for (int n = start_idx; n < total_hs; n += 2) { 689 VectorRegister h_cur = hs[n]; 690 VectorRegister ini_cur = inis[n/2]; 691 692 vaddudm(h_cur, ini_cur, h_cur); 693 } 694 695 for (int n = start_idx; n < total_hs; n += 2) { 696 VectorRegister h_cur = hs[n]; 697 698 mfvrd (tmp, h_cur); 699 #if defined(VM_LITTLE_ENDIAN) 700 std (tmp, 8*n + 8, state); 701 #else 702 std (tmp, 8*n - 8, state); 703 #endif 704 vsldoi (aux, h_cur, h_cur, 8); 705 mfvrd (tmp, aux); 706 std (tmp, 8*n + 0, state); 707 } 708 709 b (after_state_save_aligned); 710 } 711 712 bind(state_save_aligned); 713 { 714 mr(addr, state); 715 for (int n = 0; n < total_hs; n += 2) { 716 #if defined(VM_LITTLE_ENDIAN) 717 VectorRegister h_cur = hs[n]; 718 VectorRegister h_next = hs[n+1]; 719 #else 720 VectorRegister h_cur = hs[n+1]; 721 VectorRegister h_next = hs[n]; 722 #endif 723 VectorRegister ini_cur = inis[n/2]; 724 725 lvx(ini_cur, addr); 726 addi(addr, addr, 16); 727 xxmrgld(h_cur->to_vsr(), h_next->to_vsr(), h_cur->to_vsr()); 728 } 729 730 for (int n = start_idx; n < total_hs; n += 2) { 731 VectorRegister h_cur = hs[n]; 732 VectorRegister ini_cur = inis[n/2]; 733 734 vaddudm(h_cur, ini_cur, h_cur); 735 } 736 737 mr(addr, state); 738 for (int n = start_idx; n < total_hs; n += 2) { 739 VectorRegister h_cur = hs[n]; 740 741 stvx(h_cur, addr); 742 addi(addr, addr, 16); 743 } 744 } 745 746 bind(after_state_save_aligned); 747 } 748 749 // Use h_cnt to cycle through hs elements but also increment it at the end 750 void MacroAssembler::sha512_round(const VectorRegister* hs, 751 const int total_hs, int& h_cnt, 752 const VectorRegister kpw) { 753 754 // convenience registers: cycle from 0-7 downwards 755 const VectorRegister a = hs[(total_hs + 0 - (h_cnt % total_hs)) % total_hs]; 756 const VectorRegister b = hs[(total_hs + 1 - (h_cnt % total_hs)) % total_hs]; 757 const VectorRegister c = hs[(total_hs + 2 - (h_cnt % total_hs)) % total_hs]; 758 const VectorRegister d = hs[(total_hs + 3 - (h_cnt % total_hs)) % total_hs]; 759 const VectorRegister e = hs[(total_hs + 4 - (h_cnt % total_hs)) % total_hs]; 760 const VectorRegister f = hs[(total_hs + 5 - (h_cnt % total_hs)) % total_hs]; 761 const VectorRegister g = hs[(total_hs + 6 - (h_cnt % total_hs)) % total_hs]; 762 const VectorRegister h = hs[(total_hs + 7 - (h_cnt % total_hs)) % total_hs]; 763 // temporaries 764 const VectorRegister Ch = VR20; 765 const VectorRegister Maj = VR21; 766 const VectorRegister bsa = VR22; 767 const VectorRegister bse = VR23; 768 const VectorRegister tmp1 = VR24; 769 const VectorRegister tmp2 = VR25; 770 771 vsel (Ch, g, f, e); 772 vxor (Maj, a, b); 773 vshasigmad(bse, e, 1, 0xf); 774 vaddudm (tmp2, Ch, kpw); 775 vaddudm (tmp1, h, bse); 776 vsel (Maj, b, c, Maj); 777 vaddudm (tmp1, tmp1, tmp2); 778 vshasigmad(bsa, a, 1, 0); 779 vaddudm (tmp2, bsa, Maj); 780 vaddudm (d, d, tmp1); 781 vaddudm (h, tmp1, tmp2); 782 783 // advance vector pointer to the next iteration 784 h_cnt++; 785 } 786 787 void MacroAssembler::sha512_calc_2w(const VectorRegister w0, 788 const VectorRegister w1, 789 const VectorRegister w2, 790 const VectorRegister w3, 791 const VectorRegister w4, 792 const VectorRegister w5, 793 const VectorRegister w6, 794 const VectorRegister w7, 795 const VectorRegister kpw0, 796 const VectorRegister kpw1, 797 const Register j, 798 const VectorRegister vRb, 799 const Register k) { 800 // Temporaries 801 const VectorRegister VR_a = VR20; 802 const VectorRegister VR_b = VR21; 803 const VectorRegister VR_c = VR22; 804 const VectorRegister VR_d = VR23; 805 806 // load to k[j] 807 lvx (VR_a, j, k); 808 // advance j 809 addi (j, j, 16); // 16 bytes were read 810 811 #if defined(VM_LITTLE_ENDIAN) 812 // v6 = w[j-15], w[j-14] 813 vperm (VR_b, w1, w0, vRb); 814 // v12 = w[j-7], w[j-6] 815 vperm (VR_c, w5, w4, vRb); 816 #else 817 // v6 = w[j-15], w[j-14] 818 vperm (VR_b, w0, w1, vRb); 819 // v12 = w[j-7], w[j-6] 820 vperm (VR_c, w4, w5, vRb); 821 #endif 822 823 // v6 = s0(w[j-15]) , s0(w[j-14]) 824 vshasigmad (VR_b, VR_b, 0, 0); 825 // v5 = s1(w[j-2]) , s1(w[j-1]) 826 vshasigmad (VR_d, w7, 0, 0xf); 827 // v6 = s0(w[j-15]) + w[j-7] , s0(w[j-14]) + w[j-6] 828 vaddudm (VR_b, VR_b, VR_c); 829 // v8 = s1(w[j-2]) + w[j-16] , s1(w[j-1]) + w[j-15] 830 vaddudm (VR_d, VR_d, w0); 831 // v9 = s0(w[j-15]) + w[j-7] + w[j-16] + s1(w[j-2]), // w[j] 832 // s0(w[j-14]) + w[j-6] + w[j-15] + s1(w[j-1]), // w[j+1] 833 vaddudm (VR_c, VR_d, VR_b); 834 // Updating w0 to w7 to hold the new previous 16 values from w. 835 vmr (w0, w1); 836 vmr (w1, w2); 837 vmr (w2, w3); 838 vmr (w3, w4); 839 vmr (w4, w5); 840 vmr (w5, w6); 841 vmr (w6, w7); 842 vmr (w7, VR_c); 843 844 #if defined(VM_LITTLE_ENDIAN) 845 // store k + w to kpw0 (2 values at once) 846 vaddudm (kpw0, VR_c, VR_a); 847 // kpw1 holds (k + w)[1] 848 vsldoi (kpw1, kpw0, kpw0, 8); 849 #else 850 // store k + w to kpw0 (2 values at once) 851 vaddudm (kpw1, VR_c, VR_a); 852 // kpw1 holds (k + w)[1] 853 vsldoi (kpw0, kpw1, kpw1, 8); 854 #endif 855 } 856 857 void MacroAssembler::sha512_load_h_vec(const Register state, 858 const VectorRegister* hs, 859 const int total_hs) { 860 #if defined(VM_LITTLE_ENDIAN) 861 VectorRegister a = hs[0]; 862 VectorRegister g = hs[6]; 863 int start_idx = 0; 864 #else 865 VectorRegister a = hs[1]; 866 VectorRegister g = hs[7]; 867 int start_idx = 1; 868 #endif 869 870 Register addr = R7; 871 VectorRegister vRb = VR8; 872 Register tmp = R8; 873 Label state_aligned, after_state_aligned; 874 875 andi_(tmp, state, 0xf); 876 beq(CCR0, state_aligned); 877 878 // deal with unaligned addresses 879 VectorRegister aux = VR9; 880 881 lvx (a, state); 882 addi (addr, state, 16); 883 lvsr (vRb, addr); 884 885 for (int n = start_idx + 2; n < total_hs; n += 2) { 886 VectorRegister h_cur = hs[n]; 887 VectorRegister h_prev2 = hs[n - 2]; 888 889 lvx (h_cur, addr); 890 addi (addr, addr, 16); 891 vec_perm(h_prev2, h_cur, vRb); 892 } 893 lvx (aux, addr); 894 vec_perm(g, aux, vRb); 895 896 b (after_state_aligned); 897 898 bind(state_aligned); 899 900 // deal with aligned addresses 901 mr(addr, state); 902 for (int n = start_idx; n < total_hs; n += 2) { 903 VectorRegister h_cur = hs[n]; 904 905 lvx (h_cur, addr); 906 addi (addr, addr, 16); 907 } 908 909 bind(after_state_aligned); 910 } 911 912 // R3_ARG1 - byte[] Input string with padding but in Big Endian 913 // R4_ARG2 - int[] SHA.state (at first, the root of primes) 914 // R5_ARG3 - int offset 915 // R6_ARG4 - int limit 916 // 917 // Internal Register usage: 918 // R7 R8 R9 - volatile temporaries 919 // VR0-VR7 - a-h 920 // VR8 - vRb 921 // VR9 - aux (highly volatile, use with care) 922 // VR10-VR17 - w0-w7 | ini_a-ini_h 923 // VR18 - vsp16 | kplusw0 924 // VR19 - vsp32 | kplusw1 925 // VR20-VR25 - sha512_calc_2w and sha512_round temporaries 926 void MacroAssembler::sha512(bool multi_block) { 927 static const ssize_t base_size = sizeof(uint64_t); 928 static const ssize_t buf_size = 128; 929 static uint64_t waux[buf_size / base_size] __attribute((aligned (16))); 930 static const uint64_t round_consts[80] __attribute((aligned (16))) = { 931 0x428a2f98d728ae22, 0x7137449123ef65cd, 932 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc, 933 0x3956c25bf348b538, 0x59f111f1b605d019, 934 0x923f82a4af194f9b, 0xab1c5ed5da6d8118, 935 0xd807aa98a3030242, 0x12835b0145706fbe, 936 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2, 937 0x72be5d74f27b896f, 0x80deb1fe3b1696b1, 938 0x9bdc06a725c71235, 0xc19bf174cf692694, 939 0xe49b69c19ef14ad2, 0xefbe4786384f25e3, 940 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65, 941 0x2de92c6f592b0275, 0x4a7484aa6ea6e483, 942 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5, 943 0x983e5152ee66dfab, 0xa831c66d2db43210, 944 0xb00327c898fb213f, 0xbf597fc7beef0ee4, 945 0xc6e00bf33da88fc2, 0xd5a79147930aa725, 946 0x06ca6351e003826f, 0x142929670a0e6e70, 947 0x27b70a8546d22ffc, 0x2e1b21385c26c926, 948 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df, 949 0x650a73548baf63de, 0x766a0abb3c77b2a8, 950 0x81c2c92e47edaee6, 0x92722c851482353b, 951 0xa2bfe8a14cf10364, 0xa81a664bbc423001, 952 0xc24b8b70d0f89791, 0xc76c51a30654be30, 953 0xd192e819d6ef5218, 0xd69906245565a910, 954 0xf40e35855771202a, 0x106aa07032bbd1b8, 955 0x19a4c116b8d2d0c8, 0x1e376c085141ab53, 956 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8, 957 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb, 958 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3, 959 0x748f82ee5defb2fc, 0x78a5636f43172f60, 960 0x84c87814a1f0ab72, 0x8cc702081a6439ec, 961 0x90befffa23631e28, 0xa4506cebde82bde9, 962 0xbef9a3f7b2c67915, 0xc67178f2e372532b, 963 0xca273eceea26619c, 0xd186b8c721c0c207, 964 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178, 965 0x06f067aa72176fba, 0x0a637dc5a2c898a6, 966 0x113f9804bef90dae, 0x1b710b35131c471b, 967 0x28db77f523047d84, 0x32caab7b40c72493, 968 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c, 969 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a, 970 0x5fcb6fab3ad6faec, 0x6c44198c4a475817, 971 }; 972 static const uint8_t w_size = sizeof(round_consts)/sizeof(uint64_t); 973 974 Register buf_in = R3_ARG1; 975 Register state = R4_ARG2; 976 Register ofs = R5_ARG3; 977 Register limit = R6_ARG4; 978 979 Label sha_loop, bsw_loop, core_loop; 980 981 // Save non-volatile vector registers in the red zone 982 static const VectorRegister nv[] = { 983 VR20, VR21, VR22, VR23, VR24, VR25/*, VR26, VR27, VR28, VR29, VR30, VR31*/ 984 }; 985 static const uint8_t nv_size = sizeof(nv) / sizeof (VectorRegister); 986 987 for (int c = 0; c < nv_size; c++) { 988 Register idx = R7; 989 li (idx, (c - (nv_size)) * 16); 990 stvx(nv[c], idx, R1); 991 } 992 993 // Load hash state to registers 994 VectorRegister a = VR0; 995 VectorRegister b = VR1; 996 VectorRegister c = VR2; 997 VectorRegister d = VR3; 998 VectorRegister e = VR4; 999 VectorRegister f = VR5; 1000 VectorRegister g = VR6; 1001 VectorRegister h = VR7; 1002 static const VectorRegister hs[] = {a, b, c, d, e, f, g, h}; 1003 static const int total_hs = sizeof(hs)/sizeof(VectorRegister); 1004 // counter for cycling through hs vector to avoid register moves between iterations 1005 int h_cnt = 0; 1006 1007 // Load a-h registers from the memory pointed by state 1008 sha512_load_h_vec(state, hs, total_hs); 1009 1010 if (multi_block) { 1011 align(OptoLoopAlignment); 1012 } 1013 bind(sha_loop); 1014 1015 for (int n = 0; n < total_hs; n += 2) { 1016 #if defined(VM_LITTLE_ENDIAN) 1017 VectorRegister h_cur = hs[n]; 1018 VectorRegister h_next = hs[n + 1]; 1019 #else 1020 VectorRegister h_cur = hs[n + 1]; 1021 VectorRegister h_next = hs[n]; 1022 #endif 1023 vsldoi (h_next, h_cur, h_cur, 8); 1024 } 1025 1026 Register k = R9; 1027 load_const_optimized(k, const_cast<uint64_t *>(round_consts), R0); 1028 1029 // Load 16 elements from w out of the loop. 1030 // Order of the long values is Endianess specific. 1031 VectorRegister w0 = VR10; 1032 VectorRegister w1 = VR11; 1033 VectorRegister w2 = VR12; 1034 VectorRegister w3 = VR13; 1035 VectorRegister w4 = VR14; 1036 VectorRegister w5 = VR15; 1037 VectorRegister w6 = VR16; 1038 VectorRegister w7 = VR17; 1039 static const VectorRegister ws[] = {w0, w1, w2, w3, w4, w5, w6, w7}; 1040 static const int total_ws = sizeof(ws)/sizeof(VectorRegister); 1041 1042 // Load 16 w into vectors and setup vsl for vperm 1043 sha512_load_w_vec(buf_in, ws, total_ws); 1044 1045 #if defined(VM_LITTLE_ENDIAN) 1046 VectorRegister vsp16 = VR18; 1047 VectorRegister vsp32 = VR19; 1048 VectorRegister shiftarg = VR9; 1049 1050 vspltisw(vsp16, 8); 1051 vspltisw(shiftarg, 1); 1052 vsl (vsp16, vsp16, shiftarg); 1053 vsl (vsp32, vsp16, shiftarg); 1054 1055 VectorRegister vsp8 = VR9; 1056 vspltish(vsp8, 8); 1057 1058 // Convert input from Big Endian to Little Endian 1059 for (int c = 0; c < total_ws; c++) { 1060 VectorRegister w = ws[c]; 1061 vrlh (w, w, vsp8); 1062 } 1063 for (int c = 0; c < total_ws; c++) { 1064 VectorRegister w = ws[c]; 1065 vrlw (w, w, vsp16); 1066 } 1067 for (int c = 0; c < total_ws; c++) { 1068 VectorRegister w = ws[c]; 1069 vrld (w, w, vsp32); 1070 } 1071 #endif 1072 1073 Register Rb = R10; 1074 VectorRegister vRb = VR8; 1075 li (Rb, 8); 1076 lvsr (vRb, Rb); 1077 1078 VectorRegister kplusw0 = VR18; 1079 VectorRegister kplusw1 = VR19; 1080 1081 Register addr = R7; 1082 mr (addr, k); 1083 1084 for (int n = 0; n < total_ws; n++) { 1085 VectorRegister w = ws[n]; 1086 1087 lvx (kplusw0, addr); 1088 addi (addr, addr, 16); 1089 #if defined(VM_LITTLE_ENDIAN) 1090 vaddudm(kplusw0, kplusw0, w); 1091 vsldoi (kplusw1, kplusw0, kplusw0, 8); 1092 #else 1093 vaddudm(kplusw1, kplusw0, w); 1094 vsldoi (kplusw0, kplusw1, kplusw1, 8); 1095 #endif 1096 1097 sha512_round(hs, total_hs, h_cnt, kplusw0); 1098 sha512_round(hs, total_hs, h_cnt, kplusw1); 1099 } 1100 1101 Register tmp = R8; 1102 li (tmp, (w_size-16)/total_hs); 1103 mtctr (tmp); 1104 // j will be aligned to 4 for loading words. 1105 // Whenever read, advance the pointer (e.g: when j is used in a function) 1106 Register j = tmp; 1107 li (j, 8*16); 1108 1109 align(OptoLoopAlignment); 1110 bind(core_loop); 1111 1112 // due to VectorRegister rotate, always iterate in multiples of total_hs 1113 for (int n = 0; n < total_hs/2; n++) { 1114 sha512_calc_2w(w0, w1, w2, w3, w4, w5, w6, w7, kplusw0, kplusw1, j, vRb, k); 1115 sha512_round(hs, total_hs, h_cnt, kplusw0); 1116 sha512_round(hs, total_hs, h_cnt, kplusw1); 1117 } 1118 1119 bdnz (core_loop); 1120 1121 sha512_update_sha_state(state, hs, total_hs); 1122 1123 if (multi_block) { 1124 // process next 1024 bit block (buf_in already updated) 1125 addi(ofs, ofs, buf_size); 1126 cmpd(CCR0, ofs, limit); 1127 blt(CCR0, sha_loop); 1128 1129 // return ofs 1130 mr(R3_ARG1, ofs); 1131 } 1132 1133 // Restore non-volatile registers 1134 for (int c = 0; c < nv_size; c++) { 1135 Register idx = R7; 1136 li (idx, (c - (nv_size)) * 16); 1137 lvx(nv[c], idx, R1); 1138 } 1139 }