1 // Copyright (c) 2017 Instituto de Pesquisas Eldorado. All rights reserved. 2 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 3 // 4 // This code is free software; you can redistribute it and/or modify it 5 // under the terms of the GNU General Public License version 2 only, as 6 // published by the Free Software Foundation. 7 // 8 // This code is distributed in the hope that it will be useful, but WITHOUT 9 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 10 // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 11 // version 2 for more details (a copy is included in the LICENSE file that 12 // accompanied this code). 13 // 14 // You should have received a copy of the GNU General Public License version 15 // 2 along with this work; if not, write to the Free Software Foundation, 16 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 17 // 18 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 19 // or visit www.oracle.com if you need additional information or have any 20 // questions. 21 22 #include "asm/assembler.hpp" 23 #include "asm/assembler.inline.hpp" 24 #include "runtime/stubRoutines.hpp" 25 #include "macroAssembler_ppc.hpp" 26 27 /********************************************************************** 28 * SHA 256 29 *********************************************************************/ 30 31 void MacroAssembler::sha256_deque(const VectorRegister src, 32 const VectorRegister dst1, 33 const VectorRegister dst2, 34 const VectorRegister dst3) { 35 vsldoi (dst1, src, src, 12); 36 vsldoi (dst2, src, src, 8); 37 vsldoi (dst3, src, src, 4); 38 } 39 40 void MacroAssembler::sha256_round(const VectorRegister* hs, 41 const int total_hs, 42 int& h_cnt, 43 const VectorRegister kpw) { 44 // convenience registers: cycle from 0-7 downwards 45 const VectorRegister a = hs[(total_hs + 0 - (h_cnt % total_hs)) % total_hs]; 46 const VectorRegister b = hs[(total_hs + 1 - (h_cnt % total_hs)) % total_hs]; 47 const VectorRegister c = hs[(total_hs + 2 - (h_cnt % total_hs)) % total_hs]; 48 const VectorRegister d = hs[(total_hs + 3 - (h_cnt % total_hs)) % total_hs]; 49 const VectorRegister e = hs[(total_hs + 4 - (h_cnt % total_hs)) % total_hs]; 50 const VectorRegister f = hs[(total_hs + 5 - (h_cnt % total_hs)) % total_hs]; 51 const VectorRegister g = hs[(total_hs + 6 - (h_cnt % total_hs)) % total_hs]; 52 const VectorRegister h = hs[(total_hs + 7 - (h_cnt % total_hs)) % total_hs]; 53 // temporaries 54 VectorRegister ch = VR0; 55 VectorRegister maj = VR1; 56 VectorRegister bsa = VR2; 57 VectorRegister bse = VR3; 58 VectorRegister vt0 = VR4; 59 VectorRegister vt1 = VR5; 60 VectorRegister vt2 = VR6; 61 VectorRegister vt3 = VR7; 62 63 vsel (ch, g, f, e); 64 vxor (maj, a, b); 65 vshasigmaw (bse, e, 1, 0xf); 66 vadduwm (vt2, ch, kpw); 67 vadduwm (vt1, h, bse); 68 vsel (maj, b, c, maj); 69 vadduwm (vt3, vt1, vt2); 70 vshasigmaw (bsa, a, 1, 0); 71 vadduwm (vt0, bsa, maj); 72 73 vadduwm (d, d, vt3); 74 vadduwm (h, vt3, vt0); 75 76 // advance vector pointer to the next iteration 77 h_cnt++; 78 } 79 80 void MacroAssembler::sha256_load_h_vec(const VectorRegister a, 81 const VectorRegister e, 82 const Register hptr) { 83 // temporaries 84 Register tmp = R8; 85 VectorRegister vt0 = VR0; 86 VectorRegister vRb = VR6; 87 // labels 88 Label sha256_aligned, sha256_load_end;; 89 90 andi_ (tmp, hptr, 0xf); 91 addi (tmp, hptr, 16); 92 beq (CCR0, sha256_aligned); 93 94 // handle unaligned accesses 95 lvx (a, hptr); 96 lvsr (vRb, hptr); 97 98 lvx (e, tmp); 99 addi (tmp, tmp, 16); 100 vperm (a, e, a, vRb); 101 102 lvx (vt0, tmp); 103 vperm (e, vt0, e, vRb); 104 b (sha256_load_end); 105 106 // aligned accesses 107 bind(sha256_aligned); 108 lvx (a, hptr); 109 addi (tmp, hptr, 16); 110 lvx (e, tmp); 111 112 bind(sha256_load_end); 113 } 114 115 void MacroAssembler::sha256_load_w_plus_k_vec(const Register buf_in, 116 const VectorRegister* ws, 117 const int total_ws, 118 const Register k, 119 const VectorRegister* kpws, 120 const int total_kpws) { 121 Label w_aligned, after_w_load; 122 123 Register tmp = R8; 124 VectorRegister vt0 = VR0; 125 VectorRegister vt1 = VR1; 126 VectorRegister vRb = VR6; 127 128 andi_ (tmp, buf_in, 0xF); 129 beq (CCR0, w_aligned); // address ends with 0x0, not 0x8 130 131 // deal with unaligned addresses 132 lvx (ws[0], buf_in); 133 addi (buf_in, buf_in, 16); 134 lvsl (vRb, buf_in); 135 136 for (int n = 1; n < total_ws; n++) { 137 VectorRegister w_cur = ws[n]; 138 VectorRegister w_prev = ws[n-1]; 139 140 lvx (w_cur, buf_in); 141 addi (buf_in, buf_in, 16); 142 vperm(w_prev, w_cur, w_prev, vRb); 143 } 144 145 lvx (vt0, buf_in); 146 vperm (ws[total_ws-1], vt0, ws[total_ws-1], vRb); 147 148 b (after_w_load); 149 150 bind(w_aligned); 151 152 // deal with aligned addresses 153 for (int n = 0; n < total_ws; n++) { 154 VectorRegister w = ws[n]; 155 156 lvx (w, buf_in); 157 addi (buf_in, buf_in, 16); 158 } 159 160 bind(after_w_load); 161 162 // Byte swapping on little endian 163 li (tmp, 8); 164 lvsl (vt0, tmp); 165 vspltisb (vt1, 0xb); 166 vxor (vt1, vt0, vt1); 167 for (int n = 0; n < total_ws; n++) { 168 VectorRegister w = ws[n]; 169 vperm (w, w, w, vt1); 170 } 171 172 // Loading k, which is always aligned to 16-bytes 173 lvx (kpws[0], k); 174 addi (tmp, k, 16); 175 for (int n = 1; n < total_kpws-1; n++) { 176 VectorRegister kpw = kpws[n]; 177 178 lvx (kpw, tmp); 179 addi (tmp, tmp, 16); 180 } 181 lvx (kpws[total_kpws-1], tmp); 182 183 // Add w to K 184 assert(total_ws == total_kpws, "Redesign the loop below"); 185 for (int n = 0; n < total_kpws; n++) { 186 VectorRegister kpw = kpws[n]; 187 VectorRegister w = ws[n]; 188 189 vadduwm (kpw, kpw, w); 190 } 191 } 192 193 void MacroAssembler::sha256_calc_4w(const VectorRegister w0, 194 const VectorRegister w1, 195 const VectorRegister w2, 196 const VectorRegister w3, 197 const VectorRegister kpw0, 198 const VectorRegister kpw1, 199 const VectorRegister kpw2, 200 const VectorRegister kpw3, 201 const Register j, 202 const Register k) { 203 // Temporaries 204 const VectorRegister vt0 = VR0; 205 const VectorRegister vt1 = VR1; 206 const VectorSRegister vsrt1 = vt1->to_vsr(); 207 const VectorRegister vt2 = VR2; 208 const VectorRegister vt3 = VR3; 209 const VectorSRegister vst3 = vt3->to_vsr(); 210 const VectorRegister vt4 = VR4; 211 212 // load to k[j] 213 lvx (vt0, j, k); 214 215 // advance j 216 addi (j, j, 16); // 16 bytes were read 217 218 // b = w[j-15], w[j-14], w[j-13], w[j-12] 219 vsldoi (vt1, w1, w0, 12); 220 221 // c = w[j-7], w[j-6], w[j-5], w[j-4] 222 vsldoi (vt2, w3, w2, 12); 223 224 // d = w[j-2], w[j-1], w[j-4], w[j-3] 225 vsldoi (vt3, w3, w3, 8); 226 227 // b = s0(w[j-15]) , s0(w[j-14]) , s0(w[j-13]) , s0(w[j-12]) 228 vshasigmaw (vt1, vt1, 0, 0); 229 230 // d = s1(w[j-2]) , s1(w[j-1]) , s1(w[j-4]) , s1(w[j-3]) 231 vshasigmaw (vt3, vt3, 0, 0xf); 232 233 // c = s0(w[j-15]) + w[j-7], 234 // s0(w[j-14]) + w[j-6], 235 // s0(w[j-13]) + w[j-5], 236 // s0(w[j-12]) + w[j-4] 237 vadduwm (vt2, vt1, vt2); 238 239 // c = s0(w[j-15]) + w[j-7] + w[j-16], 240 // s0(w[j-14]) + w[j-6] + w[j-15], 241 // s0(w[j-13]) + w[j-5] + w[j-14], 242 // s0(w[j-12]) + w[j-4] + w[j-13] 243 vadduwm (vt2, vt2, w0); 244 245 // e = s0(w[j-15]) + w[j-7] + w[j-16] + s1(w[j-2]), // w[j] 246 // s0(w[j-14]) + w[j-6] + w[j-15] + s1(w[j-1]), // w[j+1] 247 // s0(w[j-13]) + w[j-5] + w[j-14] + s1(w[j-4]), // UNDEFINED 248 // s0(w[j-12]) + w[j-4] + w[j-13] + s1(w[j-3]) // UNDEFINED 249 vadduwm (vt4, vt2, vt3); 250 251 // At this point, e[0] and e[1] are the correct values to be stored at w[j] 252 // and w[j+1]. 253 // e[2] and e[3] are not considered. 254 // b = s1(w[j]) , s1(s(w[j+1]) , UNDEFINED , UNDEFINED 255 vshasigmaw (vt1, vt4, 0, 0xf); 256 257 // v5 = s1(w[j-2]) , s1(w[j-1]) , s1(w[j]) , s1(w[j+1]) 258 xxmrgld (vst3,vsrt1,vst3); 259 260 // c = s0(w[j-15]) + w[j-7] + w[j-16] + s1(w[j-2]), // w[j] 261 // s0(w[j-14]) + w[j-6] + w[j-15] + s1(w[j-1]), // w[j+1] 262 // s0(w[j-13]) + w[j-5] + w[j-14] + s1(w[j]), // w[j+2] 263 // s0(w[j-12]) + w[j-4] + w[j-13] + s1(w[j+1]) // w[j+4] 264 vadduwm (vt2, vt2, vt3); 265 266 // Updating w0 to w3 to hold the new previous 16 values from w. 267 vmr (w0, w1); 268 vmr (w1, w2); 269 vmr (w2, w3); 270 vmr (w3, vt2); 271 272 // store k + w to v9 (4 values at once) 273 vadduwm (kpw0,vt2, vt0); 274 275 vsldoi (kpw1,kpw0,kpw0, 12); 276 vsldoi (kpw2,kpw0,kpw0, 8); 277 vsldoi (kpw3,kpw0,kpw0, 4); 278 } 279 280 void MacroAssembler::sha256_update_sha_state(const VectorRegister a, 281 const VectorRegister b_, 282 const VectorRegister c, 283 const VectorRegister d, 284 const VectorRegister e, 285 const VectorRegister f, 286 const VectorRegister g, 287 const VectorRegister h, 288 const Register hptr) { 289 // temporaries 290 VectorRegister vt0 = VR0; 291 VectorRegister vt1 = VR1; 292 VectorRegister vt2 = VR2; 293 VectorRegister vt3 = VR3; 294 VectorRegister vt4 = VR4; 295 VectorRegister vt5 = VR5; 296 VectorRegister vaux = VR6; 297 VectorRegister vRb = VR6; 298 Register tmp = R8; 299 Register of16 = R8; 300 Register of32 = R9; 301 Label state_load_aligned, after_state_load_aligned; 302 303 // Load hptr 304 andi_ (tmp, hptr, 0xf); 305 li (of16,16); 306 beq (CCR0,state_load_aligned); 307 308 // handle unaligned accesses 309 li (of32,32); 310 lvx (vt0, hptr); 311 lvsr (vRb, hptr); 312 313 lvx (vt5, hptr,of16); 314 vperm (vt0, vt5, vt0, vRb); // vt0 = hptr[0]..hptr[3] 315 316 lvx (vt1, hptr,of32); 317 vperm (vt5, vt1, vt5, vRb); // vt5 = hptr[4]..hptr[7] 318 b (after_state_load_aligned); 319 320 // aligned accesses 321 bind(state_load_aligned); 322 lvx (vt0, hptr); 323 lvx (vt5, of16,hptr); 324 325 bind(after_state_load_aligned); 326 327 vmrglw (vt1, b_, a); // vt1 = {a, b, ?, ?} 328 vmrglw (vt2, d, c); // vt2 = {c, d, ?, ?} 329 vmrglw (vt3, f, e); // vt3 = {e, f, ?, ?} 330 vmrglw (vt4, h, g); // vt4 = {g, h, ?, ?} 331 xxmrgld (vt1->to_vsr(), vt2->to_vsr(), vt1->to_vsr()); // vt1 = {a, b, c, d} 332 xxmrgld (vt3->to_vsr(), vt4->to_vsr(), vt3->to_vsr()); // vt3 = {e, f, g, h} 333 vadduwm (a, vt0, vt1); // a = {a+hptr[0], b+hptr[1], c+hptr[2], d+hptr[3]} 334 vadduwm (e, vt5, vt3); // e = {e+hptr[4], f+hptr[5], g+hptr[6], h+hptr[7]} 335 336 // Save hptr back, works for any alignment 337 xxswapd (vt0->to_vsr(), a->to_vsr()); 338 stxvd2x (vt0->to_vsr(), hptr); 339 xxswapd (vt5->to_vsr(), e->to_vsr()); 340 stxvd2x (vt5->to_vsr(), of16, hptr); 341 } 342 343 344 // R3_ARG1 - byte[] Input string with padding but in Big Endian 345 // R4_ARG2 - int[] SHA.state (at first, the root of primes) 346 // R5_ARG3 - int offset 347 // R6_ARG4 - int limit 348 // 349 // Internal Register usage: 350 // R7 - k 351 // R8 - tmp | j | of16 352 // R9 - of32 353 // VR0-VR8 - ch, maj, bsa, bse, vt0-vt3 | vt0-vt5, vaux/vRb 354 // VR9-VR16 - a-h 355 // VR17-VR20 - w0-w3 356 // VR21-VR23 - vRb | vaux0-vaux2 357 // VR24-VR27 - kpw0-kpw3 358 void MacroAssembler::sha256(bool multi_block) { 359 static const ssize_t base_size = sizeof(uint32_t); 360 static const ssize_t buf_size = 64; 361 static uint32_t waux[buf_size / base_size] __attribute((aligned (16))); 362 static const uint32_t round_consts[64] __attribute((aligned (16))) = { 363 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 364 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 365 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 366 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 367 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 368 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 369 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 370 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 371 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 372 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 373 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 374 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 375 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 376 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 377 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 378 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 379 }; 380 static const uint8_t w_size = sizeof(round_consts)/sizeof(uint32_t); 381 382 Register buf_in = R3_ARG1; 383 Register state = R4_ARG2; 384 Register ofs = R5_ARG3; 385 Register limit = R6_ARG4; 386 387 Label sha_loop, bsw_loop, core_loop; 388 389 // Save non-volatile vector registers in the red zone 390 static const VectorRegister nv[] = { 391 VR20, VR21, VR22, VR23, VR24, VR25, VR26, VR27/*, VR28, VR29, VR30, VR31*/ 392 }; 393 static const uint8_t nv_size = sizeof(nv) / sizeof (VectorRegister); 394 395 for (int c = 0; c < nv_size; c++) { 396 Register tmp = R8; 397 li (tmp, (c - (nv_size)) * 16); 398 stvx(nv[c], tmp, R1); 399 } 400 401 // Load hash state to registers 402 VectorRegister a = VR9; 403 VectorRegister b = VR10; 404 VectorRegister c = VR11; 405 VectorRegister d = VR12; 406 VectorRegister e = VR13; 407 VectorRegister f = VR14; 408 VectorRegister g = VR15; 409 VectorRegister h = VR16; 410 static const VectorRegister hs[] = {a, b, c, d, e, f, g, h}; 411 static const int total_hs = sizeof(hs)/sizeof(VectorRegister); 412 // counter for cycling through hs vector to avoid register moves between iterations 413 int h_cnt = 0; 414 415 // Load a-h registers from the memory pointed by state 416 sha256_load_h_vec(a, e, state); 417 418 // keep k loaded also during MultiBlock loops 419 Register k = R7; 420 load_const(k, const_cast<uint32_t *>(round_consts)); 421 422 // Avoiding redundant loads 423 bind(sha_loop); 424 sha256_deque(a, b, c, d); 425 sha256_deque(e, f, g, h); 426 427 align(OptoLoopAlignment); 428 429 // Load 16 elements from w out of the loop 430 VectorRegister w0 = VR17; 431 VectorRegister w1 = VR18; 432 VectorRegister w2 = VR19; 433 VectorRegister w3 = VR20; 434 static const VectorRegister ws[] = {w0, w1, w2, w3}; 435 static const int total_ws = sizeof(ws)/sizeof(VectorRegister); 436 437 VectorRegister kpw0 = VR24; 438 VectorRegister kpw1 = VR25; 439 VectorRegister kpw2 = VR26; 440 VectorRegister kpw3 = VR27; 441 static const VectorRegister kpws[] = {kpw0, kpw1, kpw2, kpw3}; 442 static const int total_kpws = sizeof(kpws)/sizeof(VectorRegister); 443 444 sha256_load_w_plus_k_vec(buf_in, ws, total_ws, k, kpws, total_kpws); 445 446 // Cycle through the first 16 elements 447 assert(total_ws == total_kpws, "Redesign the loop below"); 448 for (int n = 0; n < total_ws; n++) { 449 VectorRegister vaux0 = VR21; 450 VectorRegister vaux1 = VR22; 451 VectorRegister vaux2 = VR23; 452 453 sha256_deque(kpws[n],vaux0, vaux1, vaux2); 454 455 sha256_round(hs, total_hs, h_cnt, kpws[n]); 456 sha256_round(hs, total_hs, h_cnt, vaux0); 457 sha256_round(hs, total_hs, h_cnt, vaux1); 458 sha256_round(hs, total_hs, h_cnt, vaux2); 459 } 460 461 Register tmp = R8; 462 // loop the 16th to the 64th iteration by 8 steps 463 li (tmp, (w_size - 16) / total_hs); 464 mtctr(tmp); 465 466 // j will be aligned to 4 for loading words. 467 // Whenever read, advance the pointer (e.g: when j is used in a function) 468 Register j = R8; 469 li (j, 16*4); 470 471 align(OptoLoopAlignment); 472 bind(core_loop); 473 474 // due to VectorRegister rotate, always iterate in multiples of total_hs 475 for (int n = 0; n < total_hs/4; n++) { 476 sha256_calc_4w(w0, w1, w2, w3, kpw0, kpw1, kpw2, kpw3, j, k); 477 sha256_round(hs, total_hs, h_cnt, kpw0); 478 sha256_round(hs, total_hs, h_cnt, kpw1); 479 sha256_round(hs, total_hs, h_cnt, kpw2); 480 sha256_round(hs, total_hs, h_cnt, kpw3); 481 } 482 483 bdnz (core_loop); 484 485 // Update hash state 486 sha256_update_sha_state(a, b, c, d, e, f, g, h, state); 487 488 if (multi_block) { 489 // process next 1024 bit block (buf_in already updated) 490 addi(ofs, ofs, buf_size); 491 cmpd(CCR0, ofs, limit); 492 blt(CCR0, sha_loop); 493 494 // return ofs 495 mr(R3_ARG1, ofs); 496 } 497 498 // Restore non-volatile registers 499 for (int c = 0; c < nv_size; c++) { 500 Register tmp = R8; 501 li (tmp, (c - (nv_size)) * 16); 502 lvx(nv[c], tmp, R1); 503 } 504 } 505 506 /********************************************************************** 507 * SHA 512 508 *********************************************************************/ 509 510 void MacroAssembler::sha512_load_w_vec(const Register buf_in, 511 const VectorRegister* ws, 512 const int total_ws) { 513 Register tmp = R8; 514 VectorRegister vRb = VR8; 515 VectorRegister aux = VR9; 516 Label is_aligned, after_alignment; 517 518 andi_ (tmp, buf_in, 0xF); 519 beq (CCR0, is_aligned); // address ends with 0x0, not 0x8 520 521 // deal with unaligned addresses 522 lvx (ws[0], buf_in); 523 addi (buf_in, buf_in, 16); 524 lvsl (vRb, buf_in); 525 526 for (int n = 1; n < total_ws; n++) { 527 VectorRegister w_cur = ws[n]; 528 VectorRegister w_prev = ws[n-1]; 529 530 lvx (w_cur, buf_in); 531 addi (buf_in, buf_in, 16); 532 vperm(w_prev, w_cur, w_prev, vRb); 533 } 534 535 lvx (aux, buf_in); 536 vperm (ws[total_ws-1], aux, ws[total_ws-1], vRb); 537 538 b (after_alignment); 539 540 bind(is_aligned); 541 542 for (int n = 0; n < total_ws; n++) { 543 VectorRegister w = ws[n]; 544 545 lvx (w, buf_in); 546 addi (buf_in, buf_in, 16); 547 } 548 549 bind(after_alignment); 550 } 551 552 // Update hash state 553 void MacroAssembler::sha512_update_sha_state(const Register state, 554 const VectorRegister* hs, 555 const int total_hs) { 556 557 // load initial hash from the memory pointed by state 558 VectorRegister ini_a = VR10; 559 VectorRegister ini_c = VR12; 560 VectorRegister ini_e = VR14; 561 VectorRegister ini_g = VR16; 562 static const VectorRegister inis[] = {ini_a, ini_c, ini_e, ini_g}; 563 static const int total_inis = sizeof(inis)/sizeof(VectorRegister); 564 565 Label state_save_aligned, after_state_save_aligned; 566 567 Register addr = R7; 568 Register tmp = R8; 569 VectorRegister vRb = VR8; 570 VectorRegister aux = VR9; 571 572 andi_(tmp, state, 0xf); 573 beq(CCR0, state_save_aligned); 574 // deal with unaligned addresses 575 576 { 577 VectorRegister a = hs[0]; 578 VectorRegister b_ = hs[1]; 579 VectorRegister c = hs[2]; 580 VectorRegister d = hs[3]; 581 VectorRegister e = hs[4]; 582 VectorRegister f = hs[5]; 583 VectorRegister g = hs[6]; 584 VectorRegister h = hs[7]; 585 lvsr (vRb, state); 586 lvx (ini_a, state); 587 addi (addr, state, 16); 588 589 lvx (ini_c, addr); 590 addi (addr, addr, 16); 591 vperm (ini_a, ini_c, ini_a, vRb); 592 593 lvx (ini_e, addr); 594 addi (addr, addr, 16); 595 vperm (ini_c, ini_e, ini_c, vRb); 596 597 lvx (ini_g, addr); 598 addi (addr, addr, 16); 599 vperm (ini_e, ini_g, ini_e, vRb); 600 601 lvx (aux, addr); 602 vperm (ini_g, aux, ini_g, vRb); 603 604 xxmrgld(a->to_vsr(), b_->to_vsr(), a->to_vsr()); 605 xxmrgld(c->to_vsr(), d->to_vsr(), c->to_vsr()); 606 xxmrgld(e->to_vsr(), f->to_vsr(), e->to_vsr()); 607 xxmrgld(g->to_vsr(), h->to_vsr(), g->to_vsr()); 608 609 for (int n = 0; n < total_hs; n += 2) { 610 VectorRegister h_cur = hs[n]; 611 VectorRegister ini_cur = inis[n/2]; 612 613 vaddudm(h_cur, ini_cur, h_cur); 614 } 615 616 for (int n = 0; n < total_hs; n += 2) { 617 VectorRegister h_cur = hs[n]; 618 619 mfvrd (tmp, h_cur); 620 std (tmp, 8*n + 8, state); 621 vsldoi (aux, h_cur, h_cur, 8); 622 mfvrd (tmp, aux); 623 std (tmp, 8*n + 0, state); 624 } 625 626 b (after_state_save_aligned); 627 } 628 629 bind(state_save_aligned); 630 631 { 632 mr(addr, state); 633 for (int n = 0; n < total_hs; n += 2) { 634 VectorRegister h_cur = hs[n]; 635 VectorRegister h_next = hs[n+1]; 636 VectorRegister ini_cur = inis[n/2]; 637 638 lvx(ini_cur, addr); 639 addi(addr, addr, 16); 640 xxmrgld(h_cur->to_vsr(), h_next->to_vsr(), h_cur->to_vsr()); 641 } 642 643 for (int n = 0; n < total_hs; n += 2) { 644 VectorRegister h_cur = hs[n]; 645 VectorRegister ini_cur = inis[n/2]; 646 647 vaddudm(h_cur, ini_cur, h_cur); 648 } 649 650 mr(addr, state); 651 for (int n = 0; n < total_hs; n += 2) { 652 VectorRegister h_cur = hs[n]; 653 654 stvx(h_cur, addr); 655 addi(addr, addr, 16); 656 } 657 } 658 659 bind(after_state_save_aligned); 660 } 661 662 // Use h_cnt to cycle through hs elements but also increment it at the end 663 void MacroAssembler::sha512_round(const VectorRegister* hs, 664 const int total_hs, int& h_cnt, 665 const VectorRegister kpw) { 666 667 // convenience registers: cycle from 0-7 downwards 668 const VectorRegister a = hs[(total_hs + 0 - (h_cnt % total_hs)) % total_hs]; 669 const VectorRegister b = hs[(total_hs + 1 - (h_cnt % total_hs)) % total_hs]; 670 const VectorRegister c = hs[(total_hs + 2 - (h_cnt % total_hs)) % total_hs]; 671 const VectorRegister d = hs[(total_hs + 3 - (h_cnt % total_hs)) % total_hs]; 672 const VectorRegister e = hs[(total_hs + 4 - (h_cnt % total_hs)) % total_hs]; 673 const VectorRegister f = hs[(total_hs + 5 - (h_cnt % total_hs)) % total_hs]; 674 const VectorRegister g = hs[(total_hs + 6 - (h_cnt % total_hs)) % total_hs]; 675 const VectorRegister h = hs[(total_hs + 7 - (h_cnt % total_hs)) % total_hs]; 676 // temporaries 677 const VectorRegister Ch = VR20; 678 const VectorRegister Maj = VR21; 679 const VectorRegister bsa = VR22; 680 const VectorRegister bse = VR23; 681 const VectorRegister tmp1 = VR24; 682 const VectorRegister tmp2 = VR25; 683 684 vsel (Ch, g, f, e); 685 vxor (Maj, a, b); 686 vshasigmad(bse, e, 1, 0xf); 687 vaddudm (tmp2, Ch, kpw); 688 vaddudm (tmp1, h, bse); 689 vsel (Maj, b, c, Maj); 690 vaddudm (tmp1, tmp1, tmp2); 691 vshasigmad(bsa, a, 1, 0); 692 vaddudm (tmp2, bsa, Maj); 693 vaddudm (d, d, tmp1); 694 vaddudm (h, tmp1, tmp2); 695 696 // advance vector pointer to the next iteration 697 h_cnt++; 698 } 699 700 void MacroAssembler::sha512_calc_2w(const VectorRegister w0, 701 const VectorRegister w1, 702 const VectorRegister w2, 703 const VectorRegister w3, 704 const VectorRegister w4, 705 const VectorRegister w5, 706 const VectorRegister w6, 707 const VectorRegister w7, 708 const VectorRegister kpw0, 709 const VectorRegister kpw1, 710 const Register j, 711 const VectorRegister vRb, 712 const Register k) { 713 // Temporaries 714 const VectorRegister VR_a = VR20; 715 const VectorRegister VR_b = VR21; 716 const VectorRegister VR_c = VR22; 717 const VectorRegister VR_d = VR23; 718 719 // load to k[j] 720 lvx (VR_a, j, k); 721 // advance j 722 addi (j, j, 16); // 16 bytes were read 723 // v6 = w[j-15], w[j-14] 724 vperm (VR_b, w1, w0, vRb); 725 // v12 = w[j-7], w[j-6] 726 vperm (VR_c, w5, w4, vRb); 727 // v6 = s0(w[j-15]) , s0(w[j-14]) 728 vshasigmad (VR_b, VR_b, 0, 0); 729 // v5 = s1(w[j-2]) , s1(w[j-1]) 730 vshasigmad (VR_d, w7, 0, 0xf); 731 // v6 = s0(w[j-15]) + w[j-7] , s0(w[j-14]) + w[j-6] 732 vaddudm (VR_b, VR_b, VR_c); 733 // v8 = s1(w[j-2]) + w[j-16] , s1(w[j-1]) + w[j-15] 734 vaddudm (VR_d, VR_d, w0); 735 // v9 = s0(w[j-15]) + w[j-7] + w[j-16] + s1(w[j-2]), // w[j] 736 // s0(w[j-14]) + w[j-6] + w[j-15] + s1(w[j-1]), // w[j+1] 737 vaddudm (VR_c, VR_d, VR_b); 738 // Updating w0 to w7 to hold the new previous 16 values from w. 739 vmr (w0, w1); 740 vmr (w1, w2); 741 vmr (w2, w3); 742 vmr (w3, w4); 743 vmr (w4, w5); 744 vmr (w5, w6); 745 vmr (w6, w7); 746 vmr (w7, VR_c); 747 // store k + w to kpw0 (2 values at once) 748 vaddudm (kpw0, VR_c, VR_a); 749 // kpw1 holds (k + w)[1] 750 vsldoi (kpw1, kpw0, kpw0, 8); 751 } 752 753 void MacroAssembler::sha512_load_h_vec(const Register state, 754 const VectorRegister* hs, 755 const int total_hs) { 756 VectorRegister a = hs[0]; 757 VectorRegister g = hs[6]; 758 759 Register addr = R7; 760 VectorRegister vRb = VR8; 761 Register tmp = R8; 762 Label state_aligned, after_state_aligned; 763 764 andi_(tmp, state, 0xf); 765 beq(CCR0, state_aligned); 766 767 // deal with unaligned addresses 768 VectorRegister aux = VR9; 769 770 lvx (a, state); 771 addi (addr, state, 16); 772 lvsl (vRb, addr); 773 774 for (int n = 2; n < total_hs; n += 2) { 775 VectorRegister h_cur = hs[n]; 776 VectorRegister h_prev2 = hs[n - 2]; 777 778 lvx (h_cur, addr); 779 addi (addr, addr, 16); 780 vperm (h_prev2, h_cur, h_prev2, vRb); 781 } 782 lvx (aux, addr); 783 vperm (g, aux, g, vRb); 784 785 b (after_state_aligned); 786 787 bind(state_aligned); 788 789 // deal with aligned addresses 790 mr(addr, state); 791 for (int n = 0; n < total_hs; n += 2) { 792 VectorRegister h_cur = hs[n]; 793 794 lvx (h_cur, addr); 795 addi (addr, addr, 16); 796 } 797 798 bind(after_state_aligned); 799 } 800 801 // R3_ARG1 - byte[] Input string with padding but in Big Endian 802 // R4_ARG2 - int[] SHA.state (at first, the root of primes) 803 // R5_ARG3 - int offset 804 // R6_ARG4 - int limit 805 // 806 // Internal Register usage: 807 // R7 R8 R9 - volatile temporaries 808 // VR0-VR7 - a-h 809 // VR8 - vRb 810 // VR9 - aux (highly volatile, use with care) 811 // VR10-VR17 - w0-w7 | ini_a-ini_h 812 // VR18 - vsp16 | kplusw0 813 // VR19 - vsp32 | kplusw1 814 // VR20-VR25 - sha512_calc_2w and sha512_round temporaries 815 void MacroAssembler::sha512(bool multi_block) { 816 static const ssize_t base_size = sizeof(uint64_t); 817 static const ssize_t buf_size = 128; 818 static uint64_t waux[buf_size / base_size] __attribute((aligned (16))); 819 static const uint64_t round_consts[80] __attribute((aligned (16))) = { 820 0x428a2f98d728ae22, 0x7137449123ef65cd, 0xb5c0fbcfec4d3b2f, 821 0xe9b5dba58189dbbc, 0x3956c25bf348b538, 0x59f111f1b605d019, 822 0x923f82a4af194f9b, 0xab1c5ed5da6d8118, 0xd807aa98a3030242, 823 0x12835b0145706fbe, 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2, 824 0x72be5d74f27b896f, 0x80deb1fe3b1696b1, 0x9bdc06a725c71235, 825 0xc19bf174cf692694, 0xe49b69c19ef14ad2, 0xefbe4786384f25e3, 826 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65, 0x2de92c6f592b0275, 827 0x4a7484aa6ea6e483, 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5, 828 0x983e5152ee66dfab, 0xa831c66d2db43210, 0xb00327c898fb213f, 829 0xbf597fc7beef0ee4, 0xc6e00bf33da88fc2, 0xd5a79147930aa725, 830 0x06ca6351e003826f, 0x142929670a0e6e70, 0x27b70a8546d22ffc, 831 0x2e1b21385c26c926, 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df, 832 0x650a73548baf63de, 0x766a0abb3c77b2a8, 0x81c2c92e47edaee6, 833 0x92722c851482353b, 0xa2bfe8a14cf10364, 0xa81a664bbc423001, 834 0xc24b8b70d0f89791, 0xc76c51a30654be30, 0xd192e819d6ef5218, 835 0xd69906245565a910, 0xf40e35855771202a, 0x106aa07032bbd1b8, 836 0x19a4c116b8d2d0c8, 0x1e376c085141ab53, 0x2748774cdf8eeb99, 837 0x34b0bcb5e19b48a8, 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb, 838 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3, 0x748f82ee5defb2fc, 839 0x78a5636f43172f60, 0x84c87814a1f0ab72, 0x8cc702081a6439ec, 840 0x90befffa23631e28, 0xa4506cebde82bde9, 0xbef9a3f7b2c67915, 841 0xc67178f2e372532b, 0xca273eceea26619c, 0xd186b8c721c0c207, 842 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178, 0x06f067aa72176fba, 843 0x0a637dc5a2c898a6, 0x113f9804bef90dae, 0x1b710b35131c471b, 844 0x28db77f523047d84, 0x32caab7b40c72493, 0x3c9ebe0a15c9bebc, 845 0x431d67c49c100d4c, 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a, 846 0x5fcb6fab3ad6faec, 0x6c44198c4a475817 847 }; 848 static const uint8_t w_size = sizeof(round_consts)/sizeof(uint64_t); 849 850 Register buf_in = R3_ARG1; 851 Register state = R4_ARG2; 852 Register ofs = R5_ARG3; 853 Register limit = R6_ARG4; 854 855 Label sha_loop, bsw_loop, core_loop; 856 857 // Save non-volatile vector registers in the red zone 858 static const VectorRegister nv[] = { 859 VR20, VR21, VR22, VR23, VR24, VR25/*, VR26, VR27, VR28, VR29, VR30, VR31*/ 860 }; 861 static const uint8_t nv_size = sizeof(nv) / sizeof (VectorRegister); 862 863 for (int c = 0; c < nv_size; c++) { 864 Register idx = R7; 865 li (idx, (c - (nv_size)) * 16); 866 stvx(nv[c], idx, R1); 867 } 868 869 // Load hash state to registers 870 VectorRegister a = VR0; 871 VectorRegister b = VR1; 872 VectorRegister c = VR2; 873 VectorRegister d = VR3; 874 VectorRegister e = VR4; 875 VectorRegister f = VR5; 876 VectorRegister g = VR6; 877 VectorRegister h = VR7; 878 static const VectorRegister hs[] = {a, b, c, d, e, f, g, h}; 879 static const int total_hs = sizeof(hs)/sizeof(VectorRegister); 880 // counter for cycling through hs vector to avoid register moves between iterations 881 int h_cnt = 0; 882 883 // Load a-h registers from the memory pointed by state 884 sha512_load_h_vec(state, hs, total_hs); 885 886 align(OptoLoopAlignment); 887 bind(sha_loop); 888 889 for (int n = 0; n < total_hs; n += 2) { 890 VectorRegister h_cur = hs[n]; 891 VectorRegister h_next = hs[n + 1]; 892 893 vsldoi (h_next, h_cur, h_cur, 8); 894 } 895 896 Register k = R9; 897 load_const(k, const_cast<uint64_t *>(round_consts)); 898 899 // Load 16 elements from w out of the loop 900 VectorRegister w0 = VR10; 901 VectorRegister w1 = VR11; 902 VectorRegister w2 = VR12; 903 VectorRegister w3 = VR13; 904 VectorRegister w4 = VR14; 905 VectorRegister w5 = VR15; 906 VectorRegister w6 = VR16; 907 VectorRegister w7 = VR17; 908 static const VectorRegister ws[] = {w0, w1, w2, w3, w4, w5, w6, w7}; 909 static const int total_ws = sizeof(ws)/sizeof(VectorRegister); 910 911 // Load 16 w into vectors and setup vsl for vperm 912 sha512_load_w_vec(buf_in, ws, total_ws); 913 914 VectorRegister vsp16 = VR18; 915 VectorRegister vsp32 = VR19; 916 VectorRegister shiftarg = VR9; 917 918 vspltisw(vsp16, 8); 919 vspltisw(shiftarg, 1); 920 vsl (vsp16, vsp16, shiftarg); 921 vsl (vsp32, vsp16, shiftarg); 922 923 VectorRegister vsp8 = VR9; 924 vspltish(vsp8, 8); 925 926 // Convert input from Big Endian to Little Endian 927 for (int c = 0; c < total_ws; c++) { 928 VectorRegister w = ws[c]; 929 vrlh (w, w, vsp8); 930 } 931 for (int c = 0; c < total_ws; c++) { 932 VectorRegister w = ws[c]; 933 vrlw (w, w, vsp16); 934 } 935 for (int c = 0; c < total_ws; c++) { 936 VectorRegister w = ws[c]; 937 vrld (w, w, vsp32); 938 } 939 940 Register Rb = R10; 941 VectorRegister vRb = VR8; 942 li (Rb, 8); 943 lvsl (vRb, Rb); 944 945 VectorRegister kplusw0 = VR18; 946 VectorRegister kplusw1 = VR19; 947 948 Register addr = R7; 949 mr (addr, k); 950 951 for (int n = 0; n < total_ws; n++) { 952 VectorRegister w = ws[n]; 953 954 lvx (kplusw0, addr); 955 addi (addr, addr, 16); 956 vaddudm(kplusw0, kplusw0, w); 957 958 sha512_round(hs, total_hs, h_cnt, kplusw0); 959 vsldoi (kplusw1, kplusw0, kplusw0, 8); 960 sha512_round(hs, total_hs, h_cnt, kplusw1); 961 } 962 963 Register tmp = R8; 964 li (tmp, (w_size-16)/total_hs); 965 mtctr (tmp); 966 // j will be aligned to 4 for loading words. 967 // Whenever read, advance the pointer (e.g: when j is used in a function) 968 Register j = tmp; 969 li (j, 8*16); 970 971 align(OptoLoopAlignment); 972 bind(core_loop); 973 974 // due to VectorRegister rotate, always iterate in multiples of total_hs 975 for (int n = 0; n < total_hs/2; n++) { 976 sha512_calc_2w(w0, w1, w2, w3, w4, w5, w6, w7, kplusw0, kplusw1, j, vRb, k); 977 sha512_round(hs, total_hs, h_cnt, kplusw0); 978 sha512_round(hs, total_hs, h_cnt, kplusw1); 979 } 980 981 bdnz (core_loop); 982 983 sha512_update_sha_state(state, hs, total_hs); 984 985 if (multi_block) { 986 // process next 1024 bit block (buf_in already updated) 987 addi(ofs, ofs, buf_size); 988 cmpd(CCR0, ofs, limit); 989 blt(CCR0, sha_loop); 990 991 // return ofs 992 mr(R3_ARG1, ofs); 993 } 994 995 // Restore non-volatile registers 996 for (int c = 0; c < nv_size; c++) { 997 Register idx = R7; 998 li (idx, (c - (nv_size)) * 16); 999 lvx(nv[c], idx, R1); 1000 } 1001 }