1 // Copyright (c) 2017 Instituto de Pesquisas Eldorado. All rights reserved.
2 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
3 //
4 // This code is free software; you can redistribute it and/or modify it
5 // under the terms of the GNU General Public License version 2 only, as
6 // published by the Free Software Foundation.
7 //
8 // This code is distributed in the hope that it will be useful, but WITHOUT
9 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
11 // version 2 for more details (a copy is included in the LICENSE file that
12 // accompanied this code).
13 //
14 // You should have received a copy of the GNU General Public License version
15 // 2 along with this work; if not, write to the Free Software Foundation,
16 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
17 //
18 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
19 // or visit www.oracle.com if you need additional information or have any
20 // questions.
21
22 #include "asm/assembler.hpp"
23 #include "asm/assembler.inline.hpp"
24 #include "runtime/stubRoutines.hpp"
25 #include "macroAssembler_ppc.hpp"
26
27 /**********************************************************************
28 * SHA 256
29 *********************************************************************/
30
31 void MacroAssembler::sha256_deque(const VectorRegister src,
32 const VectorRegister dst1,
33 const VectorRegister dst2,
34 const VectorRegister dst3) {
35 vsldoi (dst1, src, src, 12);
36 vsldoi (dst2, src, src, 8);
37 vsldoi (dst3, src, src, 4);
38 }
39
40 void MacroAssembler::sha256_round(const VectorRegister* hs,
41 const int total_hs,
42 int& h_cnt,
43 const VectorRegister kpw) {
44 // convenience registers: cycle from 0-7 downwards
45 const VectorRegister a = hs[(total_hs + 0 - (h_cnt % total_hs)) % total_hs];
46 const VectorRegister b = hs[(total_hs + 1 - (h_cnt % total_hs)) % total_hs];
47 const VectorRegister c = hs[(total_hs + 2 - (h_cnt % total_hs)) % total_hs];
48 const VectorRegister d = hs[(total_hs + 3 - (h_cnt % total_hs)) % total_hs];
49 const VectorRegister e = hs[(total_hs + 4 - (h_cnt % total_hs)) % total_hs];
50 const VectorRegister f = hs[(total_hs + 5 - (h_cnt % total_hs)) % total_hs];
51 const VectorRegister g = hs[(total_hs + 6 - (h_cnt % total_hs)) % total_hs];
52 const VectorRegister h = hs[(total_hs + 7 - (h_cnt % total_hs)) % total_hs];
53 // temporaries
54 VectorRegister ch = VR0;
55 VectorRegister maj = VR1;
56 VectorRegister bsa = VR2;
57 VectorRegister bse = VR3;
58 VectorRegister vt0 = VR4;
59 VectorRegister vt1 = VR5;
60 VectorRegister vt2 = VR6;
61 VectorRegister vt3 = VR7;
62
63 vsel (ch, g, f, e);
64 vxor (maj, a, b);
65 vshasigmaw (bse, e, 1, 0xf);
66 vadduwm (vt2, ch, kpw);
67 vadduwm (vt1, h, bse);
68 vsel (maj, b, c, maj);
69 vadduwm (vt3, vt1, vt2);
70 vshasigmaw (bsa, a, 1, 0);
71 vadduwm (vt0, bsa, maj);
72
73 vadduwm (d, d, vt3);
74 vadduwm (h, vt3, vt0);
75
76 // advance vector pointer to the next iteration
77 h_cnt++;
78 }
79
80 void MacroAssembler::sha256_load_h_vec(const VectorRegister a,
81 const VectorRegister e,
82 const Register hptr) {
83 // temporaries
84 Register tmp = R8;
85 VectorRegister vt0 = VR0;
86 VectorRegister vRb = VR6;
87 // labels
88 Label sha256_aligned, sha256_load_end;;
89
90 andi_ (tmp, hptr, 0xf);
91 addi (tmp, hptr, 16);
92 beq (CCR0, sha256_aligned);
93
94 // handle unaligned accesses
95 lvx (a, hptr);
96 lvsr (vRb, hptr);
97
98 lvx (e, tmp);
99 addi (tmp, tmp, 16);
100 vperm (a, e, a, vRb);
101
102 lvx (vt0, tmp);
103 vperm (e, vt0, e, vRb);
104 b (sha256_load_end);
105
106 // aligned accesses
107 bind(sha256_aligned);
108 lvx (a, hptr);
109 addi (tmp, hptr, 16);
110 lvx (e, tmp);
111
112 bind(sha256_load_end);
113 }
114
115 void MacroAssembler::sha256_load_w_plus_k_vec(const Register buf_in,
116 const VectorRegister* ws,
117 const int total_ws,
118 const Register k,
119 const VectorRegister* kpws,
120 const int total_kpws) {
121 Label w_aligned, after_w_load;
122
123 Register tmp = R8;
124 VectorRegister vt0 = VR0;
125 VectorRegister vt1 = VR1;
126 VectorRegister vRb = VR6;
127
128 andi_ (tmp, buf_in, 0xF);
129 beq (CCR0, w_aligned); // address ends with 0x0, not 0x8
130
131 // deal with unaligned addresses
132 lvx (ws[0], buf_in);
133 addi (buf_in, buf_in, 16);
134 lvsl (vRb, buf_in);
135
136 for (int n = 1; n < total_ws; n++) {
137 VectorRegister w_cur = ws[n];
138 VectorRegister w_prev = ws[n-1];
139
140 lvx (w_cur, buf_in);
141 addi (buf_in, buf_in, 16);
142 vperm(w_prev, w_cur, w_prev, vRb);
143 }
144
145 lvx (vt0, buf_in);
146 vperm (ws[total_ws-1], vt0, ws[total_ws-1], vRb);
147
148 b (after_w_load);
149
150 bind(w_aligned);
151
152 // deal with aligned addresses
153 for (int n = 0; n < total_ws; n++) {
154 VectorRegister w = ws[n];
155
156 lvx (w, buf_in);
157 addi (buf_in, buf_in, 16);
158 }
159
160 bind(after_w_load);
161
162 // Byte swapping on little endian
163 li (tmp, 8);
164 lvsl (vt0, tmp);
165 vspltisb (vt1, 0xb);
166 vxor (vt1, vt0, vt1);
167 for (int n = 0; n < total_ws; n++) {
168 VectorRegister w = ws[n];
169 vperm (w, w, w, vt1);
170 }
171
172 // Loading k, which is always aligned to 16-bytes
173 lvx (kpws[0], k);
174 addi (tmp, k, 16);
175 for (int n = 1; n < total_kpws-1; n++) {
176 VectorRegister kpw = kpws[n];
177
178 lvx (kpw, tmp);
179 addi (tmp, tmp, 16);
180 }
181 lvx (kpws[total_kpws-1], tmp);
182
183 // Add w to K
184 assert(total_ws == total_kpws, "Redesign the loop below");
185 for (int n = 0; n < total_kpws; n++) {
186 VectorRegister kpw = kpws[n];
187 VectorRegister w = ws[n];
188
189 vadduwm (kpw, kpw, w);
190 }
191 }
192
193 void MacroAssembler::sha256_calc_4w(const VectorRegister w0,
194 const VectorRegister w1,
195 const VectorRegister w2,
196 const VectorRegister w3,
197 const VectorRegister kpw0,
198 const VectorRegister kpw1,
199 const VectorRegister kpw2,
200 const VectorRegister kpw3,
201 const Register j,
202 const Register k) {
203 // Temporaries
204 const VectorRegister vt0 = VR0;
205 const VectorRegister vt1 = VR1;
206 const VectorSRegister vsrt1 = vt1->to_vsr();
207 const VectorRegister vt2 = VR2;
208 const VectorRegister vt3 = VR3;
209 const VectorSRegister vst3 = vt3->to_vsr();
210 const VectorRegister vt4 = VR4;
211
212 // load to k[j]
213 lvx (vt0, j, k);
214
215 // advance j
216 addi (j, j, 16); // 16 bytes were read
217
218 // b = w[j-15], w[j-14], w[j-13], w[j-12]
219 vsldoi (vt1, w1, w0, 12);
220
221 // c = w[j-7], w[j-6], w[j-5], w[j-4]
222 vsldoi (vt2, w3, w2, 12);
223
224 // d = w[j-2], w[j-1], w[j-4], w[j-3]
225 vsldoi (vt3, w3, w3, 8);
226
227 // b = s0(w[j-15]) , s0(w[j-14]) , s0(w[j-13]) , s0(w[j-12])
228 vshasigmaw (vt1, vt1, 0, 0);
229
230 // d = s1(w[j-2]) , s1(w[j-1]) , s1(w[j-4]) , s1(w[j-3])
231 vshasigmaw (vt3, vt3, 0, 0xf);
232
233 // c = s0(w[j-15]) + w[j-7],
234 // s0(w[j-14]) + w[j-6],
235 // s0(w[j-13]) + w[j-5],
236 // s0(w[j-12]) + w[j-4]
237 vadduwm (vt2, vt1, vt2);
238
239 // c = s0(w[j-15]) + w[j-7] + w[j-16],
240 // s0(w[j-14]) + w[j-6] + w[j-15],
241 // s0(w[j-13]) + w[j-5] + w[j-14],
242 // s0(w[j-12]) + w[j-4] + w[j-13]
243 vadduwm (vt2, vt2, w0);
244
245 // e = s0(w[j-15]) + w[j-7] + w[j-16] + s1(w[j-2]), // w[j]
246 // s0(w[j-14]) + w[j-6] + w[j-15] + s1(w[j-1]), // w[j+1]
247 // s0(w[j-13]) + w[j-5] + w[j-14] + s1(w[j-4]), // UNDEFINED
248 // s0(w[j-12]) + w[j-4] + w[j-13] + s1(w[j-3]) // UNDEFINED
249 vadduwm (vt4, vt2, vt3);
250
251 // At this point, e[0] and e[1] are the correct values to be stored at w[j]
252 // and w[j+1].
253 // e[2] and e[3] are not considered.
254 // b = s1(w[j]) , s1(s(w[j+1]) , UNDEFINED , UNDEFINED
255 vshasigmaw (vt1, vt4, 0, 0xf);
256
257 // v5 = s1(w[j-2]) , s1(w[j-1]) , s1(w[j]) , s1(w[j+1])
258 xxmrgld (vst3,vsrt1,vst3);
259
260 // c = s0(w[j-15]) + w[j-7] + w[j-16] + s1(w[j-2]), // w[j]
261 // s0(w[j-14]) + w[j-6] + w[j-15] + s1(w[j-1]), // w[j+1]
262 // s0(w[j-13]) + w[j-5] + w[j-14] + s1(w[j]), // w[j+2]
263 // s0(w[j-12]) + w[j-4] + w[j-13] + s1(w[j+1]) // w[j+4]
264 vadduwm (vt2, vt2, vt3);
265
266 // Updating w0 to w3 to hold the new previous 16 values from w.
267 vmr (w0, w1);
268 vmr (w1, w2);
269 vmr (w2, w3);
270 vmr (w3, vt2);
271
272 // store k + w to v9 (4 values at once)
273 vadduwm (kpw0,vt2, vt0);
274
275 vsldoi (kpw1,kpw0,kpw0, 12);
276 vsldoi (kpw2,kpw0,kpw0, 8);
277 vsldoi (kpw3,kpw0,kpw0, 4);
278 }
279
280 void MacroAssembler::sha256_update_sha_state(const VectorRegister a,
281 const VectorRegister b_,
282 const VectorRegister c,
283 const VectorRegister d,
284 const VectorRegister e,
285 const VectorRegister f,
286 const VectorRegister g,
287 const VectorRegister h,
288 const Register hptr) {
289 // temporaries
290 VectorRegister vt0 = VR0;
291 VectorRegister vt1 = VR1;
292 VectorRegister vt2 = VR2;
293 VectorRegister vt3 = VR3;
294 VectorRegister vt4 = VR4;
295 VectorRegister vt5 = VR5;
296 VectorRegister vaux = VR6;
297 VectorRegister vRb = VR6;
298 Register tmp = R8;
299 Register of16 = R8;
300 Register of32 = R9;
301 Label state_load_aligned, after_state_load_aligned;
302
303 // Load hptr
304 andi_ (tmp, hptr, 0xf);
305 li (of16,16);
306 beq (CCR0,state_load_aligned);
307
308 // handle unaligned accesses
309 li (of32,32);
310 lvx (vt0, hptr);
311 lvsr (vRb, hptr);
312
313 lvx (vt5, hptr,of16);
314 vperm (vt0, vt5, vt0, vRb); // vt0 = hptr[0]..hptr[3]
315
316 lvx (vt1, hptr,of32);
317 vperm (vt5, vt1, vt5, vRb); // vt5 = hptr[4]..hptr[7]
318 b (after_state_load_aligned);
319
320 // aligned accesses
321 bind(state_load_aligned);
322 lvx (vt0, hptr);
323 lvx (vt5, of16,hptr);
324
325 bind(after_state_load_aligned);
326
327 vmrglw (vt1, b_, a); // vt1 = {a, b, ?, ?}
328 vmrglw (vt2, d, c); // vt2 = {c, d, ?, ?}
329 vmrglw (vt3, f, e); // vt3 = {e, f, ?, ?}
330 vmrglw (vt4, h, g); // vt4 = {g, h, ?, ?}
331 xxmrgld (vt1->to_vsr(), vt2->to_vsr(), vt1->to_vsr()); // vt1 = {a, b, c, d}
332 xxmrgld (vt3->to_vsr(), vt4->to_vsr(), vt3->to_vsr()); // vt3 = {e, f, g, h}
333 vadduwm (a, vt0, vt1); // a = {a+hptr[0], b+hptr[1], c+hptr[2], d+hptr[3]}
334 vadduwm (e, vt5, vt3); // e = {e+hptr[4], f+hptr[5], g+hptr[6], h+hptr[7]}
335
336 // Save hptr back, works for any alignment
337 xxswapd (vt0->to_vsr(), a->to_vsr());
338 stxvd2x (vt0->to_vsr(), hptr);
339 xxswapd (vt5->to_vsr(), e->to_vsr());
340 stxvd2x (vt5->to_vsr(), of16, hptr);
341 }
342
343
344 // R3_ARG1 - byte[] Input string with padding but in Big Endian
345 // R4_ARG2 - int[] SHA.state (at first, the root of primes)
346 // R5_ARG3 - int offset
347 // R6_ARG4 - int limit
348 //
349 // Internal Register usage:
350 // R7 - k
351 // R8 - tmp | j | of16
352 // R9 - of32
353 // VR0-VR8 - ch, maj, bsa, bse, vt0-vt3 | vt0-vt5, vaux/vRb
354 // VR9-VR16 - a-h
355 // VR17-VR20 - w0-w3
356 // VR21-VR23 - vRb | vaux0-vaux2
357 // VR24-VR27 - kpw0-kpw3
358 void MacroAssembler::sha256(bool multi_block) {
359 static const ssize_t base_size = sizeof(uint32_t);
360 static const ssize_t buf_size = 64;
361 static uint32_t waux[buf_size / base_size] __attribute((aligned (16)));
362 static const uint32_t round_consts[64] __attribute((aligned (16))) = {
363 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
364 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
365 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
366 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
367 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
368 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
369 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
370 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
371 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
372 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
373 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
374 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
375 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
376 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
377 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
378 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
379 };
380 static const uint8_t w_size = sizeof(round_consts)/sizeof(uint32_t);
381
382 Register buf_in = R3_ARG1;
383 Register state = R4_ARG2;
384 Register ofs = R5_ARG3;
385 Register limit = R6_ARG4;
386
387 Label sha_loop, bsw_loop, core_loop;
388
389 // Save non-volatile vector registers in the red zone
390 static const VectorRegister nv[] = {
391 VR20, VR21, VR22, VR23, VR24, VR25, VR26, VR27/*, VR28, VR29, VR30, VR31*/
392 };
393 static const uint8_t nv_size = sizeof(nv) / sizeof (VectorRegister);
394
395 for (int c = 0; c < nv_size; c++) {
396 Register tmp = R8;
397 li (tmp, (c - (nv_size)) * 16);
398 stvx(nv[c], tmp, R1);
399 }
400
401 // Load hash state to registers
402 VectorRegister a = VR9;
403 VectorRegister b = VR10;
404 VectorRegister c = VR11;
405 VectorRegister d = VR12;
406 VectorRegister e = VR13;
407 VectorRegister f = VR14;
408 VectorRegister g = VR15;
409 VectorRegister h = VR16;
410 static const VectorRegister hs[] = {a, b, c, d, e, f, g, h};
411 static const int total_hs = sizeof(hs)/sizeof(VectorRegister);
412 // counter for cycling through hs vector to avoid register moves between iterations
413 int h_cnt = 0;
414
415 // Load a-h registers from the memory pointed by state
416 sha256_load_h_vec(a, e, state);
417
418 // keep k loaded also during MultiBlock loops
419 Register k = R7;
420 load_const(k, const_cast<uint32_t *>(round_consts));
421
422 // Avoiding redundant loads
423 bind(sha_loop);
424 sha256_deque(a, b, c, d);
425 sha256_deque(e, f, g, h);
426
427 align(OptoLoopAlignment);
428
429 // Load 16 elements from w out of the loop
430 VectorRegister w0 = VR17;
431 VectorRegister w1 = VR18;
432 VectorRegister w2 = VR19;
433 VectorRegister w3 = VR20;
434 static const VectorRegister ws[] = {w0, w1, w2, w3};
435 static const int total_ws = sizeof(ws)/sizeof(VectorRegister);
436
437 VectorRegister kpw0 = VR24;
438 VectorRegister kpw1 = VR25;
439 VectorRegister kpw2 = VR26;
440 VectorRegister kpw3 = VR27;
441 static const VectorRegister kpws[] = {kpw0, kpw1, kpw2, kpw3};
442 static const int total_kpws = sizeof(kpws)/sizeof(VectorRegister);
443
444 sha256_load_w_plus_k_vec(buf_in, ws, total_ws, k, kpws, total_kpws);
445
446 // Cycle through the first 16 elements
447 assert(total_ws == total_kpws, "Redesign the loop below");
448 for (int n = 0; n < total_ws; n++) {
449 VectorRegister vaux0 = VR21;
450 VectorRegister vaux1 = VR22;
451 VectorRegister vaux2 = VR23;
452
453 sha256_deque(kpws[n],vaux0, vaux1, vaux2);
454
455 sha256_round(hs, total_hs, h_cnt, kpws[n]);
456 sha256_round(hs, total_hs, h_cnt, vaux0);
457 sha256_round(hs, total_hs, h_cnt, vaux1);
458 sha256_round(hs, total_hs, h_cnt, vaux2);
459 }
460
461 Register tmp = R8;
462 // loop the 16th to the 64th iteration by 8 steps
463 li (tmp, (w_size - 16) / total_hs);
464 mtctr(tmp);
465
466 // j will be aligned to 4 for loading words.
467 // Whenever read, advance the pointer (e.g: when j is used in a function)
468 Register j = R8;
469 li (j, 16*4);
470
471 align(OptoLoopAlignment);
472 bind(core_loop);
473
474 // due to VectorRegister rotate, always iterate in multiples of total_hs
475 for (int n = 0; n < total_hs/4; n++) {
476 sha256_calc_4w(w0, w1, w2, w3, kpw0, kpw1, kpw2, kpw3, j, k);
477 sha256_round(hs, total_hs, h_cnt, kpw0);
478 sha256_round(hs, total_hs, h_cnt, kpw1);
479 sha256_round(hs, total_hs, h_cnt, kpw2);
480 sha256_round(hs, total_hs, h_cnt, kpw3);
481 }
482
483 bdnz (core_loop);
484
485 // Update hash state
486 sha256_update_sha_state(a, b, c, d, e, f, g, h, state);
487
488 if (multi_block) {
489 // process next 1024 bit block (buf_in already updated)
490 addi(ofs, ofs, buf_size);
491 cmpd(CCR0, ofs, limit);
492 blt(CCR0, sha_loop);
493
494 // return ofs
495 mr(R3_ARG1, ofs);
496 }
497
498 // Restore non-volatile registers
499 for (int c = 0; c < nv_size; c++) {
500 Register tmp = R8;
501 li (tmp, (c - (nv_size)) * 16);
502 lvx(nv[c], tmp, R1);
503 }
504 }
505
506 /**********************************************************************
507 * SHA 512
508 *********************************************************************/
509
510 void MacroAssembler::sha512_load_w_vec(const Register buf_in,
511 const VectorRegister* ws,
512 const int total_ws) {
513 Register tmp = R8;
514 VectorRegister vRb = VR8;
515 VectorRegister aux = VR9;
516 Label is_aligned, after_alignment;
517
518 andi_ (tmp, buf_in, 0xF);
519 beq (CCR0, is_aligned); // address ends with 0x0, not 0x8
520
521 // deal with unaligned addresses
522 lvx (ws[0], buf_in);
523 addi (buf_in, buf_in, 16);
524 lvsl (vRb, buf_in);
525
526 for (int n = 1; n < total_ws; n++) {
527 VectorRegister w_cur = ws[n];
528 VectorRegister w_prev = ws[n-1];
529
530 lvx (w_cur, buf_in);
531 addi (buf_in, buf_in, 16);
532 vperm(w_prev, w_cur, w_prev, vRb);
533 }
534
535 lvx (aux, buf_in);
536 vperm (ws[total_ws-1], aux, ws[total_ws-1], vRb);
537
538 b (after_alignment);
539
540 bind(is_aligned);
541
542 for (int n = 0; n < total_ws; n++) {
543 VectorRegister w = ws[n];
544
545 lvx (w, buf_in);
546 addi (buf_in, buf_in, 16);
547 }
548
549 bind(after_alignment);
550 }
551
552 // Update hash state
553 void MacroAssembler::sha512_update_sha_state(const Register state,
554 const VectorRegister* hs,
555 const int total_hs) {
556
557 // load initial hash from the memory pointed by state
558 VectorRegister ini_a = VR10;
559 VectorRegister ini_c = VR12;
560 VectorRegister ini_e = VR14;
561 VectorRegister ini_g = VR16;
562 static const VectorRegister inis[] = {ini_a, ini_c, ini_e, ini_g};
563 static const int total_inis = sizeof(inis)/sizeof(VectorRegister);
564
565 Label state_save_aligned, after_state_save_aligned;
566
567 Register addr = R7;
568 Register tmp = R8;
569 VectorRegister vRb = VR8;
570 VectorRegister aux = VR9;
571
572 andi_(tmp, state, 0xf);
573 beq(CCR0, state_save_aligned);
574 // deal with unaligned addresses
575
576 {
577 VectorRegister a = hs[0];
578 VectorRegister b_ = hs[1];
579 VectorRegister c = hs[2];
580 VectorRegister d = hs[3];
581 VectorRegister e = hs[4];
582 VectorRegister f = hs[5];
583 VectorRegister g = hs[6];
584 VectorRegister h = hs[7];
585 lvsr (vRb, state);
586 lvx (ini_a, state);
587 addi (addr, state, 16);
588
589 lvx (ini_c, addr);
590 addi (addr, addr, 16);
591 vperm (ini_a, ini_c, ini_a, vRb);
592
593 lvx (ini_e, addr);
594 addi (addr, addr, 16);
595 vperm (ini_c, ini_e, ini_c, vRb);
596
597 lvx (ini_g, addr);
598 addi (addr, addr, 16);
599 vperm (ini_e, ini_g, ini_e, vRb);
600
601 lvx (aux, addr);
602 vperm (ini_g, aux, ini_g, vRb);
603
604 xxmrgld(a->to_vsr(), b_->to_vsr(), a->to_vsr());
605 xxmrgld(c->to_vsr(), d->to_vsr(), c->to_vsr());
606 xxmrgld(e->to_vsr(), f->to_vsr(), e->to_vsr());
607 xxmrgld(g->to_vsr(), h->to_vsr(), g->to_vsr());
608
609 for (int n = 0; n < total_hs; n += 2) {
610 VectorRegister h_cur = hs[n];
611 VectorRegister ini_cur = inis[n/2];
612
613 vaddudm(h_cur, ini_cur, h_cur);
614 }
615
616 for (int n = 0; n < total_hs; n += 2) {
617 VectorRegister h_cur = hs[n];
618
619 mfvrd (tmp, h_cur);
620 std (tmp, 8*n + 8, state);
621 vsldoi (aux, h_cur, h_cur, 8);
622 mfvrd (tmp, aux);
623 std (tmp, 8*n + 0, state);
624 }
625
626 b (after_state_save_aligned);
627 }
628
629 bind(state_save_aligned);
630
631 {
632 mr(addr, state);
633 for (int n = 0; n < total_hs; n += 2) {
634 VectorRegister h_cur = hs[n];
635 VectorRegister h_next = hs[n+1];
636 VectorRegister ini_cur = inis[n/2];
637
638 lvx(ini_cur, addr);
639 addi(addr, addr, 16);
640 xxmrgld(h_cur->to_vsr(), h_next->to_vsr(), h_cur->to_vsr());
641 }
642
643 for (int n = 0; n < total_hs; n += 2) {
644 VectorRegister h_cur = hs[n];
645 VectorRegister ini_cur = inis[n/2];
646
647 vaddudm(h_cur, ini_cur, h_cur);
648 }
649
650 mr(addr, state);
651 for (int n = 0; n < total_hs; n += 2) {
652 VectorRegister h_cur = hs[n];
653
654 stvx(h_cur, addr);
655 addi(addr, addr, 16);
656 }
657 }
658
659 bind(after_state_save_aligned);
660 }
661
662 // Use h_cnt to cycle through hs elements but also increment it at the end
663 void MacroAssembler::sha512_round(const VectorRegister* hs,
664 const int total_hs, int& h_cnt,
665 const VectorRegister kpw) {
666
667 // convenience registers: cycle from 0-7 downwards
668 const VectorRegister a = hs[(total_hs + 0 - (h_cnt % total_hs)) % total_hs];
669 const VectorRegister b = hs[(total_hs + 1 - (h_cnt % total_hs)) % total_hs];
670 const VectorRegister c = hs[(total_hs + 2 - (h_cnt % total_hs)) % total_hs];
671 const VectorRegister d = hs[(total_hs + 3 - (h_cnt % total_hs)) % total_hs];
672 const VectorRegister e = hs[(total_hs + 4 - (h_cnt % total_hs)) % total_hs];
673 const VectorRegister f = hs[(total_hs + 5 - (h_cnt % total_hs)) % total_hs];
674 const VectorRegister g = hs[(total_hs + 6 - (h_cnt % total_hs)) % total_hs];
675 const VectorRegister h = hs[(total_hs + 7 - (h_cnt % total_hs)) % total_hs];
676 // temporaries
677 const VectorRegister Ch = VR20;
678 const VectorRegister Maj = VR21;
679 const VectorRegister bsa = VR22;
680 const VectorRegister bse = VR23;
681 const VectorRegister tmp1 = VR24;
682 const VectorRegister tmp2 = VR25;
683
684 vsel (Ch, g, f, e);
685 vxor (Maj, a, b);
686 vshasigmad(bse, e, 1, 0xf);
687 vaddudm (tmp2, Ch, kpw);
688 vaddudm (tmp1, h, bse);
689 vsel (Maj, b, c, Maj);
690 vaddudm (tmp1, tmp1, tmp2);
691 vshasigmad(bsa, a, 1, 0);
692 vaddudm (tmp2, bsa, Maj);
693 vaddudm (d, d, tmp1);
694 vaddudm (h, tmp1, tmp2);
695
696 // advance vector pointer to the next iteration
697 h_cnt++;
698 }
699
700 void MacroAssembler::sha512_calc_2w(const VectorRegister w0,
701 const VectorRegister w1,
702 const VectorRegister w2,
703 const VectorRegister w3,
704 const VectorRegister w4,
705 const VectorRegister w5,
706 const VectorRegister w6,
707 const VectorRegister w7,
708 const VectorRegister kpw0,
709 const VectorRegister kpw1,
710 const Register j,
711 const VectorRegister vRb,
712 const Register k) {
713 // Temporaries
714 const VectorRegister VR_a = VR20;
715 const VectorRegister VR_b = VR21;
716 const VectorRegister VR_c = VR22;
717 const VectorRegister VR_d = VR23;
718
719 // load to k[j]
720 lvx (VR_a, j, k);
721 // advance j
722 addi (j, j, 16); // 16 bytes were read
723 // v6 = w[j-15], w[j-14]
724 vperm (VR_b, w1, w0, vRb);
725 // v12 = w[j-7], w[j-6]
726 vperm (VR_c, w5, w4, vRb);
727 // v6 = s0(w[j-15]) , s0(w[j-14])
728 vshasigmad (VR_b, VR_b, 0, 0);
729 // v5 = s1(w[j-2]) , s1(w[j-1])
730 vshasigmad (VR_d, w7, 0, 0xf);
731 // v6 = s0(w[j-15]) + w[j-7] , s0(w[j-14]) + w[j-6]
732 vaddudm (VR_b, VR_b, VR_c);
733 // v8 = s1(w[j-2]) + w[j-16] , s1(w[j-1]) + w[j-15]
734 vaddudm (VR_d, VR_d, w0);
735 // v9 = s0(w[j-15]) + w[j-7] + w[j-16] + s1(w[j-2]), // w[j]
736 // s0(w[j-14]) + w[j-6] + w[j-15] + s1(w[j-1]), // w[j+1]
737 vaddudm (VR_c, VR_d, VR_b);
738 // Updating w0 to w7 to hold the new previous 16 values from w.
739 vmr (w0, w1);
740 vmr (w1, w2);
741 vmr (w2, w3);
742 vmr (w3, w4);
743 vmr (w4, w5);
744 vmr (w5, w6);
745 vmr (w6, w7);
746 vmr (w7, VR_c);
747 // store k + w to kpw0 (2 values at once)
748 vaddudm (kpw0, VR_c, VR_a);
749 // kpw1 holds (k + w)[1]
750 vsldoi (kpw1, kpw0, kpw0, 8);
751 }
752
753 void MacroAssembler::sha512_load_h_vec(const Register state,
754 const VectorRegister* hs,
755 const int total_hs) {
756 VectorRegister a = hs[0];
757 VectorRegister g = hs[6];
758
759 Register addr = R7;
760 VectorRegister vRb = VR8;
761 Register tmp = R8;
762 Label state_aligned, after_state_aligned;
763
764 andi_(tmp, state, 0xf);
765 beq(CCR0, state_aligned);
766
767 // deal with unaligned addresses
768 VectorRegister aux = VR9;
769
770 lvx (a, state);
771 addi (addr, state, 16);
772 lvsl (vRb, addr);
773
774 for (int n = 2; n < total_hs; n += 2) {
775 VectorRegister h_cur = hs[n];
776 VectorRegister h_prev2 = hs[n - 2];
777
778 lvx (h_cur, addr);
779 addi (addr, addr, 16);
780 vperm (h_prev2, h_cur, h_prev2, vRb);
781 }
782 lvx (aux, addr);
783 vperm (g, aux, g, vRb);
784
785 b (after_state_aligned);
786
787 bind(state_aligned);
788
789 // deal with aligned addresses
790 mr(addr, state);
791 for (int n = 0; n < total_hs; n += 2) {
792 VectorRegister h_cur = hs[n];
793
794 lvx (h_cur, addr);
795 addi (addr, addr, 16);
796 }
797
798 bind(after_state_aligned);
799 }
800
801 // R3_ARG1 - byte[] Input string with padding but in Big Endian
802 // R4_ARG2 - int[] SHA.state (at first, the root of primes)
803 // R5_ARG3 - int offset
804 // R6_ARG4 - int limit
805 //
806 // Internal Register usage:
807 // R7 R8 R9 - volatile temporaries
808 // VR0-VR7 - a-h
809 // VR8 - vRb
810 // VR9 - aux (highly volatile, use with care)
811 // VR10-VR17 - w0-w7 | ini_a-ini_h
812 // VR18 - vsp16 | kplusw0
813 // VR19 - vsp32 | kplusw1
814 // VR20-VR25 - sha512_calc_2w and sha512_round temporaries
815 void MacroAssembler::sha512(bool multi_block) {
816 static const ssize_t base_size = sizeof(uint64_t);
817 static const ssize_t buf_size = 128;
818 static uint64_t waux[buf_size / base_size] __attribute((aligned (16)));
819 static const uint64_t round_consts[80] __attribute((aligned (16))) = {
820 0x428a2f98d728ae22, 0x7137449123ef65cd, 0xb5c0fbcfec4d3b2f,
821 0xe9b5dba58189dbbc, 0x3956c25bf348b538, 0x59f111f1b605d019,
822 0x923f82a4af194f9b, 0xab1c5ed5da6d8118, 0xd807aa98a3030242,
823 0x12835b0145706fbe, 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2,
824 0x72be5d74f27b896f, 0x80deb1fe3b1696b1, 0x9bdc06a725c71235,
825 0xc19bf174cf692694, 0xe49b69c19ef14ad2, 0xefbe4786384f25e3,
826 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65, 0x2de92c6f592b0275,
827 0x4a7484aa6ea6e483, 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5,
828 0x983e5152ee66dfab, 0xa831c66d2db43210, 0xb00327c898fb213f,
829 0xbf597fc7beef0ee4, 0xc6e00bf33da88fc2, 0xd5a79147930aa725,
830 0x06ca6351e003826f, 0x142929670a0e6e70, 0x27b70a8546d22ffc,
831 0x2e1b21385c26c926, 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df,
832 0x650a73548baf63de, 0x766a0abb3c77b2a8, 0x81c2c92e47edaee6,
833 0x92722c851482353b, 0xa2bfe8a14cf10364, 0xa81a664bbc423001,
834 0xc24b8b70d0f89791, 0xc76c51a30654be30, 0xd192e819d6ef5218,
835 0xd69906245565a910, 0xf40e35855771202a, 0x106aa07032bbd1b8,
836 0x19a4c116b8d2d0c8, 0x1e376c085141ab53, 0x2748774cdf8eeb99,
837 0x34b0bcb5e19b48a8, 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb,
838 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3, 0x748f82ee5defb2fc,
839 0x78a5636f43172f60, 0x84c87814a1f0ab72, 0x8cc702081a6439ec,
840 0x90befffa23631e28, 0xa4506cebde82bde9, 0xbef9a3f7b2c67915,
841 0xc67178f2e372532b, 0xca273eceea26619c, 0xd186b8c721c0c207,
842 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178, 0x06f067aa72176fba,
843 0x0a637dc5a2c898a6, 0x113f9804bef90dae, 0x1b710b35131c471b,
844 0x28db77f523047d84, 0x32caab7b40c72493, 0x3c9ebe0a15c9bebc,
845 0x431d67c49c100d4c, 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a,
846 0x5fcb6fab3ad6faec, 0x6c44198c4a475817
847 };
848 static const uint8_t w_size = sizeof(round_consts)/sizeof(uint64_t);
849
850 Register buf_in = R3_ARG1;
851 Register state = R4_ARG2;
852 Register ofs = R5_ARG3;
853 Register limit = R6_ARG4;
854
855 Label sha_loop, bsw_loop, core_loop;
856
857 // Save non-volatile vector registers in the red zone
858 static const VectorRegister nv[] = {
859 VR20, VR21, VR22, VR23, VR24, VR25/*, VR26, VR27, VR28, VR29, VR30, VR31*/
860 };
861 static const uint8_t nv_size = sizeof(nv) / sizeof (VectorRegister);
862
863 for (int c = 0; c < nv_size; c++) {
864 Register idx = R7;
865 li (idx, (c - (nv_size)) * 16);
866 stvx(nv[c], idx, R1);
867 }
868
869 // Load hash state to registers
870 VectorRegister a = VR0;
871 VectorRegister b = VR1;
872 VectorRegister c = VR2;
873 VectorRegister d = VR3;
874 VectorRegister e = VR4;
875 VectorRegister f = VR5;
876 VectorRegister g = VR6;
877 VectorRegister h = VR7;
878 static const VectorRegister hs[] = {a, b, c, d, e, f, g, h};
879 static const int total_hs = sizeof(hs)/sizeof(VectorRegister);
880 // counter for cycling through hs vector to avoid register moves between iterations
881 int h_cnt = 0;
882
883 // Load a-h registers from the memory pointed by state
884 sha512_load_h_vec(state, hs, total_hs);
885
886 align(OptoLoopAlignment);
887 bind(sha_loop);
888
889 for (int n = 0; n < total_hs; n += 2) {
890 VectorRegister h_cur = hs[n];
891 VectorRegister h_next = hs[n + 1];
892
893 vsldoi (h_next, h_cur, h_cur, 8);
894 }
895
896 Register k = R9;
897 load_const(k, const_cast<uint64_t *>(round_consts));
898
899 // Load 16 elements from w out of the loop
900 VectorRegister w0 = VR10;
901 VectorRegister w1 = VR11;
902 VectorRegister w2 = VR12;
903 VectorRegister w3 = VR13;
904 VectorRegister w4 = VR14;
905 VectorRegister w5 = VR15;
906 VectorRegister w6 = VR16;
907 VectorRegister w7 = VR17;
908 static const VectorRegister ws[] = {w0, w1, w2, w3, w4, w5, w6, w7};
909 static const int total_ws = sizeof(ws)/sizeof(VectorRegister);
910
911 // Load 16 w into vectors and setup vsl for vperm
912 sha512_load_w_vec(buf_in, ws, total_ws);
913
914 VectorRegister vsp16 = VR18;
915 VectorRegister vsp32 = VR19;
916 VectorRegister shiftarg = VR9;
917
918 vspltisw(vsp16, 8);
919 vspltisw(shiftarg, 1);
920 vsl (vsp16, vsp16, shiftarg);
921 vsl (vsp32, vsp16, shiftarg);
922
923 VectorRegister vsp8 = VR9;
924 vspltish(vsp8, 8);
925
926 // Convert input from Big Endian to Little Endian
927 for (int c = 0; c < total_ws; c++) {
928 VectorRegister w = ws[c];
929 vrlh (w, w, vsp8);
930 }
931 for (int c = 0; c < total_ws; c++) {
932 VectorRegister w = ws[c];
933 vrlw (w, w, vsp16);
934 }
935 for (int c = 0; c < total_ws; c++) {
936 VectorRegister w = ws[c];
937 vrld (w, w, vsp32);
938 }
939
940 Register Rb = R10;
941 VectorRegister vRb = VR8;
942 li (Rb, 8);
943 lvsl (vRb, Rb);
944
945 VectorRegister kplusw0 = VR18;
946 VectorRegister kplusw1 = VR19;
947
948 Register addr = R7;
949 mr (addr, k);
950
951 for (int n = 0; n < total_ws; n++) {
952 VectorRegister w = ws[n];
953
954 lvx (kplusw0, addr);
955 addi (addr, addr, 16);
956 vaddudm(kplusw0, kplusw0, w);
957
958 sha512_round(hs, total_hs, h_cnt, kplusw0);
959 vsldoi (kplusw1, kplusw0, kplusw0, 8);
960 sha512_round(hs, total_hs, h_cnt, kplusw1);
961 }
962
963 Register tmp = R8;
964 li (tmp, (w_size-16)/total_hs);
965 mtctr (tmp);
966 // j will be aligned to 4 for loading words.
967 // Whenever read, advance the pointer (e.g: when j is used in a function)
968 Register j = tmp;
969 li (j, 8*16);
970
971 align(OptoLoopAlignment);
972 bind(core_loop);
973
974 // due to VectorRegister rotate, always iterate in multiples of total_hs
975 for (int n = 0; n < total_hs/2; n++) {
976 sha512_calc_2w(w0, w1, w2, w3, w4, w5, w6, w7, kplusw0, kplusw1, j, vRb, k);
977 sha512_round(hs, total_hs, h_cnt, kplusw0);
978 sha512_round(hs, total_hs, h_cnt, kplusw1);
979 }
980
981 bdnz (core_loop);
982
983 sha512_update_sha_state(state, hs, total_hs);
984
985 if (multi_block) {
986 // process next 1024 bit block (buf_in already updated)
987 addi(ofs, ofs, buf_size);
988 cmpd(CCR0, ofs, limit);
989 blt(CCR0, sha_loop);
990
991 // return ofs
992 mr(R3_ARG1, ofs);
993 }
994
995 // Restore non-volatile registers
996 for (int c = 0; c < nv_size; c++) {
997 Register idx = R7;
998 li (idx, (c - (nv_size)) * 16);
999 lvx(nv[c], idx, R1);
1000 }
1001 }