1 /*
2 * Copyright (c) 1997, 2017, Oracle and/or its affiliates. All rights reserved.
3 * Copyright (c) 2012, 2017, SAP SE. All rights reserved.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
24 */
25
26 #include "precompiled.hpp"
27 #include "asm/macroAssembler.inline.hpp"
28 #include "compiler/disassembler.hpp"
29 #include "gc/shared/cardTableModRefBS.hpp"
30 #include "gc/shared/collectedHeap.inline.hpp"
31 #include "interpreter/interpreter.hpp"
32 #include "memory/resourceArea.hpp"
33 #include "nativeInst_ppc.hpp"
34 #include "prims/methodHandles.hpp"
35 #include "runtime/biasedLocking.hpp"
36 #include "runtime/icache.hpp"
37 #include "runtime/interfaceSupport.hpp"
38 #include "runtime/objectMonitor.hpp"
39 #include "runtime/os.hpp"
40 #include "runtime/sharedRuntime.hpp"
41 #include "runtime/stubRoutines.hpp"
42 #include "utilities/macros.hpp"
43 #if INCLUDE_ALL_GCS
44 #include "gc/g1/g1CollectedHeap.inline.hpp"
45 #include "gc/g1/g1SATBCardTableModRefBS.hpp"
46 #include "gc/g1/heapRegion.hpp"
47 #endif // INCLUDE_ALL_GCS
48 #ifdef COMPILER2
49 #include "opto/intrinsicnode.hpp"
50 #endif
51
52 #ifdef PRODUCT
53 #define BLOCK_COMMENT(str) // nothing
54 #else
55 #define BLOCK_COMMENT(str) block_comment(str)
56 #endif
57 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
58
59 #ifdef ASSERT
60 // On RISC, there's no benefit to verifying instruction boundaries.
61 bool AbstractAssembler::pd_check_instruction_mark() { return false; }
62 #endif
63
64 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) {
65 assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range");
66 if (Assembler::is_simm(si31, 16)) {
67 ld(d, si31, a);
68 if (emit_filler_nop) nop();
69 } else {
70 const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31);
71 const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31);
72 addis(d, a, hi);
73 ld(d, lo, d);
74 }
75 }
76
77 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) {
78 assert_different_registers(d, a);
79 ld_largeoffset_unchecked(d, si31, a, emit_filler_nop);
80 }
81
82 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base,
83 size_t size_in_bytes, bool is_signed) {
84 switch (size_in_bytes) {
85 case 8: ld(dst, offs, base); break;
86 case 4: is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break;
87 case 2: is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break;
88 case 1: lbz(dst, offs, base); if (is_signed) extsb(dst, dst); break; // lba doesn't exist :(
89 default: ShouldNotReachHere();
90 }
91 }
92
93 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base,
94 size_t size_in_bytes) {
95 switch (size_in_bytes) {
96 case 8: std(dst, offs, base); break;
97 case 4: stw(dst, offs, base); break;
98 case 2: sth(dst, offs, base); break;
99 case 1: stb(dst, offs, base); break;
100 default: ShouldNotReachHere();
101 }
102 }
103
104 void MacroAssembler::align(int modulus, int max, int rem) {
105 int padding = (rem + modulus - (offset() % modulus)) % modulus;
106 if (padding > max) return;
107 for (int c = (padding >> 2); c > 0; --c) { nop(); }
108 }
109
110 // Issue instructions that calculate given TOC from global TOC.
111 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16,
112 bool add_relocation, bool emit_dummy_addr) {
113 int offset = -1;
114 if (emit_dummy_addr) {
115 offset = -128; // dummy address
116 } else if (addr != (address)(intptr_t)-1) {
117 offset = MacroAssembler::offset_to_global_toc(addr);
118 }
119
120 if (hi16) {
121 addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset));
122 }
123 if (lo16) {
124 if (add_relocation) {
125 // Relocate at the addi to avoid confusion with a load from the method's TOC.
126 relocate(internal_word_Relocation::spec(addr));
127 }
128 addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset));
129 }
130 }
131
132 int MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) {
133 const int offset = MacroAssembler::offset_to_global_toc(addr);
134
135 const address inst2_addr = a;
136 const int inst2 = *(int *)inst2_addr;
137
138 // The relocation points to the second instruction, the addi,
139 // and the addi reads and writes the same register dst.
140 const int dst = inv_rt_field(inst2);
141 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
142
143 // Now, find the preceding addis which writes to dst.
144 int inst1 = 0;
145 address inst1_addr = inst2_addr - BytesPerInstWord;
146 while (inst1_addr >= bound) {
147 inst1 = *(int *) inst1_addr;
148 if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
149 // Stop, found the addis which writes dst.
150 break;
151 }
152 inst1_addr -= BytesPerInstWord;
153 }
154
155 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
156 set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset));
157 set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset));
158 return (int)((intptr_t)addr - (intptr_t)inst1_addr);
159 }
160
161 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) {
162 const address inst2_addr = a;
163 const int inst2 = *(int *)inst2_addr;
164
165 // The relocation points to the second instruction, the addi,
166 // and the addi reads and writes the same register dst.
167 const int dst = inv_rt_field(inst2);
168 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
169
170 // Now, find the preceding addis which writes to dst.
171 int inst1 = 0;
172 address inst1_addr = inst2_addr - BytesPerInstWord;
173 while (inst1_addr >= bound) {
174 inst1 = *(int *) inst1_addr;
175 if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
176 // stop, found the addis which writes dst
177 break;
178 }
179 inst1_addr -= BytesPerInstWord;
180 }
181
182 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
183
184 int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0);
185 // -1 is a special case
186 if (offset == -1) {
187 return (address)(intptr_t)-1;
188 } else {
189 return global_toc() + offset;
190 }
191 }
192
193 #ifdef _LP64
194 // Patch compressed oops or klass constants.
195 // Assembler sequence is
196 // 1) compressed oops:
197 // lis rx = const.hi
198 // ori rx = rx | const.lo
199 // 2) compressed klass:
200 // lis rx = const.hi
201 // clrldi rx = rx & 0xFFFFffff // clearMS32b, optional
202 // ori rx = rx | const.lo
203 // Clrldi will be passed by.
204 int MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) {
205 assert(UseCompressedOops, "Should only patch compressed oops");
206
207 const address inst2_addr = a;
208 const int inst2 = *(int *)inst2_addr;
209
210 // The relocation points to the second instruction, the ori,
211 // and the ori reads and writes the same register dst.
212 const int dst = inv_rta_field(inst2);
213 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
214 // Now, find the preceding addis which writes to dst.
215 int inst1 = 0;
216 address inst1_addr = inst2_addr - BytesPerInstWord;
217 bool inst1_found = false;
218 while (inst1_addr >= bound) {
219 inst1 = *(int *)inst1_addr;
220 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; }
221 inst1_addr -= BytesPerInstWord;
222 }
223 assert(inst1_found, "inst is not lis");
224
225 int xc = (data >> 16) & 0xffff;
226 int xd = (data >> 0) & 0xffff;
227
228 set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo
229 set_imm((int *)inst2_addr, (xd)); // unsigned int
230 return (int)((intptr_t)inst2_addr - (intptr_t)inst1_addr);
231 }
232
233 // Get compressed oop or klass constant.
234 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) {
235 assert(UseCompressedOops, "Should only patch compressed oops");
236
237 const address inst2_addr = a;
238 const int inst2 = *(int *)inst2_addr;
239
240 // The relocation points to the second instruction, the ori,
241 // and the ori reads and writes the same register dst.
242 const int dst = inv_rta_field(inst2);
243 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
244 // Now, find the preceding lis which writes to dst.
245 int inst1 = 0;
246 address inst1_addr = inst2_addr - BytesPerInstWord;
247 bool inst1_found = false;
248
249 while (inst1_addr >= bound) {
250 inst1 = *(int *) inst1_addr;
251 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;}
252 inst1_addr -= BytesPerInstWord;
253 }
254 assert(inst1_found, "inst is not lis");
255
256 uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff));
257 uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16);
258
259 return (int) (xl | xh);
260 }
261 #endif // _LP64
262
263 // Returns true if successful.
264 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a,
265 Register toc, bool fixed_size) {
266 int toc_offset = 0;
267 // Use RelocationHolder::none for the constant pool entry, otherwise
268 // we will end up with a failing NativeCall::verify(x) where x is
269 // the address of the constant pool entry.
270 // FIXME: We should insert relocation information for oops at the constant
271 // pool entries instead of inserting it at the loads; patching of a constant
272 // pool entry should be less expensive.
273 address const_address = address_constant((address)a.value(), RelocationHolder::none);
274 if (const_address == NULL) { return false; } // allocation failure
275 // Relocate at the pc of the load.
276 relocate(a.rspec());
277 toc_offset = (int)(const_address - code()->consts()->start());
278 ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size);
279 return true;
280 }
281
282 bool MacroAssembler::is_load_const_from_method_toc_at(address a) {
283 const address inst1_addr = a;
284 const int inst1 = *(int *)inst1_addr;
285
286 // The relocation points to the ld or the addis.
287 return (is_ld(inst1)) ||
288 (is_addis(inst1) && inv_ra_field(inst1) != 0);
289 }
290
291 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) {
292 assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc");
293
294 const address inst1_addr = a;
295 const int inst1 = *(int *)inst1_addr;
296
297 if (is_ld(inst1)) {
298 return inv_d1_field(inst1);
299 } else if (is_addis(inst1)) {
300 const int dst = inv_rt_field(inst1);
301
302 // Now, find the succeeding ld which reads and writes to dst.
303 address inst2_addr = inst1_addr + BytesPerInstWord;
304 int inst2 = 0;
305 while (true) {
306 inst2 = *(int *) inst2_addr;
307 if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) {
308 // Stop, found the ld which reads and writes dst.
309 break;
310 }
311 inst2_addr += BytesPerInstWord;
312 }
313 return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2);
314 }
315 ShouldNotReachHere();
316 return 0;
317 }
318
319 // Get the constant from a `load_const' sequence.
320 long MacroAssembler::get_const(address a) {
321 assert(is_load_const_at(a), "not a load of a constant");
322 const int *p = (const int*) a;
323 unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48);
324 if (is_ori(*(p+1))) {
325 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32);
326 x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16);
327 x |= (((unsigned long) (get_imm(a,4) & 0xffff)));
328 } else if (is_lis(*(p+1))) {
329 x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32);
330 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16);
331 x |= (((unsigned long) (get_imm(a,3) & 0xffff)));
332 } else {
333 ShouldNotReachHere();
334 return (long) 0;
335 }
336 return (long) x;
337 }
338
339 // Patch the 64 bit constant of a `load_const' sequence. This is a low
340 // level procedure. It neither flushes the instruction cache nor is it
341 // mt safe.
342 void MacroAssembler::patch_const(address a, long x) {
343 assert(is_load_const_at(a), "not a load of a constant");
344 int *p = (int*) a;
345 if (is_ori(*(p+1))) {
346 set_imm(0 + p, (x >> 48) & 0xffff);
347 set_imm(1 + p, (x >> 32) & 0xffff);
348 set_imm(3 + p, (x >> 16) & 0xffff);
349 set_imm(4 + p, x & 0xffff);
350 } else if (is_lis(*(p+1))) {
351 set_imm(0 + p, (x >> 48) & 0xffff);
352 set_imm(2 + p, (x >> 32) & 0xffff);
353 set_imm(1 + p, (x >> 16) & 0xffff);
354 set_imm(3 + p, x & 0xffff);
355 } else {
356 ShouldNotReachHere();
357 }
358 }
359
360 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) {
361 assert(oop_recorder() != NULL, "this assembler needs a Recorder");
362 int index = oop_recorder()->allocate_metadata_index(obj);
363 RelocationHolder rspec = metadata_Relocation::spec(index);
364 return AddressLiteral((address)obj, rspec);
365 }
366
367 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) {
368 assert(oop_recorder() != NULL, "this assembler needs a Recorder");
369 int index = oop_recorder()->find_index(obj);
370 RelocationHolder rspec = metadata_Relocation::spec(index);
371 return AddressLiteral((address)obj, rspec);
372 }
373
374 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) {
375 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
376 int oop_index = oop_recorder()->allocate_oop_index(obj);
377 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
378 }
379
380 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) {
381 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
382 int oop_index = oop_recorder()->find_index(obj);
383 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
384 }
385
386 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
387 Register tmp, int offset) {
388 intptr_t value = *delayed_value_addr;
389 if (value != 0) {
390 return RegisterOrConstant(value + offset);
391 }
392
393 // Load indirectly to solve generation ordering problem.
394 // static address, no relocation
395 int simm16_offset = load_const_optimized(tmp, delayed_value_addr, noreg, true);
396 ld(tmp, simm16_offset, tmp); // must be aligned ((xa & 3) == 0)
397
398 if (offset != 0) {
399 addi(tmp, tmp, offset);
400 }
401
402 return RegisterOrConstant(tmp);
403 }
404
405 #ifndef PRODUCT
406 void MacroAssembler::pd_print_patched_instruction(address branch) {
407 Unimplemented(); // TODO: PPC port
408 }
409 #endif // ndef PRODUCT
410
411 // Conditional far branch for destinations encodable in 24+2 bits.
412 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) {
413
414 // If requested by flag optimize, relocate the bc_far as a
415 // runtime_call and prepare for optimizing it when the code gets
416 // relocated.
417 if (optimize == bc_far_optimize_on_relocate) {
418 relocate(relocInfo::runtime_call_type);
419 }
420
421 // variant 2:
422 //
423 // b!cxx SKIP
424 // bxx DEST
425 // SKIP:
426 //
427
428 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
429 opposite_bcond(inv_boint_bcond(boint)));
430
431 // We emit two branches.
432 // First, a conditional branch which jumps around the far branch.
433 const address not_taken_pc = pc() + 2 * BytesPerInstWord;
434 const address bc_pc = pc();
435 bc(opposite_boint, biint, not_taken_pc);
436
437 const int bc_instr = *(int*)bc_pc;
438 assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition");
439 assert(opposite_boint == inv_bo_field(bc_instr), "postcondition");
440 assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))),
441 opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))),
442 "postcondition");
443 assert(biint == inv_bi_field(bc_instr), "postcondition");
444
445 // Second, an unconditional far branch which jumps to dest.
446 // Note: target(dest) remembers the current pc (see CodeSection::target)
447 // and returns the current pc if the label is not bound yet; when
448 // the label gets bound, the unconditional far branch will be patched.
449 const address target_pc = target(dest);
450 const address b_pc = pc();
451 b(target_pc);
452
453 assert(not_taken_pc == pc(), "postcondition");
454 assert(dest.is_bound() || target_pc == b_pc, "postcondition");
455 }
456
457 // 1 or 2 instructions
458 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) {
459 if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) {
460 bc(boint, biint, dest);
461 } else {
462 bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate);
463 }
464 }
465
466 bool MacroAssembler::is_bc_far_at(address instruction_addr) {
467 return is_bc_far_variant1_at(instruction_addr) ||
468 is_bc_far_variant2_at(instruction_addr) ||
469 is_bc_far_variant3_at(instruction_addr);
470 }
471
472 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) {
473 if (is_bc_far_variant1_at(instruction_addr)) {
474 const address instruction_1_addr = instruction_addr;
475 const int instruction_1 = *(int*)instruction_1_addr;
476 return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr);
477 } else if (is_bc_far_variant2_at(instruction_addr)) {
478 const address instruction_2_addr = instruction_addr + 4;
479 return bxx_destination(instruction_2_addr);
480 } else if (is_bc_far_variant3_at(instruction_addr)) {
481 return instruction_addr + 8;
482 }
483 // variant 4 ???
484 ShouldNotReachHere();
485 return NULL;
486 }
487 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) {
488
489 if (is_bc_far_variant3_at(instruction_addr)) {
490 // variant 3, far cond branch to the next instruction, already patched to nops:
491 //
492 // nop
493 // endgroup
494 // SKIP/DEST:
495 //
496 return;
497 }
498
499 // first, extract boint and biint from the current branch
500 int boint = 0;
501 int biint = 0;
502
503 ResourceMark rm;
504 const int code_size = 2 * BytesPerInstWord;
505 CodeBuffer buf(instruction_addr, code_size);
506 MacroAssembler masm(&buf);
507 if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) {
508 // Far branch to next instruction: Optimize it by patching nops (produce variant 3).
509 masm.nop();
510 masm.endgroup();
511 } else {
512 if (is_bc_far_variant1_at(instruction_addr)) {
513 // variant 1, the 1st instruction contains the destination address:
514 //
515 // bcxx DEST
516 // nop
517 //
518 const int instruction_1 = *(int*)(instruction_addr);
519 boint = inv_bo_field(instruction_1);
520 biint = inv_bi_field(instruction_1);
521 } else if (is_bc_far_variant2_at(instruction_addr)) {
522 // variant 2, the 2nd instruction contains the destination address:
523 //
524 // b!cxx SKIP
525 // bxx DEST
526 // SKIP:
527 //
528 const int instruction_1 = *(int*)(instruction_addr);
529 boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))),
530 opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1))));
531 biint = inv_bi_field(instruction_1);
532 } else {
533 // variant 4???
534 ShouldNotReachHere();
535 }
536
537 // second, set the new branch destination and optimize the code
538 if (dest != instruction_addr + 4 && // the bc_far is still unbound!
539 masm.is_within_range_of_bcxx(dest, instruction_addr)) {
540 // variant 1:
541 //
542 // bcxx DEST
543 // nop
544 //
545 masm.bc(boint, biint, dest);
546 masm.nop();
547 } else {
548 // variant 2:
549 //
550 // b!cxx SKIP
551 // bxx DEST
552 // SKIP:
553 //
554 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
555 opposite_bcond(inv_boint_bcond(boint)));
556 const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord;
557 masm.bc(opposite_boint, biint, not_taken_pc);
558 masm.b(dest);
559 }
560 }
561 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
562 }
563
564 // Emit a NOT mt-safe patchable 64 bit absolute call/jump.
565 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) {
566 // get current pc
567 uint64_t start_pc = (uint64_t) pc();
568
569 const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last
570 const address pc_of_b = (address) (start_pc + (0*BytesPerInstWord)); // b is first
571
572 // relocate here
573 if (rt != relocInfo::none) {
574 relocate(rt);
575 }
576
577 if ( ReoptimizeCallSequences &&
578 (( link && is_within_range_of_b(dest, pc_of_bl)) ||
579 (!link && is_within_range_of_b(dest, pc_of_b)))) {
580 // variant 2:
581 // Emit an optimized, pc-relative call/jump.
582
583 if (link) {
584 // some padding
585 nop();
586 nop();
587 nop();
588 nop();
589 nop();
590 nop();
591
592 // do the call
593 assert(pc() == pc_of_bl, "just checking");
594 bl(dest, relocInfo::none);
595 } else {
596 // do the jump
597 assert(pc() == pc_of_b, "just checking");
598 b(dest, relocInfo::none);
599
600 // some padding
601 nop();
602 nop();
603 nop();
604 nop();
605 nop();
606 nop();
607 }
608
609 // Assert that we can identify the emitted call/jump.
610 assert(is_bxx64_patchable_variant2_at((address)start_pc, link),
611 "can't identify emitted call");
612 } else {
613 // variant 1:
614 mr(R0, R11); // spill R11 -> R0.
615
616 // Load the destination address into CTR,
617 // calculate destination relative to global toc.
618 calculate_address_from_global_toc(R11, dest, true, true, false);
619
620 mtctr(R11);
621 mr(R11, R0); // spill R11 <- R0.
622 nop();
623
624 // do the call/jump
625 if (link) {
626 bctrl();
627 } else{
628 bctr();
629 }
630 // Assert that we can identify the emitted call/jump.
631 assert(is_bxx64_patchable_variant1b_at((address)start_pc, link),
632 "can't identify emitted call");
633 }
634
635 // Assert that we can identify the emitted call/jump.
636 assert(is_bxx64_patchable_at((address)start_pc, link),
637 "can't identify emitted call");
638 assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest,
639 "wrong encoding of dest address");
640 }
641
642 // Identify a bxx64_patchable instruction.
643 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) {
644 return is_bxx64_patchable_variant1b_at(instruction_addr, link)
645 //|| is_bxx64_patchable_variant1_at(instruction_addr, link)
646 || is_bxx64_patchable_variant2_at(instruction_addr, link);
647 }
648
649 // Does the call64_patchable instruction use a pc-relative encoding of
650 // the call destination?
651 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) {
652 // variant 2 is pc-relative
653 return is_bxx64_patchable_variant2_at(instruction_addr, link);
654 }
655
656 // Identify variant 1.
657 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) {
658 unsigned int* instr = (unsigned int*) instruction_addr;
659 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
660 && is_mtctr(instr[5]) // mtctr
661 && is_load_const_at(instruction_addr);
662 }
663
664 // Identify variant 1b: load destination relative to global toc.
665 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) {
666 unsigned int* instr = (unsigned int*) instruction_addr;
667 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
668 && is_mtctr(instr[3]) // mtctr
669 && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr);
670 }
671
672 // Identify variant 2.
673 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) {
674 unsigned int* instr = (unsigned int*) instruction_addr;
675 if (link) {
676 return is_bl (instr[6]) // bl dest is last
677 && is_nop(instr[0]) // nop
678 && is_nop(instr[1]) // nop
679 && is_nop(instr[2]) // nop
680 && is_nop(instr[3]) // nop
681 && is_nop(instr[4]) // nop
682 && is_nop(instr[5]); // nop
683 } else {
684 return is_b (instr[0]) // b dest is first
685 && is_nop(instr[1]) // nop
686 && is_nop(instr[2]) // nop
687 && is_nop(instr[3]) // nop
688 && is_nop(instr[4]) // nop
689 && is_nop(instr[5]) // nop
690 && is_nop(instr[6]); // nop
691 }
692 }
693
694 // Set dest address of a bxx64_patchable instruction.
695 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) {
696 ResourceMark rm;
697 int code_size = MacroAssembler::bxx64_patchable_size;
698 CodeBuffer buf(instruction_addr, code_size);
699 MacroAssembler masm(&buf);
700 masm.bxx64_patchable(dest, relocInfo::none, link);
701 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
702 }
703
704 // Get dest address of a bxx64_patchable instruction.
705 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) {
706 if (is_bxx64_patchable_variant1_at(instruction_addr, link)) {
707 return (address) (unsigned long) get_const(instruction_addr);
708 } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) {
709 unsigned int* instr = (unsigned int*) instruction_addr;
710 if (link) {
711 const int instr_idx = 6; // bl is last
712 int branchoffset = branch_destination(instr[instr_idx], 0);
713 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
714 } else {
715 const int instr_idx = 0; // b is first
716 int branchoffset = branch_destination(instr[instr_idx], 0);
717 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
718 }
719 // Load dest relative to global toc.
720 } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) {
721 return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord,
722 instruction_addr);
723 } else {
724 ShouldNotReachHere();
725 return NULL;
726 }
727 }
728
729 // Uses ordering which corresponds to ABI:
730 // _savegpr0_14: std r14,-144(r1)
731 // _savegpr0_15: std r15,-136(r1)
732 // _savegpr0_16: std r16,-128(r1)
733 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) {
734 std(R14, offset, dst); offset += 8;
735 std(R15, offset, dst); offset += 8;
736 std(R16, offset, dst); offset += 8;
737 std(R17, offset, dst); offset += 8;
738 std(R18, offset, dst); offset += 8;
739 std(R19, offset, dst); offset += 8;
740 std(R20, offset, dst); offset += 8;
741 std(R21, offset, dst); offset += 8;
742 std(R22, offset, dst); offset += 8;
743 std(R23, offset, dst); offset += 8;
744 std(R24, offset, dst); offset += 8;
745 std(R25, offset, dst); offset += 8;
746 std(R26, offset, dst); offset += 8;
747 std(R27, offset, dst); offset += 8;
748 std(R28, offset, dst); offset += 8;
749 std(R29, offset, dst); offset += 8;
750 std(R30, offset, dst); offset += 8;
751 std(R31, offset, dst); offset += 8;
752
753 stfd(F14, offset, dst); offset += 8;
754 stfd(F15, offset, dst); offset += 8;
755 stfd(F16, offset, dst); offset += 8;
756 stfd(F17, offset, dst); offset += 8;
757 stfd(F18, offset, dst); offset += 8;
758 stfd(F19, offset, dst); offset += 8;
759 stfd(F20, offset, dst); offset += 8;
760 stfd(F21, offset, dst); offset += 8;
761 stfd(F22, offset, dst); offset += 8;
762 stfd(F23, offset, dst); offset += 8;
763 stfd(F24, offset, dst); offset += 8;
764 stfd(F25, offset, dst); offset += 8;
765 stfd(F26, offset, dst); offset += 8;
766 stfd(F27, offset, dst); offset += 8;
767 stfd(F28, offset, dst); offset += 8;
768 stfd(F29, offset, dst); offset += 8;
769 stfd(F30, offset, dst); offset += 8;
770 stfd(F31, offset, dst);
771 }
772
773 // Uses ordering which corresponds to ABI:
774 // _restgpr0_14: ld r14,-144(r1)
775 // _restgpr0_15: ld r15,-136(r1)
776 // _restgpr0_16: ld r16,-128(r1)
777 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) {
778 ld(R14, offset, src); offset += 8;
779 ld(R15, offset, src); offset += 8;
780 ld(R16, offset, src); offset += 8;
781 ld(R17, offset, src); offset += 8;
782 ld(R18, offset, src); offset += 8;
783 ld(R19, offset, src); offset += 8;
784 ld(R20, offset, src); offset += 8;
785 ld(R21, offset, src); offset += 8;
786 ld(R22, offset, src); offset += 8;
787 ld(R23, offset, src); offset += 8;
788 ld(R24, offset, src); offset += 8;
789 ld(R25, offset, src); offset += 8;
790 ld(R26, offset, src); offset += 8;
791 ld(R27, offset, src); offset += 8;
792 ld(R28, offset, src); offset += 8;
793 ld(R29, offset, src); offset += 8;
794 ld(R30, offset, src); offset += 8;
795 ld(R31, offset, src); offset += 8;
796
797 // FP registers
798 lfd(F14, offset, src); offset += 8;
799 lfd(F15, offset, src); offset += 8;
800 lfd(F16, offset, src); offset += 8;
801 lfd(F17, offset, src); offset += 8;
802 lfd(F18, offset, src); offset += 8;
803 lfd(F19, offset, src); offset += 8;
804 lfd(F20, offset, src); offset += 8;
805 lfd(F21, offset, src); offset += 8;
806 lfd(F22, offset, src); offset += 8;
807 lfd(F23, offset, src); offset += 8;
808 lfd(F24, offset, src); offset += 8;
809 lfd(F25, offset, src); offset += 8;
810 lfd(F26, offset, src); offset += 8;
811 lfd(F27, offset, src); offset += 8;
812 lfd(F28, offset, src); offset += 8;
813 lfd(F29, offset, src); offset += 8;
814 lfd(F30, offset, src); offset += 8;
815 lfd(F31, offset, src);
816 }
817
818 // For verify_oops.
819 void MacroAssembler::save_volatile_gprs(Register dst, int offset) {
820 std(R2, offset, dst); offset += 8;
821 std(R3, offset, dst); offset += 8;
822 std(R4, offset, dst); offset += 8;
823 std(R5, offset, dst); offset += 8;
824 std(R6, offset, dst); offset += 8;
825 std(R7, offset, dst); offset += 8;
826 std(R8, offset, dst); offset += 8;
827 std(R9, offset, dst); offset += 8;
828 std(R10, offset, dst); offset += 8;
829 std(R11, offset, dst); offset += 8;
830 std(R12, offset, dst); offset += 8;
831
832 stfd(F0, offset, dst); offset += 8;
833 stfd(F1, offset, dst); offset += 8;
834 stfd(F2, offset, dst); offset += 8;
835 stfd(F3, offset, dst); offset += 8;
836 stfd(F4, offset, dst); offset += 8;
837 stfd(F5, offset, dst); offset += 8;
838 stfd(F6, offset, dst); offset += 8;
839 stfd(F7, offset, dst); offset += 8;
840 stfd(F8, offset, dst); offset += 8;
841 stfd(F9, offset, dst); offset += 8;
842 stfd(F10, offset, dst); offset += 8;
843 stfd(F11, offset, dst); offset += 8;
844 stfd(F12, offset, dst); offset += 8;
845 stfd(F13, offset, dst);
846 }
847
848 // For verify_oops.
849 void MacroAssembler::restore_volatile_gprs(Register src, int offset) {
850 ld(R2, offset, src); offset += 8;
851 ld(R3, offset, src); offset += 8;
852 ld(R4, offset, src); offset += 8;
853 ld(R5, offset, src); offset += 8;
854 ld(R6, offset, src); offset += 8;
855 ld(R7, offset, src); offset += 8;
856 ld(R8, offset, src); offset += 8;
857 ld(R9, offset, src); offset += 8;
858 ld(R10, offset, src); offset += 8;
859 ld(R11, offset, src); offset += 8;
860 ld(R12, offset, src); offset += 8;
861
862 lfd(F0, offset, src); offset += 8;
863 lfd(F1, offset, src); offset += 8;
864 lfd(F2, offset, src); offset += 8;
865 lfd(F3, offset, src); offset += 8;
866 lfd(F4, offset, src); offset += 8;
867 lfd(F5, offset, src); offset += 8;
868 lfd(F6, offset, src); offset += 8;
869 lfd(F7, offset, src); offset += 8;
870 lfd(F8, offset, src); offset += 8;
871 lfd(F9, offset, src); offset += 8;
872 lfd(F10, offset, src); offset += 8;
873 lfd(F11, offset, src); offset += 8;
874 lfd(F12, offset, src); offset += 8;
875 lfd(F13, offset, src);
876 }
877
878 void MacroAssembler::save_LR_CR(Register tmp) {
879 mfcr(tmp);
880 std(tmp, _abi(cr), R1_SP);
881 mflr(tmp);
882 std(tmp, _abi(lr), R1_SP);
883 // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad)
884 }
885
886 void MacroAssembler::restore_LR_CR(Register tmp) {
887 assert(tmp != R1_SP, "must be distinct");
888 ld(tmp, _abi(lr), R1_SP);
889 mtlr(tmp);
890 ld(tmp, _abi(cr), R1_SP);
891 mtcr(tmp);
892 }
893
894 address MacroAssembler::get_PC_trash_LR(Register result) {
895 Label L;
896 bl(L);
897 bind(L);
898 address lr_pc = pc();
899 mflr(result);
900 return lr_pc;
901 }
902
903 void MacroAssembler::resize_frame(Register offset, Register tmp) {
904 #ifdef ASSERT
905 assert_different_registers(offset, tmp, R1_SP);
906 andi_(tmp, offset, frame::alignment_in_bytes-1);
907 asm_assert_eq("resize_frame: unaligned", 0x204);
908 #endif
909
910 // tmp <- *(SP)
911 ld(tmp, _abi(callers_sp), R1_SP);
912 // addr <- SP + offset;
913 // *(addr) <- tmp;
914 // SP <- addr
915 stdux(tmp, R1_SP, offset);
916 }
917
918 void MacroAssembler::resize_frame(int offset, Register tmp) {
919 assert(is_simm(offset, 16), "too big an offset");
920 assert_different_registers(tmp, R1_SP);
921 assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned");
922 // tmp <- *(SP)
923 ld(tmp, _abi(callers_sp), R1_SP);
924 // addr <- SP + offset;
925 // *(addr) <- tmp;
926 // SP <- addr
927 stdu(tmp, offset, R1_SP);
928 }
929
930 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) {
931 // (addr == tmp1) || (addr == tmp2) is allowed here!
932 assert(tmp1 != tmp2, "must be distinct");
933
934 // compute offset w.r.t. current stack pointer
935 // tmp_1 <- addr - SP (!)
936 subf(tmp1, R1_SP, addr);
937
938 // atomically update SP keeping back link.
939 resize_frame(tmp1/* offset */, tmp2/* tmp */);
940 }
941
942 void MacroAssembler::push_frame(Register bytes, Register tmp) {
943 #ifdef ASSERT
944 assert(bytes != R0, "r0 not allowed here");
945 andi_(R0, bytes, frame::alignment_in_bytes-1);
946 asm_assert_eq("push_frame(Reg, Reg): unaligned", 0x203);
947 #endif
948 neg(tmp, bytes);
949 stdux(R1_SP, R1_SP, tmp);
950 }
951
952 // Push a frame of size `bytes'.
953 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) {
954 long offset = align_addr(bytes, frame::alignment_in_bytes);
955 if (is_simm(-offset, 16)) {
956 stdu(R1_SP, -offset, R1_SP);
957 } else {
958 load_const_optimized(tmp, -offset);
959 stdux(R1_SP, R1_SP, tmp);
960 }
961 }
962
963 // Push a frame of size `bytes' plus abi_reg_args on top.
964 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) {
965 push_frame(bytes + frame::abi_reg_args_size, tmp);
966 }
967
968 // Setup up a new C frame with a spill area for non-volatile GPRs and
969 // additional space for local variables.
970 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes,
971 Register tmp) {
972 push_frame(bytes + frame::abi_reg_args_size + frame::spill_nonvolatiles_size, tmp);
973 }
974
975 // Pop current C frame.
976 void MacroAssembler::pop_frame() {
977 ld(R1_SP, _abi(callers_sp), R1_SP);
978 }
979
980 #if defined(ABI_ELFv2)
981 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) {
982 // TODO(asmundak): make sure the caller uses R12 as function descriptor
983 // most of the times.
984 if (R12 != r_function_entry) {
985 mr(R12, r_function_entry);
986 }
987 mtctr(R12);
988 // Do a call or a branch.
989 if (and_link) {
990 bctrl();
991 } else {
992 bctr();
993 }
994 _last_calls_return_pc = pc();
995
996 return _last_calls_return_pc;
997 }
998
999 // Call a C function via a function descriptor and use full C
1000 // calling conventions. Updates and returns _last_calls_return_pc.
1001 address MacroAssembler::call_c(Register r_function_entry) {
1002 return branch_to(r_function_entry, /*and_link=*/true);
1003 }
1004
1005 // For tail calls: only branch, don't link, so callee returns to caller of this function.
1006 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) {
1007 return branch_to(r_function_entry, /*and_link=*/false);
1008 }
1009
1010 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) {
1011 load_const(R12, function_entry, R0);
1012 return branch_to(R12, /*and_link=*/true);
1013 }
1014
1015 #else
1016 // Generic version of a call to C function via a function descriptor
1017 // with variable support for C calling conventions (TOC, ENV, etc.).
1018 // Updates and returns _last_calls_return_pc.
1019 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call,
1020 bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) {
1021 // we emit standard ptrgl glue code here
1022 assert((function_descriptor != R0), "function_descriptor cannot be R0");
1023
1024 // retrieve necessary entries from the function descriptor
1025 ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor);
1026 mtctr(R0);
1027
1028 if (load_toc_of_callee) {
1029 ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor);
1030 }
1031 if (load_env_of_callee) {
1032 ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor);
1033 } else if (load_toc_of_callee) {
1034 li(R11, 0);
1035 }
1036
1037 // do a call or a branch
1038 if (and_link) {
1039 bctrl();
1040 } else {
1041 bctr();
1042 }
1043 _last_calls_return_pc = pc();
1044
1045 return _last_calls_return_pc;
1046 }
1047
1048 // Call a C function via a function descriptor and use full C calling
1049 // conventions.
1050 // We don't use the TOC in generated code, so there is no need to save
1051 // and restore its value.
1052 address MacroAssembler::call_c(Register fd) {
1053 return branch_to(fd, /*and_link=*/true,
1054 /*save toc=*/false,
1055 /*restore toc=*/false,
1056 /*load toc=*/true,
1057 /*load env=*/true);
1058 }
1059
1060 address MacroAssembler::call_c_and_return_to_caller(Register fd) {
1061 return branch_to(fd, /*and_link=*/false,
1062 /*save toc=*/false,
1063 /*restore toc=*/false,
1064 /*load toc=*/true,
1065 /*load env=*/true);
1066 }
1067
1068 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) {
1069 if (rt != relocInfo::none) {
1070 // this call needs to be relocatable
1071 if (!ReoptimizeCallSequences
1072 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1073 || fd == NULL // support code-size estimation
1074 || !fd->is_friend_function()
1075 || fd->entry() == NULL) {
1076 // it's not a friend function as defined by class FunctionDescriptor,
1077 // so do a full call-c here.
1078 load_const(R11, (address)fd, R0);
1079
1080 bool has_env = (fd != NULL && fd->env() != NULL);
1081 return branch_to(R11, /*and_link=*/true,
1082 /*save toc=*/false,
1083 /*restore toc=*/false,
1084 /*load toc=*/true,
1085 /*load env=*/has_env);
1086 } else {
1087 // It's a friend function. Load the entry point and don't care about
1088 // toc and env. Use an optimizable call instruction, but ensure the
1089 // same code-size as in the case of a non-friend function.
1090 nop();
1091 nop();
1092 nop();
1093 bl64_patchable(fd->entry(), rt);
1094 _last_calls_return_pc = pc();
1095 return _last_calls_return_pc;
1096 }
1097 } else {
1098 // This call does not need to be relocatable, do more aggressive
1099 // optimizations.
1100 if (!ReoptimizeCallSequences
1101 || !fd->is_friend_function()) {
1102 // It's not a friend function as defined by class FunctionDescriptor,
1103 // so do a full call-c here.
1104 load_const(R11, (address)fd, R0);
1105 return branch_to(R11, /*and_link=*/true,
1106 /*save toc=*/false,
1107 /*restore toc=*/false,
1108 /*load toc=*/true,
1109 /*load env=*/true);
1110 } else {
1111 // it's a friend function, load the entry point and don't care about
1112 // toc and env.
1113 address dest = fd->entry();
1114 if (is_within_range_of_b(dest, pc())) {
1115 bl(dest);
1116 } else {
1117 bl64_patchable(dest, rt);
1118 }
1119 _last_calls_return_pc = pc();
1120 return _last_calls_return_pc;
1121 }
1122 }
1123 }
1124
1125 // Call a C function. All constants needed reside in TOC.
1126 //
1127 // Read the address to call from the TOC.
1128 // Read env from TOC, if fd specifies an env.
1129 // Read new TOC from TOC.
1130 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd,
1131 relocInfo::relocType rt, Register toc) {
1132 if (!ReoptimizeCallSequences
1133 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1134 || !fd->is_friend_function()) {
1135 // It's not a friend function as defined by class FunctionDescriptor,
1136 // so do a full call-c here.
1137 assert(fd->entry() != NULL, "function must be linked");
1138
1139 AddressLiteral fd_entry(fd->entry());
1140 bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true);
1141 mtctr(R11);
1142 if (fd->env() == NULL) {
1143 li(R11, 0);
1144 nop();
1145 } else {
1146 AddressLiteral fd_env(fd->env());
1147 success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true);
1148 }
1149 AddressLiteral fd_toc(fd->toc());
1150 // Set R2_TOC (load from toc)
1151 success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true);
1152 bctrl();
1153 _last_calls_return_pc = pc();
1154 if (!success) { return NULL; }
1155 } else {
1156 // It's a friend function, load the entry point and don't care about
1157 // toc and env. Use an optimizable call instruction, but ensure the
1158 // same code-size as in the case of a non-friend function.
1159 nop();
1160 bl64_patchable(fd->entry(), rt);
1161 _last_calls_return_pc = pc();
1162 }
1163 return _last_calls_return_pc;
1164 }
1165 #endif // ABI_ELFv2
1166
1167 void MacroAssembler::call_VM_base(Register oop_result,
1168 Register last_java_sp,
1169 address entry_point,
1170 bool check_exceptions) {
1171 BLOCK_COMMENT("call_VM {");
1172 // Determine last_java_sp register.
1173 if (!last_java_sp->is_valid()) {
1174 last_java_sp = R1_SP;
1175 }
1176 set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1);
1177
1178 // ARG1 must hold thread address.
1179 mr(R3_ARG1, R16_thread);
1180 #if defined(ABI_ELFv2)
1181 address return_pc = call_c(entry_point, relocInfo::none);
1182 #else
1183 address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none);
1184 #endif
1185
1186 reset_last_Java_frame();
1187
1188 // Check for pending exceptions.
1189 if (check_exceptions) {
1190 // We don't check for exceptions here.
1191 ShouldNotReachHere();
1192 }
1193
1194 // Get oop result if there is one and reset the value in the thread.
1195 if (oop_result->is_valid()) {
1196 get_vm_result(oop_result);
1197 }
1198
1199 _last_calls_return_pc = return_pc;
1200 BLOCK_COMMENT("} call_VM");
1201 }
1202
1203 void MacroAssembler::call_VM_leaf_base(address entry_point) {
1204 BLOCK_COMMENT("call_VM_leaf {");
1205 #if defined(ABI_ELFv2)
1206 call_c(entry_point, relocInfo::none);
1207 #else
1208 call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none);
1209 #endif
1210 BLOCK_COMMENT("} call_VM_leaf");
1211 }
1212
1213 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) {
1214 call_VM_base(oop_result, noreg, entry_point, check_exceptions);
1215 }
1216
1217 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1,
1218 bool check_exceptions) {
1219 // R3_ARG1 is reserved for the thread.
1220 mr_if_needed(R4_ARG2, arg_1);
1221 call_VM(oop_result, entry_point, check_exceptions);
1222 }
1223
1224 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2,
1225 bool check_exceptions) {
1226 // R3_ARG1 is reserved for the thread
1227 mr_if_needed(R4_ARG2, arg_1);
1228 assert(arg_2 != R4_ARG2, "smashed argument");
1229 mr_if_needed(R5_ARG3, arg_2);
1230 call_VM(oop_result, entry_point, check_exceptions);
1231 }
1232
1233 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3,
1234 bool check_exceptions) {
1235 // R3_ARG1 is reserved for the thread
1236 mr_if_needed(R4_ARG2, arg_1);
1237 assert(arg_2 != R4_ARG2, "smashed argument");
1238 mr_if_needed(R5_ARG3, arg_2);
1239 mr_if_needed(R6_ARG4, arg_3);
1240 call_VM(oop_result, entry_point, check_exceptions);
1241 }
1242
1243 void MacroAssembler::call_VM_leaf(address entry_point) {
1244 call_VM_leaf_base(entry_point);
1245 }
1246
1247 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) {
1248 mr_if_needed(R3_ARG1, arg_1);
1249 call_VM_leaf(entry_point);
1250 }
1251
1252 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) {
1253 mr_if_needed(R3_ARG1, arg_1);
1254 assert(arg_2 != R3_ARG1, "smashed argument");
1255 mr_if_needed(R4_ARG2, arg_2);
1256 call_VM_leaf(entry_point);
1257 }
1258
1259 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) {
1260 mr_if_needed(R3_ARG1, arg_1);
1261 assert(arg_2 != R3_ARG1, "smashed argument");
1262 mr_if_needed(R4_ARG2, arg_2);
1263 assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument");
1264 mr_if_needed(R5_ARG3, arg_3);
1265 call_VM_leaf(entry_point);
1266 }
1267
1268 // Check whether instruction is a read access to the polling page
1269 // which was emitted by load_from_polling_page(..).
1270 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext,
1271 address* polling_address_ptr) {
1272 if (!is_ld(instruction))
1273 return false; // It's not a ld. Fail.
1274
1275 int rt = inv_rt_field(instruction);
1276 int ra = inv_ra_field(instruction);
1277 int ds = inv_ds_field(instruction);
1278 if (!(ds == 0 && ra != 0 && rt == 0)) {
1279 return false; // It's not a ld(r0, X, ra). Fail.
1280 }
1281
1282 if (!ucontext) {
1283 // Set polling address.
1284 if (polling_address_ptr != NULL) {
1285 *polling_address_ptr = NULL;
1286 }
1287 return true; // No ucontext given. Can't check value of ra. Assume true.
1288 }
1289
1290 #ifdef LINUX
1291 // Ucontext given. Check that register ra contains the address of
1292 // the safepoing polling page.
1293 ucontext_t* uc = (ucontext_t*) ucontext;
1294 // Set polling address.
1295 address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds;
1296 if (polling_address_ptr != NULL) {
1297 *polling_address_ptr = addr;
1298 }
1299 return os::is_poll_address(addr);
1300 #else
1301 // Not on Linux, ucontext must be NULL.
1302 ShouldNotReachHere();
1303 return false;
1304 #endif
1305 }
1306
1307 bool MacroAssembler::is_memory_serialization(int instruction, JavaThread* thread, void* ucontext) {
1308 #ifdef LINUX
1309 ucontext_t* uc = (ucontext_t*) ucontext;
1310
1311 if (is_stwx(instruction) || is_stwux(instruction)) {
1312 int ra = inv_ra_field(instruction);
1313 int rb = inv_rb_field(instruction);
1314
1315 // look up content of ra and rb in ucontext
1316 address ra_val=(address)uc->uc_mcontext.regs->gpr[ra];
1317 long rb_val=(long)uc->uc_mcontext.regs->gpr[rb];
1318 return os::is_memory_serialize_page(thread, ra_val+rb_val);
1319 } else if (is_stw(instruction) || is_stwu(instruction)) {
1320 int ra = inv_ra_field(instruction);
1321 int d1 = inv_d1_field(instruction);
1322
1323 // look up content of ra in ucontext
1324 address ra_val=(address)uc->uc_mcontext.regs->gpr[ra];
1325 return os::is_memory_serialize_page(thread, ra_val+d1);
1326 } else {
1327 return false;
1328 }
1329 #else
1330 // workaround not needed on !LINUX :-)
1331 ShouldNotCallThis();
1332 return false;
1333 #endif
1334 }
1335
1336 void MacroAssembler::bang_stack_with_offset(int offset) {
1337 // When increasing the stack, the old stack pointer will be written
1338 // to the new top of stack according to the PPC64 abi.
1339 // Therefore, stack banging is not necessary when increasing
1340 // the stack by <= os::vm_page_size() bytes.
1341 // When increasing the stack by a larger amount, this method is
1342 // called repeatedly to bang the intermediate pages.
1343
1344 // Stack grows down, caller passes positive offset.
1345 assert(offset > 0, "must bang with positive offset");
1346
1347 long stdoffset = -offset;
1348
1349 if (is_simm(stdoffset, 16)) {
1350 // Signed 16 bit offset, a simple std is ok.
1351 if (UseLoadInstructionsForStackBangingPPC64) {
1352 ld(R0, (int)(signed short)stdoffset, R1_SP);
1353 } else {
1354 std(R0,(int)(signed short)stdoffset, R1_SP);
1355 }
1356 } else if (is_simm(stdoffset, 31)) {
1357 const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset);
1358 const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset);
1359
1360 Register tmp = R11;
1361 addis(tmp, R1_SP, hi);
1362 if (UseLoadInstructionsForStackBangingPPC64) {
1363 ld(R0, lo, tmp);
1364 } else {
1365 std(R0, lo, tmp);
1366 }
1367 } else {
1368 ShouldNotReachHere();
1369 }
1370 }
1371
1372 // If instruction is a stack bang of the form
1373 // std R0, x(Ry), (see bang_stack_with_offset())
1374 // stdu R1_SP, x(R1_SP), (see push_frame(), resize_frame())
1375 // or stdux R1_SP, Rx, R1_SP (see push_frame(), resize_frame())
1376 // return the banged address. Otherwise, return 0.
1377 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) {
1378 #ifdef LINUX
1379 ucontext_t* uc = (ucontext_t*) ucontext;
1380 int rs = inv_rs_field(instruction);
1381 int ra = inv_ra_field(instruction);
1382 if ( (is_ld(instruction) && rs == 0 && UseLoadInstructionsForStackBangingPPC64)
1383 || (is_std(instruction) && rs == 0 && !UseLoadInstructionsForStackBangingPPC64)
1384 || (is_stdu(instruction) && rs == 1)) {
1385 int ds = inv_ds_field(instruction);
1386 // return banged address
1387 return ds+(address)uc->uc_mcontext.regs->gpr[ra];
1388 } else if (is_stdux(instruction) && rs == 1) {
1389 int rb = inv_rb_field(instruction);
1390 address sp = (address)uc->uc_mcontext.regs->gpr[1];
1391 long rb_val = (long)uc->uc_mcontext.regs->gpr[rb];
1392 return ra != 1 || rb_val >= 0 ? NULL // not a stack bang
1393 : sp + rb_val; // banged address
1394 }
1395 return NULL; // not a stack bang
1396 #else
1397 // workaround not needed on !LINUX :-)
1398 ShouldNotCallThis();
1399 return NULL;
1400 #endif
1401 }
1402
1403 void MacroAssembler::reserved_stack_check(Register return_pc) {
1404 // Test if reserved zone needs to be enabled.
1405 Label no_reserved_zone_enabling;
1406
1407 ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread);
1408 cmpld(CCR0, R1_SP, R0);
1409 blt_predict_taken(CCR0, no_reserved_zone_enabling);
1410
1411 // Enable reserved zone again, throw stack overflow exception.
1412 push_frame_reg_args(0, R0);
1413 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread);
1414 pop_frame();
1415 mtlr(return_pc);
1416 load_const_optimized(R0, StubRoutines::throw_delayed_StackOverflowError_entry());
1417 mtctr(R0);
1418 bctr();
1419
1420 should_not_reach_here();
1421
1422 bind(no_reserved_zone_enabling);
1423 }
1424
1425 void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base,
1426 bool cmpxchgx_hint) {
1427 Label retry;
1428 bind(retry);
1429 ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1430 stdcx_(exchange_value, addr_base);
1431 if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1432 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1433 } else {
1434 bne( CCR0, retry); // StXcx_ sets CCR0.
1435 }
1436 }
1437
1438 void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base,
1439 Register tmp, bool cmpxchgx_hint) {
1440 Label retry;
1441 bind(retry);
1442 ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1443 add(tmp, dest_current_value, inc_value);
1444 stdcx_(tmp, addr_base);
1445 if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1446 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1447 } else {
1448 bne( CCR0, retry); // StXcx_ sets CCR0.
1449 }
1450 }
1451
1452 // Word/sub-word atomic helper functions
1453
1454 // Temps and addr_base are killed if size < 4 and processor does not support respective instructions.
1455 // Only signed types are supported with size < 4.
1456 // Atomic add always kills tmp1.
1457 void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value,
1458 Register addr_base, Register tmp1, Register tmp2, Register tmp3,
1459 bool cmpxchgx_hint, bool is_add, int size) {
1460 // Sub-word instructions are available since Power 8.
1461 // For older processors, instruction_type != size holds, and we
1462 // emulate the sub-word instructions by constructing a 4-byte value
1463 // that leaves the other bytes unchanged.
1464 const int instruction_type = VM_Version::has_lqarx() ? size : 4;
1465
1466 Label retry;
1467 Register shift_amount = noreg,
1468 val32 = dest_current_value,
1469 modval = is_add ? tmp1 : exchange_value;
1470
1471 if (instruction_type != size) {
1472 assert_different_registers(tmp1, tmp2, tmp3, dest_current_value, exchange_value, addr_base);
1473 modval = tmp1;
1474 shift_amount = tmp2;
1475 val32 = tmp3;
1476 // Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.
1477 #ifdef VM_LITTLE_ENDIAN
1478 rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;
1479 clrrdi(addr_base, addr_base, 2);
1480 #else
1481 xori(shift_amount, addr_base, (size == 1) ? 3 : 2);
1482 clrrdi(addr_base, addr_base, 2);
1483 rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;
1484 #endif
1485 }
1486
1487 // atomic emulation loop
1488 bind(retry);
1489
1490 switch (instruction_type) {
1491 case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1492 case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1493 case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1494 default: ShouldNotReachHere();
1495 }
1496
1497 if (instruction_type != size) {
1498 srw(dest_current_value, val32, shift_amount);
1499 }
1500
1501 if (is_add) { add(modval, dest_current_value, exchange_value); }
1502
1503 if (instruction_type != size) {
1504 // Transform exchange value such that the replacement can be done by one xor instruction.
1505 xorr(modval, dest_current_value, is_add ? modval : exchange_value);
1506 clrldi(modval, modval, (size == 1) ? 56 : 48);
1507 slw(modval, modval, shift_amount);
1508 xorr(modval, val32, modval);
1509 }
1510
1511 switch (instruction_type) {
1512 case 4: stwcx_(modval, addr_base); break;
1513 case 2: sthcx_(modval, addr_base); break;
1514 case 1: stbcx_(modval, addr_base); break;
1515 default: ShouldNotReachHere();
1516 }
1517
1518 if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1519 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1520 } else {
1521 bne( CCR0, retry); // StXcx_ sets CCR0.
1522 }
1523
1524 // l?arx zero-extends, but Java wants byte/short values sign-extended.
1525 if (size == 1) {
1526 extsb(dest_current_value, dest_current_value);
1527 } else if (size == 2) {
1528 extsh(dest_current_value, dest_current_value);
1529 };
1530 }
1531
1532 // Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions.
1533 // Only signed types are supported with size < 4.
1534 void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value,
1535 Register compare_value, Register exchange_value,
1536 Register addr_base, Register tmp1, Register tmp2,
1537 Label &retry, Label &failed, bool cmpxchgx_hint, int size) {
1538 // Sub-word instructions are available since Power 8.
1539 // For older processors, instruction_type != size holds, and we
1540 // emulate the sub-word instructions by constructing a 4-byte value
1541 // that leaves the other bytes unchanged.
1542 const int instruction_type = VM_Version::has_lqarx() ? size : 4;
1543
1544 Register shift_amount = noreg,
1545 val32 = dest_current_value,
1546 modval = exchange_value;
1547
1548 if (instruction_type != size) {
1549 assert_different_registers(tmp1, tmp2, dest_current_value, compare_value, exchange_value, addr_base);
1550 shift_amount = tmp1;
1551 val32 = tmp2;
1552 modval = tmp2;
1553 // Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.
1554 #ifdef VM_LITTLE_ENDIAN
1555 rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;
1556 clrrdi(addr_base, addr_base, 2);
1557 #else
1558 xori(shift_amount, addr_base, (size == 1) ? 3 : 2);
1559 clrrdi(addr_base, addr_base, 2);
1560 rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;
1561 #endif
1562 // Transform exchange value such that the replacement can be done by one xor instruction.
1563 xorr(exchange_value, compare_value, exchange_value);
1564 clrldi(exchange_value, exchange_value, (size == 1) ? 56 : 48);
1565 slw(exchange_value, exchange_value, shift_amount);
1566 }
1567
1568 // atomic emulation loop
1569 bind(retry);
1570
1571 switch (instruction_type) {
1572 case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1573 case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1574 case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1575 default: ShouldNotReachHere();
1576 }
1577
1578 if (instruction_type != size) {
1579 srw(dest_current_value, val32, shift_amount);
1580 }
1581 if (size == 1) {
1582 extsb(dest_current_value, dest_current_value);
1583 } else if (size == 2) {
1584 extsh(dest_current_value, dest_current_value);
1585 };
1586
1587 cmpw(flag, dest_current_value, compare_value);
1588 if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1589 bne_predict_not_taken(flag, failed);
1590 } else {
1591 bne( flag, failed);
1592 }
1593 // branch to done => (flag == ne), (dest_current_value != compare_value)
1594 // fall through => (flag == eq), (dest_current_value == compare_value)
1595
1596 if (instruction_type != size) {
1597 xorr(modval, val32, exchange_value);
1598 }
1599
1600 switch (instruction_type) {
1601 case 4: stwcx_(modval, addr_base); break;
1602 case 2: sthcx_(modval, addr_base); break;
1603 case 1: stbcx_(modval, addr_base); break;
1604 default: ShouldNotReachHere();
1605 }
1606 }
1607
1608 // CmpxchgX sets condition register to cmpX(current, compare).
1609 void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value,
1610 Register compare_value, Register exchange_value,
1611 Register addr_base, Register tmp1, Register tmp2,
1612 int semantics, bool cmpxchgx_hint,
1613 Register int_flag_success, bool contention_hint, bool weak, int size) {
1614 Label retry;
1615 Label failed;
1616 Label done;
1617
1618 // Save one branch if result is returned via register and
1619 // result register is different from the other ones.
1620 bool use_result_reg = (int_flag_success != noreg);
1621 bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value &&
1622 int_flag_success != exchange_value && int_flag_success != addr_base &&
1623 int_flag_success != tmp1 && int_flag_success != tmp2);
1624 assert(!weak || flag == CCR0, "weak only supported with CCR0");
1625 assert(size == 1 || size == 2 || size == 4, "unsupported");
1626
1627 if (use_result_reg && preset_result_reg) {
1628 li(int_flag_success, 0); // preset (assume cas failed)
1629 }
1630
1631 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1632 if (contention_hint) { // Don't try to reserve if cmp fails.
1633 switch (size) {
1634 case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break;
1635 case 2: lha(dest_current_value, 0, addr_base); break;
1636 case 4: lwz(dest_current_value, 0, addr_base); break;
1637 default: ShouldNotReachHere();
1638 }
1639 cmpw(flag, dest_current_value, compare_value);
1640 bne(flag, failed);
1641 }
1642
1643 // release/fence semantics
1644 if (semantics & MemBarRel) {
1645 release();
1646 }
1647
1648 cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base, tmp1, tmp2,
1649 retry, failed, cmpxchgx_hint, size);
1650 if (!weak || use_result_reg) {
1651 if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1652 bne_predict_not_taken(CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1653 } else {
1654 bne( CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1655 }
1656 }
1657 // fall through => (flag == eq), (dest_current_value == compare_value), (swapped)
1658
1659 // Result in register (must do this at the end because int_flag_success can be the
1660 // same register as one above).
1661 if (use_result_reg) {
1662 li(int_flag_success, 1);
1663 }
1664
1665 if (semantics & MemBarFenceAfter) {
1666 fence();
1667 } else if (semantics & MemBarAcq) {
1668 isync();
1669 }
1670
1671 if (use_result_reg && !preset_result_reg) {
1672 b(done);
1673 }
1674
1675 bind(failed);
1676 if (use_result_reg && !preset_result_reg) {
1677 li(int_flag_success, 0);
1678 }
1679
1680 bind(done);
1681 // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1682 // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1683 }
1684
1685 // Preforms atomic compare exchange:
1686 // if (compare_value == *addr_base)
1687 // *addr_base = exchange_value
1688 // int_flag_success = 1;
1689 // else
1690 // int_flag_success = 0;
1691 //
1692 // ConditionRegister flag = cmp(compare_value, *addr_base)
1693 // Register dest_current_value = *addr_base
1694 // Register compare_value Used to compare with value in memory
1695 // Register exchange_value Written to memory if compare_value == *addr_base
1696 // Register addr_base The memory location to compareXChange
1697 // Register int_flag_success Set to 1 if exchange_value was written to *addr_base
1698 //
1699 // To avoid the costly compare exchange the value is tested beforehand.
1700 // Several special cases exist to avoid that unnecessary information is generated.
1701 //
1702 void MacroAssembler::cmpxchgd(ConditionRegister flag,
1703 Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value,
1704 Register addr_base, int semantics, bool cmpxchgx_hint,
1705 Register int_flag_success, Label* failed_ext, bool contention_hint, bool weak) {
1706 Label retry;
1707 Label failed_int;
1708 Label& failed = (failed_ext != NULL) ? *failed_ext : failed_int;
1709 Label done;
1710
1711 // Save one branch if result is returned via register and result register is different from the other ones.
1712 bool use_result_reg = (int_flag_success!=noreg);
1713 bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() &&
1714 int_flag_success!=exchange_value && int_flag_success!=addr_base);
1715 assert(!weak || flag == CCR0, "weak only supported with CCR0");
1716 assert(int_flag_success == noreg || failed_ext == NULL, "cannot have both");
1717
1718 if (use_result_reg && preset_result_reg) {
1719 li(int_flag_success, 0); // preset (assume cas failed)
1720 }
1721
1722 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1723 if (contention_hint) { // Don't try to reserve if cmp fails.
1724 ld(dest_current_value, 0, addr_base);
1725 cmpd(flag, compare_value, dest_current_value);
1726 bne(flag, failed);
1727 }
1728
1729 // release/fence semantics
1730 if (semantics & MemBarRel) {
1731 release();
1732 }
1733
1734 // atomic emulation loop
1735 bind(retry);
1736
1737 ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1738 cmpd(flag, compare_value, dest_current_value);
1739 if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1740 bne_predict_not_taken(flag, failed);
1741 } else {
1742 bne( flag, failed);
1743 }
1744
1745 stdcx_(exchange_value, addr_base);
1746 if (!weak || use_result_reg || failed_ext) {
1747 if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1748 bne_predict_not_taken(CCR0, weak ? failed : retry); // stXcx_ sets CCR0
1749 } else {
1750 bne( CCR0, weak ? failed : retry); // stXcx_ sets CCR0
1751 }
1752 }
1753
1754 // result in register (must do this at the end because int_flag_success can be the same register as one above)
1755 if (use_result_reg) {
1756 li(int_flag_success, 1);
1757 }
1758
1759 if (semantics & MemBarFenceAfter) {
1760 fence();
1761 } else if (semantics & MemBarAcq) {
1762 isync();
1763 }
1764
1765 if (use_result_reg && !preset_result_reg) {
1766 b(done);
1767 }
1768
1769 bind(failed_int);
1770 if (use_result_reg && !preset_result_reg) {
1771 li(int_flag_success, 0);
1772 }
1773
1774 bind(done);
1775 // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1776 // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1777 }
1778
1779 // Look up the method for a megamorphic invokeinterface call.
1780 // The target method is determined by <intf_klass, itable_index>.
1781 // The receiver klass is in recv_klass.
1782 // On success, the result will be in method_result, and execution falls through.
1783 // On failure, execution transfers to the given label.
1784 void MacroAssembler::lookup_interface_method(Register recv_klass,
1785 Register intf_klass,
1786 RegisterOrConstant itable_index,
1787 Register method_result,
1788 Register scan_temp,
1789 Register sethi_temp,
1790 Label& L_no_such_interface) {
1791 assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
1792 assert(itable_index.is_constant() || itable_index.as_register() == method_result,
1793 "caller must use same register for non-constant itable index as for method");
1794
1795 // Compute start of first itableOffsetEntry (which is at the end of the vtable).
1796 int vtable_base = in_bytes(Klass::vtable_start_offset());
1797 int itentry_off = itableMethodEntry::method_offset_in_bytes();
1798 int logMEsize = exact_log2(itableMethodEntry::size() * wordSize);
1799 int scan_step = itableOffsetEntry::size() * wordSize;
1800 int log_vte_size= exact_log2(vtableEntry::size_in_bytes());
1801
1802 lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass);
1803 // %%% We should store the aligned, prescaled offset in the klassoop.
1804 // Then the next several instructions would fold away.
1805
1806 sldi(scan_temp, scan_temp, log_vte_size);
1807 addi(scan_temp, scan_temp, vtable_base);
1808 add(scan_temp, recv_klass, scan_temp);
1809
1810 // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1811 if (itable_index.is_register()) {
1812 Register itable_offset = itable_index.as_register();
1813 sldi(itable_offset, itable_offset, logMEsize);
1814 if (itentry_off) addi(itable_offset, itable_offset, itentry_off);
1815 add(recv_klass, itable_offset, recv_klass);
1816 } else {
1817 long itable_offset = (long)itable_index.as_constant();
1818 load_const_optimized(sethi_temp, (itable_offset<<logMEsize)+itentry_off); // static address, no relocation
1819 add(recv_klass, sethi_temp, recv_klass);
1820 }
1821
1822 // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
1823 // if (scan->interface() == intf) {
1824 // result = (klass + scan->offset() + itable_index);
1825 // }
1826 // }
1827 Label search, found_method;
1828
1829 for (int peel = 1; peel >= 0; peel--) {
1830 // %%%% Could load both offset and interface in one ldx, if they were
1831 // in the opposite order. This would save a load.
1832 ld(method_result, itableOffsetEntry::interface_offset_in_bytes(), scan_temp);
1833
1834 // Check that this entry is non-null. A null entry means that
1835 // the receiver class doesn't implement the interface, and wasn't the
1836 // same as when the caller was compiled.
1837 cmpd(CCR0, method_result, intf_klass);
1838
1839 if (peel) {
1840 beq(CCR0, found_method);
1841 } else {
1842 bne(CCR0, search);
1843 // (invert the test to fall through to found_method...)
1844 }
1845
1846 if (!peel) break;
1847
1848 bind(search);
1849
1850 cmpdi(CCR0, method_result, 0);
1851 beq(CCR0, L_no_such_interface);
1852 addi(scan_temp, scan_temp, scan_step);
1853 }
1854
1855 bind(found_method);
1856
1857 // Got a hit.
1858 int ito_offset = itableOffsetEntry::offset_offset_in_bytes();
1859 lwz(scan_temp, ito_offset, scan_temp);
1860 ldx(method_result, scan_temp, recv_klass);
1861 }
1862
1863 // virtual method calling
1864 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1865 RegisterOrConstant vtable_index,
1866 Register method_result) {
1867
1868 assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg());
1869
1870 const int base = in_bytes(Klass::vtable_start_offset());
1871 assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1872
1873 if (vtable_index.is_register()) {
1874 sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord);
1875 add(recv_klass, vtable_index.as_register(), recv_klass);
1876 } else {
1877 addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord);
1878 }
1879 ld(R19_method, base + vtableEntry::method_offset_in_bytes(), recv_klass);
1880 }
1881
1882 /////////////////////////////////////////// subtype checking ////////////////////////////////////////////
1883 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1884 Register super_klass,
1885 Register temp1_reg,
1886 Register temp2_reg,
1887 Label* L_success,
1888 Label* L_failure,
1889 Label* L_slow_path,
1890 RegisterOrConstant super_check_offset) {
1891
1892 const Register check_cache_offset = temp1_reg;
1893 const Register cached_super = temp2_reg;
1894
1895 assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super);
1896
1897 int sco_offset = in_bytes(Klass::super_check_offset_offset());
1898 int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1899
1900 bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1901 bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset);
1902
1903 Label L_fallthrough;
1904 int label_nulls = 0;
1905 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; }
1906 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; }
1907 if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1908 assert(label_nulls <= 1 ||
1909 (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path),
1910 "at most one NULL in the batch, usually");
1911
1912 // If the pointers are equal, we are done (e.g., String[] elements).
1913 // This self-check enables sharing of secondary supertype arrays among
1914 // non-primary types such as array-of-interface. Otherwise, each such
1915 // type would need its own customized SSA.
1916 // We move this check to the front of the fast path because many
1917 // type checks are in fact trivially successful in this manner,
1918 // so we get a nicely predicted branch right at the start of the check.
1919 cmpd(CCR0, sub_klass, super_klass);
1920 beq(CCR0, *L_success);
1921
1922 // Check the supertype display:
1923 if (must_load_sco) {
1924 // The super check offset is always positive...
1925 lwz(check_cache_offset, sco_offset, super_klass);
1926 super_check_offset = RegisterOrConstant(check_cache_offset);
1927 // super_check_offset is register.
1928 assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register());
1929 }
1930 // The loaded value is the offset from KlassOopDesc.
1931
1932 ld(cached_super, super_check_offset, sub_klass);
1933 cmpd(CCR0, cached_super, super_klass);
1934
1935 // This check has worked decisively for primary supers.
1936 // Secondary supers are sought in the super_cache ('super_cache_addr').
1937 // (Secondary supers are interfaces and very deeply nested subtypes.)
1938 // This works in the same check above because of a tricky aliasing
1939 // between the super_cache and the primary super display elements.
1940 // (The 'super_check_addr' can address either, as the case requires.)
1941 // Note that the cache is updated below if it does not help us find
1942 // what we need immediately.
1943 // So if it was a primary super, we can just fail immediately.
1944 // Otherwise, it's the slow path for us (no success at this point).
1945
1946 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); }
1947
1948 if (super_check_offset.is_register()) {
1949 beq(CCR0, *L_success);
1950 cmpwi(CCR0, super_check_offset.as_register(), sc_offset);
1951 if (L_failure == &L_fallthrough) {
1952 beq(CCR0, *L_slow_path);
1953 } else {
1954 bne(CCR0, *L_failure);
1955 FINAL_JUMP(*L_slow_path);
1956 }
1957 } else {
1958 if (super_check_offset.as_constant() == sc_offset) {
1959 // Need a slow path; fast failure is impossible.
1960 if (L_slow_path == &L_fallthrough) {
1961 beq(CCR0, *L_success);
1962 } else {
1963 bne(CCR0, *L_slow_path);
1964 FINAL_JUMP(*L_success);
1965 }
1966 } else {
1967 // No slow path; it's a fast decision.
1968 if (L_failure == &L_fallthrough) {
1969 beq(CCR0, *L_success);
1970 } else {
1971 bne(CCR0, *L_failure);
1972 FINAL_JUMP(*L_success);
1973 }
1974 }
1975 }
1976
1977 bind(L_fallthrough);
1978 #undef FINAL_JUMP
1979 }
1980
1981 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1982 Register super_klass,
1983 Register temp1_reg,
1984 Register temp2_reg,
1985 Label* L_success,
1986 Register result_reg) {
1987 const Register array_ptr = temp1_reg; // current value from cache array
1988 const Register temp = temp2_reg;
1989
1990 assert_different_registers(sub_klass, super_klass, array_ptr, temp);
1991
1992 int source_offset = in_bytes(Klass::secondary_supers_offset());
1993 int target_offset = in_bytes(Klass::secondary_super_cache_offset());
1994
1995 int length_offset = Array<Klass*>::length_offset_in_bytes();
1996 int base_offset = Array<Klass*>::base_offset_in_bytes();
1997
1998 Label hit, loop, failure, fallthru;
1999
2000 ld(array_ptr, source_offset, sub_klass);
2001
2002 // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated.");
2003 lwz(temp, length_offset, array_ptr);
2004 cmpwi(CCR0, temp, 0);
2005 beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0
2006
2007 mtctr(temp); // load ctr
2008
2009 bind(loop);
2010 // Oops in table are NO MORE compressed.
2011 ld(temp, base_offset, array_ptr);
2012 cmpd(CCR0, temp, super_klass);
2013 beq(CCR0, hit);
2014 addi(array_ptr, array_ptr, BytesPerWord);
2015 bdnz(loop);
2016
2017 bind(failure);
2018 if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss)
2019 b(fallthru);
2020
2021 bind(hit);
2022 std(super_klass, target_offset, sub_klass); // save result to cache
2023 if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit)
2024 if (L_success != NULL) { b(*L_success); }
2025 else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided
2026
2027 bind(fallthru);
2028 }
2029
2030 // Try fast path, then go to slow one if not successful
2031 void MacroAssembler::check_klass_subtype(Register sub_klass,
2032 Register super_klass,
2033 Register temp1_reg,
2034 Register temp2_reg,
2035 Label& L_success) {
2036 Label L_failure;
2037 check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure);
2038 check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success);
2039 bind(L_failure); // Fallthru if not successful.
2040 }
2041
2042 void MacroAssembler::check_method_handle_type(Register mtype_reg, Register mh_reg,
2043 Register temp_reg,
2044 Label& wrong_method_type) {
2045 assert_different_registers(mtype_reg, mh_reg, temp_reg);
2046 // Compare method type against that of the receiver.
2047 load_heap_oop_not_null(temp_reg, delayed_value(java_lang_invoke_MethodHandle::type_offset_in_bytes, temp_reg), mh_reg);
2048 cmpd(CCR0, temp_reg, mtype_reg);
2049 bne(CCR0, wrong_method_type);
2050 }
2051
2052 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot,
2053 Register temp_reg,
2054 int extra_slot_offset) {
2055 // cf. TemplateTable::prepare_invoke(), if (load_receiver).
2056 int stackElementSize = Interpreter::stackElementSize;
2057 int offset = extra_slot_offset * stackElementSize;
2058 if (arg_slot.is_constant()) {
2059 offset += arg_slot.as_constant() * stackElementSize;
2060 return offset;
2061 } else {
2062 assert(temp_reg != noreg, "must specify");
2063 sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize));
2064 if (offset != 0)
2065 addi(temp_reg, temp_reg, offset);
2066 return temp_reg;
2067 }
2068 }
2069
2070 // Supports temp2_reg = R0.
2071 void MacroAssembler::biased_locking_enter(ConditionRegister cr_reg, Register obj_reg,
2072 Register mark_reg, Register temp_reg,
2073 Register temp2_reg, Label& done, Label* slow_case) {
2074 assert(UseBiasedLocking, "why call this otherwise?");
2075
2076 #ifdef ASSERT
2077 assert_different_registers(obj_reg, mark_reg, temp_reg, temp2_reg);
2078 #endif
2079
2080 Label cas_label;
2081
2082 // Branch to done if fast path fails and no slow_case provided.
2083 Label *slow_case_int = (slow_case != NULL) ? slow_case : &done;
2084
2085 // Biased locking
2086 // See whether the lock is currently biased toward our thread and
2087 // whether the epoch is still valid
2088 // Note that the runtime guarantees sufficient alignment of JavaThread
2089 // pointers to allow age to be placed into low bits
2090 assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits,
2091 "biased locking makes assumptions about bit layout");
2092
2093 if (PrintBiasedLockingStatistics) {
2094 load_const(temp2_reg, (address) BiasedLocking::total_entry_count_addr(), temp_reg);
2095 lwzx(temp_reg, temp2_reg);
2096 addi(temp_reg, temp_reg, 1);
2097 stwx(temp_reg, temp2_reg);
2098 }
2099
2100 andi(temp_reg, mark_reg, markOopDesc::biased_lock_mask_in_place);
2101 cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern);
2102 bne(cr_reg, cas_label);
2103
2104 load_klass(temp_reg, obj_reg);
2105
2106 load_const_optimized(temp2_reg, ~((int) markOopDesc::age_mask_in_place));
2107 ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
2108 orr(temp_reg, R16_thread, temp_reg);
2109 xorr(temp_reg, mark_reg, temp_reg);
2110 andr(temp_reg, temp_reg, temp2_reg);
2111 cmpdi(cr_reg, temp_reg, 0);
2112 if (PrintBiasedLockingStatistics) {
2113 Label l;
2114 bne(cr_reg, l);
2115 load_const(temp2_reg, (address) BiasedLocking::biased_lock_entry_count_addr());
2116 lwzx(mark_reg, temp2_reg);
2117 addi(mark_reg, mark_reg, 1);
2118 stwx(mark_reg, temp2_reg);
2119 // restore mark_reg
2120 ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);
2121 bind(l);
2122 }
2123 beq(cr_reg, done);
2124
2125 Label try_revoke_bias;
2126 Label try_rebias;
2127
2128 // At this point we know that the header has the bias pattern and
2129 // that we are not the bias owner in the current epoch. We need to
2130 // figure out more details about the state of the header in order to
2131 // know what operations can be legally performed on the object's
2132 // header.
2133
2134 // If the low three bits in the xor result aren't clear, that means
2135 // the prototype header is no longer biased and we have to revoke
2136 // the bias on this object.
2137 andi(temp2_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
2138 cmpwi(cr_reg, temp2_reg, 0);
2139 bne(cr_reg, try_revoke_bias);
2140
2141 // Biasing is still enabled for this data type. See whether the
2142 // epoch of the current bias is still valid, meaning that the epoch
2143 // bits of the mark word are equal to the epoch bits of the
2144 // prototype header. (Note that the prototype header's epoch bits
2145 // only change at a safepoint.) If not, attempt to rebias the object
2146 // toward the current thread. Note that we must be absolutely sure
2147 // that the current epoch is invalid in order to do this because
2148 // otherwise the manipulations it performs on the mark word are
2149 // illegal.
2150
2151 int shift_amount = 64 - markOopDesc::epoch_shift;
2152 // rotate epoch bits to right (little) end and set other bits to 0
2153 // [ big part | epoch | little part ] -> [ 0..0 | epoch ]
2154 rldicl_(temp2_reg, temp_reg, shift_amount, 64 - markOopDesc::epoch_bits);
2155 // branch if epoch bits are != 0, i.e. they differ, because the epoch has been incremented
2156 bne(CCR0, try_rebias);
2157
2158 // The epoch of the current bias is still valid but we know nothing
2159 // about the owner; it might be set or it might be clear. Try to
2160 // acquire the bias of the object using an atomic operation. If this
2161 // fails we will go in to the runtime to revoke the object's bias.
2162 // Note that we first construct the presumed unbiased header so we
2163 // don't accidentally blow away another thread's valid bias.
2164 andi(mark_reg, mark_reg, (markOopDesc::biased_lock_mask_in_place |
2165 markOopDesc::age_mask_in_place |
2166 markOopDesc::epoch_mask_in_place));
2167 orr(temp_reg, R16_thread, mark_reg);
2168
2169 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2170
2171 // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
2172 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
2173 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
2174 /*where=*/obj_reg,
2175 MacroAssembler::MemBarAcq,
2176 MacroAssembler::cmpxchgx_hint_acquire_lock(),
2177 noreg, slow_case_int); // bail out if failed
2178
2179 // If the biasing toward our thread failed, this means that
2180 // another thread succeeded in biasing it toward itself and we
2181 // need to revoke that bias. The revocation will occur in the
2182 // interpreter runtime in the slow case.
2183 if (PrintBiasedLockingStatistics) {
2184 load_const(temp2_reg, (address) BiasedLocking::anonymously_biased_lock_entry_count_addr(), temp_reg);
2185 lwzx(temp_reg, temp2_reg);
2186 addi(temp_reg, temp_reg, 1);
2187 stwx(temp_reg, temp2_reg);
2188 }
2189 b(done);
2190
2191 bind(try_rebias);
2192 // At this point we know the epoch has expired, meaning that the
2193 // current "bias owner", if any, is actually invalid. Under these
2194 // circumstances _only_, we are allowed to use the current header's
2195 // value as the comparison value when doing the cas to acquire the
2196 // bias in the current epoch. In other words, we allow transfer of
2197 // the bias from one thread to another directly in this situation.
2198 load_klass(temp_reg, obj_reg);
2199 andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place);
2200 orr(temp2_reg, R16_thread, temp2_reg);
2201 ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
2202 orr(temp_reg, temp2_reg, temp_reg);
2203
2204 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2205
2206 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
2207 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
2208 /*where=*/obj_reg,
2209 MacroAssembler::MemBarAcq,
2210 MacroAssembler::cmpxchgx_hint_acquire_lock(),
2211 noreg, slow_case_int); // bail out if failed
2212
2213 // If the biasing toward our thread failed, this means that
2214 // another thread succeeded in biasing it toward itself and we
2215 // need to revoke that bias. The revocation will occur in the
2216 // interpreter runtime in the slow case.
2217 if (PrintBiasedLockingStatistics) {
2218 load_const(temp2_reg, (address) BiasedLocking::rebiased_lock_entry_count_addr(), temp_reg);
2219 lwzx(temp_reg, temp2_reg);
2220 addi(temp_reg, temp_reg, 1);
2221 stwx(temp_reg, temp2_reg);
2222 }
2223 b(done);
2224
2225 bind(try_revoke_bias);
2226 // The prototype mark in the klass doesn't have the bias bit set any
2227 // more, indicating that objects of this data type are not supposed
2228 // to be biased any more. We are going to try to reset the mark of
2229 // this object to the prototype value and fall through to the
2230 // CAS-based locking scheme. Note that if our CAS fails, it means
2231 // that another thread raced us for the privilege of revoking the
2232 // bias of this particular object, so it's okay to continue in the
2233 // normal locking code.
2234 load_klass(temp_reg, obj_reg);
2235 ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
2236 andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place);
2237 orr(temp_reg, temp_reg, temp2_reg);
2238
2239 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2240
2241 // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
2242 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
2243 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
2244 /*where=*/obj_reg,
2245 MacroAssembler::MemBarAcq,
2246 MacroAssembler::cmpxchgx_hint_acquire_lock());
2247
2248 // reload markOop in mark_reg before continuing with lightweight locking
2249 ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);
2250
2251 // Fall through to the normal CAS-based lock, because no matter what
2252 // the result of the above CAS, some thread must have succeeded in
2253 // removing the bias bit from the object's header.
2254 if (PrintBiasedLockingStatistics) {
2255 Label l;
2256 bne(cr_reg, l);
2257 load_const(temp2_reg, (address) BiasedLocking::revoked_lock_entry_count_addr(), temp_reg);
2258 lwzx(temp_reg, temp2_reg);
2259 addi(temp_reg, temp_reg, 1);
2260 stwx(temp_reg, temp2_reg);
2261 bind(l);
2262 }
2263
2264 bind(cas_label);
2265 }
2266
2267 void MacroAssembler::biased_locking_exit (ConditionRegister cr_reg, Register mark_addr, Register temp_reg, Label& done) {
2268 // Check for biased locking unlock case, which is a no-op
2269 // Note: we do not have to check the thread ID for two reasons.
2270 // First, the interpreter checks for IllegalMonitorStateException at
2271 // a higher level. Second, if the bias was revoked while we held the
2272 // lock, the object could not be rebiased toward another thread, so
2273 // the bias bit would be clear.
2274
2275 ld(temp_reg, 0, mark_addr);
2276 andi(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
2277
2278 cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern);
2279 beq(cr_reg, done);
2280 }
2281
2282 // allocation (for C1)
2283 void MacroAssembler::eden_allocate(
2284 Register obj, // result: pointer to object after successful allocation
2285 Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise
2286 int con_size_in_bytes, // object size in bytes if known at compile time
2287 Register t1, // temp register
2288 Register t2, // temp register
2289 Label& slow_case // continuation point if fast allocation fails
2290 ) {
2291 b(slow_case);
2292 }
2293
2294 void MacroAssembler::tlab_allocate(
2295 Register obj, // result: pointer to object after successful allocation
2296 Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise
2297 int con_size_in_bytes, // object size in bytes if known at compile time
2298 Register t1, // temp register
2299 Label& slow_case // continuation point if fast allocation fails
2300 ) {
2301 // make sure arguments make sense
2302 assert_different_registers(obj, var_size_in_bytes, t1);
2303 assert(0 <= con_size_in_bytes && is_simm13(con_size_in_bytes), "illegal object size");
2304 assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment");
2305
2306 const Register new_top = t1;
2307 //verify_tlab(); not implemented
2308
2309 ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2310 ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread);
2311 if (var_size_in_bytes == noreg) {
2312 addi(new_top, obj, con_size_in_bytes);
2313 } else {
2314 add(new_top, obj, var_size_in_bytes);
2315 }
2316 cmpld(CCR0, new_top, R0);
2317 bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case);
2318
2319 #ifdef ASSERT
2320 // make sure new free pointer is properly aligned
2321 {
2322 Label L;
2323 andi_(R0, new_top, MinObjAlignmentInBytesMask);
2324 beq(CCR0, L);
2325 stop("updated TLAB free is not properly aligned", 0x934);
2326 bind(L);
2327 }
2328 #endif // ASSERT
2329
2330 // update the tlab top pointer
2331 std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2332 //verify_tlab(); not implemented
2333 }
2334 void MacroAssembler::tlab_refill(Label& retry_tlab, Label& try_eden, Label& slow_case) {
2335 unimplemented("tlab_refill");
2336 }
2337 void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register t1, Register t2) {
2338 unimplemented("incr_allocated_bytes");
2339 }
2340
2341 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset,
2342 int insts_call_instruction_offset, Register Rtoc) {
2343 // Start the stub.
2344 address stub = start_a_stub(64);
2345 if (stub == NULL) { return NULL; } // CodeCache full: bail out
2346
2347 // Create a trampoline stub relocation which relates this trampoline stub
2348 // with the call instruction at insts_call_instruction_offset in the
2349 // instructions code-section.
2350 relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset));
2351 const int stub_start_offset = offset();
2352
2353 // For java_to_interp stubs we use R11_scratch1 as scratch register
2354 // and in call trampoline stubs we use R12_scratch2. This way we
2355 // can distinguish them (see is_NativeCallTrampolineStub_at()).
2356 Register reg_scratch = R12_scratch2;
2357
2358 // Now, create the trampoline stub's code:
2359 // - load the TOC
2360 // - load the call target from the constant pool
2361 // - call
2362 if (Rtoc == noreg) {
2363 calculate_address_from_global_toc(reg_scratch, method_toc());
2364 Rtoc = reg_scratch;
2365 }
2366
2367 ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false);
2368 mtctr(reg_scratch);
2369 bctr();
2370
2371 const address stub_start_addr = addr_at(stub_start_offset);
2372
2373 // Assert that the encoded destination_toc_offset can be identified and that it is correct.
2374 assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(),
2375 "encoded offset into the constant pool must match");
2376 // Trampoline_stub_size should be good.
2377 assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size");
2378 assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
2379
2380 // End the stub.
2381 end_a_stub();
2382 return stub;
2383 }
2384
2385 // TM on PPC64.
2386 void MacroAssembler::atomic_inc_ptr(Register addr, Register result, int simm16) {
2387 Label retry;
2388 bind(retry);
2389 ldarx(result, addr, /*hint*/ false);
2390 addi(result, result, simm16);
2391 stdcx_(result, addr);
2392 if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
2393 bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
2394 } else {
2395 bne( CCR0, retry); // stXcx_ sets CCR0
2396 }
2397 }
2398
2399 void MacroAssembler::atomic_ori_int(Register addr, Register result, int uimm16) {
2400 Label retry;
2401 bind(retry);
2402 lwarx(result, addr, /*hint*/ false);
2403 ori(result, result, uimm16);
2404 stwcx_(result, addr);
2405 if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
2406 bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
2407 } else {
2408 bne( CCR0, retry); // stXcx_ sets CCR0
2409 }
2410 }
2411
2412 #if INCLUDE_RTM_OPT
2413
2414 // Update rtm_counters based on abort status
2415 // input: abort_status
2416 // rtm_counters (RTMLockingCounters*)
2417 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters_Reg) {
2418 // Mapping to keep PreciseRTMLockingStatistics similar to x86.
2419 // x86 ppc (! means inverted, ? means not the same)
2420 // 0 31 Set if abort caused by XABORT instruction.
2421 // 1 ! 7 If set, the transaction may succeed on a retry. This bit is always clear if bit 0 is set.
2422 // 2 13 Set if another logical processor conflicted with a memory address that was part of the transaction that aborted.
2423 // 3 10 Set if an internal buffer overflowed.
2424 // 4 ?12 Set if a debug breakpoint was hit.
2425 // 5 ?32 Set if an abort occurred during execution of a nested transaction.
2426 const int tm_failure_bit[] = {Assembler::tm_tabort, // Note: Seems like signal handler sets this, too.
2427 Assembler::tm_failure_persistent, // inverted: transient
2428 Assembler::tm_trans_cf,
2429 Assembler::tm_footprint_of,
2430 Assembler::tm_non_trans_cf,
2431 Assembler::tm_suspended};
2432 const bool tm_failure_inv[] = {false, true, false, false, false, false};
2433 assert(sizeof(tm_failure_bit)/sizeof(int) == RTMLockingCounters::ABORT_STATUS_LIMIT, "adapt mapping!");
2434
2435 const Register addr_Reg = R0;
2436 // Keep track of offset to where rtm_counters_Reg had pointed to.
2437 int counters_offs = RTMLockingCounters::abort_count_offset();
2438 addi(addr_Reg, rtm_counters_Reg, counters_offs);
2439 const Register temp_Reg = rtm_counters_Reg;
2440
2441 //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically
2442 ldx(temp_Reg, addr_Reg);
2443 addi(temp_Reg, temp_Reg, 1);
2444 stdx(temp_Reg, addr_Reg);
2445
2446 if (PrintPreciseRTMLockingStatistics) {
2447 int counters_offs_delta = RTMLockingCounters::abortX_count_offset() - counters_offs;
2448
2449 //mftexasr(abort_status); done by caller
2450 for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
2451 counters_offs += counters_offs_delta;
2452 li(temp_Reg, counters_offs_delta); // can't use addi with R0
2453 add(addr_Reg, addr_Reg, temp_Reg); // point to next counter
2454 counters_offs_delta = sizeof(uintx);
2455
2456 Label check_abort;
2457 rldicr_(temp_Reg, abort_status, tm_failure_bit[i], 0);
2458 if (tm_failure_inv[i]) {
2459 bne(CCR0, check_abort);
2460 } else {
2461 beq(CCR0, check_abort);
2462 }
2463 //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically
2464 ldx(temp_Reg, addr_Reg);
2465 addi(temp_Reg, temp_Reg, 1);
2466 stdx(temp_Reg, addr_Reg);
2467 bind(check_abort);
2468 }
2469 }
2470 li(temp_Reg, -counters_offs); // can't use addi with R0
2471 add(rtm_counters_Reg, addr_Reg, temp_Reg); // restore
2472 }
2473
2474 // Branch if (random & (count-1) != 0), count is 2^n
2475 // tmp and CR0 are killed
2476 void MacroAssembler::branch_on_random_using_tb(Register tmp, int count, Label& brLabel) {
2477 mftb(tmp);
2478 andi_(tmp, tmp, count-1);
2479 bne(CCR0, brLabel);
2480 }
2481
2482 // Perform abort ratio calculation, set no_rtm bit if high ratio.
2483 // input: rtm_counters_Reg (RTMLockingCounters* address) - KILLED
2484 void MacroAssembler::rtm_abort_ratio_calculation(Register rtm_counters_Reg,
2485 RTMLockingCounters* rtm_counters,
2486 Metadata* method_data) {
2487 Label L_done, L_check_always_rtm1, L_check_always_rtm2;
2488
2489 if (RTMLockingCalculationDelay > 0) {
2490 // Delay calculation.
2491 ld(rtm_counters_Reg, (RegisterOrConstant)(intptr_t)RTMLockingCounters::rtm_calculation_flag_addr());
2492 cmpdi(CCR0, rtm_counters_Reg, 0);
2493 beq(CCR0, L_done);
2494 load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2495 }
2496 // Abort ratio calculation only if abort_count > RTMAbortThreshold.
2497 // Aborted transactions = abort_count * 100
2498 // All transactions = total_count * RTMTotalCountIncrRate
2499 // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
2500 ld(R0, RTMLockingCounters::abort_count_offset(), rtm_counters_Reg);
2501 if (is_simm(RTMAbortThreshold, 16)) { // cmpdi can handle 16bit immediate only.
2502 cmpdi(CCR0, R0, RTMAbortThreshold);
2503 blt(CCR0, L_check_always_rtm2); // reload of rtm_counters_Reg not necessary
2504 } else {
2505 load_const_optimized(rtm_counters_Reg, RTMAbortThreshold);
2506 cmpd(CCR0, R0, rtm_counters_Reg);
2507 blt(CCR0, L_check_always_rtm1); // reload of rtm_counters_Reg required
2508 }
2509 mulli(R0, R0, 100);
2510
2511 const Register tmpReg = rtm_counters_Reg;
2512 ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2513 mulli(tmpReg, tmpReg, RTMTotalCountIncrRate); // allowable range: int16
2514 mulli(tmpReg, tmpReg, RTMAbortRatio); // allowable range: int16
2515 cmpd(CCR0, R0, tmpReg);
2516 blt(CCR0, L_check_always_rtm1); // jump to reload
2517 if (method_data != NULL) {
2518 // Set rtm_state to "no rtm" in MDO.
2519 // Not using a metadata relocation. Method and Class Loader are kept alive anyway.
2520 // (See nmethod::metadata_do and CodeBuffer::finalize_oop_references.)
2521 load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2522 atomic_ori_int(R0, tmpReg, NoRTM);
2523 }
2524 b(L_done);
2525
2526 bind(L_check_always_rtm1);
2527 load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2528 bind(L_check_always_rtm2);
2529 ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2530 int64_t thresholdValue = RTMLockingThreshold / RTMTotalCountIncrRate;
2531 if (is_simm(thresholdValue, 16)) { // cmpdi can handle 16bit immediate only.
2532 cmpdi(CCR0, tmpReg, thresholdValue);
2533 } else {
2534 load_const_optimized(R0, thresholdValue);
2535 cmpd(CCR0, tmpReg, R0);
2536 }
2537 blt(CCR0, L_done);
2538 if (method_data != NULL) {
2539 // Set rtm_state to "always rtm" in MDO.
2540 // Not using a metadata relocation. See above.
2541 load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2542 atomic_ori_int(R0, tmpReg, UseRTM);
2543 }
2544 bind(L_done);
2545 }
2546
2547 // Update counters and perform abort ratio calculation.
2548 // input: abort_status_Reg
2549 void MacroAssembler::rtm_profiling(Register abort_status_Reg, Register temp_Reg,
2550 RTMLockingCounters* rtm_counters,
2551 Metadata* method_data,
2552 bool profile_rtm) {
2553
2554 assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2555 // Update rtm counters based on state at abort.
2556 // Reads abort_status_Reg, updates flags.
2557 assert_different_registers(abort_status_Reg, temp_Reg);
2558 load_const_optimized(temp_Reg, (address)rtm_counters, R0);
2559 rtm_counters_update(abort_status_Reg, temp_Reg);
2560 if (profile_rtm) {
2561 assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2562 rtm_abort_ratio_calculation(temp_Reg, rtm_counters, method_data);
2563 }
2564 }
2565
2566 // Retry on abort if abort's status indicates non-persistent failure.
2567 // inputs: retry_count_Reg
2568 // : abort_status_Reg
2569 // output: retry_count_Reg decremented by 1
2570 void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg,
2571 Label& retryLabel, Label* checkRetry) {
2572 Label doneRetry;
2573 rldicr_(R0, abort_status_Reg, tm_failure_persistent, 0);
2574 bne(CCR0, doneRetry);
2575 if (checkRetry) { bind(*checkRetry); }
2576 addic_(retry_count_Reg, retry_count_Reg, -1);
2577 blt(CCR0, doneRetry);
2578 smt_yield(); // Can't use wait(). No permission (SIGILL).
2579 b(retryLabel);
2580 bind(doneRetry);
2581 }
2582
2583 // Spin and retry if lock is busy.
2584 // inputs: owner_addr_Reg (monitor address)
2585 // : retry_count_Reg
2586 // output: retry_count_Reg decremented by 1
2587 // CTR is killed
2588 void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register owner_addr_Reg, Label& retryLabel) {
2589 Label SpinLoop, doneRetry;
2590 addic_(retry_count_Reg, retry_count_Reg, -1);
2591 blt(CCR0, doneRetry);
2592
2593 if (RTMSpinLoopCount > 1) {
2594 li(R0, RTMSpinLoopCount);
2595 mtctr(R0);
2596 }
2597
2598 bind(SpinLoop);
2599 smt_yield(); // Can't use waitrsv(). No permission (SIGILL).
2600
2601 if (RTMSpinLoopCount > 1) {
2602 bdz(retryLabel);
2603 ld(R0, 0, owner_addr_Reg);
2604 cmpdi(CCR0, R0, 0);
2605 bne(CCR0, SpinLoop);
2606 }
2607
2608 b(retryLabel);
2609
2610 bind(doneRetry);
2611 }
2612
2613 // Use RTM for normal stack locks.
2614 // Input: objReg (object to lock)
2615 void MacroAssembler::rtm_stack_locking(ConditionRegister flag,
2616 Register obj, Register mark_word, Register tmp,
2617 Register retry_on_abort_count_Reg,
2618 RTMLockingCounters* stack_rtm_counters,
2619 Metadata* method_data, bool profile_rtm,
2620 Label& DONE_LABEL, Label& IsInflated) {
2621 assert(UseRTMForStackLocks, "why call this otherwise?");
2622 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
2623 Label L_rtm_retry, L_decrement_retry, L_on_abort;
2624
2625 if (RTMRetryCount > 0) {
2626 load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
2627 bind(L_rtm_retry);
2628 }
2629 andi_(R0, mark_word, markOopDesc::monitor_value); // inflated vs stack-locked|neutral|biased
2630 bne(CCR0, IsInflated);
2631
2632 if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2633 Label L_noincrement;
2634 if (RTMTotalCountIncrRate > 1) {
2635 branch_on_random_using_tb(tmp, RTMTotalCountIncrRate, L_noincrement);
2636 }
2637 assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
2638 load_const_optimized(tmp, (address)stack_rtm_counters->total_count_addr(), R0);
2639 //atomic_inc_ptr(tmp, /*temp, will be reloaded*/mark_word); We don't increment atomically
2640 ldx(mark_word, tmp);
2641 addi(mark_word, mark_word, 1);
2642 stdx(mark_word, tmp);
2643 bind(L_noincrement);
2644 }
2645 tbegin_();
2646 beq(CCR0, L_on_abort);
2647 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // Reload in transaction, conflicts need to be tracked.
2648 andi(R0, mark_word, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
2649 cmpwi(flag, R0, markOopDesc::unlocked_value); // bits = 001 unlocked
2650 beq(flag, DONE_LABEL); // all done if unlocked
2651
2652 if (UseRTMXendForLockBusy) {
2653 tend_();
2654 b(L_decrement_retry);
2655 } else {
2656 tabort_();
2657 }
2658 bind(L_on_abort);
2659 const Register abort_status_Reg = tmp;
2660 mftexasr(abort_status_Reg);
2661 if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2662 rtm_profiling(abort_status_Reg, /*temp*/mark_word, stack_rtm_counters, method_data, profile_rtm);
2663 }
2664 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // reload
2665 if (RTMRetryCount > 0) {
2666 // Retry on lock abort if abort status is not permanent.
2667 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry, &L_decrement_retry);
2668 } else {
2669 bind(L_decrement_retry);
2670 }
2671 }
2672
2673 // Use RTM for inflating locks
2674 // inputs: obj (object to lock)
2675 // mark_word (current header - KILLED)
2676 // boxReg (on-stack box address (displaced header location) - KILLED)
2677 void MacroAssembler::rtm_inflated_locking(ConditionRegister flag,
2678 Register obj, Register mark_word, Register boxReg,
2679 Register retry_on_busy_count_Reg, Register retry_on_abort_count_Reg,
2680 RTMLockingCounters* rtm_counters,
2681 Metadata* method_data, bool profile_rtm,
2682 Label& DONE_LABEL) {
2683 assert(UseRTMLocking, "why call this otherwise?");
2684 Label L_rtm_retry, L_decrement_retry, L_on_abort;
2685 // Clean monitor_value bit to get valid pointer.
2686 int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markOopDesc::monitor_value;
2687
2688 // Store non-null, using boxReg instead of (intptr_t)markOopDesc::unused_mark().
2689 std(boxReg, BasicLock::displaced_header_offset_in_bytes(), boxReg);
2690 const Register tmpReg = boxReg;
2691 const Register owner_addr_Reg = mark_word;
2692 addi(owner_addr_Reg, mark_word, owner_offset);
2693
2694 if (RTMRetryCount > 0) {
2695 load_const_optimized(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy.
2696 load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort.
2697 bind(L_rtm_retry);
2698 }
2699 if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2700 Label L_noincrement;
2701 if (RTMTotalCountIncrRate > 1) {
2702 branch_on_random_using_tb(R0, RTMTotalCountIncrRate, L_noincrement);
2703 }
2704 assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2705 load_const(R0, (address)rtm_counters->total_count_addr(), tmpReg);
2706 //atomic_inc_ptr(R0, tmpReg); We don't increment atomically
2707 ldx(tmpReg, R0);
2708 addi(tmpReg, tmpReg, 1);
2709 stdx(tmpReg, R0);
2710 bind(L_noincrement);
2711 }
2712 tbegin_();
2713 beq(CCR0, L_on_abort);
2714 // We don't reload mark word. Will only be reset at safepoint.
2715 ld(R0, 0, owner_addr_Reg); // Load in transaction, conflicts need to be tracked.
2716 cmpdi(flag, R0, 0);
2717 beq(flag, DONE_LABEL);
2718
2719 if (UseRTMXendForLockBusy) {
2720 tend_();
2721 b(L_decrement_retry);
2722 } else {
2723 tabort_();
2724 }
2725 bind(L_on_abort);
2726 const Register abort_status_Reg = tmpReg;
2727 mftexasr(abort_status_Reg);
2728 if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2729 rtm_profiling(abort_status_Reg, /*temp*/ owner_addr_Reg, rtm_counters, method_data, profile_rtm);
2730 // Restore owner_addr_Reg
2731 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);
2732 #ifdef ASSERT
2733 andi_(R0, mark_word, markOopDesc::monitor_value);
2734 asm_assert_ne("must be inflated", 0xa754); // Deflating only allowed at safepoint.
2735 #endif
2736 addi(owner_addr_Reg, mark_word, owner_offset);
2737 }
2738 if (RTMRetryCount > 0) {
2739 // Retry on lock abort if abort status is not permanent.
2740 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
2741 }
2742
2743 // Appears unlocked - try to swing _owner from null to non-null.
2744 cmpxchgd(flag, /*current val*/ R0, (intptr_t)0, /*new val*/ R16_thread, owner_addr_Reg,
2745 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2746 MacroAssembler::cmpxchgx_hint_acquire_lock(), noreg, &L_decrement_retry, true);
2747
2748 if (RTMRetryCount > 0) {
2749 // success done else retry
2750 b(DONE_LABEL);
2751 bind(L_decrement_retry);
2752 // Spin and retry if lock is busy.
2753 rtm_retry_lock_on_busy(retry_on_busy_count_Reg, owner_addr_Reg, L_rtm_retry);
2754 } else {
2755 bind(L_decrement_retry);
2756 }
2757 }
2758
2759 #endif // INCLUDE_RTM_OPT
2760
2761 // "The box" is the space on the stack where we copy the object mark.
2762 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box,
2763 Register temp, Register displaced_header, Register current_header,
2764 bool try_bias,
2765 RTMLockingCounters* rtm_counters,
2766 RTMLockingCounters* stack_rtm_counters,
2767 Metadata* method_data,
2768 bool use_rtm, bool profile_rtm) {
2769 assert_different_registers(oop, box, temp, displaced_header, current_header);
2770 assert(flag != CCR0, "bad condition register");
2771 Label cont;
2772 Label object_has_monitor;
2773 Label cas_failed;
2774
2775 // Load markOop from object into displaced_header.
2776 ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop);
2777
2778
2779 // Always do locking in runtime.
2780 if (EmitSync & 0x01) {
2781 cmpdi(flag, oop, 0); // Oop can't be 0 here => always false.
2782 return;
2783 }
2784
2785 if (try_bias) {
2786 biased_locking_enter(flag, oop, displaced_header, temp, current_header, cont);
2787 }
2788
2789 #if INCLUDE_RTM_OPT
2790 if (UseRTMForStackLocks && use_rtm) {
2791 rtm_stack_locking(flag, oop, displaced_header, temp, /*temp*/ current_header,
2792 stack_rtm_counters, method_data, profile_rtm,
2793 cont, object_has_monitor);
2794 }
2795 #endif // INCLUDE_RTM_OPT
2796
2797 // Handle existing monitor.
2798 if ((EmitSync & 0x02) == 0) {
2799 // The object has an existing monitor iff (mark & monitor_value) != 0.
2800 andi_(temp, displaced_header, markOopDesc::monitor_value);
2801 bne(CCR0, object_has_monitor);
2802 }
2803
2804 // Set displaced_header to be (markOop of object | UNLOCK_VALUE).
2805 ori(displaced_header, displaced_header, markOopDesc::unlocked_value);
2806
2807 // Load Compare Value application register.
2808
2809 // Initialize the box. (Must happen before we update the object mark!)
2810 std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2811
2812 // Must fence, otherwise, preceding store(s) may float below cmpxchg.
2813 // Compare object markOop with mark and if equal exchange scratch1 with object markOop.
2814 cmpxchgd(/*flag=*/flag,
2815 /*current_value=*/current_header,
2816 /*compare_value=*/displaced_header,
2817 /*exchange_value=*/box,
2818 /*where=*/oop,
2819 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2820 MacroAssembler::cmpxchgx_hint_acquire_lock(),
2821 noreg,
2822 &cas_failed,
2823 /*check without membar and ldarx first*/true);
2824 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2825
2826 // If the compare-and-exchange succeeded, then we found an unlocked
2827 // object and we have now locked it.
2828 b(cont);
2829
2830 bind(cas_failed);
2831 // We did not see an unlocked object so try the fast recursive case.
2832
2833 // Check if the owner is self by comparing the value in the markOop of object
2834 // (current_header) with the stack pointer.
2835 sub(current_header, current_header, R1_SP);
2836 load_const_optimized(temp, ~(os::vm_page_size()-1) | markOopDesc::lock_mask_in_place);
2837
2838 and_(R0/*==0?*/, current_header, temp);
2839 // If condition is true we are cont and hence we can store 0 as the
2840 // displaced header in the box, which indicates that it is a recursive lock.
2841 mcrf(flag,CCR0);
2842 std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box);
2843
2844 // Handle existing monitor.
2845 if ((EmitSync & 0x02) == 0) {
2846 b(cont);
2847
2848 bind(object_has_monitor);
2849 // The object's monitor m is unlocked iff m->owner == NULL,
2850 // otherwise m->owner may contain a thread or a stack address.
2851
2852 #if INCLUDE_RTM_OPT
2853 // Use the same RTM locking code in 32- and 64-bit VM.
2854 if (use_rtm) {
2855 rtm_inflated_locking(flag, oop, displaced_header, box, temp, /*temp*/ current_header,
2856 rtm_counters, method_data, profile_rtm, cont);
2857 } else {
2858 #endif // INCLUDE_RTM_OPT
2859
2860 // Try to CAS m->owner from NULL to current thread.
2861 addi(temp, displaced_header, ObjectMonitor::owner_offset_in_bytes()-markOopDesc::monitor_value);
2862 cmpxchgd(/*flag=*/flag,
2863 /*current_value=*/current_header,
2864 /*compare_value=*/(intptr_t)0,
2865 /*exchange_value=*/R16_thread,
2866 /*where=*/temp,
2867 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2868 MacroAssembler::cmpxchgx_hint_acquire_lock());
2869
2870 // Store a non-null value into the box.
2871 std(box, BasicLock::displaced_header_offset_in_bytes(), box);
2872
2873 # ifdef ASSERT
2874 bne(flag, cont);
2875 // We have acquired the monitor, check some invariants.
2876 addi(/*monitor=*/temp, temp, -ObjectMonitor::owner_offset_in_bytes());
2877 // Invariant 1: _recursions should be 0.
2878 //assert(ObjectMonitor::recursions_size_in_bytes() == 8, "unexpected size");
2879 asm_assert_mem8_is_zero(ObjectMonitor::recursions_offset_in_bytes(), temp,
2880 "monitor->_recursions should be 0", -1);
2881 // Invariant 2: OwnerIsThread shouldn't be 0.
2882 //assert(ObjectMonitor::OwnerIsThread_size_in_bytes() == 4, "unexpected size");
2883 //asm_assert_mem4_isnot_zero(ObjectMonitor::OwnerIsThread_offset_in_bytes(), temp,
2884 // "monitor->OwnerIsThread shouldn't be 0", -1);
2885 # endif
2886
2887 #if INCLUDE_RTM_OPT
2888 } // use_rtm()
2889 #endif
2890 }
2891
2892 bind(cont);
2893 // flag == EQ indicates success
2894 // flag == NE indicates failure
2895 }
2896
2897 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box,
2898 Register temp, Register displaced_header, Register current_header,
2899 bool try_bias, bool use_rtm) {
2900 assert_different_registers(oop, box, temp, displaced_header, current_header);
2901 assert(flag != CCR0, "bad condition register");
2902 Label cont;
2903 Label object_has_monitor;
2904
2905 // Always do locking in runtime.
2906 if (EmitSync & 0x01) {
2907 cmpdi(flag, oop, 0); // Oop can't be 0 here => always false.
2908 return;
2909 }
2910
2911 if (try_bias) {
2912 biased_locking_exit(flag, oop, current_header, cont);
2913 }
2914
2915 #if INCLUDE_RTM_OPT
2916 if (UseRTMForStackLocks && use_rtm) {
2917 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
2918 Label L_regular_unlock;
2919 ld(current_header, oopDesc::mark_offset_in_bytes(), oop); // fetch markword
2920 andi(R0, current_header, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
2921 cmpwi(flag, R0, markOopDesc::unlocked_value); // bits = 001 unlocked
2922 bne(flag, L_regular_unlock); // else RegularLock
2923 tend_(); // otherwise end...
2924 b(cont); // ... and we're done
2925 bind(L_regular_unlock);
2926 }
2927 #endif
2928
2929 // Find the lock address and load the displaced header from the stack.
2930 ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2931
2932 // If the displaced header is 0, we have a recursive unlock.
2933 cmpdi(flag, displaced_header, 0);
2934 beq(flag, cont);
2935
2936 // Handle existing monitor.
2937 if ((EmitSync & 0x02) == 0) {
2938 // The object has an existing monitor iff (mark & monitor_value) != 0.
2939 RTM_OPT_ONLY( if (!(UseRTMForStackLocks && use_rtm)) ) // skip load if already done
2940 ld(current_header, oopDesc::mark_offset_in_bytes(), oop);
2941 andi_(R0, current_header, markOopDesc::monitor_value);
2942 bne(CCR0, object_has_monitor);
2943 }
2944
2945 // Check if it is still a light weight lock, this is is true if we see
2946 // the stack address of the basicLock in the markOop of the object.
2947 // Cmpxchg sets flag to cmpd(current_header, box).
2948 cmpxchgd(/*flag=*/flag,
2949 /*current_value=*/current_header,
2950 /*compare_value=*/box,
2951 /*exchange_value=*/displaced_header,
2952 /*where=*/oop,
2953 MacroAssembler::MemBarRel,
2954 MacroAssembler::cmpxchgx_hint_release_lock(),
2955 noreg,
2956 &cont);
2957
2958 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2959
2960 // Handle existing monitor.
2961 if ((EmitSync & 0x02) == 0) {
2962 b(cont);
2963
2964 bind(object_has_monitor);
2965 addi(current_header, current_header, -markOopDesc::monitor_value); // monitor
2966 ld(temp, ObjectMonitor::owner_offset_in_bytes(), current_header);
2967
2968 // It's inflated.
2969 #if INCLUDE_RTM_OPT
2970 if (use_rtm) {
2971 Label L_regular_inflated_unlock;
2972 // Clean monitor_value bit to get valid pointer
2973 cmpdi(flag, temp, 0);
2974 bne(flag, L_regular_inflated_unlock);
2975 tend_();
2976 b(cont);
2977 bind(L_regular_inflated_unlock);
2978 }
2979 #endif
2980
2981 ld(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header);
2982 xorr(temp, R16_thread, temp); // Will be 0 if we are the owner.
2983 orr(temp, temp, displaced_header); // Will be 0 if there are 0 recursions.
2984 cmpdi(flag, temp, 0);
2985 bne(flag, cont);
2986
2987 ld(temp, ObjectMonitor::EntryList_offset_in_bytes(), current_header);
2988 ld(displaced_header, ObjectMonitor::cxq_offset_in_bytes(), current_header);
2989 orr(temp, temp, displaced_header); // Will be 0 if both are 0.
2990 cmpdi(flag, temp, 0);
2991 bne(flag, cont);
2992 release();
2993 std(temp, ObjectMonitor::owner_offset_in_bytes(), current_header);
2994 }
2995
2996 bind(cont);
2997 // flag == EQ indicates success
2998 // flag == NE indicates failure
2999 }
3000
3001 // Write serialization page so VM thread can do a pseudo remote membar.
3002 // We use the current thread pointer to calculate a thread specific
3003 // offset to write to within the page. This minimizes bus traffic
3004 // due to cache line collision.
3005 void MacroAssembler::serialize_memory(Register thread, Register tmp1, Register tmp2) {
3006 srdi(tmp2, thread, os::get_serialize_page_shift_count());
3007
3008 int mask = os::vm_page_size() - sizeof(int);
3009 if (Assembler::is_simm(mask, 16)) {
3010 andi(tmp2, tmp2, mask);
3011 } else {
3012 lis(tmp1, (int)((signed short) (mask >> 16)));
3013 ori(tmp1, tmp1, mask & 0x0000ffff);
3014 andr(tmp2, tmp2, tmp1);
3015 }
3016
3017 load_const(tmp1, (long) os::get_memory_serialize_page());
3018 release();
3019 stwx(R0, tmp1, tmp2);
3020 }
3021
3022
3023 // GC barrier helper macros
3024
3025 // Write the card table byte if needed.
3026 void MacroAssembler::card_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp) {
3027 CardTableModRefBS* bs =
3028 barrier_set_cast<CardTableModRefBS>(Universe::heap()->barrier_set());
3029 assert(bs->kind() == BarrierSet::CardTableForRS ||
3030 bs->kind() == BarrierSet::CardTableExtension, "wrong barrier");
3031 #ifdef ASSERT
3032 cmpdi(CCR0, Rnew_val, 0);
3033 asm_assert_ne("null oop not allowed", 0x321);
3034 #endif
3035 card_table_write(bs->byte_map_base, Rtmp, Rstore_addr);
3036 }
3037
3038 // Write the card table byte.
3039 void MacroAssembler::card_table_write(jbyte* byte_map_base, Register Rtmp, Register Robj) {
3040 assert_different_registers(Robj, Rtmp, R0);
3041 load_const_optimized(Rtmp, (address)byte_map_base, R0);
3042 srdi(Robj, Robj, CardTableModRefBS::card_shift);
3043 li(R0, 0); // dirty
3044 if (UseConcMarkSweepGC) membar(Assembler::StoreStore);
3045 stbx(R0, Rtmp, Robj);
3046 }
3047
3048 // Kills R31 if value is a volatile register.
3049 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2, bool needs_frame) {
3050 Label done;
3051 cmpdi(CCR0, value, 0);
3052 beq(CCR0, done); // Use NULL as-is.
3053
3054 clrrdi(tmp1, value, JNIHandles::weak_tag_size);
3055 #if INCLUDE_ALL_GCS
3056 if (UseG1GC) { andi_(tmp2, value, JNIHandles::weak_tag_mask); }
3057 #endif
3058 ld(value, 0, tmp1); // Resolve (untagged) jobject.
3059
3060 #if INCLUDE_ALL_GCS
3061 if (UseG1GC) {
3062 Label not_weak;
3063 beq(CCR0, not_weak); // Test for jweak tag.
3064 verify_oop(value);
3065 g1_write_barrier_pre(noreg, // obj
3066 noreg, // offset
3067 value, // pre_val
3068 tmp1, tmp2, needs_frame);
3069 bind(not_weak);
3070 }
3071 #endif // INCLUDE_ALL_GCS
3072 verify_oop(value);
3073 bind(done);
3074 }
3075
3076 #if INCLUDE_ALL_GCS
3077 // General G1 pre-barrier generator.
3078 // Goal: record the previous value if it is not null.
3079 void MacroAssembler::g1_write_barrier_pre(Register Robj, RegisterOrConstant offset, Register Rpre_val,
3080 Register Rtmp1, Register Rtmp2, bool needs_frame) {
3081 Label runtime, filtered;
3082
3083 // Is marking active?
3084 if (in_bytes(SATBMarkQueue::byte_width_of_active()) == 4) {
3085 lwz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_active()), R16_thread);
3086 } else {
3087 guarantee(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, "Assumption");
3088 lbz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_active()), R16_thread);
3089 }
3090 cmpdi(CCR0, Rtmp1, 0);
3091 beq(CCR0, filtered);
3092
3093 // Do we need to load the previous value?
3094 if (Robj != noreg) {
3095 // Load the previous value...
3096 if (UseCompressedOops) {
3097 lwz(Rpre_val, offset, Robj);
3098 } else {
3099 ld(Rpre_val, offset, Robj);
3100 }
3101 // Previous value has been loaded into Rpre_val.
3102 }
3103 assert(Rpre_val != noreg, "must have a real register");
3104
3105 // Is the previous value null?
3106 cmpdi(CCR0, Rpre_val, 0);
3107 beq(CCR0, filtered);
3108
3109 if (Robj != noreg && UseCompressedOops) {
3110 decode_heap_oop_not_null(Rpre_val);
3111 }
3112
3113 // OK, it's not filtered, so we'll need to call enqueue. In the normal
3114 // case, pre_val will be a scratch G-reg, but there are some cases in
3115 // which it's an O-reg. In the first case, do a normal call. In the
3116 // latter, do a save here and call the frameless version.
3117
3118 // Can we store original value in the thread's buffer?
3119 // Is index == 0?
3120 // (The index field is typed as size_t.)
3121 const Register Rbuffer = Rtmp1, Rindex = Rtmp2;
3122
3123 ld(Rindex, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_index()), R16_thread);
3124 cmpdi(CCR0, Rindex, 0);
3125 beq(CCR0, runtime); // If index == 0, goto runtime.
3126 ld(Rbuffer, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_buf()), R16_thread);
3127
3128 addi(Rindex, Rindex, -wordSize); // Decrement index.
3129 std(Rindex, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_index()), R16_thread);
3130
3131 // Record the previous value.
3132 stdx(Rpre_val, Rbuffer, Rindex);
3133 b(filtered);
3134
3135 bind(runtime);
3136
3137 // May need to preserve LR. Also needed if current frame is not compatible with C calling convention.
3138 if (needs_frame) {
3139 save_LR_CR(Rtmp1);
3140 push_frame_reg_args(0, Rtmp2);
3141 }
3142
3143 if (Rpre_val->is_volatile() && Robj == noreg) mr(R31, Rpre_val); // Save pre_val across C call if it was preloaded.
3144 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), Rpre_val, R16_thread);
3145 if (Rpre_val->is_volatile() && Robj == noreg) mr(Rpre_val, R31); // restore
3146
3147 if (needs_frame) {
3148 pop_frame();
3149 restore_LR_CR(Rtmp1);
3150 }
3151
3152 bind(filtered);
3153 }
3154
3155 // General G1 post-barrier generator
3156 // Store cross-region card.
3157 void MacroAssembler::g1_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp1, Register Rtmp2, Register Rtmp3, Label *filtered_ext) {
3158 Label runtime, filtered_int;
3159 Label& filtered = (filtered_ext != NULL) ? *filtered_ext : filtered_int;
3160 assert_different_registers(Rstore_addr, Rnew_val, Rtmp1, Rtmp2);
3161
3162 G1SATBCardTableLoggingModRefBS* bs =
3163 barrier_set_cast<G1SATBCardTableLoggingModRefBS>(Universe::heap()->barrier_set());
3164
3165 // Does store cross heap regions?
3166 if (G1RSBarrierRegionFilter) {
3167 xorr(Rtmp1, Rstore_addr, Rnew_val);
3168 srdi_(Rtmp1, Rtmp1, HeapRegion::LogOfHRGrainBytes);
3169 beq(CCR0, filtered);
3170 }
3171
3172 // Crosses regions, storing NULL?
3173 #ifdef ASSERT
3174 cmpdi(CCR0, Rnew_val, 0);
3175 asm_assert_ne("null oop not allowed (G1)", 0x322); // Checked by caller on PPC64, so following branch is obsolete:
3176 //beq(CCR0, filtered);
3177 #endif
3178
3179 // Storing region crossing non-NULL, is card already dirty?
3180 assert(sizeof(*bs->byte_map_base) == sizeof(jbyte), "adjust this code");
3181 const Register Rcard_addr = Rtmp1;
3182 Register Rbase = Rtmp2;
3183 load_const_optimized(Rbase, (address)bs->byte_map_base, /*temp*/ Rtmp3);
3184
3185 srdi(Rcard_addr, Rstore_addr, CardTableModRefBS::card_shift);
3186
3187 // Get the address of the card.
3188 lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr);
3189 cmpwi(CCR0, Rtmp3, (int)G1SATBCardTableModRefBS::g1_young_card_val());
3190 beq(CCR0, filtered);
3191
3192 membar(Assembler::StoreLoad);
3193 lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr); // Reload after membar.
3194 cmpwi(CCR0, Rtmp3 /* card value */, CardTableModRefBS::dirty_card_val());
3195 beq(CCR0, filtered);
3196
3197 // Storing a region crossing, non-NULL oop, card is clean.
3198 // Dirty card and log.
3199 li(Rtmp3, CardTableModRefBS::dirty_card_val());
3200 //release(); // G1: oops are allowed to get visible after dirty marking.
3201 stbx(Rtmp3, Rbase, Rcard_addr);
3202
3203 add(Rcard_addr, Rbase, Rcard_addr); // This is the address which needs to get enqueued.
3204 Rbase = noreg; // end of lifetime
3205
3206 const Register Rqueue_index = Rtmp2,
3207 Rqueue_buf = Rtmp3;
3208 ld(Rqueue_index, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_index()), R16_thread);
3209 cmpdi(CCR0, Rqueue_index, 0);
3210 beq(CCR0, runtime); // index == 0 then jump to runtime
3211 ld(Rqueue_buf, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_buf()), R16_thread);
3212
3213 addi(Rqueue_index, Rqueue_index, -wordSize); // decrement index
3214 std(Rqueue_index, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_index()), R16_thread);
3215
3216 stdx(Rcard_addr, Rqueue_buf, Rqueue_index); // store card
3217 b(filtered);
3218
3219 bind(runtime);
3220
3221 // Save the live input values.
3222 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), Rcard_addr, R16_thread);
3223
3224 bind(filtered_int);
3225 }
3226 #endif // INCLUDE_ALL_GCS
3227
3228 // Values for last_Java_pc, and last_Java_sp must comply to the rules
3229 // in frame_ppc.hpp.
3230 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) {
3231 // Always set last_Java_pc and flags first because once last_Java_sp
3232 // is visible has_last_Java_frame is true and users will look at the
3233 // rest of the fields. (Note: flags should always be zero before we
3234 // get here so doesn't need to be set.)
3235
3236 // Verify that last_Java_pc was zeroed on return to Java
3237 asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread,
3238 "last_Java_pc not zeroed before leaving Java", 0x200);
3239
3240 // When returning from calling out from Java mode the frame anchor's
3241 // last_Java_pc will always be set to NULL. It is set here so that
3242 // if we are doing a call to native (not VM) that we capture the
3243 // known pc and don't have to rely on the native call having a
3244 // standard frame linkage where we can find the pc.
3245 if (last_Java_pc != noreg)
3246 std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
3247
3248 // Set last_Java_sp last.
3249 std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
3250 }
3251
3252 void MacroAssembler::reset_last_Java_frame(void) {
3253 asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()),
3254 R16_thread, "SP was not set, still zero", 0x202);
3255
3256 BLOCK_COMMENT("reset_last_Java_frame {");
3257 li(R0, 0);
3258
3259 // _last_Java_sp = 0
3260 std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
3261
3262 // _last_Java_pc = 0
3263 std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
3264 BLOCK_COMMENT("} reset_last_Java_frame");
3265 }
3266
3267 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) {
3268 assert_different_registers(sp, tmp1);
3269
3270 // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via
3271 // TOP_IJAVA_FRAME_ABI.
3272 // FIXME: assert that we really have a TOP_IJAVA_FRAME here!
3273 address entry = pc();
3274 load_const_optimized(tmp1, entry);
3275
3276 set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1);
3277 }
3278
3279 void MacroAssembler::get_vm_result(Register oop_result) {
3280 // Read:
3281 // R16_thread
3282 // R16_thread->in_bytes(JavaThread::vm_result_offset())
3283 //
3284 // Updated:
3285 // oop_result
3286 // R16_thread->in_bytes(JavaThread::vm_result_offset())
3287
3288 verify_thread();
3289
3290 ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread);
3291 li(R0, 0);
3292 std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread);
3293
3294 verify_oop(oop_result);
3295 }
3296
3297 void MacroAssembler::get_vm_result_2(Register metadata_result) {
3298 // Read:
3299 // R16_thread
3300 // R16_thread->in_bytes(JavaThread::vm_result_2_offset())
3301 //
3302 // Updated:
3303 // metadata_result
3304 // R16_thread->in_bytes(JavaThread::vm_result_2_offset())
3305
3306 ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
3307 li(R0, 0);
3308 std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
3309 }
3310
3311 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3312 Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided.
3313 if (Universe::narrow_klass_base() != 0) {
3314 // Use dst as temp if it is free.
3315 sub_const_optimized(dst, current, Universe::narrow_klass_base(), R0);
3316 current = dst;
3317 }
3318 if (Universe::narrow_klass_shift() != 0) {
3319 srdi(dst, current, Universe::narrow_klass_shift());
3320 current = dst;
3321 }
3322 return current;
3323 }
3324
3325 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) {
3326 if (UseCompressedClassPointers) {
3327 Register compressedKlass = encode_klass_not_null(ck, klass);
3328 stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop);
3329 } else {
3330 std(klass, oopDesc::klass_offset_in_bytes(), dst_oop);
3331 }
3332 }
3333
3334 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) {
3335 if (UseCompressedClassPointers) {
3336 if (val == noreg) {
3337 val = R0;
3338 li(val, 0);
3339 }
3340 stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed
3341 }
3342 }
3343
3344 int MacroAssembler::instr_size_for_decode_klass_not_null() {
3345 if (!UseCompressedClassPointers) return 0;
3346 int num_instrs = 1; // shift or move
3347 if (Universe::narrow_klass_base() != 0) num_instrs = 7; // shift + load const + add
3348 return num_instrs * BytesPerInstWord;
3349 }
3350
3351 void MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3352 assert(dst != R0, "Dst reg may not be R0, as R0 is used here.");
3353 if (src == noreg) src = dst;
3354 Register shifted_src = src;
3355 if (Universe::narrow_klass_shift() != 0 ||
3356 Universe::narrow_klass_base() == 0 && src != dst) { // Move required.
3357 shifted_src = dst;
3358 sldi(shifted_src, src, Universe::narrow_klass_shift());
3359 }
3360 if (Universe::narrow_klass_base() != 0) {
3361 add_const_optimized(dst, shifted_src, Universe::narrow_klass_base(), R0);
3362 }
3363 }
3364
3365 void MacroAssembler::load_klass(Register dst, Register src) {
3366 if (UseCompressedClassPointers) {
3367 lwz(dst, oopDesc::klass_offset_in_bytes(), src);
3368 // Attention: no null check here!
3369 decode_klass_not_null(dst, dst);
3370 } else {
3371 ld(dst, oopDesc::klass_offset_in_bytes(), src);
3372 }
3373 }
3374
3375 void MacroAssembler::load_mirror_from_const_method(Register mirror, Register const_method) {
3376 ld(mirror, in_bytes(ConstMethod::constants_offset()), const_method);
3377 ld(mirror, ConstantPool::pool_holder_offset_in_bytes(), mirror);
3378 ld(mirror, in_bytes(Klass::java_mirror_offset()), mirror);
3379 }
3380
3381 // Clear Array
3382 // For very short arrays. tmp == R0 is allowed.
3383 void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) {
3384 if (cnt_dwords > 0) { li(tmp, 0); }
3385 for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); }
3386 }
3387
3388 // Version for constant short array length. Kills base_ptr. tmp == R0 is allowed.
3389 void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) {
3390 if (cnt_dwords < 8) {
3391 clear_memory_unrolled(base_ptr, cnt_dwords, tmp);
3392 return;
3393 }
3394
3395 Label loop;
3396 const long loopcnt = cnt_dwords >> 1,
3397 remainder = cnt_dwords & 1;
3398
3399 li(tmp, loopcnt);
3400 mtctr(tmp);
3401 li(tmp, 0);
3402 bind(loop);
3403 std(tmp, 0, base_ptr);
3404 std(tmp, 8, base_ptr);
3405 addi(base_ptr, base_ptr, 16);
3406 bdnz(loop);
3407 if (remainder) { std(tmp, 0, base_ptr); }
3408 }
3409
3410 // Kills both input registers. tmp == R0 is allowed.
3411 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) {
3412 // Procedure for large arrays (uses data cache block zero instruction).
3413 Label startloop, fast, fastloop, small_rest, restloop, done;
3414 const int cl_size = VM_Version::L1_data_cache_line_size(),
3415 cl_dwords = cl_size >> 3,
3416 cl_dw_addr_bits = exact_log2(cl_dwords),
3417 dcbz_min = 1, // Min count of dcbz executions, needs to be >0.
3418 min_cnt = ((dcbz_min + 1) << cl_dw_addr_bits) - 1;
3419
3420 if (const_cnt >= 0) {
3421 // Constant case.
3422 if (const_cnt < min_cnt) {
3423 clear_memory_constlen(base_ptr, const_cnt, tmp);
3424 return;
3425 }
3426 load_const_optimized(cnt_dwords, const_cnt, tmp);
3427 } else {
3428 // cnt_dwords already loaded in register. Need to check size.
3429 cmpdi(CCR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included).
3430 blt(CCR1, small_rest);
3431 }
3432 rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line.
3433 beq(CCR0, fast); // Already 128byte aligned.
3434
3435 subfic(tmp, tmp, cl_dwords);
3436 mtctr(tmp); // Set ctr to hit 128byte boundary (0<ctr<cl_dwords).
3437 subf(cnt_dwords, tmp, cnt_dwords); // rest.
3438 li(tmp, 0);
3439
3440 bind(startloop); // Clear at the beginning to reach 128byte boundary.
3441 std(tmp, 0, base_ptr); // Clear 8byte aligned block.
3442 addi(base_ptr, base_ptr, 8);
3443 bdnz(startloop);
3444
3445 bind(fast); // Clear 128byte blocks.
3446 srdi(tmp, cnt_dwords, cl_dw_addr_bits); // Loop count for 128byte loop (>0).
3447 andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords.
3448 mtctr(tmp); // Load counter.
3449
3450 bind(fastloop);
3451 dcbz(base_ptr); // Clear 128byte aligned block.
3452 addi(base_ptr, base_ptr, cl_size);
3453 bdnz(fastloop);
3454
3455 bind(small_rest);
3456 cmpdi(CCR0, cnt_dwords, 0); // size 0?
3457 beq(CCR0, done); // rest == 0
3458 li(tmp, 0);
3459 mtctr(cnt_dwords); // Load counter.
3460
3461 bind(restloop); // Clear rest.
3462 std(tmp, 0, base_ptr); // Clear 8byte aligned block.
3463 addi(base_ptr, base_ptr, 8);
3464 bdnz(restloop);
3465
3466 bind(done);
3467 }
3468
3469 /////////////////////////////////////////// String intrinsics ////////////////////////////////////////////
3470
3471 #ifdef COMPILER2
3472 // Intrinsics for CompactStrings
3473
3474 // Compress char[] to byte[] by compressing 16 bytes at once.
3475 void MacroAssembler::string_compress_16(Register src, Register dst, Register cnt,
3476 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5,
3477 Label& Lfailure) {
3478
3479 const Register tmp0 = R0;
3480 assert_different_registers(src, dst, cnt, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5);
3481 Label Lloop, Lslow;
3482
3483 // Check if cnt >= 8 (= 16 bytes)
3484 lis(tmp1, 0xFF); // tmp1 = 0x00FF00FF00FF00FF
3485 srwi_(tmp2, cnt, 3);
3486 beq(CCR0, Lslow);
3487 ori(tmp1, tmp1, 0xFF);
3488 rldimi(tmp1, tmp1, 32, 0);
3489 mtctr(tmp2);
3490
3491 // 2x unrolled loop
3492 bind(Lloop);
3493 ld(tmp2, 0, src); // _0_1_2_3 (Big Endian)
3494 ld(tmp4, 8, src); // _4_5_6_7
3495
3496 orr(tmp0, tmp2, tmp4);
3497 rldicl(tmp3, tmp2, 6*8, 64-24); // _____1_2
3498 rldimi(tmp2, tmp2, 2*8, 2*8); // _0_2_3_3
3499 rldicl(tmp5, tmp4, 6*8, 64-24); // _____5_6
3500 rldimi(tmp4, tmp4, 2*8, 2*8); // _4_6_7_7
3501
3502 andc_(tmp0, tmp0, tmp1);
3503 bne(CCR0, Lfailure); // Not latin1.
3504 addi(src, src, 16);
3505
3506 rlwimi(tmp3, tmp2, 0*8, 24, 31);// _____1_3
3507 srdi(tmp2, tmp2, 3*8); // ____0_2_
3508 rlwimi(tmp5, tmp4, 0*8, 24, 31);// _____5_7
3509 srdi(tmp4, tmp4, 3*8); // ____4_6_
3510
3511 orr(tmp2, tmp2, tmp3); // ____0123
3512 orr(tmp4, tmp4, tmp5); // ____4567
3513
3514 stw(tmp2, 0, dst);
3515 stw(tmp4, 4, dst);
3516 addi(dst, dst, 8);
3517 bdnz(Lloop);
3518
3519 bind(Lslow); // Fallback to slow version
3520 }
3521
3522 // Compress char[] to byte[]. cnt must be positive int.
3523 void MacroAssembler::string_compress(Register src, Register dst, Register cnt, Register tmp, Label& Lfailure) {
3524 Label Lloop;
3525 mtctr(cnt);
3526
3527 bind(Lloop);
3528 lhz(tmp, 0, src);
3529 cmplwi(CCR0, tmp, 0xff);
3530 bgt(CCR0, Lfailure); // Not latin1.
3531 addi(src, src, 2);
3532 stb(tmp, 0, dst);
3533 addi(dst, dst, 1);
3534 bdnz(Lloop);
3535 }
3536
3537 // Inflate byte[] to char[] by inflating 16 bytes at once.
3538 void MacroAssembler::string_inflate_16(Register src, Register dst, Register cnt,
3539 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
3540 const Register tmp0 = R0;
3541 assert_different_registers(src, dst, cnt, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5);
3542 Label Lloop, Lslow;
3543
3544 // Check if cnt >= 8
3545 srwi_(tmp2, cnt, 3);
3546 beq(CCR0, Lslow);
3547 lis(tmp1, 0xFF); // tmp1 = 0x00FF00FF
3548 ori(tmp1, tmp1, 0xFF);
3549 mtctr(tmp2);
3550
3551 // 2x unrolled loop
3552 bind(Lloop);
3553 lwz(tmp2, 0, src); // ____0123 (Big Endian)
3554 lwz(tmp4, 4, src); // ____4567
3555 addi(src, src, 8);
3556
3557 rldicl(tmp3, tmp2, 7*8, 64-8); // _______2
3558 rlwimi(tmp2, tmp2, 3*8, 16, 23);// ____0113
3559 rldicl(tmp5, tmp4, 7*8, 64-8); // _______6
3560 rlwimi(tmp4, tmp4, 3*8, 16, 23);// ____4557
3561
3562 andc(tmp0, tmp2, tmp1); // ____0_1_
3563 rlwimi(tmp2, tmp3, 2*8, 0, 23); // _____2_3
3564 andc(tmp3, tmp4, tmp1); // ____4_5_
3565 rlwimi(tmp4, tmp5, 2*8, 0, 23); // _____6_7
3566
3567 rldimi(tmp2, tmp0, 3*8, 0*8); // _0_1_2_3
3568 rldimi(tmp4, tmp3, 3*8, 0*8); // _4_5_6_7
3569
3570 std(tmp2, 0, dst);
3571 std(tmp4, 8, dst);
3572 addi(dst, dst, 16);
3573 bdnz(Lloop);
3574
3575 bind(Lslow); // Fallback to slow version
3576 }
3577
3578 // Inflate byte[] to char[]. cnt must be positive int.
3579 void MacroAssembler::string_inflate(Register src, Register dst, Register cnt, Register tmp) {
3580 Label Lloop;
3581 mtctr(cnt);
3582
3583 bind(Lloop);
3584 lbz(tmp, 0, src);
3585 addi(src, src, 1);
3586 sth(tmp, 0, dst);
3587 addi(dst, dst, 2);
3588 bdnz(Lloop);
3589 }
3590
3591 void MacroAssembler::string_compare(Register str1, Register str2,
3592 Register cnt1, Register cnt2,
3593 Register tmp1, Register result, int ae) {
3594 const Register tmp0 = R0,
3595 diff = tmp1;
3596
3597 assert_different_registers(str1, str2, cnt1, cnt2, tmp0, tmp1, result);
3598 Label Ldone, Lslow, Lloop, Lreturn_diff;
3599
3600 // Note: Making use of the fact that compareTo(a, b) == -compareTo(b, a)
3601 // we interchange str1 and str2 in the UL case and negate the result.
3602 // Like this, str1 is always latin1 encoded, except for the UU case.
3603 // In addition, we need 0 (or sign which is 0) extend.
3604
3605 if (ae == StrIntrinsicNode::UU) {
3606 srwi(cnt1, cnt1, 1);
3607 } else {
3608 clrldi(cnt1, cnt1, 32);
3609 }
3610
3611 if (ae != StrIntrinsicNode::LL) {
3612 srwi(cnt2, cnt2, 1);
3613 } else {
3614 clrldi(cnt2, cnt2, 32);
3615 }
3616
3617 // See if the lengths are different, and calculate min in cnt1.
3618 // Save diff in case we need it for a tie-breaker.
3619 subf_(diff, cnt2, cnt1); // diff = cnt1 - cnt2
3620 // if (diff > 0) { cnt1 = cnt2; }
3621 if (VM_Version::has_isel()) {
3622 isel(cnt1, CCR0, Assembler::greater, /*invert*/ false, cnt2);
3623 } else {
3624 Label Lskip;
3625 blt(CCR0, Lskip);
3626 mr(cnt1, cnt2);
3627 bind(Lskip);
3628 }
3629
3630 // Rename registers
3631 Register chr1 = result;
3632 Register chr2 = tmp0;
3633
3634 // Compare multiple characters in fast loop (only implemented for same encoding).
3635 int stride1 = 8, stride2 = 8;
3636 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3637 int log2_chars_per_iter = (ae == StrIntrinsicNode::LL) ? 3 : 2;
3638 Label Lfastloop, Lskipfast;
3639
3640 srwi_(tmp0, cnt1, log2_chars_per_iter);
3641 beq(CCR0, Lskipfast);
3642 rldicl(cnt2, cnt1, 0, 64 - log2_chars_per_iter); // Remaining characters.
3643 li(cnt1, 1 << log2_chars_per_iter); // Initialize for failure case: Rescan characters from current iteration.
3644 mtctr(tmp0);
3645
3646 bind(Lfastloop);
3647 ld(chr1, 0, str1);
3648 ld(chr2, 0, str2);
3649 cmpd(CCR0, chr1, chr2);
3650 bne(CCR0, Lslow);
3651 addi(str1, str1, stride1);
3652 addi(str2, str2, stride2);
3653 bdnz(Lfastloop);
3654 mr(cnt1, cnt2); // Remaining characters.
3655 bind(Lskipfast);
3656 }
3657
3658 // Loop which searches the first difference character by character.
3659 cmpwi(CCR0, cnt1, 0);
3660 beq(CCR0, Lreturn_diff);
3661 bind(Lslow);
3662 mtctr(cnt1);
3663
3664 switch (ae) {
3665 case StrIntrinsicNode::LL: stride1 = 1; stride2 = 1; break;
3666 case StrIntrinsicNode::UL: // fallthru (see comment above)
3667 case StrIntrinsicNode::LU: stride1 = 1; stride2 = 2; break;
3668 case StrIntrinsicNode::UU: stride1 = 2; stride2 = 2; break;
3669 default: ShouldNotReachHere(); break;
3670 }
3671
3672 bind(Lloop);
3673 if (stride1 == 1) { lbz(chr1, 0, str1); } else { lhz(chr1, 0, str1); }
3674 if (stride2 == 1) { lbz(chr2, 0, str2); } else { lhz(chr2, 0, str2); }
3675 subf_(result, chr2, chr1); // result = chr1 - chr2
3676 bne(CCR0, Ldone);
3677 addi(str1, str1, stride1);
3678 addi(str2, str2, stride2);
3679 bdnz(Lloop);
3680
3681 // If strings are equal up to min length, return the length difference.
3682 bind(Lreturn_diff);
3683 mr(result, diff);
3684
3685 // Otherwise, return the difference between the first mismatched chars.
3686 bind(Ldone);
3687 if (ae == StrIntrinsicNode::UL) {
3688 neg(result, result); // Negate result (see note above).
3689 }
3690 }
3691
3692 void MacroAssembler::array_equals(bool is_array_equ, Register ary1, Register ary2,
3693 Register limit, Register tmp1, Register result, bool is_byte) {
3694 const Register tmp0 = R0;
3695 assert_different_registers(ary1, ary2, limit, tmp0, tmp1, result);
3696 Label Ldone, Lskiploop, Lloop, Lfastloop, Lskipfast;
3697 bool limit_needs_shift = false;
3698
3699 if (is_array_equ) {
3700 const int length_offset = arrayOopDesc::length_offset_in_bytes();
3701 const int base_offset = arrayOopDesc::base_offset_in_bytes(is_byte ? T_BYTE : T_CHAR);
3702
3703 // Return true if the same array.
3704 cmpd(CCR0, ary1, ary2);
3705 beq(CCR0, Lskiploop);
3706
3707 // Return false if one of them is NULL.
3708 cmpdi(CCR0, ary1, 0);
3709 cmpdi(CCR1, ary2, 0);
3710 li(result, 0);
3711 cror(CCR0, Assembler::equal, CCR1, Assembler::equal);
3712 beq(CCR0, Ldone);
3713
3714 // Load the lengths of arrays.
3715 lwz(limit, length_offset, ary1);
3716 lwz(tmp0, length_offset, ary2);
3717
3718 // Return false if the two arrays are not equal length.
3719 cmpw(CCR0, limit, tmp0);
3720 bne(CCR0, Ldone);
3721
3722 // Load array addresses.
3723 addi(ary1, ary1, base_offset);
3724 addi(ary2, ary2, base_offset);
3725 } else {
3726 limit_needs_shift = !is_byte;
3727 li(result, 0); // Assume not equal.
3728 }
3729
3730 // Rename registers
3731 Register chr1 = tmp0;
3732 Register chr2 = tmp1;
3733
3734 // Compare 8 bytes per iteration in fast loop.
3735 const int log2_chars_per_iter = is_byte ? 3 : 2;
3736
3737 srwi_(tmp0, limit, log2_chars_per_iter + (limit_needs_shift ? 1 : 0));
3738 beq(CCR0, Lskipfast);
3739 mtctr(tmp0);
3740
3741 bind(Lfastloop);
3742 ld(chr1, 0, ary1);
3743 ld(chr2, 0, ary2);
3744 addi(ary1, ary1, 8);
3745 addi(ary2, ary2, 8);
3746 cmpd(CCR0, chr1, chr2);
3747 bne(CCR0, Ldone);
3748 bdnz(Lfastloop);
3749
3750 bind(Lskipfast);
3751 rldicl_(limit, limit, limit_needs_shift ? 64 - 1 : 0, 64 - log2_chars_per_iter); // Remaining characters.
3752 beq(CCR0, Lskiploop);
3753 mtctr(limit);
3754
3755 // Character by character.
3756 bind(Lloop);
3757 if (is_byte) {
3758 lbz(chr1, 0, ary1);
3759 lbz(chr2, 0, ary2);
3760 addi(ary1, ary1, 1);
3761 addi(ary2, ary2, 1);
3762 } else {
3763 lhz(chr1, 0, ary1);
3764 lhz(chr2, 0, ary2);
3765 addi(ary1, ary1, 2);
3766 addi(ary2, ary2, 2);
3767 }
3768 cmpw(CCR0, chr1, chr2);
3769 bne(CCR0, Ldone);
3770 bdnz(Lloop);
3771
3772 bind(Lskiploop);
3773 li(result, 1); // All characters are equal.
3774 bind(Ldone);
3775 }
3776
3777 void MacroAssembler::string_indexof(Register result, Register haystack, Register haycnt,
3778 Register needle, ciTypeArray* needle_values, Register needlecnt, int needlecntval,
3779 Register tmp1, Register tmp2, Register tmp3, Register tmp4, int ae) {
3780
3781 // Ensure 0<needlecnt<=haycnt in ideal graph as prerequisite!
3782 Label L_TooShort, L_Found, L_NotFound, L_End;
3783 Register last_addr = haycnt, // Kill haycnt at the beginning.
3784 addr = tmp1,
3785 n_start = tmp2,
3786 ch1 = tmp3,
3787 ch2 = R0;
3788
3789 assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
3790 const int h_csize = (ae == StrIntrinsicNode::LL) ? 1 : 2;
3791 const int n_csize = (ae == StrIntrinsicNode::UU) ? 2 : 1;
3792
3793 // **************************************************************************************************
3794 // Prepare for main loop: optimized for needle count >=2, bail out otherwise.
3795 // **************************************************************************************************
3796
3797 // Compute last haystack addr to use if no match gets found.
3798 clrldi(haycnt, haycnt, 32); // Ensure positive int is valid as 64 bit value.
3799 addi(addr, haystack, -h_csize); // Accesses use pre-increment.
3800 if (needlecntval == 0) { // variable needlecnt
3801 cmpwi(CCR6, needlecnt, 2);
3802 clrldi(needlecnt, needlecnt, 32); // Ensure positive int is valid as 64 bit value.
3803 blt(CCR6, L_TooShort); // Variable needlecnt: handle short needle separately.
3804 }
3805
3806 if (n_csize == 2) { lwz(n_start, 0, needle); } else { lhz(n_start, 0, needle); } // Load first 2 characters of needle.
3807
3808 if (needlecntval == 0) { // variable needlecnt
3809 subf(ch1, needlecnt, haycnt); // Last character index to compare is haycnt-needlecnt.
3810 addi(needlecnt, needlecnt, -2); // Rest of needle.
3811 } else { // constant needlecnt
3812 guarantee(needlecntval != 1, "IndexOf with single-character needle must be handled separately");
3813 assert((needlecntval & 0x7fff) == needlecntval, "wrong immediate");
3814 addi(ch1, haycnt, -needlecntval); // Last character index to compare is haycnt-needlecnt.
3815 if (needlecntval > 3) { li(needlecnt, needlecntval - 2); } // Rest of needle.
3816 }
3817
3818 if (h_csize == 2) { slwi(ch1, ch1, 1); } // Scale to number of bytes.
3819
3820 if (ae ==StrIntrinsicNode::UL) {
3821 srwi(tmp4, n_start, 1*8); // ___0
3822 rlwimi(n_start, tmp4, 2*8, 0, 23); // _0_1
3823 }
3824
3825 add(last_addr, haystack, ch1); // Point to last address to compare (haystack+2*(haycnt-needlecnt)).
3826
3827 // Main Loop (now we have at least 2 characters).
3828 Label L_OuterLoop, L_InnerLoop, L_FinalCheck, L_Comp1, L_Comp2;
3829 bind(L_OuterLoop); // Search for 1st 2 characters.
3830 Register addr_diff = tmp4;
3831 subf(addr_diff, addr, last_addr); // Difference between already checked address and last address to check.
3832 addi(addr, addr, h_csize); // This is the new address we want to use for comparing.
3833 srdi_(ch2, addr_diff, h_csize);
3834 beq(CCR0, L_FinalCheck); // 2 characters left?
3835 mtctr(ch2); // num of characters / 2
3836 bind(L_InnerLoop); // Main work horse (2x unrolled search loop)
3837 if (h_csize == 2) { // Load 2 characters of haystack (ignore alignment).
3838 lwz(ch1, 0, addr);
3839 lwz(ch2, 2, addr);
3840 } else {
3841 lhz(ch1, 0, addr);
3842 lhz(ch2, 1, addr);
3843 }
3844 cmpw(CCR0, ch1, n_start); // Compare 2 characters (1 would be sufficient but try to reduce branches to CompLoop).
3845 cmpw(CCR1, ch2, n_start);
3846 beq(CCR0, L_Comp1); // Did we find the needle start?
3847 beq(CCR1, L_Comp2);
3848 addi(addr, addr, 2 * h_csize);
3849 bdnz(L_InnerLoop);
3850 bind(L_FinalCheck);
3851 andi_(addr_diff, addr_diff, h_csize); // Remaining characters not covered by InnerLoop: (num of characters) & 1.
3852 beq(CCR0, L_NotFound);
3853 if (h_csize == 2) { lwz(ch1, 0, addr); } else { lhz(ch1, 0, addr); } // One position left at which we have to compare.
3854 cmpw(CCR1, ch1, n_start);
3855 beq(CCR1, L_Comp1);
3856 bind(L_NotFound);
3857 li(result, -1); // not found
3858 b(L_End);
3859
3860 // **************************************************************************************************
3861 // Special Case: unfortunately, the variable needle case can be called with needlecnt<2
3862 // **************************************************************************************************
3863 if (needlecntval == 0) { // We have to handle these cases separately.
3864 Label L_OneCharLoop;
3865 bind(L_TooShort);
3866 mtctr(haycnt);
3867 if (n_csize == 2) { lhz(n_start, 0, needle); } else { lbz(n_start, 0, needle); } // First character of needle
3868 bind(L_OneCharLoop);
3869 if (h_csize == 2) { lhzu(ch1, 2, addr); } else { lbzu(ch1, 1, addr); }
3870 cmpw(CCR1, ch1, n_start);
3871 beq(CCR1, L_Found); // Did we find the one character needle?
3872 bdnz(L_OneCharLoop);
3873 li(result, -1); // Not found.
3874 b(L_End);
3875 }
3876
3877 // **************************************************************************************************
3878 // Regular Case Part II: compare rest of needle (first 2 characters have been compared already)
3879 // **************************************************************************************************
3880
3881 // Compare the rest
3882 bind(L_Comp2);
3883 addi(addr, addr, h_csize); // First comparison has failed, 2nd one hit.
3884 bind(L_Comp1); // Addr points to possible needle start.
3885 if (needlecntval != 2) { // Const needlecnt==2?
3886 if (needlecntval != 3) {
3887 if (needlecntval == 0) { beq(CCR6, L_Found); } // Variable needlecnt==2?
3888 Register n_ind = tmp4,
3889 h_ind = n_ind;
3890 li(n_ind, 2 * n_csize); // First 2 characters are already compared, use index 2.
3891 mtctr(needlecnt); // Decremented by 2, still > 0.
3892 Label L_CompLoop;
3893 bind(L_CompLoop);
3894 if (ae ==StrIntrinsicNode::UL) {
3895 h_ind = ch1;
3896 sldi(h_ind, n_ind, 1);
3897 }
3898 if (n_csize == 2) { lhzx(ch2, needle, n_ind); } else { lbzx(ch2, needle, n_ind); }
3899 if (h_csize == 2) { lhzx(ch1, addr, h_ind); } else { lbzx(ch1, addr, h_ind); }
3900 cmpw(CCR1, ch1, ch2);
3901 bne(CCR1, L_OuterLoop);
3902 addi(n_ind, n_ind, n_csize);
3903 bdnz(L_CompLoop);
3904 } else { // No loop required if there's only one needle character left.
3905 if (n_csize == 2) { lhz(ch2, 2 * 2, needle); } else { lbz(ch2, 2 * 1, needle); }
3906 if (h_csize == 2) { lhz(ch1, 2 * 2, addr); } else { lbz(ch1, 2 * 1, addr); }
3907 cmpw(CCR1, ch1, ch2);
3908 bne(CCR1, L_OuterLoop);
3909 }
3910 }
3911 // Return index ...
3912 bind(L_Found);
3913 subf(result, haystack, addr); // relative to haystack, ...
3914 if (h_csize == 2) { srdi(result, result, 1); } // in characters.
3915 bind(L_End);
3916 } // string_indexof
3917
3918 void MacroAssembler::string_indexof_char(Register result, Register haystack, Register haycnt,
3919 Register needle, jchar needleChar, Register tmp1, Register tmp2, bool is_byte) {
3920 assert_different_registers(haystack, haycnt, needle, tmp1, tmp2);
3921
3922 Label L_InnerLoop, L_FinalCheck, L_Found1, L_Found2, L_NotFound, L_End;
3923 Register addr = tmp1,
3924 ch1 = tmp2,
3925 ch2 = R0;
3926
3927 const int h_csize = is_byte ? 1 : 2;
3928
3929 //4:
3930 srwi_(tmp2, haycnt, 1); // Shift right by exact_log2(UNROLL_FACTOR).
3931 mr(addr, haystack);
3932 beq(CCR0, L_FinalCheck);
3933 mtctr(tmp2); // Move to count register.
3934 //8:
3935 bind(L_InnerLoop); // Main work horse (2x unrolled search loop).
3936 if (!is_byte) {
3937 lhz(ch1, 0, addr);
3938 lhz(ch2, 2, addr);
3939 } else {
3940 lbz(ch1, 0, addr);
3941 lbz(ch2, 1, addr);
3942 }
3943 (needle != R0) ? cmpw(CCR0, ch1, needle) : cmplwi(CCR0, ch1, (unsigned int)needleChar);
3944 (needle != R0) ? cmpw(CCR1, ch2, needle) : cmplwi(CCR1, ch2, (unsigned int)needleChar);
3945 beq(CCR0, L_Found1); // Did we find the needle?
3946 beq(CCR1, L_Found2);
3947 addi(addr, addr, 2 * h_csize);
3948 bdnz(L_InnerLoop);
3949 //16:
3950 bind(L_FinalCheck);
3951 andi_(R0, haycnt, 1);
3952 beq(CCR0, L_NotFound);
3953 if (!is_byte) { lhz(ch1, 0, addr); } else { lbz(ch1, 0, addr); } // One position left at which we have to compare.
3954 (needle != R0) ? cmpw(CCR1, ch1, needle) : cmplwi(CCR1, ch1, (unsigned int)needleChar);
3955 beq(CCR1, L_Found1);
3956 //21:
3957 bind(L_NotFound);
3958 li(result, -1); // Not found.
3959 b(L_End);
3960
3961 bind(L_Found2);
3962 addi(addr, addr, h_csize);
3963 //24:
3964 bind(L_Found1); // Return index ...
3965 subf(result, haystack, addr); // relative to haystack, ...
3966 if (!is_byte) { srdi(result, result, 1); } // in characters.
3967 bind(L_End);
3968 } // string_indexof_char
3969
3970
3971 void MacroAssembler::has_negatives(Register src, Register cnt, Register result,
3972 Register tmp1, Register tmp2) {
3973 const Register tmp0 = R0;
3974 assert_different_registers(src, result, cnt, tmp0, tmp1, tmp2);
3975 Label Lfastloop, Lslow, Lloop, Lnoneg, Ldone;
3976
3977 // Check if cnt >= 8 (= 16 bytes)
3978 lis(tmp1, (int)(short)0x8080); // tmp1 = 0x8080808080808080
3979 srwi_(tmp2, cnt, 4);
3980 li(result, 1); // Assume there's a negative byte.
3981 beq(CCR0, Lslow);
3982 ori(tmp1, tmp1, 0x8080);
3983 rldimi(tmp1, tmp1, 32, 0);
3984 mtctr(tmp2);
3985
3986 // 2x unrolled loop
3987 bind(Lfastloop);
3988 ld(tmp2, 0, src);
3989 ld(tmp0, 8, src);
3990
3991 orr(tmp0, tmp2, tmp0);
3992
3993 and_(tmp0, tmp0, tmp1);
3994 bne(CCR0, Ldone); // Found negative byte.
3995 addi(src, src, 16);
3996
3997 bdnz(Lfastloop);
3998
3999 bind(Lslow); // Fallback to slow version
4000 rldicl_(tmp0, cnt, 0, 64-4);
4001 beq(CCR0, Lnoneg);
4002 mtctr(tmp0);
4003 bind(Lloop);
4004 lbz(tmp0, 0, src);
4005 addi(src, src, 1);
4006 andi_(tmp0, tmp0, 0x80);
4007 bne(CCR0, Ldone); // Found negative byte.
4008 bdnz(Lloop);
4009 bind(Lnoneg);
4010 li(result, 0);
4011
4012 bind(Ldone);
4013 }
4014
4015 #endif // Compiler2
4016
4017 // Helpers for Intrinsic Emitters
4018 //
4019 // Revert the byte order of a 32bit value in a register
4020 // src: 0x44556677
4021 // dst: 0x77665544
4022 // Three steps to obtain the result:
4023 // 1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word
4024 // into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared.
4025 // This value initializes dst.
4026 // 2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost
4027 // byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go.
4028 // This value is mask inserted into dst with a [0..23] mask of 1s.
4029 // 3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position.
4030 // This value is mask inserted into dst with a [8..15] mask of 1s.
4031 void MacroAssembler::load_reverse_32(Register dst, Register src) {
4032 assert_different_registers(dst, src);
4033
4034 rldicl(dst, src, (4+1)*8, 56); // Rotate byte 4 into position 7 (rightmost), clear all to the left.
4035 rlwimi(dst, src, 3*8, 0, 23); // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone.
4036 rlwimi(dst, src, 1*8, 8, 15); // Insert byte 6 into position 5, leave the rest alone.
4037 }
4038
4039 // Calculate the column addresses of the crc32 lookup table into distinct registers.
4040 // This loop-invariant calculation is moved out of the loop body, reducing the loop
4041 // body size from 20 to 16 instructions.
4042 // Returns the offset that was used to calculate the address of column tc3.
4043 // Due to register shortage, setting tc3 may overwrite table. With the return offset
4044 // at hand, the original table address can be easily reconstructed.
4045 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {
4046
4047 #ifdef VM_LITTLE_ENDIAN
4048 // This is what we implement (the DOLIT4 part):
4049 // ========================================================================= */
4050 // #define DOLIT4 c ^= *buf4++; \
4051 // c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \
4052 // crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24]
4053 // #define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4
4054 // ========================================================================= */
4055 const int ix0 = 3*(4*CRC32_COLUMN_SIZE);
4056 const int ix1 = 2*(4*CRC32_COLUMN_SIZE);
4057 const int ix2 = 1*(4*CRC32_COLUMN_SIZE);
4058 const int ix3 = 0*(4*CRC32_COLUMN_SIZE);
4059 #else
4060 // This is what we implement (the DOBIG4 part):
4061 // =========================================================================
4062 // #define DOBIG4 c ^= *++buf4; \
4063 // c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \
4064 // crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24]
4065 // #define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4
4066 // =========================================================================
4067 const int ix0 = 4*(4*CRC32_COLUMN_SIZE);
4068 const int ix1 = 5*(4*CRC32_COLUMN_SIZE);
4069 const int ix2 = 6*(4*CRC32_COLUMN_SIZE);
4070 const int ix3 = 7*(4*CRC32_COLUMN_SIZE);
4071 #endif
4072 assert_different_registers(table, tc0, tc1, tc2);
4073 assert(table == tc3, "must be!");
4074
4075 addi(tc0, table, ix0);
4076 addi(tc1, table, ix1);
4077 addi(tc2, table, ix2);
4078 if (ix3 != 0) addi(tc3, table, ix3);
4079
4080 return ix3;
4081 }
4082
4083 /**
4084 * uint32_t crc;
4085 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
4086 */
4087 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) {
4088 assert_different_registers(crc, table, tmp);
4089 assert_different_registers(val, table);
4090
4091 if (crc == val) { // Must rotate first to use the unmodified value.
4092 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
4093 // As we use a word (4-byte) instruction, we have to adapt the mask bit positions.
4094 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits.
4095 } else {
4096 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits.
4097 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
4098 }
4099 lwzx(tmp, table, tmp);
4100 xorr(crc, crc, tmp);
4101 }
4102
4103 /**
4104 * uint32_t crc;
4105 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
4106 */
4107 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
4108 fold_byte_crc32(crc, crc, table, tmp);
4109 }
4110
4111 /**
4112 * Emits code to update CRC-32 with a byte value according to constants in table.
4113 *
4114 * @param [in,out]crc Register containing the crc.
4115 * @param [in]val Register containing the byte to fold into the CRC.
4116 * @param [in]table Register containing the table of crc constants.
4117 *
4118 * uint32_t crc;
4119 * val = crc_table[(val ^ crc) & 0xFF];
4120 * crc = val ^ (crc >> 8);
4121 */
4122 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
4123 BLOCK_COMMENT("update_byte_crc32:");
4124 xorr(val, val, crc);
4125 fold_byte_crc32(crc, val, table, val);
4126 }
4127
4128 /**
4129 * @param crc register containing existing CRC (32-bit)
4130 * @param buf register pointing to input byte buffer (byte*)
4131 * @param len register containing number of bytes
4132 * @param table register pointing to CRC table
4133 */
4134 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,
4135 Register data, bool loopAlignment) {
4136 assert_different_registers(crc, buf, len, table, data);
4137
4138 Label L_mainLoop, L_done;
4139 const int mainLoop_stepping = 1;
4140 const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4;
4141
4142 // Process all bytes in a single-byte loop.
4143 clrldi_(len, len, 32); // Enforce 32 bit. Anything to do?
4144 beq(CCR0, L_done);
4145
4146 mtctr(len);
4147 align(mainLoop_alignment);
4148 BIND(L_mainLoop);
4149 lbz(data, 0, buf); // Byte from buffer, zero-extended.
4150 addi(buf, buf, mainLoop_stepping); // Advance buffer position.
4151 update_byte_crc32(crc, data, table);
4152 bdnz(L_mainLoop); // Iterate.
4153
4154 bind(L_done);
4155 }
4156
4157 /**
4158 * Emits code to update CRC-32 with a 4-byte value according to constants in table
4159 * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
4160 */
4161 // A not on the lookup table address(es):
4162 // The lookup table consists of two sets of four columns each.
4163 // The columns {0..3} are used for little-endian machines.
4164 // The columns {4..7} are used for big-endian machines.
4165 // To save the effort of adding the column offset to the table address each time
4166 // a table element is looked up, it is possible to pass the pre-calculated
4167 // column addresses.
4168 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary.
4169 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,
4170 Register t0, Register t1, Register t2, Register t3,
4171 Register tc0, Register tc1, Register tc2, Register tc3) {
4172 assert_different_registers(crc, t3);
4173
4174 // XOR crc with next four bytes of buffer.
4175 lwz(t3, bufDisp, buf);
4176 if (bufInc != 0) {
4177 addi(buf, buf, bufInc);
4178 }
4179 xorr(t3, t3, crc);
4180
4181 // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices.
4182 rlwinm(t0, t3, 2, 24-2, 31-2); // ((t1 >> 0) & 0xff) << 2
4183 rlwinm(t1, t3, 32+(2- 8), 24-2, 31-2); // ((t1 >> 8) & 0xff) << 2
4184 rlwinm(t2, t3, 32+(2-16), 24-2, 31-2); // ((t1 >> 16) & 0xff) << 2
4185 rlwinm(t3, t3, 32+(2-24), 24-2, 31-2); // ((t1 >> 24) & 0xff) << 2
4186
4187 // Use the pre-calculated column addresses.
4188 // Load pre-calculated table values.
4189 lwzx(t0, tc0, t0);
4190 lwzx(t1, tc1, t1);
4191 lwzx(t2, tc2, t2);
4192 lwzx(t3, tc3, t3);
4193
4194 // Calculate new crc from table values.
4195 xorr(t0, t0, t1);
4196 xorr(t2, t2, t3);
4197 xorr(crc, t0, t2); // Now crc contains the final checksum value.
4198 }
4199
4200 /**
4201 * @param crc register containing existing CRC (32-bit)
4202 * @param buf register pointing to input byte buffer (byte*)
4203 * @param len register containing number of bytes
4204 * @param table register pointing to CRC table
4205 *
4206 * Uses R9..R12 as work register. Must be saved/restored by caller!
4207 */
4208 void MacroAssembler::kernel_crc32_2word(Register crc, Register buf, Register len, Register table,
4209 Register t0, Register t1, Register t2, Register t3,
4210 Register tc0, Register tc1, Register tc2, Register tc3,
4211 bool invertCRC) {
4212 assert_different_registers(crc, buf, len, table);
4213
4214 Label L_mainLoop, L_tail;
4215 Register tmp = t0;
4216 Register data = t0;
4217 Register tmp2 = t1;
4218 const int mainLoop_stepping = 8;
4219 const int tailLoop_stepping = 1;
4220 const int log_stepping = exact_log2(mainLoop_stepping);
4221 const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
4222 const int complexThreshold = 2*mainLoop_stepping;
4223
4224 // Don't test for len <= 0 here. This pathological case should not occur anyway.
4225 // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles
4226 // for all well-behaved cases. The situation itself is detected and handled correctly
4227 // within update_byteLoop_crc32.
4228 assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
4229
4230 BLOCK_COMMENT("kernel_crc32_2word {");
4231
4232 if (invertCRC) {
4233 nand(crc, crc, crc); // 1s complement of crc
4234 }
4235
4236 // Check for short (<mainLoop_stepping) buffer.
4237 cmpdi(CCR0, len, complexThreshold);
4238 blt(CCR0, L_tail);
4239
4240 // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
4241 // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
4242 {
4243 // Align buf addr to mainLoop_stepping boundary.
4244 neg(tmp2, buf); // Calculate # preLoop iterations for alignment.
4245 rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
4246
4247 if (complexThreshold > mainLoop_stepping) {
4248 sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4249 } else {
4250 sub(tmp, len, tmp2); // Remaining bytes for main loop.
4251 cmpdi(CCR0, tmp, mainLoop_stepping);
4252 blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing
4253 mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4254 }
4255 update_byteLoop_crc32(crc, buf, tmp2, table, data, false);
4256 }
4257
4258 srdi(tmp2, len, log_stepping); // #iterations for mainLoop
4259 andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop
4260 mtctr(tmp2);
4261
4262 #ifdef VM_LITTLE_ENDIAN
4263 Register crc_rv = crc;
4264 #else
4265 Register crc_rv = tmp; // Load_reverse needs separate registers to work on.
4266 // Occupies tmp, but frees up crc.
4267 load_reverse_32(crc_rv, crc); // Revert byte order because we are dealing with big-endian data.
4268 tmp = crc;
4269 #endif
4270
4271 int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
4272
4273 align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement.
4274 BIND(L_mainLoop);
4275 update_1word_crc32(crc_rv, buf, table, 0, 0, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
4276 update_1word_crc32(crc_rv, buf, table, 4, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
4277 bdnz(L_mainLoop);
4278
4279 #ifndef VM_LITTLE_ENDIAN
4280 load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data.
4281 tmp = crc_rv; // Tmp uses it's original register again.
4282 #endif
4283
4284 // Restore original table address for tailLoop.
4285 if (reconstructTableOffset != 0) {
4286 addi(table, table, -reconstructTableOffset);
4287 }
4288
4289 // Process last few (<complexThreshold) bytes of buffer.
4290 BIND(L_tail);
4291 update_byteLoop_crc32(crc, buf, len, table, data, false);
4292
4293 if (invertCRC) {
4294 nand(crc, crc, crc); // 1s complement of crc
4295 }
4296 BLOCK_COMMENT("} kernel_crc32_2word");
4297 }
4298
4299 /**
4300 * @param crc register containing existing CRC (32-bit)
4301 * @param buf register pointing to input byte buffer (byte*)
4302 * @param len register containing number of bytes
4303 * @param table register pointing to CRC table
4304 *
4305 * uses R9..R12 as work register. Must be saved/restored by caller!
4306 */
4307 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table,
4308 Register t0, Register t1, Register t2, Register t3,
4309 Register tc0, Register tc1, Register tc2, Register tc3,
4310 bool invertCRC) {
4311 assert_different_registers(crc, buf, len, table);
4312
4313 Label L_mainLoop, L_tail;
4314 Register tmp = t0;
4315 Register data = t0;
4316 Register tmp2 = t1;
4317 const int mainLoop_stepping = 4;
4318 const int tailLoop_stepping = 1;
4319 const int log_stepping = exact_log2(mainLoop_stepping);
4320 const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
4321 const int complexThreshold = 2*mainLoop_stepping;
4322
4323 // Don't test for len <= 0 here. This pathological case should not occur anyway.
4324 // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles
4325 // for all well-behaved cases. The situation itself is detected and handled correctly
4326 // within update_byteLoop_crc32.
4327 assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
4328
4329 BLOCK_COMMENT("kernel_crc32_1word {");
4330
4331 if (invertCRC) {
4332 nand(crc, crc, crc); // 1s complement of crc
4333 }
4334
4335 // Check for short (<mainLoop_stepping) buffer.
4336 cmpdi(CCR0, len, complexThreshold);
4337 blt(CCR0, L_tail);
4338
4339 // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
4340 // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
4341 {
4342 // Align buf addr to mainLoop_stepping boundary.
4343 neg(tmp2, buf); // Calculate # preLoop iterations for alignment.
4344 rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
4345
4346 if (complexThreshold > mainLoop_stepping) {
4347 sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4348 } else {
4349 sub(tmp, len, tmp2); // Remaining bytes for main loop.
4350 cmpdi(CCR0, tmp, mainLoop_stepping);
4351 blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing
4352 mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4353 }
4354 update_byteLoop_crc32(crc, buf, tmp2, table, data, false);
4355 }
4356
4357 srdi(tmp2, len, log_stepping); // #iterations for mainLoop
4358 andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop
4359 mtctr(tmp2);
4360
4361 #ifdef VM_LITTLE_ENDIAN
4362 Register crc_rv = crc;
4363 #else
4364 Register crc_rv = tmp; // Load_reverse needs separate registers to work on.
4365 // Occupies tmp, but frees up crc.
4366 load_reverse_32(crc_rv, crc); // Revert byte order because we are dealing with big-endian data.
4367 tmp = crc;
4368 #endif
4369
4370 int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
4371
4372 align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement.
4373 BIND(L_mainLoop);
4374 update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
4375 bdnz(L_mainLoop);
4376
4377 #ifndef VM_LITTLE_ENDIAN
4378 load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data.
4379 tmp = crc_rv; // Tmp uses it's original register again.
4380 #endif
4381
4382 // Restore original table address for tailLoop.
4383 if (reconstructTableOffset != 0) {
4384 addi(table, table, -reconstructTableOffset);
4385 }
4386
4387 // Process last few (<complexThreshold) bytes of buffer.
4388 BIND(L_tail);
4389 update_byteLoop_crc32(crc, buf, len, table, data, false);
4390
4391 if (invertCRC) {
4392 nand(crc, crc, crc); // 1s complement of crc
4393 }
4394 BLOCK_COMMENT("} kernel_crc32_1word");
4395 }
4396
4397 /**
4398 * @param crc register containing existing CRC (32-bit)
4399 * @param buf register pointing to input byte buffer (byte*)
4400 * @param len register containing number of bytes
4401 * @param table register pointing to CRC table
4402 *
4403 * Uses R7_ARG5, R8_ARG6 as work registers.
4404 */
4405 void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len, Register table,
4406 Register t0, Register t1, Register t2, Register t3,
4407 bool invertCRC) {
4408 assert_different_registers(crc, buf, len, table);
4409
4410 Register data = t0; // Holds the current byte to be folded into crc.
4411
4412 BLOCK_COMMENT("kernel_crc32_1byte {");
4413
4414 if (invertCRC) {
4415 nand(crc, crc, crc); // 1s complement of crc
4416 }
4417
4418 // Process all bytes in a single-byte loop.
4419 update_byteLoop_crc32(crc, buf, len, table, data, true);
4420
4421 if (invertCRC) {
4422 nand(crc, crc, crc); // 1s complement of crc
4423 }
4424 BLOCK_COMMENT("} kernel_crc32_1byte");
4425 }
4426
4427 /**
4428 * @param crc register containing existing CRC (32-bit)
4429 * @param buf register pointing to input byte buffer (byte*)
4430 * @param len register containing number of bytes
4431 * @param table register pointing to CRC table
4432 * @param constants register pointing to CRC table for 128-bit aligned memory
4433 * @param barretConstants register pointing to table for barrett reduction
4434 * @param t0 volatile register
4435 * @param t1 volatile register
4436 * @param t2 volatile register
4437 * @param t3 volatile register
4438 */
4439 void MacroAssembler::kernel_crc32_1word_vpmsumd(Register crc, Register buf, Register len, Register table,
4440 Register constants, Register barretConstants,
4441 Register t0, Register t1, Register t2, Register t3, Register t4,
4442 bool invertCRC) {
4443 assert_different_registers(crc, buf, len, table);
4444
4445 Label L_alignedHead, L_tail, L_alignTail, L_start, L_end;
4446
4447 Register prealign = t0;
4448 Register postalign = t0;
4449
4450 BLOCK_COMMENT("kernel_crc32_1word_vpmsumb {");
4451
4452 // 1. use kernel_crc32_1word for shorter than 384bit
4453 clrldi(len, len, 32);
4454 cmpdi(CCR0, len, 384);
4455 bge(CCR0, L_start);
4456
4457 Register tc0 = t4;
4458 Register tc1 = constants;
4459 Register tc2 = barretConstants;
4460 kernel_crc32_1word(crc, buf, len, table,t0, t1, t2, t3, tc0, tc1, tc2, table, invertCRC);
4461 b(L_end);
4462
4463 BIND(L_start);
4464
4465 // 2. ~c
4466 if (invertCRC) {
4467 nand(crc, crc, crc); // 1s complement of crc
4468 }
4469
4470 // 3. calculate from 0 to first 128bit-aligned address
4471 clrldi_(prealign, buf, 57);
4472 beq(CCR0, L_alignedHead);
4473
4474 subfic(prealign, prealign, 128);
4475
4476 subf(len, prealign, len);
4477 update_byteLoop_crc32(crc, buf, prealign, table, t2, false);
4478
4479 // 4. calculate from first 128bit-aligned address to last 128bit-aligned address
4480 BIND(L_alignedHead);
4481
4482 clrldi(postalign, len, 57);
4483 subf(len, postalign, len);
4484
4485 // len must be more than 256bit
4486 kernel_crc32_1word_aligned(crc, buf, len, constants, barretConstants, t1, t2, t3);
4487
4488 // 5. calculate remaining
4489 cmpdi(CCR0, postalign, 0);
4490 beq(CCR0, L_tail);
4491
4492 update_byteLoop_crc32(crc, buf, postalign, table, t2, false);
4493
4494 BIND(L_tail);
4495
4496 // 6. ~c
4497 if (invertCRC) {
4498 nand(crc, crc, crc); // 1s complement of crc
4499 }
4500
4501 BIND(L_end);
4502
4503 BLOCK_COMMENT("} kernel_crc32_1word_vpmsumb");
4504 }
4505
4506 /**
4507 * @param crc register containing existing CRC (32-bit)
4508 * @param buf register pointing to input byte buffer (byte*)
4509 * @param len register containing number of bytes
4510 * @param constants register pointing to CRC table for 128-bit aligned memory
4511 * @param barretConstants register pointing to table for barrett reduction
4512 * @param t0 volatile register
4513 * @param t1 volatile register
4514 * @param t2 volatile register
4515 */
4516 void MacroAssembler::kernel_crc32_1word_aligned(Register crc, Register buf, Register len,
4517 Register constants, Register barretConstants, Register t0, Register t1, Register t2) {
4518 Label L_mainLoop, L_tail, L_alignTail, L_barrett_reduction, L_end, L_first_warm_up_done, L_first_cool_down, L_second_cool_down, L_XOR, L_test;
4519 Label L_lv0, L_lv1, L_lv2, L_lv3, L_lv4, L_lv5, L_lv6, L_lv7, L_lv8, L_lv9, L_lv10, L_lv11, L_lv12, L_lv13, L_lv14, L_lv15;
4520 Label L_1, L_2, L_3, L_4;
4521
4522 Register rLoaded = t0;
4523 Register rTmp1 = t1;
4524 Register rTmp2 = t2;
4525 Register off16 = R22;
4526 Register off32 = R23;
4527 Register off48 = R24;
4528 Register off64 = R25;
4529 Register off80 = R26;
4530 Register off96 = R27;
4531 Register off112 = R28;
4532 Register rIdx = R29;
4533 Register rMax = R30;
4534 Register constantsPos = R31;
4535
4536 VectorRegister mask_32bit = VR24;
4537 VectorRegister mask_64bit = VR25;
4538 VectorRegister zeroes = VR26;
4539 VectorRegister const1 = VR27;
4540 VectorRegister const2 = VR28;
4541
4542 // Save non-volatile vector registers (frameless).
4543 Register offset = t1; int offsetInt = 0;
4544 offsetInt -= 16; li(offset, -16); stvx(VR20, offset, R1_SP);
4545 offsetInt -= 16; addi(offset, offset, -16); stvx(VR21, offset, R1_SP);
4546 offsetInt -= 16; addi(offset, offset, -16); stvx(VR22, offset, R1_SP);
4547 offsetInt -= 16; addi(offset, offset, -16); stvx(VR23, offset, R1_SP);
4548 offsetInt -= 16; addi(offset, offset, -16); stvx(VR24, offset, R1_SP);
4549 offsetInt -= 16; addi(offset, offset, -16); stvx(VR25, offset, R1_SP);
4550 offsetInt -= 16; addi(offset, offset, -16); stvx(VR26, offset, R1_SP);
4551 offsetInt -= 16; addi(offset, offset, -16); stvx(VR27, offset, R1_SP);
4552 offsetInt -= 16; addi(offset, offset, -16); stvx(VR28, offset, R1_SP);
4553 offsetInt -= 8; std(R22, offsetInt, R1_SP);
4554 offsetInt -= 8; std(R23, offsetInt, R1_SP);
4555 offsetInt -= 8; std(R24, offsetInt, R1_SP);
4556 offsetInt -= 8; std(R25, offsetInt, R1_SP);
4557 offsetInt -= 8; std(R26, offsetInt, R1_SP);
4558 offsetInt -= 8; std(R27, offsetInt, R1_SP);
4559 offsetInt -= 8; std(R28, offsetInt, R1_SP);
4560 offsetInt -= 8; std(R29, offsetInt, R1_SP);
4561 offsetInt -= 8; std(R30, offsetInt, R1_SP);
4562 offsetInt -= 8; std(R31, offsetInt, R1_SP);
4563
4564 // Set constants
4565 li(off16, 16);
4566 li(off32, 32);
4567 li(off48, 48);
4568 li(off64, 64);
4569 li(off80, 80);
4570 li(off96, 96);
4571 li(off112, 112);
4572
4573 clrldi(crc, crc, 32);
4574
4575 vxor(zeroes, zeroes, zeroes);
4576 vspltisw(VR0, -1);
4577
4578 vsldoi(mask_32bit, zeroes, VR0, 4);
4579 vsldoi(mask_64bit, zeroes, VR0, -8);
4580
4581 // Get the initial value into v8
4582 vxor(VR8, VR8, VR8);
4583 mtvrd(VR8, crc);
4584 vsldoi(VR8, zeroes, VR8, -8); // shift into bottom 32 bits
4585
4586 li (rLoaded, 0);
4587
4588 rldicr(rIdx, len, 0, 56);
4589
4590 {
4591 BIND(L_1);
4592 // Checksum in blocks of MAX_SIZE (32768)
4593 lis(rMax, 0);
4594 ori(rMax, rMax, 32768);
4595 mr(rTmp2, rMax);
4596 cmpd(CCR0, rIdx, rMax);
4597 bgt(CCR0, L_2);
4598 mr(rMax, rIdx);
4599
4600 BIND(L_2);
4601 subf(rIdx, rMax, rIdx);
4602
4603 // our main loop does 128 bytes at a time
4604 srdi(rMax, rMax, 7);
4605
4606 /*
4607 * Work out the offset into the constants table to start at. Each
4608 * constant is 16 bytes, and it is used against 128 bytes of input
4609 * data - 128 / 16 = 8
4610 */
4611 sldi(rTmp1, rMax, 4);
4612 srdi(rTmp2, rTmp2, 3);
4613 subf(rTmp1, rTmp1, rTmp2);
4614
4615 // We reduce our final 128 bytes in a separate step
4616 addi(rMax, rMax, -1);
4617 mtctr(rMax);
4618
4619 // Find the start of our constants
4620 add(constantsPos, constants, rTmp1);
4621
4622 // zero VR0-v7 which will contain our checksums
4623 vxor(VR0, VR0, VR0);
4624 vxor(VR1, VR1, VR1);
4625 vxor(VR2, VR2, VR2);
4626 vxor(VR3, VR3, VR3);
4627 vxor(VR4, VR4, VR4);
4628 vxor(VR5, VR5, VR5);
4629 vxor(VR6, VR6, VR6);
4630 vxor(VR7, VR7, VR7);
4631
4632 lvx(const1, constantsPos);
4633
4634 /*
4635 * If we are looping back to consume more data we use the values
4636 * already in VR16-v23.
4637 */
4638 cmpdi(CCR0, rLoaded, 1);
4639 beq(CCR0, L_3);
4640 {
4641
4642 // First warm up pass
4643 lvx(VR16, buf);
4644 lvx(VR17, off16, buf);
4645 lvx(VR18, off32, buf);
4646 lvx(VR19, off48, buf);
4647 lvx(VR20, off64, buf);
4648 lvx(VR21, off80, buf);
4649 lvx(VR22, off96, buf);
4650 lvx(VR23, off112, buf);
4651 addi(buf, buf, 8*16);
4652
4653 // xor in initial value
4654 vxor(VR16, VR16, VR8);
4655 }
4656
4657 BIND(L_3);
4658 bdz(L_first_warm_up_done);
4659
4660 addi(constantsPos, constantsPos, 16);
4661 lvx(const2, constantsPos);
4662
4663 // Second warm up pass
4664 vpmsumd(VR8, VR16, const1);
4665 lvx(VR16, buf);
4666
4667 vpmsumd(VR9, VR17, const1);
4668 lvx(VR17, off16, buf);
4669
4670 vpmsumd(VR10, VR18, const1);
4671 lvx(VR18, off32, buf);
4672
4673 vpmsumd(VR11, VR19, const1);
4674 lvx(VR19, off48, buf);
4675
4676 vpmsumd(VR12, VR20, const1);
4677 lvx(VR20, off64, buf);
4678
4679 vpmsumd(VR13, VR21, const1);
4680 lvx(VR21, off80, buf);
4681
4682 vpmsumd(VR14, VR22, const1);
4683 lvx(VR22, off96, buf);
4684
4685 vpmsumd(VR15, VR23, const1);
4686 lvx(VR23, off112, buf);
4687
4688 addi(buf, buf, 8 * 16);
4689
4690 bdz(L_first_cool_down);
4691
4692 /*
4693 * main loop. We modulo schedule it such that it takes three iterations
4694 * to complete - first iteration load, second iteration vpmsum, third
4695 * iteration xor.
4696 */
4697 {
4698 BIND(L_4);
4699 lvx(const1, constantsPos); addi(constantsPos, constantsPos, 16);
4700
4701 vxor(VR0, VR0, VR8);
4702 vpmsumd(VR8, VR16, const2);
4703 lvx(VR16, buf);
4704
4705 vxor(VR1, VR1, VR9);
4706 vpmsumd(VR9, VR17, const2);
4707 lvx(VR17, off16, buf);
4708
4709 vxor(VR2, VR2, VR10);
4710 vpmsumd(VR10, VR18, const2);
4711 lvx(VR18, off32, buf);
4712
4713 vxor(VR3, VR3, VR11);
4714 vpmsumd(VR11, VR19, const2);
4715 lvx(VR19, off48, buf);
4716 lvx(const2, constantsPos);
4717
4718 vxor(VR4, VR4, VR12);
4719 vpmsumd(VR12, VR20, const1);
4720 lvx(VR20, off64, buf);
4721
4722 vxor(VR5, VR5, VR13);
4723 vpmsumd(VR13, VR21, const1);
4724 lvx(VR21, off80, buf);
4725
4726 vxor(VR6, VR6, VR14);
4727 vpmsumd(VR14, VR22, const1);
4728 lvx(VR22, off96, buf);
4729
4730 vxor(VR7, VR7, VR15);
4731 vpmsumd(VR15, VR23, const1);
4732 lvx(VR23, off112, buf);
4733
4734 addi(buf, buf, 8 * 16);
4735
4736 bdnz(L_4);
4737 }
4738
4739 BIND(L_first_cool_down);
4740
4741 // First cool down pass
4742 lvx(const1, constantsPos);
4743 addi(constantsPos, constantsPos, 16);
4744
4745 vxor(VR0, VR0, VR8);
4746 vpmsumd(VR8, VR16, const1);
4747
4748 vxor(VR1, VR1, VR9);
4749 vpmsumd(VR9, VR17, const1);
4750
4751 vxor(VR2, VR2, VR10);
4752 vpmsumd(VR10, VR18, const1);
4753
4754 vxor(VR3, VR3, VR11);
4755 vpmsumd(VR11, VR19, const1);
4756
4757 vxor(VR4, VR4, VR12);
4758 vpmsumd(VR12, VR20, const1);
4759
4760 vxor(VR5, VR5, VR13);
4761 vpmsumd(VR13, VR21, const1);
4762
4763 vxor(VR6, VR6, VR14);
4764 vpmsumd(VR14, VR22, const1);
4765
4766 vxor(VR7, VR7, VR15);
4767 vpmsumd(VR15, VR23, const1);
4768
4769 BIND(L_second_cool_down);
4770 // Second cool down pass
4771 vxor(VR0, VR0, VR8);
4772 vxor(VR1, VR1, VR9);
4773 vxor(VR2, VR2, VR10);
4774 vxor(VR3, VR3, VR11);
4775 vxor(VR4, VR4, VR12);
4776 vxor(VR5, VR5, VR13);
4777 vxor(VR6, VR6, VR14);
4778 vxor(VR7, VR7, VR15);
4779
4780 /*
4781 * vpmsumd produces a 96 bit result in the least significant bits
4782 * of the register. Since we are bit reflected we have to shift it
4783 * left 32 bits so it occupies the least significant bits in the
4784 * bit reflected domain.
4785 */
4786 vsldoi(VR0, VR0, zeroes, 4);
4787 vsldoi(VR1, VR1, zeroes, 4);
4788 vsldoi(VR2, VR2, zeroes, 4);
4789 vsldoi(VR3, VR3, zeroes, 4);
4790 vsldoi(VR4, VR4, zeroes, 4);
4791 vsldoi(VR5, VR5, zeroes, 4);
4792 vsldoi(VR6, VR6, zeroes, 4);
4793 vsldoi(VR7, VR7, zeroes, 4);
4794
4795 // xor with last 1024 bits
4796 lvx(VR8, buf);
4797 lvx(VR9, off16, buf);
4798 lvx(VR10, off32, buf);
4799 lvx(VR11, off48, buf);
4800 lvx(VR12, off64, buf);
4801 lvx(VR13, off80, buf);
4802 lvx(VR14, off96, buf);
4803 lvx(VR15, off112, buf);
4804 addi(buf, buf, 8 * 16);
4805
4806 vxor(VR16, VR0, VR8);
4807 vxor(VR17, VR1, VR9);
4808 vxor(VR18, VR2, VR10);
4809 vxor(VR19, VR3, VR11);
4810 vxor(VR20, VR4, VR12);
4811 vxor(VR21, VR5, VR13);
4812 vxor(VR22, VR6, VR14);
4813 vxor(VR23, VR7, VR15);
4814
4815 li(rLoaded, 1);
4816 cmpdi(CCR0, rIdx, 0);
4817 addi(rIdx, rIdx, 128);
4818 bne(CCR0, L_1);
4819 }
4820
4821 // Work out how many bytes we have left
4822 andi_(len, len, 127);
4823
4824 // Calculate where in the constant table we need to start
4825 subfic(rTmp1, len, 128);
4826 add(constantsPos, constantsPos, rTmp1);
4827
4828 // How many 16 byte chunks are in the tail
4829 srdi(rIdx, len, 4);
4830 mtctr(rIdx);
4831
4832 /*
4833 * Reduce the previously calculated 1024 bits to 64 bits, shifting
4834 * 32 bits to include the trailing 32 bits of zeros
4835 */
4836 lvx(VR0, constantsPos);
4837 lvx(VR1, off16, constantsPos);
4838 lvx(VR2, off32, constantsPos);
4839 lvx(VR3, off48, constantsPos);
4840 lvx(VR4, off64, constantsPos);
4841 lvx(VR5, off80, constantsPos);
4842 lvx(VR6, off96, constantsPos);
4843 lvx(VR7, off112, constantsPos);
4844 addi(constantsPos, constantsPos, 8 * 16);
4845
4846 vpmsumw(VR0, VR16, VR0);
4847 vpmsumw(VR1, VR17, VR1);
4848 vpmsumw(VR2, VR18, VR2);
4849 vpmsumw(VR3, VR19, VR3);
4850 vpmsumw(VR4, VR20, VR4);
4851 vpmsumw(VR5, VR21, VR5);
4852 vpmsumw(VR6, VR22, VR6);
4853 vpmsumw(VR7, VR23, VR7);
4854
4855 // Now reduce the tail (0 - 112 bytes)
4856 cmpdi(CCR0, rIdx, 0);
4857 beq(CCR0, L_XOR);
4858
4859 lvx(VR16, buf); addi(buf, buf, 16);
4860 lvx(VR17, constantsPos);
4861 vpmsumw(VR16, VR16, VR17);
4862 vxor(VR0, VR0, VR16);
4863 beq(CCR0, L_XOR);
4864
4865 lvx(VR16, buf); addi(buf, buf, 16);
4866 lvx(VR17, off16, constantsPos);
4867 vpmsumw(VR16, VR16, VR17);
4868 vxor(VR0, VR0, VR16);
4869 beq(CCR0, L_XOR);
4870
4871 lvx(VR16, buf); addi(buf, buf, 16);
4872 lvx(VR17, off32, constantsPos);
4873 vpmsumw(VR16, VR16, VR17);
4874 vxor(VR0, VR0, VR16);
4875 beq(CCR0, L_XOR);
4876
4877 lvx(VR16, buf); addi(buf, buf, 16);
4878 lvx(VR17, off48,constantsPos);
4879 vpmsumw(VR16, VR16, VR17);
4880 vxor(VR0, VR0, VR16);
4881 beq(CCR0, L_XOR);
4882
4883 lvx(VR16, buf); addi(buf, buf, 16);
4884 lvx(VR17, off64, constantsPos);
4885 vpmsumw(VR16, VR16, VR17);
4886 vxor(VR0, VR0, VR16);
4887 beq(CCR0, L_XOR);
4888
4889 lvx(VR16, buf); addi(buf, buf, 16);
4890 lvx(VR17, off80, constantsPos);
4891 vpmsumw(VR16, VR16, VR17);
4892 vxor(VR0, VR0, VR16);
4893 beq(CCR0, L_XOR);
4894
4895 lvx(VR16, buf); addi(buf, buf, 16);
4896 lvx(VR17, off96, constantsPos);
4897 vpmsumw(VR16, VR16, VR17);
4898 vxor(VR0, VR0, VR16);
4899
4900 // Now xor all the parallel chunks together
4901 BIND(L_XOR);
4902 vxor(VR0, VR0, VR1);
4903 vxor(VR2, VR2, VR3);
4904 vxor(VR4, VR4, VR5);
4905 vxor(VR6, VR6, VR7);
4906
4907 vxor(VR0, VR0, VR2);
4908 vxor(VR4, VR4, VR6);
4909
4910 vxor(VR0, VR0, VR4);
4911
4912 b(L_barrett_reduction);
4913
4914 BIND(L_first_warm_up_done);
4915 lvx(const1, constantsPos);
4916 addi(constantsPos, constantsPos, 16);
4917 vpmsumd(VR8, VR16, const1);
4918 vpmsumd(VR9, VR17, const1);
4919 vpmsumd(VR10, VR18, const1);
4920 vpmsumd(VR11, VR19, const1);
4921 vpmsumd(VR12, VR20, const1);
4922 vpmsumd(VR13, VR21, const1);
4923 vpmsumd(VR14, VR22, const1);
4924 vpmsumd(VR15, VR23, const1);
4925 b(L_second_cool_down);
4926
4927 BIND(L_barrett_reduction);
4928
4929 lvx(const1, barretConstants);
4930 addi(barretConstants, barretConstants, 16);
4931 lvx(const2, barretConstants);
4932
4933 vsldoi(VR1, VR0, VR0, -8);
4934 vxor(VR0, VR0, VR1); // xor two 64 bit results together
4935
4936 // shift left one bit
4937 vspltisb(VR1, 1);
4938 vsl(VR0, VR0, VR1);
4939
4940 vand(VR0, VR0, mask_64bit);
4941
4942 /*
4943 * The reflected version of Barrett reduction. Instead of bit
4944 * reflecting our data (which is expensive to do), we bit reflect our
4945 * constants and our algorithm, which means the intermediate data in
4946 * our vector registers goes from 0-63 instead of 63-0. We can reflect
4947 * the algorithm because we don't carry in mod 2 arithmetic.
4948 */
4949 vand(VR1, VR0, mask_32bit); // bottom 32 bits of a
4950 vpmsumd(VR1, VR1, const1); // ma
4951 vand(VR1, VR1, mask_32bit); // bottom 32bits of ma
4952 vpmsumd(VR1, VR1, const2); // qn */
4953 vxor(VR0, VR0, VR1); // a - qn, subtraction is xor in GF(2)
4954
4955 /*
4956 * Since we are bit reflected, the result (ie the low 32 bits) is in
4957 * the high 32 bits. We just need to shift it left 4 bytes
4958 * V0 [ 0 1 X 3 ]
4959 * V0 [ 0 X 2 3 ]
4960 */
4961 vsldoi(VR0, VR0, zeroes, 4); // shift result into top 64 bits of
4962
4963 // Get it into r3
4964 mfvrd(crc, VR0);
4965
4966 BIND(L_end);
4967
4968 offsetInt = 0;
4969 // Restore non-volatile Vector registers (frameless).
4970 offsetInt -= 16; li(offset, -16); lvx(VR20, offset, R1_SP);
4971 offsetInt -= 16; addi(offset, offset, -16); lvx(VR21, offset, R1_SP);
4972 offsetInt -= 16; addi(offset, offset, -16); lvx(VR22, offset, R1_SP);
4973 offsetInt -= 16; addi(offset, offset, -16); lvx(VR23, offset, R1_SP);
4974 offsetInt -= 16; addi(offset, offset, -16); lvx(VR24, offset, R1_SP);
4975 offsetInt -= 16; addi(offset, offset, -16); lvx(VR25, offset, R1_SP);
4976 offsetInt -= 16; addi(offset, offset, -16); lvx(VR26, offset, R1_SP);
4977 offsetInt -= 16; addi(offset, offset, -16); lvx(VR27, offset, R1_SP);
4978 offsetInt -= 16; addi(offset, offset, -16); lvx(VR28, offset, R1_SP);
4979 offsetInt -= 8; ld(R22, offsetInt, R1_SP);
4980 offsetInt -= 8; ld(R23, offsetInt, R1_SP);
4981 offsetInt -= 8; ld(R24, offsetInt, R1_SP);
4982 offsetInt -= 8; ld(R25, offsetInt, R1_SP);
4983 offsetInt -= 8; ld(R26, offsetInt, R1_SP);
4984 offsetInt -= 8; ld(R27, offsetInt, R1_SP);
4985 offsetInt -= 8; ld(R28, offsetInt, R1_SP);
4986 offsetInt -= 8; ld(R29, offsetInt, R1_SP);
4987 offsetInt -= 8; ld(R30, offsetInt, R1_SP);
4988 offsetInt -= 8; ld(R31, offsetInt, R1_SP);
4989 }
4990
4991 void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp, bool invertCRC) {
4992 assert_different_registers(crc, buf, /* len, not used!! */ table, tmp);
4993
4994 BLOCK_COMMENT("kernel_crc32_singleByte:");
4995 if (invertCRC) {
4996 nand(crc, crc, crc); // 1s complement of crc
4997 }
4998
4999 lbz(tmp, 0, buf); // Byte from buffer, zero-extended.
5000 update_byte_crc32(crc, tmp, table);
5001
5002 if (invertCRC) {
5003 nand(crc, crc, crc); // 1s complement of crc
5004 }
5005 }
5006
5007 void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) {
5008 assert_different_registers(crc, val, table);
5009
5010 BLOCK_COMMENT("kernel_crc32_singleByteReg:");
5011 if (invertCRC) {
5012 nand(crc, crc, crc); // 1s complement of crc
5013 }
5014
5015 update_byte_crc32(crc, val, table);
5016
5017 if (invertCRC) {
5018 nand(crc, crc, crc); // 1s complement of crc
5019 }
5020 }
5021
5022 // dest_lo += src1 + src2
5023 // dest_hi += carry1 + carry2
5024 void MacroAssembler::add2_with_carry(Register dest_hi,
5025 Register dest_lo,
5026 Register src1, Register src2) {
5027 li(R0, 0);
5028 addc(dest_lo, dest_lo, src1);
5029 adde(dest_hi, dest_hi, R0);
5030 addc(dest_lo, dest_lo, src2);
5031 adde(dest_hi, dest_hi, R0);
5032 }
5033
5034 // Multiply 64 bit by 64 bit first loop.
5035 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,
5036 Register x_xstart,
5037 Register y, Register y_idx,
5038 Register z,
5039 Register carry,
5040 Register product_high, Register product,
5041 Register idx, Register kdx,
5042 Register tmp) {
5043 // jlong carry, x[], y[], z[];
5044 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
5045 // huge_128 product = y[idx] * x[xstart] + carry;
5046 // z[kdx] = (jlong)product;
5047 // carry = (jlong)(product >>> 64);
5048 // }
5049 // z[xstart] = carry;
5050
5051 Label L_first_loop, L_first_loop_exit;
5052 Label L_one_x, L_one_y, L_multiply;
5053
5054 addic_(xstart, xstart, -1);
5055 blt(CCR0, L_one_x); // Special case: length of x is 1.
5056
5057 // Load next two integers of x.
5058 sldi(tmp, xstart, LogBytesPerInt);
5059 ldx(x_xstart, x, tmp);
5060 #ifdef VM_LITTLE_ENDIAN
5061 rldicl(x_xstart, x_xstart, 32, 0);
5062 #endif
5063
5064 align(32, 16);
5065 bind(L_first_loop);
5066
5067 cmpdi(CCR0, idx, 1);
5068 blt(CCR0, L_first_loop_exit);
5069 addi(idx, idx, -2);
5070 beq(CCR0, L_one_y);
5071
5072 // Load next two integers of y.
5073 sldi(tmp, idx, LogBytesPerInt);
5074 ldx(y_idx, y, tmp);
5075 #ifdef VM_LITTLE_ENDIAN
5076 rldicl(y_idx, y_idx, 32, 0);
5077 #endif
5078
5079
5080 bind(L_multiply);
5081 multiply64(product_high, product, x_xstart, y_idx);
5082
5083 li(tmp, 0);
5084 addc(product, product, carry); // Add carry to result.
5085 adde(product_high, product_high, tmp); // Add carry of the last addition.
5086 addi(kdx, kdx, -2);
5087
5088 // Store result.
5089 #ifdef VM_LITTLE_ENDIAN
5090 rldicl(product, product, 32, 0);
5091 #endif
5092 sldi(tmp, kdx, LogBytesPerInt);
5093 stdx(product, z, tmp);
5094 mr_if_needed(carry, product_high);
5095 b(L_first_loop);
5096
5097
5098 bind(L_one_y); // Load one 32 bit portion of y as (0,value).
5099
5100 lwz(y_idx, 0, y);
5101 b(L_multiply);
5102
5103
5104 bind(L_one_x); // Load one 32 bit portion of x as (0,value).
5105
5106 lwz(x_xstart, 0, x);
5107 b(L_first_loop);
5108
5109 bind(L_first_loop_exit);
5110 }
5111
5112 // Multiply 64 bit by 64 bit and add 128 bit.
5113 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y,
5114 Register z, Register yz_idx,
5115 Register idx, Register carry,
5116 Register product_high, Register product,
5117 Register tmp, int offset) {
5118
5119 // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
5120 // z[kdx] = (jlong)product;
5121
5122 sldi(tmp, idx, LogBytesPerInt);
5123 if (offset) {
5124 addi(tmp, tmp, offset);
5125 }
5126 ldx(yz_idx, y, tmp);
5127 #ifdef VM_LITTLE_ENDIAN
5128 rldicl(yz_idx, yz_idx, 32, 0);
5129 #endif
5130
5131 multiply64(product_high, product, x_xstart, yz_idx);
5132 ldx(yz_idx, z, tmp);
5133 #ifdef VM_LITTLE_ENDIAN
5134 rldicl(yz_idx, yz_idx, 32, 0);
5135 #endif
5136
5137 add2_with_carry(product_high, product, carry, yz_idx);
5138
5139 sldi(tmp, idx, LogBytesPerInt);
5140 if (offset) {
5141 addi(tmp, tmp, offset);
5142 }
5143 #ifdef VM_LITTLE_ENDIAN
5144 rldicl(product, product, 32, 0);
5145 #endif
5146 stdx(product, z, tmp);
5147 }
5148
5149 // Multiply 128 bit by 128 bit. Unrolled inner loop.
5150 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart,
5151 Register y, Register z,
5152 Register yz_idx, Register idx, Register carry,
5153 Register product_high, Register product,
5154 Register carry2, Register tmp) {
5155
5156 // jlong carry, x[], y[], z[];
5157 // int kdx = ystart+1;
5158 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
5159 // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
5160 // z[kdx+idx+1] = (jlong)product;
5161 // jlong carry2 = (jlong)(product >>> 64);
5162 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
5163 // z[kdx+idx] = (jlong)product;
5164 // carry = (jlong)(product >>> 64);
5165 // }
5166 // idx += 2;
5167 // if (idx > 0) {
5168 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
5169 // z[kdx+idx] = (jlong)product;
5170 // carry = (jlong)(product >>> 64);
5171 // }
5172
5173 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
5174 const Register jdx = R0;
5175
5176 // Scale the index.
5177 srdi_(jdx, idx, 2);
5178 beq(CCR0, L_third_loop_exit);
5179 mtctr(jdx);
5180
5181 align(32, 16);
5182 bind(L_third_loop);
5183
5184 addi(idx, idx, -4);
5185
5186 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8);
5187 mr_if_needed(carry2, product_high);
5188
5189 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0);
5190 mr_if_needed(carry, product_high);
5191 bdnz(L_third_loop);
5192
5193 bind(L_third_loop_exit); // Handle any left-over operand parts.
5194
5195 andi_(idx, idx, 0x3);
5196 beq(CCR0, L_post_third_loop_done);
5197
5198 Label L_check_1;
5199
5200 addic_(idx, idx, -2);
5201 blt(CCR0, L_check_1);
5202
5203 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0);
5204 mr_if_needed(carry, product_high);
5205
5206 bind(L_check_1);
5207
5208 addi(idx, idx, 0x2);
5209 andi_(idx, idx, 0x1);
5210 addic_(idx, idx, -1);
5211 blt(CCR0, L_post_third_loop_done);
5212
5213 sldi(tmp, idx, LogBytesPerInt);
5214 lwzx(yz_idx, y, tmp);
5215 multiply64(product_high, product, x_xstart, yz_idx);
5216 lwzx(yz_idx, z, tmp);
5217
5218 add2_with_carry(product_high, product, yz_idx, carry);
5219
5220 sldi(tmp, idx, LogBytesPerInt);
5221 stwx(product, z, tmp);
5222 srdi(product, product, 32);
5223
5224 sldi(product_high, product_high, 32);
5225 orr(product, product, product_high);
5226 mr_if_needed(carry, product);
5227
5228 bind(L_post_third_loop_done);
5229 } // multiply_128_x_128_loop
5230
5231 void MacroAssembler::multiply_to_len(Register x, Register xlen,
5232 Register y, Register ylen,
5233 Register z, Register zlen,
5234 Register tmp1, Register tmp2,
5235 Register tmp3, Register tmp4,
5236 Register tmp5, Register tmp6,
5237 Register tmp7, Register tmp8,
5238 Register tmp9, Register tmp10,
5239 Register tmp11, Register tmp12,
5240 Register tmp13) {
5241
5242 ShortBranchVerifier sbv(this);
5243
5244 assert_different_registers(x, xlen, y, ylen, z, zlen,
5245 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
5246 assert_different_registers(x, xlen, y, ylen, z, zlen,
5247 tmp1, tmp2, tmp3, tmp4, tmp5, tmp7);
5248 assert_different_registers(x, xlen, y, ylen, z, zlen,
5249 tmp1, tmp2, tmp3, tmp4, tmp5, tmp8);
5250
5251 const Register idx = tmp1;
5252 const Register kdx = tmp2;
5253 const Register xstart = tmp3;
5254
5255 const Register y_idx = tmp4;
5256 const Register carry = tmp5;
5257 const Register product = tmp6;
5258 const Register product_high = tmp7;
5259 const Register x_xstart = tmp8;
5260 const Register tmp = tmp9;
5261
5262 // First Loop.
5263 //
5264 // final static long LONG_MASK = 0xffffffffL;
5265 // int xstart = xlen - 1;
5266 // int ystart = ylen - 1;
5267 // long carry = 0;
5268 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
5269 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
5270 // z[kdx] = (int)product;
5271 // carry = product >>> 32;
5272 // }
5273 // z[xstart] = (int)carry;
5274
5275 mr_if_needed(idx, ylen); // idx = ylen
5276 mr_if_needed(kdx, zlen); // kdx = xlen + ylen
5277 li(carry, 0); // carry = 0
5278
5279 Label L_done;
5280
5281 addic_(xstart, xlen, -1);
5282 blt(CCR0, L_done);
5283
5284 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z,
5285 carry, product_high, product, idx, kdx, tmp);
5286
5287 Label L_second_loop;
5288
5289 cmpdi(CCR0, kdx, 0);
5290 beq(CCR0, L_second_loop);
5291
5292 Label L_carry;
5293
5294 addic_(kdx, kdx, -1);
5295 beq(CCR0, L_carry);
5296
5297 // Store lower 32 bits of carry.
5298 sldi(tmp, kdx, LogBytesPerInt);
5299 stwx(carry, z, tmp);
5300 srdi(carry, carry, 32);
5301 addi(kdx, kdx, -1);
5302
5303
5304 bind(L_carry);
5305
5306 // Store upper 32 bits of carry.
5307 sldi(tmp, kdx, LogBytesPerInt);
5308 stwx(carry, z, tmp);
5309
5310 // Second and third (nested) loops.
5311 //
5312 // for (int i = xstart-1; i >= 0; i--) { // Second loop
5313 // carry = 0;
5314 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
5315 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
5316 // (z[k] & LONG_MASK) + carry;
5317 // z[k] = (int)product;
5318 // carry = product >>> 32;
5319 // }
5320 // z[i] = (int)carry;
5321 // }
5322 //
5323 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
5324
5325 bind(L_second_loop);
5326
5327 li(carry, 0); // carry = 0;
5328
5329 addic_(xstart, xstart, -1); // i = xstart-1;
5330 blt(CCR0, L_done);
5331
5332 Register zsave = tmp10;
5333
5334 mr(zsave, z);
5335
5336
5337 Label L_last_x;
5338
5339 sldi(tmp, xstart, LogBytesPerInt);
5340 add(z, z, tmp); // z = z + k - j
5341 addi(z, z, 4);
5342 addic_(xstart, xstart, -1); // i = xstart-1;
5343 blt(CCR0, L_last_x);
5344
5345 sldi(tmp, xstart, LogBytesPerInt);
5346 ldx(x_xstart, x, tmp);
5347 #ifdef VM_LITTLE_ENDIAN
5348 rldicl(x_xstart, x_xstart, 32, 0);
5349 #endif
5350
5351
5352 Label L_third_loop_prologue;
5353
5354 bind(L_third_loop_prologue);
5355
5356 Register xsave = tmp11;
5357 Register xlensave = tmp12;
5358 Register ylensave = tmp13;
5359
5360 mr(xsave, x);
5361 mr(xlensave, xstart);
5362 mr(ylensave, ylen);
5363
5364
5365 multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen,
5366 carry, product_high, product, x, tmp);
5367
5368 mr(z, zsave);
5369 mr(x, xsave);
5370 mr(xlen, xlensave); // This is the decrement of the loop counter!
5371 mr(ylen, ylensave);
5372
5373 addi(tmp3, xlen, 1);
5374 sldi(tmp, tmp3, LogBytesPerInt);
5375 stwx(carry, z, tmp);
5376 addic_(tmp3, tmp3, -1);
5377 blt(CCR0, L_done);
5378
5379 srdi(carry, carry, 32);
5380 sldi(tmp, tmp3, LogBytesPerInt);
5381 stwx(carry, z, tmp);
5382 b(L_second_loop);
5383
5384 // Next infrequent code is moved outside loops.
5385 bind(L_last_x);
5386
5387 lwz(x_xstart, 0, x);
5388 b(L_third_loop_prologue);
5389
5390 bind(L_done);
5391 } // multiply_to_len
5392
5393 void MacroAssembler::asm_assert(bool check_equal, const char *msg, int id) {
5394 #ifdef ASSERT
5395 Label ok;
5396 if (check_equal) {
5397 beq(CCR0, ok);
5398 } else {
5399 bne(CCR0, ok);
5400 }
5401 stop(msg, id);
5402 bind(ok);
5403 #endif
5404 }
5405
5406 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset,
5407 Register mem_base, const char* msg, int id) {
5408 #ifdef ASSERT
5409 switch (size) {
5410 case 4:
5411 lwz(R0, mem_offset, mem_base);
5412 cmpwi(CCR0, R0, 0);
5413 break;
5414 case 8:
5415 ld(R0, mem_offset, mem_base);
5416 cmpdi(CCR0, R0, 0);
5417 break;
5418 default:
5419 ShouldNotReachHere();
5420 }
5421 asm_assert(check_equal, msg, id);
5422 #endif // ASSERT
5423 }
5424
5425 void MacroAssembler::verify_thread() {
5426 if (VerifyThread) {
5427 unimplemented("'VerifyThread' currently not implemented on PPC");
5428 }
5429 }
5430
5431 // READ: oop. KILL: R0. Volatile floats perhaps.
5432 void MacroAssembler::verify_oop(Register oop, const char* msg) {
5433 if (!VerifyOops) {
5434 return;
5435 }
5436
5437 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
5438 const Register tmp = R11; // Will be preserved.
5439 const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
5440 save_volatile_gprs(R1_SP, -nbytes_save); // except R0
5441
5442 mr_if_needed(R4_ARG2, oop);
5443 save_LR_CR(tmp); // save in old frame
5444 push_frame_reg_args(nbytes_save, tmp);
5445 // load FunctionDescriptor** / entry_address *
5446 load_const_optimized(tmp, fd, R0);
5447 // load FunctionDescriptor* / entry_address
5448 ld(tmp, 0, tmp);
5449 load_const_optimized(R3_ARG1, (address)msg, R0);
5450 // Call destination for its side effect.
5451 call_c(tmp);
5452
5453 pop_frame();
5454 restore_LR_CR(tmp);
5455 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
5456 }
5457
5458 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) {
5459 if (!VerifyOops) {
5460 return;
5461 }
5462
5463 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
5464 const Register tmp = R11; // Will be preserved.
5465 const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
5466 save_volatile_gprs(R1_SP, -nbytes_save); // except R0
5467
5468 ld(R4_ARG2, offs, base);
5469 save_LR_CR(tmp); // save in old frame
5470 push_frame_reg_args(nbytes_save, tmp);
5471 // load FunctionDescriptor** / entry_address *
5472 load_const_optimized(tmp, fd, R0);
5473 // load FunctionDescriptor* / entry_address
5474 ld(tmp, 0, tmp);
5475 load_const_optimized(R3_ARG1, (address)msg, R0);
5476 // Call destination for its side effect.
5477 call_c(tmp);
5478
5479 pop_frame();
5480 restore_LR_CR(tmp);
5481 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
5482 }
5483
5484 const char* stop_types[] = {
5485 "stop",
5486 "untested",
5487 "unimplemented",
5488 "shouldnotreachhere"
5489 };
5490
5491 static void stop_on_request(int tp, const char* msg) {
5492 tty->print("PPC assembly code requires stop: (%s) %s\n", stop_types[tp%/*stop_end*/4], msg);
5493 guarantee(false, "PPC assembly code requires stop: %s", msg);
5494 }
5495
5496 // Call a C-function that prints output.
5497 void MacroAssembler::stop(int type, const char* msg, int id) {
5498 #ifndef PRODUCT
5499 block_comment(err_msg("stop: %s %s {", stop_types[type%stop_end], msg));
5500 #else
5501 block_comment("stop {");
5502 #endif
5503
5504 // setup arguments
5505 load_const_optimized(R3_ARG1, type);
5506 load_const_optimized(R4_ARG2, (void *)msg, /*tmp=*/R0);
5507 call_VM_leaf(CAST_FROM_FN_PTR(address, stop_on_request), R3_ARG1, R4_ARG2);
5508 illtrap();
5509 emit_int32(id);
5510 block_comment("} stop;");
5511 }
5512
5513 #ifndef PRODUCT
5514 // Write pattern 0x0101010101010101 in memory region [low-before, high+after].
5515 // Val, addr are temp registers.
5516 // If low == addr, addr is killed.
5517 // High is preserved.
5518 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) {
5519 if (!ZapMemory) return;
5520
5521 assert_different_registers(low, val);
5522
5523 BLOCK_COMMENT("zap memory region {");
5524 load_const_optimized(val, 0x0101010101010101);
5525 int size = before + after;
5526 if (low == high && size < 5 && size > 0) {
5527 int offset = -before*BytesPerWord;
5528 for (int i = 0; i < size; ++i) {
5529 std(val, offset, low);
5530 offset += (1*BytesPerWord);
5531 }
5532 } else {
5533 addi(addr, low, -before*BytesPerWord);
5534 assert_different_registers(high, val);
5535 if (after) addi(high, high, after * BytesPerWord);
5536 Label loop;
5537 bind(loop);
5538 std(val, 0, addr);
5539 addi(addr, addr, 8);
5540 cmpd(CCR6, addr, high);
5541 ble(CCR6, loop);
5542 if (after) addi(high, high, -after * BytesPerWord); // Correct back to old value.
5543 }
5544 BLOCK_COMMENT("} zap memory region");
5545 }
5546
5547 #endif // !PRODUCT
5548
5549 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() {
5550 int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true);
5551 assert(sizeof(bool) == 1, "PowerPC ABI");
5552 masm->lbz(temp, simm16_offset, temp);
5553 masm->cmpwi(CCR0, temp, 0);
5554 masm->beq(CCR0, _label);
5555 }
5556
5557 SkipIfEqualZero::~SkipIfEqualZero() {
5558 _masm->bind(_label);
5559 }