1 /*
   2  * Copyright (c) 1997, 2016, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2012, 2016 SAP SE. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "interpreter/interpreter.hpp"
  29 #include "nativeInst_ppc.hpp"
  30 #include "oops/instanceOop.hpp"
  31 #include "oops/method.hpp"
  32 #include "oops/objArrayKlass.hpp"
  33 #include "oops/oop.inline.hpp"
  34 #include "prims/methodHandles.hpp"
  35 #include "runtime/frame.inline.hpp"
  36 #include "runtime/handles.inline.hpp"
  37 #include "runtime/sharedRuntime.hpp"
  38 #include "runtime/stubCodeGenerator.hpp"
  39 #include "runtime/stubRoutines.hpp"
  40 #include "runtime/thread.inline.hpp"
  41 
  42 #define __ _masm->
  43 
  44 #ifdef PRODUCT
  45 #define BLOCK_COMMENT(str) // nothing
  46 #else
  47 #define BLOCK_COMMENT(str) __ block_comment(str)
  48 #endif
  49 
  50 #if defined(ABI_ELFv2)
  51 #define STUB_ENTRY(name) StubRoutines::name()
  52 #else
  53 #define STUB_ENTRY(name) ((FunctionDescriptor*)StubRoutines::name())->entry()
  54 #endif
  55 
  56 class StubGenerator: public StubCodeGenerator {
  57  private:
  58 
  59   // Call stubs are used to call Java from C
  60   //
  61   // Arguments:
  62   //
  63   //   R3  - call wrapper address     : address
  64   //   R4  - result                   : intptr_t*
  65   //   R5  - result type              : BasicType
  66   //   R6  - method                   : Method
  67   //   R7  - frame mgr entry point    : address
  68   //   R8  - parameter block          : intptr_t*
  69   //   R9  - parameter count in words : int
  70   //   R10 - thread                   : Thread*
  71   //
  72   address generate_call_stub(address& return_address) {
  73     // Setup a new c frame, copy java arguments, call frame manager or
  74     // native_entry, and process result.
  75 
  76     StubCodeMark mark(this, "StubRoutines", "call_stub");
  77 
  78     address start = __ function_entry();
  79 
  80     // some sanity checks
  81     assert((sizeof(frame::abi_minframe) % 16) == 0,           "unaligned");
  82     assert((sizeof(frame::abi_reg_args) % 16) == 0,           "unaligned");
  83     assert((sizeof(frame::spill_nonvolatiles) % 16) == 0,     "unaligned");
  84     assert((sizeof(frame::parent_ijava_frame_abi) % 16) == 0, "unaligned");
  85     assert((sizeof(frame::entry_frame_locals) % 16) == 0,     "unaligned");
  86 
  87     Register r_arg_call_wrapper_addr        = R3;
  88     Register r_arg_result_addr              = R4;
  89     Register r_arg_result_type              = R5;
  90     Register r_arg_method                   = R6;
  91     Register r_arg_entry                    = R7;
  92     Register r_arg_thread                   = R10;
  93 
  94     Register r_temp                         = R24;
  95     Register r_top_of_arguments_addr        = R25;
  96     Register r_entryframe_fp                = R26;
  97 
  98     {
  99       // Stack on entry to call_stub:
 100       //
 101       //      F1      [C_FRAME]
 102       //              ...
 103 
 104       Register r_arg_argument_addr          = R8;
 105       Register r_arg_argument_count         = R9;
 106       Register r_frame_alignment_in_bytes   = R27;
 107       Register r_argument_addr              = R28;
 108       Register r_argumentcopy_addr          = R29;
 109       Register r_argument_size_in_bytes     = R30;
 110       Register r_frame_size                 = R23;
 111 
 112       Label arguments_copied;
 113 
 114       // Save LR/CR to caller's C_FRAME.
 115       __ save_LR_CR(R0);
 116 
 117       // Zero extend arg_argument_count.
 118       __ clrldi(r_arg_argument_count, r_arg_argument_count, 32);
 119 
 120       // Save non-volatiles GPRs to ENTRY_FRAME (not yet pushed, but it's safe).
 121       __ save_nonvolatile_gprs(R1_SP, _spill_nonvolatiles_neg(r14));
 122 
 123       // Keep copy of our frame pointer (caller's SP).
 124       __ mr(r_entryframe_fp, R1_SP);
 125 
 126       BLOCK_COMMENT("Push ENTRY_FRAME including arguments");
 127       // Push ENTRY_FRAME including arguments:
 128       //
 129       //      F0      [TOP_IJAVA_FRAME_ABI]
 130       //              alignment (optional)
 131       //              [outgoing Java arguments]
 132       //              [ENTRY_FRAME_LOCALS]
 133       //      F1      [C_FRAME]
 134       //              ...
 135 
 136       // calculate frame size
 137 
 138       // unaligned size of arguments
 139       __ sldi(r_argument_size_in_bytes,
 140                   r_arg_argument_count, Interpreter::logStackElementSize);
 141       // arguments alignment (max 1 slot)
 142       // FIXME: use round_to() here
 143       __ andi_(r_frame_alignment_in_bytes, r_arg_argument_count, 1);
 144       __ sldi(r_frame_alignment_in_bytes,
 145               r_frame_alignment_in_bytes, Interpreter::logStackElementSize);
 146 
 147       // size = unaligned size of arguments + top abi's size
 148       __ addi(r_frame_size, r_argument_size_in_bytes,
 149               frame::top_ijava_frame_abi_size);
 150       // size += arguments alignment
 151       __ add(r_frame_size,
 152              r_frame_size, r_frame_alignment_in_bytes);
 153       // size += size of call_stub locals
 154       __ addi(r_frame_size,
 155               r_frame_size, frame::entry_frame_locals_size);
 156 
 157       // push ENTRY_FRAME
 158       __ push_frame(r_frame_size, r_temp);
 159 
 160       // initialize call_stub locals (step 1)
 161       __ std(r_arg_call_wrapper_addr,
 162              _entry_frame_locals_neg(call_wrapper_address), r_entryframe_fp);
 163       __ std(r_arg_result_addr,
 164              _entry_frame_locals_neg(result_address), r_entryframe_fp);
 165       __ std(r_arg_result_type,
 166              _entry_frame_locals_neg(result_type), r_entryframe_fp);
 167       // we will save arguments_tos_address later
 168 
 169 
 170       BLOCK_COMMENT("Copy Java arguments");
 171       // copy Java arguments
 172 
 173       // Calculate top_of_arguments_addr which will be R17_tos (not prepushed) later.
 174       // FIXME: why not simply use SP+frame::top_ijava_frame_size?
 175       __ addi(r_top_of_arguments_addr,
 176               R1_SP, frame::top_ijava_frame_abi_size);
 177       __ add(r_top_of_arguments_addr,
 178              r_top_of_arguments_addr, r_frame_alignment_in_bytes);
 179 
 180       // any arguments to copy?
 181       __ cmpdi(CCR0, r_arg_argument_count, 0);
 182       __ beq(CCR0, arguments_copied);
 183 
 184       // prepare loop and copy arguments in reverse order
 185       {
 186         // init CTR with arg_argument_count
 187         __ mtctr(r_arg_argument_count);
 188 
 189         // let r_argumentcopy_addr point to last outgoing Java arguments P
 190         __ mr(r_argumentcopy_addr, r_top_of_arguments_addr);
 191 
 192         // let r_argument_addr point to last incoming java argument
 193         __ add(r_argument_addr,
 194                    r_arg_argument_addr, r_argument_size_in_bytes);
 195         __ addi(r_argument_addr, r_argument_addr, -BytesPerWord);
 196 
 197         // now loop while CTR > 0 and copy arguments
 198         {
 199           Label next_argument;
 200           __ bind(next_argument);
 201 
 202           __ ld(r_temp, 0, r_argument_addr);
 203           // argument_addr--;
 204           __ addi(r_argument_addr, r_argument_addr, -BytesPerWord);
 205           __ std(r_temp, 0, r_argumentcopy_addr);
 206           // argumentcopy_addr++;
 207           __ addi(r_argumentcopy_addr, r_argumentcopy_addr, BytesPerWord);
 208 
 209           __ bdnz(next_argument);
 210         }
 211       }
 212 
 213       // Arguments copied, continue.
 214       __ bind(arguments_copied);
 215     }
 216 
 217     {
 218       BLOCK_COMMENT("Call frame manager or native entry.");
 219       // Call frame manager or native entry.
 220       Register r_new_arg_entry = R14;
 221       assert_different_registers(r_new_arg_entry, r_top_of_arguments_addr,
 222                                  r_arg_method, r_arg_thread);
 223 
 224       __ mr(r_new_arg_entry, r_arg_entry);
 225 
 226       // Register state on entry to frame manager / native entry:
 227       //
 228       //   tos         -  intptr_t*    sender tos (prepushed) Lesp = (SP) + copied_arguments_offset - 8
 229       //   R19_method  -  Method
 230       //   R16_thread  -  JavaThread*
 231 
 232       // Tos must point to last argument - element_size.
 233       const Register tos = R15_esp;
 234 
 235       __ addi(tos, r_top_of_arguments_addr, -Interpreter::stackElementSize);
 236 
 237       // initialize call_stub locals (step 2)
 238       // now save tos as arguments_tos_address
 239       __ std(tos, _entry_frame_locals_neg(arguments_tos_address), r_entryframe_fp);
 240 
 241       // load argument registers for call
 242       __ mr(R19_method, r_arg_method);
 243       __ mr(R16_thread, r_arg_thread);
 244       assert(tos != r_arg_method, "trashed r_arg_method");
 245       assert(tos != r_arg_thread && R19_method != r_arg_thread, "trashed r_arg_thread");
 246 
 247       // Set R15_prev_state to 0 for simplifying checks in callee.
 248       __ load_const_optimized(R25_templateTableBase, (address)Interpreter::dispatch_table((TosState)0), R11_scratch1);
 249       // Stack on entry to frame manager / native entry:
 250       //
 251       //      F0      [TOP_IJAVA_FRAME_ABI]
 252       //              alignment (optional)
 253       //              [outgoing Java arguments]
 254       //              [ENTRY_FRAME_LOCALS]
 255       //      F1      [C_FRAME]
 256       //              ...
 257       //
 258 
 259       // global toc register
 260       __ load_const_optimized(R29_TOC, MacroAssembler::global_toc(), R11_scratch1);
 261       // Remember the senderSP so we interpreter can pop c2i arguments off of the stack
 262       // when called via a c2i.
 263 
 264       // Pass initial_caller_sp to framemanager.
 265       __ mr(R21_tmp1, R1_SP);
 266 
 267       // Zero the register that caches zero (optimization)
 268       __ li(R30_zero, 0);
 269 
 270       // Do a light-weight C-call here, r_new_arg_entry holds the address
 271       // of the interpreter entry point (frame manager or native entry)
 272       // and save runtime-value of LR in return_address.
 273       assert(r_new_arg_entry != tos && r_new_arg_entry != R19_method && r_new_arg_entry != R16_thread,
 274              "trashed r_new_arg_entry");
 275       return_address = __ call_stub(r_new_arg_entry);
 276     }
 277 
 278     {
 279       BLOCK_COMMENT("Returned from frame manager or native entry.");
 280       // Returned from frame manager or native entry.
 281       // Now pop frame, process result, and return to caller.
 282 
 283       // Stack on exit from frame manager / native entry:
 284       //
 285       //      F0      [ABI]
 286       //              ...
 287       //              [ENTRY_FRAME_LOCALS]
 288       //      F1      [C_FRAME]
 289       //              ...
 290       //
 291       // Just pop the topmost frame ...
 292       //
 293 
 294       Label ret_is_object;
 295       Label ret_is_long;
 296       Label ret_is_float;
 297       Label ret_is_double;
 298 
 299       Register r_entryframe_fp = R30;
 300       Register r_lr            = R7_ARG5;
 301       Register r_cr            = R8_ARG6;
 302 
 303       // Reload some volatile registers which we've spilled before the call
 304       // to frame manager / native entry.
 305       // Access all locals via frame pointer, because we know nothing about
 306       // the topmost frame's size.
 307       __ ld(r_entryframe_fp, _abi(callers_sp), R1_SP);
 308       assert_different_registers(r_entryframe_fp, R3_RET, r_arg_result_addr, r_arg_result_type, r_cr, r_lr);
 309       __ ld(r_arg_result_addr,
 310             _entry_frame_locals_neg(result_address), r_entryframe_fp);
 311       __ ld(r_arg_result_type,
 312             _entry_frame_locals_neg(result_type), r_entryframe_fp);
 313       __ ld(r_cr, _abi(cr), r_entryframe_fp);
 314       __ ld(r_lr, _abi(lr), r_entryframe_fp);
 315 
 316       // pop frame and restore non-volatiles, LR and CR
 317       __ mr(R1_SP, r_entryframe_fp);
 318       __ mtcr(r_cr);
 319       __ mtlr(r_lr);
 320 
 321       // Store result depending on type. Everything that is not
 322       // T_OBJECT, T_LONG, T_FLOAT, or T_DOUBLE is treated as T_INT.
 323       __ cmpwi(CCR0, r_arg_result_type, T_OBJECT);
 324       __ cmpwi(CCR1, r_arg_result_type, T_LONG);
 325       __ cmpwi(CCR5, r_arg_result_type, T_FLOAT);
 326       __ cmpwi(CCR6, r_arg_result_type, T_DOUBLE);
 327 
 328       // restore non-volatile registers
 329       __ restore_nonvolatile_gprs(R1_SP, _spill_nonvolatiles_neg(r14));
 330 
 331 
 332       // Stack on exit from call_stub:
 333       //
 334       //      0       [C_FRAME]
 335       //              ...
 336       //
 337       //  no call_stub frames left.
 338 
 339       // All non-volatiles have been restored at this point!!
 340       assert(R3_RET == R3, "R3_RET should be R3");
 341 
 342       __ beq(CCR0, ret_is_object);
 343       __ beq(CCR1, ret_is_long);
 344       __ beq(CCR5, ret_is_float);
 345       __ beq(CCR6, ret_is_double);
 346 
 347       // default:
 348       __ stw(R3_RET, 0, r_arg_result_addr);
 349       __ blr(); // return to caller
 350 
 351       // case T_OBJECT:
 352       __ bind(ret_is_object);
 353       __ std(R3_RET, 0, r_arg_result_addr);
 354       __ blr(); // return to caller
 355 
 356       // case T_LONG:
 357       __ bind(ret_is_long);
 358       __ std(R3_RET, 0, r_arg_result_addr);
 359       __ blr(); // return to caller
 360 
 361       // case T_FLOAT:
 362       __ bind(ret_is_float);
 363       __ stfs(F1_RET, 0, r_arg_result_addr);
 364       __ blr(); // return to caller
 365 
 366       // case T_DOUBLE:
 367       __ bind(ret_is_double);
 368       __ stfd(F1_RET, 0, r_arg_result_addr);
 369       __ blr(); // return to caller
 370     }
 371 
 372     return start;
 373   }
 374 
 375   // Return point for a Java call if there's an exception thrown in
 376   // Java code.  The exception is caught and transformed into a
 377   // pending exception stored in JavaThread that can be tested from
 378   // within the VM.
 379   //
 380   address generate_catch_exception() {
 381     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 382 
 383     address start = __ pc();
 384 
 385     // Registers alive
 386     //
 387     //  R16_thread
 388     //  R3_ARG1 - address of pending exception
 389     //  R4_ARG2 - return address in call stub
 390 
 391     const Register exception_file = R21_tmp1;
 392     const Register exception_line = R22_tmp2;
 393 
 394     __ load_const(exception_file, (void*)__FILE__);
 395     __ load_const(exception_line, (void*)__LINE__);
 396 
 397     __ std(R3_ARG1, in_bytes(JavaThread::pending_exception_offset()), R16_thread);
 398     // store into `char *'
 399     __ std(exception_file, in_bytes(JavaThread::exception_file_offset()), R16_thread);
 400     // store into `int'
 401     __ stw(exception_line, in_bytes(JavaThread::exception_line_offset()), R16_thread);
 402 
 403     // complete return to VM
 404     assert(StubRoutines::_call_stub_return_address != NULL, "must have been generated before");
 405 
 406     __ mtlr(R4_ARG2);
 407     // continue in call stub
 408     __ blr();
 409 
 410     return start;
 411   }
 412 
 413   // Continuation point for runtime calls returning with a pending
 414   // exception.  The pending exception check happened in the runtime
 415   // or native call stub.  The pending exception in Thread is
 416   // converted into a Java-level exception.
 417   //
 418   // Read:
 419   //
 420   //   LR:     The pc the runtime library callee wants to return to.
 421   //           Since the exception occurred in the callee, the return pc
 422   //           from the point of view of Java is the exception pc.
 423   //   thread: Needed for method handles.
 424   //
 425   // Invalidate:
 426   //
 427   //   volatile registers (except below).
 428   //
 429   // Update:
 430   //
 431   //   R4_ARG2: exception
 432   //
 433   // (LR is unchanged and is live out).
 434   //
 435   address generate_forward_exception() {
 436     StubCodeMark mark(this, "StubRoutines", "forward_exception");
 437     address start = __ pc();
 438 
 439 #if !defined(PRODUCT)
 440     if (VerifyOops) {
 441       // Get pending exception oop.
 442       __ ld(R3_ARG1,
 443                 in_bytes(Thread::pending_exception_offset()),
 444                 R16_thread);
 445       // Make sure that this code is only executed if there is a pending exception.
 446       {
 447         Label L;
 448         __ cmpdi(CCR0, R3_ARG1, 0);
 449         __ bne(CCR0, L);
 450         __ stop("StubRoutines::forward exception: no pending exception (1)");
 451         __ bind(L);
 452       }
 453       __ verify_oop(R3_ARG1, "StubRoutines::forward exception: not an oop");
 454     }
 455 #endif
 456 
 457     // Save LR/CR and copy exception pc (LR) into R4_ARG2.
 458     __ save_LR_CR(R4_ARG2);
 459     __ push_frame_reg_args(0, R0);
 460     // Find exception handler.
 461     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 462                      SharedRuntime::exception_handler_for_return_address),
 463                     R16_thread,
 464                     R4_ARG2);
 465     // Copy handler's address.
 466     __ mtctr(R3_RET);
 467     __ pop_frame();
 468     __ restore_LR_CR(R0);
 469 
 470     // Set up the arguments for the exception handler:
 471     //  - R3_ARG1: exception oop
 472     //  - R4_ARG2: exception pc.
 473 
 474     // Load pending exception oop.
 475     __ ld(R3_ARG1,
 476               in_bytes(Thread::pending_exception_offset()),
 477               R16_thread);
 478 
 479     // The exception pc is the return address in the caller.
 480     // Must load it into R4_ARG2.
 481     __ mflr(R4_ARG2);
 482 
 483 #ifdef ASSERT
 484     // Make sure exception is set.
 485     {
 486       Label L;
 487       __ cmpdi(CCR0, R3_ARG1, 0);
 488       __ bne(CCR0, L);
 489       __ stop("StubRoutines::forward exception: no pending exception (2)");
 490       __ bind(L);
 491     }
 492 #endif
 493 
 494     // Clear the pending exception.
 495     __ li(R0, 0);
 496     __ std(R0,
 497                in_bytes(Thread::pending_exception_offset()),
 498                R16_thread);
 499     // Jump to exception handler.
 500     __ bctr();
 501 
 502     return start;
 503   }
 504 
 505 #undef __
 506 #define __ masm->
 507   // Continuation point for throwing of implicit exceptions that are
 508   // not handled in the current activation. Fabricates an exception
 509   // oop and initiates normal exception dispatching in this
 510   // frame. Only callee-saved registers are preserved (through the
 511   // normal register window / RegisterMap handling).  If the compiler
 512   // needs all registers to be preserved between the fault point and
 513   // the exception handler then it must assume responsibility for that
 514   // in AbstractCompiler::continuation_for_implicit_null_exception or
 515   // continuation_for_implicit_division_by_zero_exception. All other
 516   // implicit exceptions (e.g., NullPointerException or
 517   // AbstractMethodError on entry) are either at call sites or
 518   // otherwise assume that stack unwinding will be initiated, so
 519   // caller saved registers were assumed volatile in the compiler.
 520   //
 521   // Note that we generate only this stub into a RuntimeStub, because
 522   // it needs to be properly traversed and ignored during GC, so we
 523   // change the meaning of the "__" macro within this method.
 524   //
 525   // Note: the routine set_pc_not_at_call_for_caller in
 526   // SharedRuntime.cpp requires that this code be generated into a
 527   // RuntimeStub.
 528   address generate_throw_exception(const char* name, address runtime_entry, bool restore_saved_exception_pc,
 529                                    Register arg1 = noreg, Register arg2 = noreg) {
 530     CodeBuffer code(name, 1024 DEBUG_ONLY(+ 512), 0);
 531     MacroAssembler* masm = new MacroAssembler(&code);
 532 
 533     OopMapSet* oop_maps  = new OopMapSet();
 534     int frame_size_in_bytes = frame::abi_reg_args_size;
 535     OopMap* map = new OopMap(frame_size_in_bytes / sizeof(jint), 0);
 536 
 537     address start = __ pc();
 538 
 539     __ save_LR_CR(R11_scratch1);
 540 
 541     // Push a frame.
 542     __ push_frame_reg_args(0, R11_scratch1);
 543 
 544     address frame_complete_pc = __ pc();
 545 
 546     if (restore_saved_exception_pc) {
 547       __ unimplemented("StubGenerator::throw_exception with restore_saved_exception_pc", 74);
 548     }
 549 
 550     // Note that we always have a runtime stub frame on the top of
 551     // stack by this point. Remember the offset of the instruction
 552     // whose address will be moved to R11_scratch1.
 553     address gc_map_pc = __ get_PC_trash_LR(R11_scratch1);
 554 
 555     __ set_last_Java_frame(/*sp*/R1_SP, /*pc*/R11_scratch1);
 556 
 557     __ mr(R3_ARG1, R16_thread);
 558     if (arg1 != noreg) {
 559       __ mr(R4_ARG2, arg1);
 560     }
 561     if (arg2 != noreg) {
 562       __ mr(R5_ARG3, arg2);
 563     }
 564 #if defined(ABI_ELFv2)
 565     __ call_c(runtime_entry, relocInfo::none);
 566 #else
 567     __ call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, runtime_entry), relocInfo::none);
 568 #endif
 569 
 570     // Set an oopmap for the call site.
 571     oop_maps->add_gc_map((int)(gc_map_pc - start), map);
 572 
 573     __ reset_last_Java_frame();
 574 
 575 #ifdef ASSERT
 576     // Make sure that this code is only executed if there is a pending
 577     // exception.
 578     {
 579       Label L;
 580       __ ld(R0,
 581                 in_bytes(Thread::pending_exception_offset()),
 582                 R16_thread);
 583       __ cmpdi(CCR0, R0, 0);
 584       __ bne(CCR0, L);
 585       __ stop("StubRoutines::throw_exception: no pending exception");
 586       __ bind(L);
 587     }
 588 #endif
 589 
 590     // Pop frame.
 591     __ pop_frame();
 592 
 593     __ restore_LR_CR(R11_scratch1);
 594 
 595     __ load_const(R11_scratch1, StubRoutines::forward_exception_entry());
 596     __ mtctr(R11_scratch1);
 597     __ bctr();
 598 
 599     // Create runtime stub with OopMap.
 600     RuntimeStub* stub =
 601       RuntimeStub::new_runtime_stub(name, &code,
 602                                     /*frame_complete=*/ (int)(frame_complete_pc - start),
 603                                     frame_size_in_bytes/wordSize,
 604                                     oop_maps,
 605                                     false);
 606     return stub->entry_point();
 607   }
 608 #undef __
 609 #define __ _masm->
 610 
 611   //  Generate G1 pre-write barrier for array.
 612   //
 613   //  Input:
 614   //     from     - register containing src address (only needed for spilling)
 615   //     to       - register containing starting address
 616   //     count    - register containing element count
 617   //     tmp      - scratch register
 618   //
 619   //  Kills:
 620   //     nothing
 621   //
 622   void gen_write_ref_array_pre_barrier(Register from, Register to, Register count, bool dest_uninitialized, Register Rtmp1,
 623                                        Register preserve1 = noreg, Register preserve2 = noreg) {
 624     BarrierSet* const bs = Universe::heap()->barrier_set();
 625     switch (bs->kind()) {
 626       case BarrierSet::G1SATBCTLogging:
 627         // With G1, don't generate the call if we statically know that the target in uninitialized
 628         if (!dest_uninitialized) {
 629           int spill_slots = 3;
 630           if (preserve1 != noreg) { spill_slots++; }
 631           if (preserve2 != noreg) { spill_slots++; }
 632           const int frame_size = align_size_up(frame::abi_reg_args_size + spill_slots * BytesPerWord, frame::alignment_in_bytes);
 633           Label filtered;
 634 
 635           // Is marking active?
 636           if (in_bytes(SATBMarkQueue::byte_width_of_active()) == 4) {
 637             __ lwz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_active()), R16_thread);
 638           } else {
 639             guarantee(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, "Assumption");
 640             __ lbz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_active()), R16_thread);
 641           }
 642           __ cmpdi(CCR0, Rtmp1, 0);
 643           __ beq(CCR0, filtered);
 644 
 645           __ save_LR_CR(R0);
 646           __ push_frame(frame_size, R0);
 647           int slot_nr = 0;
 648           __ std(from,  frame_size - (++slot_nr) * wordSize, R1_SP);
 649           __ std(to,    frame_size - (++slot_nr) * wordSize, R1_SP);
 650           __ std(count, frame_size - (++slot_nr) * wordSize, R1_SP);
 651           if (preserve1 != noreg) { __ std(preserve1, frame_size - (++slot_nr) * wordSize, R1_SP); }
 652           if (preserve2 != noreg) { __ std(preserve2, frame_size - (++slot_nr) * wordSize, R1_SP); }
 653 
 654           __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), to, count);
 655 
 656           slot_nr = 0;
 657           __ ld(from,  frame_size - (++slot_nr) * wordSize, R1_SP);
 658           __ ld(to,    frame_size - (++slot_nr) * wordSize, R1_SP);
 659           __ ld(count, frame_size - (++slot_nr) * wordSize, R1_SP);
 660           if (preserve1 != noreg) { __ ld(preserve1, frame_size - (++slot_nr) * wordSize, R1_SP); }
 661           if (preserve2 != noreg) { __ ld(preserve2, frame_size - (++slot_nr) * wordSize, R1_SP); }
 662           __ addi(R1_SP, R1_SP, frame_size); // pop_frame()
 663           __ restore_LR_CR(R0);
 664 
 665           __ bind(filtered);
 666         }
 667         break;
 668       case BarrierSet::CardTableForRS:
 669       case BarrierSet::CardTableExtension:
 670       case BarrierSet::ModRef:
 671         break;
 672       default:
 673         ShouldNotReachHere();
 674     }
 675   }
 676 
 677   //  Generate CMS/G1 post-write barrier for array.
 678   //
 679   //  Input:
 680   //     addr     - register containing starting address
 681   //     count    - register containing element count
 682   //     tmp      - scratch register
 683   //
 684   //  The input registers and R0 are overwritten.
 685   //
 686   void gen_write_ref_array_post_barrier(Register addr, Register count, Register tmp, Register preserve = noreg) {
 687     BarrierSet* const bs = Universe::heap()->barrier_set();
 688 
 689     switch (bs->kind()) {
 690       case BarrierSet::G1SATBCTLogging:
 691         {
 692           int spill_slots = (preserve != noreg) ? 1 : 0;
 693           const int frame_size = align_size_up(frame::abi_reg_args_size + spill_slots * BytesPerWord, frame::alignment_in_bytes);
 694 
 695           __ save_LR_CR(R0);
 696           __ push_frame(frame_size, R0);
 697           if (preserve != noreg) { __ std(preserve, frame_size - 1 * wordSize, R1_SP); }
 698           __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), addr, count);
 699           if (preserve != noreg) { __ ld(preserve, frame_size - 1 * wordSize, R1_SP); }
 700           __ addi(R1_SP, R1_SP, frame_size); // pop_frame();
 701           __ restore_LR_CR(R0);
 702         }
 703         break;
 704       case BarrierSet::CardTableForRS:
 705       case BarrierSet::CardTableExtension:
 706         {
 707           Label Lskip_loop, Lstore_loop;
 708           if (UseConcMarkSweepGC) {
 709             // TODO PPC port: contribute optimization / requires shared changes
 710             __ release();
 711           }
 712 
 713           CardTableModRefBS* const ct = barrier_set_cast<CardTableModRefBS>(bs);
 714           assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
 715           assert_different_registers(addr, count, tmp);
 716 
 717           __ sldi(count, count, LogBytesPerHeapOop);
 718           __ addi(count, count, -BytesPerHeapOop);
 719           __ add(count, addr, count);
 720           // Use two shifts to clear out those low order two bits! (Cannot opt. into 1.)
 721           __ srdi(addr, addr, CardTableModRefBS::card_shift);
 722           __ srdi(count, count, CardTableModRefBS::card_shift);
 723           __ subf(count, addr, count);
 724           assert_different_registers(R0, addr, count, tmp);
 725           __ load_const(tmp, (address)ct->byte_map_base);
 726           __ addic_(count, count, 1);
 727           __ beq(CCR0, Lskip_loop);
 728           __ li(R0, 0);
 729           __ mtctr(count);
 730           // Byte store loop
 731           __ bind(Lstore_loop);
 732           __ stbx(R0, tmp, addr);
 733           __ addi(addr, addr, 1);
 734           __ bdnz(Lstore_loop);
 735           __ bind(Lskip_loop);
 736         }
 737       break;
 738       case BarrierSet::ModRef:
 739         break;
 740       default:
 741         ShouldNotReachHere();
 742     }
 743   }
 744 
 745   // Support for void zero_words_aligned8(HeapWord* to, size_t count)
 746   //
 747   // Arguments:
 748   //   to:
 749   //   count:
 750   //
 751   // Destroys:
 752   //
 753   address generate_zero_words_aligned8() {
 754     StubCodeMark mark(this, "StubRoutines", "zero_words_aligned8");
 755 
 756     // Implemented as in ClearArray.
 757     address start = __ function_entry();
 758 
 759     Register base_ptr_reg   = R3_ARG1; // tohw (needs to be 8b aligned)
 760     Register cnt_dwords_reg = R4_ARG2; // count (in dwords)
 761     Register tmp1_reg       = R5_ARG3;
 762     Register tmp2_reg       = R6_ARG4;
 763     Register zero_reg       = R7_ARG5;
 764 
 765     // Procedure for large arrays (uses data cache block zero instruction).
 766     Label dwloop, fast, fastloop, restloop, lastdword, done;
 767     int cl_size = VM_Version::L1_data_cache_line_size();
 768     int cl_dwords = cl_size >> 3;
 769     int cl_dwordaddr_bits = exact_log2(cl_dwords);
 770     int min_dcbz = 2; // Needs to be positive, apply dcbz only to at least min_dcbz cache lines.
 771 
 772     // Clear up to 128byte boundary if long enough, dword_cnt=(16-(base>>3))%16.
 773     __ dcbtst(base_ptr_reg);                    // Indicate write access to first cache line ...
 774     __ andi(tmp2_reg, cnt_dwords_reg, 1);       // to check if number of dwords is even.
 775     __ srdi_(tmp1_reg, cnt_dwords_reg, 1);      // number of double dwords
 776     __ load_const_optimized(zero_reg, 0L);      // Use as zero register.
 777 
 778     __ cmpdi(CCR1, tmp2_reg, 0);                // cnt_dwords even?
 779     __ beq(CCR0, lastdword);                    // size <= 1
 780     __ mtctr(tmp1_reg);                         // Speculatively preload counter for rest loop (>0).
 781     __ cmpdi(CCR0, cnt_dwords_reg, (min_dcbz+1)*cl_dwords-1); // Big enough to ensure >=min_dcbz cache lines are included?
 782     __ neg(tmp1_reg, base_ptr_reg);             // bit 0..58: bogus, bit 57..60: (16-(base>>3))%16, bit 61..63: 000
 783 
 784     __ blt(CCR0, restloop);                     // Too small. (<31=(2*cl_dwords)-1 is sufficient, but bigger performs better.)
 785     __ rldicl_(tmp1_reg, tmp1_reg, 64-3, 64-cl_dwordaddr_bits); // Extract number of dwords to 128byte boundary=(16-(base>>3))%16.
 786 
 787     __ beq(CCR0, fast);                         // already 128byte aligned
 788     __ mtctr(tmp1_reg);                         // Set ctr to hit 128byte boundary (0<ctr<cnt).
 789     __ subf(cnt_dwords_reg, tmp1_reg, cnt_dwords_reg); // rest (>0 since size>=256-8)
 790 
 791     // Clear in first cache line dword-by-dword if not already 128byte aligned.
 792     __ bind(dwloop);
 793       __ std(zero_reg, 0, base_ptr_reg);        // Clear 8byte aligned block.
 794       __ addi(base_ptr_reg, base_ptr_reg, 8);
 795     __ bdnz(dwloop);
 796 
 797     // clear 128byte blocks
 798     __ bind(fast);
 799     __ srdi(tmp1_reg, cnt_dwords_reg, cl_dwordaddr_bits); // loop count for 128byte loop (>0 since size>=256-8)
 800     __ andi(tmp2_reg, cnt_dwords_reg, 1);       // to check if rest even
 801 
 802     __ mtctr(tmp1_reg);                         // load counter
 803     __ cmpdi(CCR1, tmp2_reg, 0);                // rest even?
 804     __ rldicl_(tmp1_reg, cnt_dwords_reg, 63, 65-cl_dwordaddr_bits); // rest in double dwords
 805 
 806     __ bind(fastloop);
 807       __ dcbz(base_ptr_reg);                    // Clear 128byte aligned block.
 808       __ addi(base_ptr_reg, base_ptr_reg, cl_size);
 809     __ bdnz(fastloop);
 810 
 811     //__ dcbtst(base_ptr_reg);                  // Indicate write access to last cache line.
 812     __ beq(CCR0, lastdword);                    // rest<=1
 813     __ mtctr(tmp1_reg);                         // load counter
 814 
 815     // Clear rest.
 816     __ bind(restloop);
 817       __ std(zero_reg, 0, base_ptr_reg);        // Clear 8byte aligned block.
 818       __ std(zero_reg, 8, base_ptr_reg);        // Clear 8byte aligned block.
 819       __ addi(base_ptr_reg, base_ptr_reg, 16);
 820     __ bdnz(restloop);
 821 
 822     __ bind(lastdword);
 823     __ beq(CCR1, done);
 824     __ std(zero_reg, 0, base_ptr_reg);
 825     __ bind(done);
 826     __ blr();                                   // return
 827 
 828     return start;
 829   }
 830 
 831 #if !defined(PRODUCT)
 832   // Wrapper which calls oopDesc::is_oop_or_null()
 833   // Only called by MacroAssembler::verify_oop
 834   static void verify_oop_helper(const char* message, oop o) {
 835     if (!o->is_oop_or_null()) {
 836       fatal("%s", message);
 837     }
 838     ++ StubRoutines::_verify_oop_count;
 839   }
 840 #endif
 841 
 842   // Return address of code to be called from code generated by
 843   // MacroAssembler::verify_oop.
 844   //
 845   // Don't generate, rather use C++ code.
 846   address generate_verify_oop() {
 847     // this is actually a `FunctionDescriptor*'.
 848     address start = 0;
 849 
 850 #if !defined(PRODUCT)
 851     start = CAST_FROM_FN_PTR(address, verify_oop_helper);
 852 #endif
 853 
 854     return start;
 855   }
 856 
 857   // Fairer handling of safepoints for native methods.
 858   //
 859   // Generate code which reads from the polling page. This special handling is needed as the
 860   // linux-ppc64 kernel before 2.6.6 doesn't set si_addr on some segfaults in 64bit mode
 861   // (cf. http://www.kernel.org/pub/linux/kernel/v2.6/ChangeLog-2.6.6), especially when we try
 862   // to read from the safepoint polling page.
 863   address generate_load_from_poll() {
 864     StubCodeMark mark(this, "StubRoutines", "generate_load_from_poll");
 865     address start = __ function_entry();
 866     __ unimplemented("StubRoutines::verify_oop", 95);  // TODO PPC port
 867     return start;
 868   }
 869 
 870   // -XX:+OptimizeFill : convert fill/copy loops into intrinsic
 871   //
 872   // The code is implemented(ported from sparc) as we believe it benefits JVM98, however
 873   // tracing(-XX:+TraceOptimizeFill) shows the intrinsic replacement doesn't happen at all!
 874   //
 875   // Source code in function is_range_check_if() shows that OptimizeFill relaxed the condition
 876   // for turning on loop predication optimization, and hence the behavior of "array range check"
 877   // and "loop invariant check" could be influenced, which potentially boosted JVM98.
 878   //
 879   // Generate stub for disjoint short fill. If "aligned" is true, the
 880   // "to" address is assumed to be heapword aligned.
 881   //
 882   // Arguments for generated stub:
 883   //   to:    R3_ARG1
 884   //   value: R4_ARG2
 885   //   count: R5_ARG3 treated as signed
 886   //
 887   address generate_fill(BasicType t, bool aligned, const char* name) {
 888     StubCodeMark mark(this, "StubRoutines", name);
 889     address start = __ function_entry();
 890 
 891     const Register to    = R3_ARG1;   // source array address
 892     const Register value = R4_ARG2;   // fill value
 893     const Register count = R5_ARG3;   // elements count
 894     const Register temp  = R6_ARG4;   // temp register
 895 
 896     //assert_clean_int(count, O3);    // Make sure 'count' is clean int.
 897 
 898     Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
 899     Label L_fill_2_bytes, L_fill_4_bytes, L_fill_elements, L_fill_32_bytes;
 900 
 901     int shift = -1;
 902     switch (t) {
 903        case T_BYTE:
 904         shift = 2;
 905         // Clone bytes (zero extend not needed because store instructions below ignore high order bytes).
 906         __ rldimi(value, value, 8, 48);     // 8 bit -> 16 bit
 907         __ cmpdi(CCR0, count, 2<<shift);    // Short arrays (< 8 bytes) fill by element.
 908         __ blt(CCR0, L_fill_elements);
 909         __ rldimi(value, value, 16, 32);    // 16 bit -> 32 bit
 910         break;
 911        case T_SHORT:
 912         shift = 1;
 913         // Clone bytes (zero extend not needed because store instructions below ignore high order bytes).
 914         __ rldimi(value, value, 16, 32);    // 16 bit -> 32 bit
 915         __ cmpdi(CCR0, count, 2<<shift);    // Short arrays (< 8 bytes) fill by element.
 916         __ blt(CCR0, L_fill_elements);
 917         break;
 918       case T_INT:
 919         shift = 0;
 920         __ cmpdi(CCR0, count, 2<<shift);    // Short arrays (< 8 bytes) fill by element.
 921         __ blt(CCR0, L_fill_4_bytes);
 922         break;
 923       default: ShouldNotReachHere();
 924     }
 925 
 926     if (!aligned && (t == T_BYTE || t == T_SHORT)) {
 927       // Align source address at 4 bytes address boundary.
 928       if (t == T_BYTE) {
 929         // One byte misalignment happens only for byte arrays.
 930         __ andi_(temp, to, 1);
 931         __ beq(CCR0, L_skip_align1);
 932         __ stb(value, 0, to);
 933         __ addi(to, to, 1);
 934         __ addi(count, count, -1);
 935         __ bind(L_skip_align1);
 936       }
 937       // Two bytes misalignment happens only for byte and short (char) arrays.
 938       __ andi_(temp, to, 2);
 939       __ beq(CCR0, L_skip_align2);
 940       __ sth(value, 0, to);
 941       __ addi(to, to, 2);
 942       __ addi(count, count, -(1 << (shift - 1)));
 943       __ bind(L_skip_align2);
 944     }
 945 
 946     if (!aligned) {
 947       // Align to 8 bytes, we know we are 4 byte aligned to start.
 948       __ andi_(temp, to, 7);
 949       __ beq(CCR0, L_fill_32_bytes);
 950       __ stw(value, 0, to);
 951       __ addi(to, to, 4);
 952       __ addi(count, count, -(1 << shift));
 953       __ bind(L_fill_32_bytes);
 954     }
 955 
 956     __ li(temp, 8<<shift);                  // Prepare for 32 byte loop.
 957     // Clone bytes int->long as above.
 958     __ rldimi(value, value, 32, 0);         // 32 bit -> 64 bit
 959 
 960     Label L_check_fill_8_bytes;
 961     // Fill 32-byte chunks.
 962     __ subf_(count, temp, count);
 963     __ blt(CCR0, L_check_fill_8_bytes);
 964 
 965     Label L_fill_32_bytes_loop;
 966     __ align(32);
 967     __ bind(L_fill_32_bytes_loop);
 968 
 969     __ std(value, 0, to);
 970     __ std(value, 8, to);
 971     __ subf_(count, temp, count);           // Update count.
 972     __ std(value, 16, to);
 973     __ std(value, 24, to);
 974 
 975     __ addi(to, to, 32);
 976     __ bge(CCR0, L_fill_32_bytes_loop);
 977 
 978     __ bind(L_check_fill_8_bytes);
 979     __ add_(count, temp, count);
 980     __ beq(CCR0, L_exit);
 981     __ addic_(count, count, -(2 << shift));
 982     __ blt(CCR0, L_fill_4_bytes);
 983 
 984     //
 985     // Length is too short, just fill 8 bytes at a time.
 986     //
 987     Label L_fill_8_bytes_loop;
 988     __ bind(L_fill_8_bytes_loop);
 989     __ std(value, 0, to);
 990     __ addic_(count, count, -(2 << shift));
 991     __ addi(to, to, 8);
 992     __ bge(CCR0, L_fill_8_bytes_loop);
 993 
 994     // Fill trailing 4 bytes.
 995     __ bind(L_fill_4_bytes);
 996     __ andi_(temp, count, 1<<shift);
 997     __ beq(CCR0, L_fill_2_bytes);
 998 
 999     __ stw(value, 0, to);
1000     if (t == T_BYTE || t == T_SHORT) {
1001       __ addi(to, to, 4);
1002       // Fill trailing 2 bytes.
1003       __ bind(L_fill_2_bytes);
1004       __ andi_(temp, count, 1<<(shift-1));
1005       __ beq(CCR0, L_fill_byte);
1006       __ sth(value, 0, to);
1007       if (t == T_BYTE) {
1008         __ addi(to, to, 2);
1009         // Fill trailing byte.
1010         __ bind(L_fill_byte);
1011         __ andi_(count, count, 1);
1012         __ beq(CCR0, L_exit);
1013         __ stb(value, 0, to);
1014       } else {
1015         __ bind(L_fill_byte);
1016       }
1017     } else {
1018       __ bind(L_fill_2_bytes);
1019     }
1020     __ bind(L_exit);
1021     __ blr();
1022 
1023     // Handle copies less than 8 bytes. Int is handled elsewhere.
1024     if (t == T_BYTE) {
1025       __ bind(L_fill_elements);
1026       Label L_fill_2, L_fill_4;
1027       __ andi_(temp, count, 1);
1028       __ beq(CCR0, L_fill_2);
1029       __ stb(value, 0, to);
1030       __ addi(to, to, 1);
1031       __ bind(L_fill_2);
1032       __ andi_(temp, count, 2);
1033       __ beq(CCR0, L_fill_4);
1034       __ stb(value, 0, to);
1035       __ stb(value, 0, to);
1036       __ addi(to, to, 2);
1037       __ bind(L_fill_4);
1038       __ andi_(temp, count, 4);
1039       __ beq(CCR0, L_exit);
1040       __ stb(value, 0, to);
1041       __ stb(value, 1, to);
1042       __ stb(value, 2, to);
1043       __ stb(value, 3, to);
1044       __ blr();
1045     }
1046 
1047     if (t == T_SHORT) {
1048       Label L_fill_2;
1049       __ bind(L_fill_elements);
1050       __ andi_(temp, count, 1);
1051       __ beq(CCR0, L_fill_2);
1052       __ sth(value, 0, to);
1053       __ addi(to, to, 2);
1054       __ bind(L_fill_2);
1055       __ andi_(temp, count, 2);
1056       __ beq(CCR0, L_exit);
1057       __ sth(value, 0, to);
1058       __ sth(value, 2, to);
1059       __ blr();
1060     }
1061     return start;
1062   }
1063 
1064   inline void assert_positive_int(Register count) {
1065 #ifdef ASSERT
1066     __ srdi_(R0, count, 31);
1067     __ asm_assert_eq("missing zero extend", 0xAFFE);
1068 #endif
1069   }
1070 
1071   // Generate overlap test for array copy stubs.
1072   //
1073   // Input:
1074   //   R3_ARG1    -  from
1075   //   R4_ARG2    -  to
1076   //   R5_ARG3    -  element count
1077   //
1078   void array_overlap_test(address no_overlap_target, int log2_elem_size) {
1079     Register tmp1 = R6_ARG4;
1080     Register tmp2 = R7_ARG5;
1081 
1082     assert_positive_int(R5_ARG3);
1083 
1084     __ subf(tmp1, R3_ARG1, R4_ARG2); // distance in bytes
1085     __ sldi(tmp2, R5_ARG3, log2_elem_size); // size in bytes
1086     __ cmpld(CCR0, R3_ARG1, R4_ARG2); // Use unsigned comparison!
1087     __ cmpld(CCR1, tmp1, tmp2);
1088     __ crnand(CCR0, Assembler::less, CCR1, Assembler::less);
1089     // Overlaps if Src before dst and distance smaller than size.
1090     // Branch to forward copy routine otherwise (within range of 32kB).
1091     __ bc(Assembler::bcondCRbiIs1, Assembler::bi0(CCR0, Assembler::less), no_overlap_target);
1092 
1093     // need to copy backwards
1094   }
1095 
1096   // The guideline in the implementations of generate_disjoint_xxx_copy
1097   // (xxx=byte,short,int,long,oop) is to copy as many elements as possible with
1098   // single instructions, but to avoid alignment interrupts (see subsequent
1099   // comment). Furthermore, we try to minimize misaligned access, even
1100   // though they cause no alignment interrupt.
1101   //
1102   // In Big-Endian mode, the PowerPC architecture requires implementations to
1103   // handle automatically misaligned integer halfword and word accesses,
1104   // word-aligned integer doubleword accesses, and word-aligned floating-point
1105   // accesses. Other accesses may or may not generate an Alignment interrupt
1106   // depending on the implementation.
1107   // Alignment interrupt handling may require on the order of hundreds of cycles,
1108   // so every effort should be made to avoid misaligned memory values.
1109   //
1110   //
1111   // Generate stub for disjoint byte copy.  If "aligned" is true, the
1112   // "from" and "to" addresses are assumed to be heapword aligned.
1113   //
1114   // Arguments for generated stub:
1115   //      from:  R3_ARG1
1116   //      to:    R4_ARG2
1117   //      count: R5_ARG3 treated as signed
1118   //
1119   address generate_disjoint_byte_copy(bool aligned, const char * name) {
1120     StubCodeMark mark(this, "StubRoutines", name);
1121     address start = __ function_entry();
1122     assert_positive_int(R5_ARG3);
1123 
1124     Register tmp1 = R6_ARG4;
1125     Register tmp2 = R7_ARG5;
1126     Register tmp3 = R8_ARG6;
1127     Register tmp4 = R9_ARG7;
1128 
1129     VectorSRegister tmp_vsr1  = VSR1;
1130     VectorSRegister tmp_vsr2  = VSR2;
1131 
1132     Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9, l_10;
1133 
1134     // Don't try anything fancy if arrays don't have many elements.
1135     __ li(tmp3, 0);
1136     __ cmpwi(CCR0, R5_ARG3, 17);
1137     __ ble(CCR0, l_6); // copy 4 at a time
1138 
1139     if (!aligned) {
1140       __ xorr(tmp1, R3_ARG1, R4_ARG2);
1141       __ andi_(tmp1, tmp1, 3);
1142       __ bne(CCR0, l_6); // If arrays don't have the same alignment mod 4, do 4 element copy.
1143 
1144       // Copy elements if necessary to align to 4 bytes.
1145       __ neg(tmp1, R3_ARG1); // Compute distance to alignment boundary.
1146       __ andi_(tmp1, tmp1, 3);
1147       __ beq(CCR0, l_2);
1148 
1149       __ subf(R5_ARG3, tmp1, R5_ARG3);
1150       __ bind(l_9);
1151       __ lbz(tmp2, 0, R3_ARG1);
1152       __ addic_(tmp1, tmp1, -1);
1153       __ stb(tmp2, 0, R4_ARG2);
1154       __ addi(R3_ARG1, R3_ARG1, 1);
1155       __ addi(R4_ARG2, R4_ARG2, 1);
1156       __ bne(CCR0, l_9);
1157 
1158       __ bind(l_2);
1159     }
1160 
1161     // copy 8 elements at a time
1162     __ xorr(tmp2, R3_ARG1, R4_ARG2); // skip if src & dest have differing alignment mod 8
1163     __ andi_(tmp1, tmp2, 7);
1164     __ bne(CCR0, l_7); // not same alignment -> to or from is aligned -> copy 8
1165 
1166     // copy a 2-element word if necessary to align to 8 bytes
1167     __ andi_(R0, R3_ARG1, 7);
1168     __ beq(CCR0, l_7);
1169 
1170     __ lwzx(tmp2, R3_ARG1, tmp3);
1171     __ addi(R5_ARG3, R5_ARG3, -4);
1172     __ stwx(tmp2, R4_ARG2, tmp3);
1173     { // FasterArrayCopy
1174       __ addi(R3_ARG1, R3_ARG1, 4);
1175       __ addi(R4_ARG2, R4_ARG2, 4);
1176     }
1177     __ bind(l_7);
1178 
1179     { // FasterArrayCopy
1180       __ cmpwi(CCR0, R5_ARG3, 31);
1181       __ ble(CCR0, l_6); // copy 2 at a time if less than 32 elements remain
1182 
1183       __ srdi(tmp1, R5_ARG3, 5);
1184       __ andi_(R5_ARG3, R5_ARG3, 31);
1185       __ mtctr(tmp1);
1186 
1187      if (!VM_Version::has_vsx()) {
1188 
1189       __ bind(l_8);
1190       // Use unrolled version for mass copying (copy 32 elements a time)
1191       // Load feeding store gets zero latency on Power6, however not on Power5.
1192       // Therefore, the following sequence is made for the good of both.
1193       __ ld(tmp1, 0, R3_ARG1);
1194       __ ld(tmp2, 8, R3_ARG1);
1195       __ ld(tmp3, 16, R3_ARG1);
1196       __ ld(tmp4, 24, R3_ARG1);
1197       __ std(tmp1, 0, R4_ARG2);
1198       __ std(tmp2, 8, R4_ARG2);
1199       __ std(tmp3, 16, R4_ARG2);
1200       __ std(tmp4, 24, R4_ARG2);
1201       __ addi(R3_ARG1, R3_ARG1, 32);
1202       __ addi(R4_ARG2, R4_ARG2, 32);
1203       __ bdnz(l_8);
1204 
1205     } else { // Processor supports VSX, so use it to mass copy.
1206 
1207       // Prefetch the data into the L2 cache.
1208       __ dcbt(R3_ARG1, 0);
1209 
1210       // If supported set DSCR pre-fetch to deepest.
1211       if (VM_Version::has_mfdscr()) {
1212         __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
1213         __ mtdscr(tmp2);
1214       }
1215 
1216       __ li(tmp1, 16);
1217 
1218       // Backbranch target aligned to 32-byte. Not 16-byte align as
1219       // loop contains < 8 instructions that fit inside a single
1220       // i-cache sector.
1221       __ align(32);
1222 
1223       __ bind(l_10);
1224       // Use loop with VSX load/store instructions to
1225       // copy 32 elements a time.
1226       __ lxvd2x(tmp_vsr1, R3_ARG1);        // Load src
1227       __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst
1228       __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1);  // Load src + 16
1229       __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
1230       __ addi(R3_ARG1, R3_ARG1, 32);       // Update src+=32
1231       __ addi(R4_ARG2, R4_ARG2, 32);       // Update dsc+=32
1232       __ bdnz(l_10);                       // Dec CTR and loop if not zero.
1233 
1234       // Restore DSCR pre-fetch value.
1235       if (VM_Version::has_mfdscr()) {
1236         __ load_const_optimized(tmp2, VM_Version::_dscr_val);
1237         __ mtdscr(tmp2);
1238       }
1239 
1240     } // VSX
1241    } // FasterArrayCopy
1242 
1243     __ bind(l_6);
1244 
1245     // copy 4 elements at a time
1246     __ cmpwi(CCR0, R5_ARG3, 4);
1247     __ blt(CCR0, l_1);
1248     __ srdi(tmp1, R5_ARG3, 2);
1249     __ mtctr(tmp1); // is > 0
1250     __ andi_(R5_ARG3, R5_ARG3, 3);
1251 
1252     { // FasterArrayCopy
1253       __ addi(R3_ARG1, R3_ARG1, -4);
1254       __ addi(R4_ARG2, R4_ARG2, -4);
1255       __ bind(l_3);
1256       __ lwzu(tmp2, 4, R3_ARG1);
1257       __ stwu(tmp2, 4, R4_ARG2);
1258       __ bdnz(l_3);
1259       __ addi(R3_ARG1, R3_ARG1, 4);
1260       __ addi(R4_ARG2, R4_ARG2, 4);
1261     }
1262 
1263     // do single element copy
1264     __ bind(l_1);
1265     __ cmpwi(CCR0, R5_ARG3, 0);
1266     __ beq(CCR0, l_4);
1267 
1268     { // FasterArrayCopy
1269       __ mtctr(R5_ARG3);
1270       __ addi(R3_ARG1, R3_ARG1, -1);
1271       __ addi(R4_ARG2, R4_ARG2, -1);
1272 
1273       __ bind(l_5);
1274       __ lbzu(tmp2, 1, R3_ARG1);
1275       __ stbu(tmp2, 1, R4_ARG2);
1276       __ bdnz(l_5);
1277     }
1278 
1279     __ bind(l_4);
1280     __ li(R3_RET, 0); // return 0
1281     __ blr();
1282 
1283     return start;
1284   }
1285 
1286   // Generate stub for conjoint byte copy.  If "aligned" is true, the
1287   // "from" and "to" addresses are assumed to be heapword aligned.
1288   //
1289   // Arguments for generated stub:
1290   //      from:  R3_ARG1
1291   //      to:    R4_ARG2
1292   //      count: R5_ARG3 treated as signed
1293   //
1294   address generate_conjoint_byte_copy(bool aligned, const char * name) {
1295     StubCodeMark mark(this, "StubRoutines", name);
1296     address start = __ function_entry();
1297     assert_positive_int(R5_ARG3);
1298 
1299     Register tmp1 = R6_ARG4;
1300     Register tmp2 = R7_ARG5;
1301     Register tmp3 = R8_ARG6;
1302 
1303     address nooverlap_target = aligned ?
1304       STUB_ENTRY(arrayof_jbyte_disjoint_arraycopy) :
1305       STUB_ENTRY(jbyte_disjoint_arraycopy);
1306 
1307     array_overlap_test(nooverlap_target, 0);
1308     // Do reverse copy. We assume the case of actual overlap is rare enough
1309     // that we don't have to optimize it.
1310     Label l_1, l_2;
1311 
1312     __ b(l_2);
1313     __ bind(l_1);
1314     __ stbx(tmp1, R4_ARG2, R5_ARG3);
1315     __ bind(l_2);
1316     __ addic_(R5_ARG3, R5_ARG3, -1);
1317     __ lbzx(tmp1, R3_ARG1, R5_ARG3);
1318     __ bge(CCR0, l_1);
1319 
1320     __ li(R3_RET, 0); // return 0
1321     __ blr();
1322 
1323     return start;
1324   }
1325 
1326   // Generate stub for disjoint short copy.  If "aligned" is true, the
1327   // "from" and "to" addresses are assumed to be heapword aligned.
1328   //
1329   // Arguments for generated stub:
1330   //      from:  R3_ARG1
1331   //      to:    R4_ARG2
1332   //  elm.count: R5_ARG3 treated as signed
1333   //
1334   // Strategy for aligned==true:
1335   //
1336   //  If length <= 9:
1337   //     1. copy 2 elements at a time (l_6)
1338   //     2. copy last element if original element count was odd (l_1)
1339   //
1340   //  If length > 9:
1341   //     1. copy 4 elements at a time until less than 4 elements are left (l_7)
1342   //     2. copy 2 elements at a time until less than 2 elements are left (l_6)
1343   //     3. copy last element if one was left in step 2. (l_1)
1344   //
1345   //
1346   // Strategy for aligned==false:
1347   //
1348   //  If length <= 9: same as aligned==true case, but NOTE: load/stores
1349   //                  can be unaligned (see comment below)
1350   //
1351   //  If length > 9:
1352   //     1. continue with step 6. if the alignment of from and to mod 4
1353   //        is different.
1354   //     2. align from and to to 4 bytes by copying 1 element if necessary
1355   //     3. at l_2 from and to are 4 byte aligned; continue with
1356   //        5. if they cannot be aligned to 8 bytes because they have
1357   //        got different alignment mod 8.
1358   //     4. at this point we know that both, from and to, have the same
1359   //        alignment mod 8, now copy one element if necessary to get
1360   //        8 byte alignment of from and to.
1361   //     5. copy 4 elements at a time until less than 4 elements are
1362   //        left; depending on step 3. all load/stores are aligned or
1363   //        either all loads or all stores are unaligned.
1364   //     6. copy 2 elements at a time until less than 2 elements are
1365   //        left (l_6); arriving here from step 1., there is a chance
1366   //        that all accesses are unaligned.
1367   //     7. copy last element if one was left in step 6. (l_1)
1368   //
1369   //  There are unaligned data accesses using integer load/store
1370   //  instructions in this stub. POWER allows such accesses.
1371   //
1372   //  According to the manuals (PowerISA_V2.06_PUBLIC, Book II,
1373   //  Chapter 2: Effect of Operand Placement on Performance) unaligned
1374   //  integer load/stores have good performance. Only unaligned
1375   //  floating point load/stores can have poor performance.
1376   //
1377   //  TODO:
1378   //
1379   //  1. check if aligning the backbranch target of loops is beneficial
1380   //
1381   address generate_disjoint_short_copy(bool aligned, const char * name) {
1382     StubCodeMark mark(this, "StubRoutines", name);
1383 
1384     Register tmp1 = R6_ARG4;
1385     Register tmp2 = R7_ARG5;
1386     Register tmp3 = R8_ARG6;
1387     Register tmp4 = R9_ARG7;
1388 
1389     VectorSRegister tmp_vsr1  = VSR1;
1390     VectorSRegister tmp_vsr2  = VSR2;
1391 
1392     address start = __ function_entry();
1393     assert_positive_int(R5_ARG3);
1394 
1395     Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9;
1396 
1397     // don't try anything fancy if arrays don't have many elements
1398     __ li(tmp3, 0);
1399     __ cmpwi(CCR0, R5_ARG3, 9);
1400     __ ble(CCR0, l_6); // copy 2 at a time
1401 
1402     if (!aligned) {
1403       __ xorr(tmp1, R3_ARG1, R4_ARG2);
1404       __ andi_(tmp1, tmp1, 3);
1405       __ bne(CCR0, l_6); // if arrays don't have the same alignment mod 4, do 2 element copy
1406 
1407       // At this point it is guaranteed that both, from and to have the same alignment mod 4.
1408 
1409       // Copy 1 element if necessary to align to 4 bytes.
1410       __ andi_(tmp1, R3_ARG1, 3);
1411       __ beq(CCR0, l_2);
1412 
1413       __ lhz(tmp2, 0, R3_ARG1);
1414       __ addi(R3_ARG1, R3_ARG1, 2);
1415       __ sth(tmp2, 0, R4_ARG2);
1416       __ addi(R4_ARG2, R4_ARG2, 2);
1417       __ addi(R5_ARG3, R5_ARG3, -1);
1418       __ bind(l_2);
1419 
1420       // At this point the positions of both, from and to, are at least 4 byte aligned.
1421 
1422       // Copy 4 elements at a time.
1423       // Align to 8 bytes, but only if both, from and to, have same alignment mod 8.
1424       __ xorr(tmp2, R3_ARG1, R4_ARG2);
1425       __ andi_(tmp1, tmp2, 7);
1426       __ bne(CCR0, l_7); // not same alignment mod 8 -> copy 4, either from or to will be unaligned
1427 
1428       // Copy a 2-element word if necessary to align to 8 bytes.
1429       __ andi_(R0, R3_ARG1, 7);
1430       __ beq(CCR0, l_7);
1431 
1432       __ lwzx(tmp2, R3_ARG1, tmp3);
1433       __ addi(R5_ARG3, R5_ARG3, -2);
1434       __ stwx(tmp2, R4_ARG2, tmp3);
1435       { // FasterArrayCopy
1436         __ addi(R3_ARG1, R3_ARG1, 4);
1437         __ addi(R4_ARG2, R4_ARG2, 4);
1438       }
1439     }
1440 
1441     __ bind(l_7);
1442 
1443     // Copy 4 elements at a time; either the loads or the stores can
1444     // be unaligned if aligned == false.
1445 
1446     { // FasterArrayCopy
1447       __ cmpwi(CCR0, R5_ARG3, 15);
1448       __ ble(CCR0, l_6); // copy 2 at a time if less than 16 elements remain
1449 
1450       __ srdi(tmp1, R5_ARG3, 4);
1451       __ andi_(R5_ARG3, R5_ARG3, 15);
1452       __ mtctr(tmp1);
1453 
1454       if (!VM_Version::has_vsx()) {
1455 
1456         __ bind(l_8);
1457         // Use unrolled version for mass copying (copy 16 elements a time).
1458         // Load feeding store gets zero latency on Power6, however not on Power5.
1459         // Therefore, the following sequence is made for the good of both.
1460         __ ld(tmp1, 0, R3_ARG1);
1461         __ ld(tmp2, 8, R3_ARG1);
1462         __ ld(tmp3, 16, R3_ARG1);
1463         __ ld(tmp4, 24, R3_ARG1);
1464         __ std(tmp1, 0, R4_ARG2);
1465         __ std(tmp2, 8, R4_ARG2);
1466         __ std(tmp3, 16, R4_ARG2);
1467         __ std(tmp4, 24, R4_ARG2);
1468         __ addi(R3_ARG1, R3_ARG1, 32);
1469         __ addi(R4_ARG2, R4_ARG2, 32);
1470         __ bdnz(l_8);
1471 
1472       } else { // Processor supports VSX, so use it to mass copy.
1473 
1474         // Prefetch src data into L2 cache.
1475         __ dcbt(R3_ARG1, 0);
1476 
1477         // If supported set DSCR pre-fetch to deepest.
1478         if (VM_Version::has_mfdscr()) {
1479           __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
1480           __ mtdscr(tmp2);
1481         }
1482         __ li(tmp1, 16);
1483 
1484         // Backbranch target aligned to 32-byte. It's not aligned 16-byte
1485         // as loop contains < 8 instructions that fit inside a single
1486         // i-cache sector.
1487         __ align(32);
1488 
1489         __ bind(l_9);
1490         // Use loop with VSX load/store instructions to
1491         // copy 16 elements a time.
1492         __ lxvd2x(tmp_vsr1, R3_ARG1);        // Load from src.
1493         __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst.
1494         __ lxvd2x(tmp_vsr2, R3_ARG1, tmp1);  // Load from src + 16.
1495         __ stxvd2x(tmp_vsr2, R4_ARG2, tmp1); // Store to dst + 16.
1496         __ addi(R3_ARG1, R3_ARG1, 32);       // Update src+=32.
1497         __ addi(R4_ARG2, R4_ARG2, 32);       // Update dsc+=32.
1498         __ bdnz(l_9);                        // Dec CTR and loop if not zero.
1499 
1500         // Restore DSCR pre-fetch value.
1501         if (VM_Version::has_mfdscr()) {
1502           __ load_const_optimized(tmp2, VM_Version::_dscr_val);
1503           __ mtdscr(tmp2);
1504         }
1505 
1506       }
1507     } // FasterArrayCopy
1508     __ bind(l_6);
1509 
1510     // copy 2 elements at a time
1511     { // FasterArrayCopy
1512       __ cmpwi(CCR0, R5_ARG3, 2);
1513       __ blt(CCR0, l_1);
1514       __ srdi(tmp1, R5_ARG3, 1);
1515       __ andi_(R5_ARG3, R5_ARG3, 1);
1516 
1517       __ addi(R3_ARG1, R3_ARG1, -4);
1518       __ addi(R4_ARG2, R4_ARG2, -4);
1519       __ mtctr(tmp1);
1520 
1521       __ bind(l_3);
1522       __ lwzu(tmp2, 4, R3_ARG1);
1523       __ stwu(tmp2, 4, R4_ARG2);
1524       __ bdnz(l_3);
1525 
1526       __ addi(R3_ARG1, R3_ARG1, 4);
1527       __ addi(R4_ARG2, R4_ARG2, 4);
1528     }
1529 
1530     // do single element copy
1531     __ bind(l_1);
1532     __ cmpwi(CCR0, R5_ARG3, 0);
1533     __ beq(CCR0, l_4);
1534 
1535     { // FasterArrayCopy
1536       __ mtctr(R5_ARG3);
1537       __ addi(R3_ARG1, R3_ARG1, -2);
1538       __ addi(R4_ARG2, R4_ARG2, -2);
1539 
1540       __ bind(l_5);
1541       __ lhzu(tmp2, 2, R3_ARG1);
1542       __ sthu(tmp2, 2, R4_ARG2);
1543       __ bdnz(l_5);
1544     }
1545     __ bind(l_4);
1546     __ li(R3_RET, 0); // return 0
1547     __ blr();
1548 
1549     return start;
1550   }
1551 
1552   // Generate stub for conjoint short copy.  If "aligned" is true, the
1553   // "from" and "to" addresses are assumed to be heapword aligned.
1554   //
1555   // Arguments for generated stub:
1556   //      from:  R3_ARG1
1557   //      to:    R4_ARG2
1558   //      count: R5_ARG3 treated as signed
1559   //
1560   address generate_conjoint_short_copy(bool aligned, const char * name) {
1561     StubCodeMark mark(this, "StubRoutines", name);
1562     address start = __ function_entry();
1563     assert_positive_int(R5_ARG3);
1564 
1565     Register tmp1 = R6_ARG4;
1566     Register tmp2 = R7_ARG5;
1567     Register tmp3 = R8_ARG6;
1568 
1569     address nooverlap_target = aligned ?
1570       STUB_ENTRY(arrayof_jshort_disjoint_arraycopy) :
1571       STUB_ENTRY(jshort_disjoint_arraycopy);
1572 
1573     array_overlap_test(nooverlap_target, 1);
1574 
1575     Label l_1, l_2;
1576     __ sldi(tmp1, R5_ARG3, 1);
1577     __ b(l_2);
1578     __ bind(l_1);
1579     __ sthx(tmp2, R4_ARG2, tmp1);
1580     __ bind(l_2);
1581     __ addic_(tmp1, tmp1, -2);
1582     __ lhzx(tmp2, R3_ARG1, tmp1);
1583     __ bge(CCR0, l_1);
1584 
1585     __ li(R3_RET, 0); // return 0
1586     __ blr();
1587 
1588     return start;
1589   }
1590 
1591   // Generate core code for disjoint int copy (and oop copy on 32-bit).  If "aligned"
1592   // is true, the "from" and "to" addresses are assumed to be heapword aligned.
1593   //
1594   // Arguments:
1595   //      from:  R3_ARG1
1596   //      to:    R4_ARG2
1597   //      count: R5_ARG3 treated as signed
1598   //
1599   void generate_disjoint_int_copy_core(bool aligned) {
1600     Register tmp1 = R6_ARG4;
1601     Register tmp2 = R7_ARG5;
1602     Register tmp3 = R8_ARG6;
1603     Register tmp4 = R0;
1604 
1605     VectorSRegister tmp_vsr1  = VSR1;
1606     VectorSRegister tmp_vsr2  = VSR2;
1607 
1608     Label l_1, l_2, l_3, l_4, l_5, l_6, l_7;
1609 
1610     // for short arrays, just do single element copy
1611     __ li(tmp3, 0);
1612     __ cmpwi(CCR0, R5_ARG3, 5);
1613     __ ble(CCR0, l_2);
1614 
1615     if (!aligned) {
1616         // check if arrays have same alignment mod 8.
1617         __ xorr(tmp1, R3_ARG1, R4_ARG2);
1618         __ andi_(R0, tmp1, 7);
1619         // Not the same alignment, but ld and std just need to be 4 byte aligned.
1620         __ bne(CCR0, l_4); // to OR from is 8 byte aligned -> copy 2 at a time
1621 
1622         // copy 1 element to align to and from on an 8 byte boundary
1623         __ andi_(R0, R3_ARG1, 7);
1624         __ beq(CCR0, l_4);
1625 
1626         __ lwzx(tmp2, R3_ARG1, tmp3);
1627         __ addi(R5_ARG3, R5_ARG3, -1);
1628         __ stwx(tmp2, R4_ARG2, tmp3);
1629         { // FasterArrayCopy
1630           __ addi(R3_ARG1, R3_ARG1, 4);
1631           __ addi(R4_ARG2, R4_ARG2, 4);
1632         }
1633         __ bind(l_4);
1634       }
1635 
1636     { // FasterArrayCopy
1637       __ cmpwi(CCR0, R5_ARG3, 7);
1638       __ ble(CCR0, l_2); // copy 1 at a time if less than 8 elements remain
1639 
1640       __ srdi(tmp1, R5_ARG3, 3);
1641       __ andi_(R5_ARG3, R5_ARG3, 7);
1642       __ mtctr(tmp1);
1643 
1644      if (!VM_Version::has_vsx()) {
1645 
1646       __ bind(l_6);
1647       // Use unrolled version for mass copying (copy 8 elements a time).
1648       // Load feeding store gets zero latency on power6, however not on power 5.
1649       // Therefore, the following sequence is made for the good of both.
1650       __ ld(tmp1, 0, R3_ARG1);
1651       __ ld(tmp2, 8, R3_ARG1);
1652       __ ld(tmp3, 16, R3_ARG1);
1653       __ ld(tmp4, 24, R3_ARG1);
1654       __ std(tmp1, 0, R4_ARG2);
1655       __ std(tmp2, 8, R4_ARG2);
1656       __ std(tmp3, 16, R4_ARG2);
1657       __ std(tmp4, 24, R4_ARG2);
1658       __ addi(R3_ARG1, R3_ARG1, 32);
1659       __ addi(R4_ARG2, R4_ARG2, 32);
1660       __ bdnz(l_6);
1661 
1662     } else { // Processor supports VSX, so use it to mass copy.
1663 
1664       // Prefetch the data into the L2 cache.
1665       __ dcbt(R3_ARG1, 0);
1666 
1667       // If supported set DSCR pre-fetch to deepest.
1668       if (VM_Version::has_mfdscr()) {
1669         __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
1670         __ mtdscr(tmp2);
1671       }
1672 
1673       __ li(tmp1, 16);
1674 
1675       // Backbranch target aligned to 32-byte. Not 16-byte align as
1676       // loop contains < 8 instructions that fit inside a single
1677       // i-cache sector.
1678       __ align(32);
1679 
1680       __ bind(l_7);
1681       // Use loop with VSX load/store instructions to
1682       // copy 8 elements a time.
1683       __ lxvd2x(tmp_vsr1, R3_ARG1);        // Load src
1684       __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst
1685       __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1);  // Load src + 16
1686       __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
1687       __ addi(R3_ARG1, R3_ARG1, 32);       // Update src+=32
1688       __ addi(R4_ARG2, R4_ARG2, 32);       // Update dsc+=32
1689       __ bdnz(l_7);                        // Dec CTR and loop if not zero.
1690 
1691       // Restore DSCR pre-fetch value.
1692       if (VM_Version::has_mfdscr()) {
1693         __ load_const_optimized(tmp2, VM_Version::_dscr_val);
1694         __ mtdscr(tmp2);
1695       }
1696 
1697     } // VSX
1698    } // FasterArrayCopy
1699 
1700     // copy 1 element at a time
1701     __ bind(l_2);
1702     __ cmpwi(CCR0, R5_ARG3, 0);
1703     __ beq(CCR0, l_1);
1704 
1705     { // FasterArrayCopy
1706       __ mtctr(R5_ARG3);
1707       __ addi(R3_ARG1, R3_ARG1, -4);
1708       __ addi(R4_ARG2, R4_ARG2, -4);
1709 
1710       __ bind(l_3);
1711       __ lwzu(tmp2, 4, R3_ARG1);
1712       __ stwu(tmp2, 4, R4_ARG2);
1713       __ bdnz(l_3);
1714     }
1715 
1716     __ bind(l_1);
1717     return;
1718   }
1719 
1720   // Generate stub for disjoint int copy.  If "aligned" is true, the
1721   // "from" and "to" addresses are assumed to be heapword aligned.
1722   //
1723   // Arguments for generated stub:
1724   //      from:  R3_ARG1
1725   //      to:    R4_ARG2
1726   //      count: R5_ARG3 treated as signed
1727   //
1728   address generate_disjoint_int_copy(bool aligned, const char * name) {
1729     StubCodeMark mark(this, "StubRoutines", name);
1730     address start = __ function_entry();
1731     assert_positive_int(R5_ARG3);
1732     generate_disjoint_int_copy_core(aligned);
1733     __ li(R3_RET, 0); // return 0
1734     __ blr();
1735     return start;
1736   }
1737 
1738   // Generate core code for conjoint int copy (and oop copy on
1739   // 32-bit).  If "aligned" is true, the "from" and "to" addresses
1740   // are assumed to be heapword aligned.
1741   //
1742   // Arguments:
1743   //      from:  R3_ARG1
1744   //      to:    R4_ARG2
1745   //      count: R5_ARG3 treated as signed
1746   //
1747   void generate_conjoint_int_copy_core(bool aligned) {
1748     // Do reverse copy.  We assume the case of actual overlap is rare enough
1749     // that we don't have to optimize it.
1750 
1751     Label l_1, l_2, l_3, l_4, l_5, l_6, l_7;
1752 
1753     Register tmp1 = R6_ARG4;
1754     Register tmp2 = R7_ARG5;
1755     Register tmp3 = R8_ARG6;
1756     Register tmp4 = R0;
1757 
1758     VectorSRegister tmp_vsr1  = VSR1;
1759     VectorSRegister tmp_vsr2  = VSR2;
1760 
1761     { // FasterArrayCopy
1762       __ cmpwi(CCR0, R5_ARG3, 0);
1763       __ beq(CCR0, l_6);
1764 
1765       __ sldi(R5_ARG3, R5_ARG3, 2);
1766       __ add(R3_ARG1, R3_ARG1, R5_ARG3);
1767       __ add(R4_ARG2, R4_ARG2, R5_ARG3);
1768       __ srdi(R5_ARG3, R5_ARG3, 2);
1769 
1770       if (!aligned) {
1771         // check if arrays have same alignment mod 8.
1772         __ xorr(tmp1, R3_ARG1, R4_ARG2);
1773         __ andi_(R0, tmp1, 7);
1774         // Not the same alignment, but ld and std just need to be 4 byte aligned.
1775         __ bne(CCR0, l_7); // to OR from is 8 byte aligned -> copy 2 at a time
1776 
1777         // copy 1 element to align to and from on an 8 byte boundary
1778         __ andi_(R0, R3_ARG1, 7);
1779         __ beq(CCR0, l_7);
1780 
1781         __ addi(R3_ARG1, R3_ARG1, -4);
1782         __ addi(R4_ARG2, R4_ARG2, -4);
1783         __ addi(R5_ARG3, R5_ARG3, -1);
1784         __ lwzx(tmp2, R3_ARG1);
1785         __ stwx(tmp2, R4_ARG2);
1786         __ bind(l_7);
1787       }
1788 
1789       __ cmpwi(CCR0, R5_ARG3, 7);
1790       __ ble(CCR0, l_5); // copy 1 at a time if less than 8 elements remain
1791 
1792       __ srdi(tmp1, R5_ARG3, 3);
1793       __ andi(R5_ARG3, R5_ARG3, 7);
1794       __ mtctr(tmp1);
1795 
1796      if (!VM_Version::has_vsx()) {
1797       __ bind(l_4);
1798       // Use unrolled version for mass copying (copy 4 elements a time).
1799       // Load feeding store gets zero latency on Power6, however not on Power5.
1800       // Therefore, the following sequence is made for the good of both.
1801       __ addi(R3_ARG1, R3_ARG1, -32);
1802       __ addi(R4_ARG2, R4_ARG2, -32);
1803       __ ld(tmp4, 24, R3_ARG1);
1804       __ ld(tmp3, 16, R3_ARG1);
1805       __ ld(tmp2, 8, R3_ARG1);
1806       __ ld(tmp1, 0, R3_ARG1);
1807       __ std(tmp4, 24, R4_ARG2);
1808       __ std(tmp3, 16, R4_ARG2);
1809       __ std(tmp2, 8, R4_ARG2);
1810       __ std(tmp1, 0, R4_ARG2);
1811       __ bdnz(l_4);
1812      } else {  // Processor supports VSX, so use it to mass copy.
1813       // Prefetch the data into the L2 cache.
1814       __ dcbt(R3_ARG1, 0);
1815 
1816       // If supported set DSCR pre-fetch to deepest.
1817       if (VM_Version::has_mfdscr()) {
1818         __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
1819         __ mtdscr(tmp2);
1820       }
1821 
1822       __ li(tmp1, 16);
1823 
1824       // Backbranch target aligned to 32-byte. Not 16-byte align as
1825       // loop contains < 8 instructions that fit inside a single
1826       // i-cache sector.
1827       __ align(32);
1828 
1829       __ bind(l_4);
1830       // Use loop with VSX load/store instructions to
1831       // copy 8 elements a time.
1832       __ addi(R3_ARG1, R3_ARG1, -32);      // Update src-=32
1833       __ addi(R4_ARG2, R4_ARG2, -32);      // Update dsc-=32
1834       __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1);  // Load src+16
1835       __ lxvd2x(tmp_vsr1, R3_ARG1);        // Load src
1836       __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst+16
1837       __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst
1838       __ bdnz(l_4);
1839 
1840       // Restore DSCR pre-fetch value.
1841       if (VM_Version::has_mfdscr()) {
1842         __ load_const_optimized(tmp2, VM_Version::_dscr_val);
1843         __ mtdscr(tmp2);
1844       }
1845      }
1846 
1847       __ cmpwi(CCR0, R5_ARG3, 0);
1848       __ beq(CCR0, l_6);
1849 
1850       __ bind(l_5);
1851       __ mtctr(R5_ARG3);
1852       __ bind(l_3);
1853       __ lwz(R0, -4, R3_ARG1);
1854       __ stw(R0, -4, R4_ARG2);
1855       __ addi(R3_ARG1, R3_ARG1, -4);
1856       __ addi(R4_ARG2, R4_ARG2, -4);
1857       __ bdnz(l_3);
1858 
1859       __ bind(l_6);
1860     }
1861   }
1862 
1863   // Generate stub for conjoint int copy.  If "aligned" is true, the
1864   // "from" and "to" addresses are assumed to be heapword aligned.
1865   //
1866   // Arguments for generated stub:
1867   //      from:  R3_ARG1
1868   //      to:    R4_ARG2
1869   //      count: R5_ARG3 treated as signed
1870   //
1871   address generate_conjoint_int_copy(bool aligned, const char * name) {
1872     StubCodeMark mark(this, "StubRoutines", name);
1873     address start = __ function_entry();
1874     assert_positive_int(R5_ARG3);
1875     address nooverlap_target = aligned ?
1876       STUB_ENTRY(arrayof_jint_disjoint_arraycopy) :
1877       STUB_ENTRY(jint_disjoint_arraycopy);
1878 
1879     array_overlap_test(nooverlap_target, 2);
1880 
1881     generate_conjoint_int_copy_core(aligned);
1882 
1883     __ li(R3_RET, 0); // return 0
1884     __ blr();
1885 
1886     return start;
1887   }
1888 
1889   // Generate core code for disjoint long copy (and oop copy on
1890   // 64-bit).  If "aligned" is true, the "from" and "to" addresses
1891   // are assumed to be heapword aligned.
1892   //
1893   // Arguments:
1894   //      from:  R3_ARG1
1895   //      to:    R4_ARG2
1896   //      count: R5_ARG3 treated as signed
1897   //
1898   void generate_disjoint_long_copy_core(bool aligned) {
1899     Register tmp1 = R6_ARG4;
1900     Register tmp2 = R7_ARG5;
1901     Register tmp3 = R8_ARG6;
1902     Register tmp4 = R0;
1903 
1904     Label l_1, l_2, l_3, l_4, l_5;
1905 
1906     VectorSRegister tmp_vsr1  = VSR1;
1907     VectorSRegister tmp_vsr2  = VSR2;
1908 
1909     { // FasterArrayCopy
1910       __ cmpwi(CCR0, R5_ARG3, 3);
1911       __ ble(CCR0, l_3); // copy 1 at a time if less than 4 elements remain
1912 
1913       __ srdi(tmp1, R5_ARG3, 2);
1914       __ andi_(R5_ARG3, R5_ARG3, 3);
1915       __ mtctr(tmp1);
1916 
1917     if (!VM_Version::has_vsx()) {
1918       __ bind(l_4);
1919       // Use unrolled version for mass copying (copy 4 elements a time).
1920       // Load feeding store gets zero latency on Power6, however not on Power5.
1921       // Therefore, the following sequence is made for the good of both.
1922       __ ld(tmp1, 0, R3_ARG1);
1923       __ ld(tmp2, 8, R3_ARG1);
1924       __ ld(tmp3, 16, R3_ARG1);
1925       __ ld(tmp4, 24, R3_ARG1);
1926       __ std(tmp1, 0, R4_ARG2);
1927       __ std(tmp2, 8, R4_ARG2);
1928       __ std(tmp3, 16, R4_ARG2);
1929       __ std(tmp4, 24, R4_ARG2);
1930       __ addi(R3_ARG1, R3_ARG1, 32);
1931       __ addi(R4_ARG2, R4_ARG2, 32);
1932       __ bdnz(l_4);
1933 
1934     } else { // Processor supports VSX, so use it to mass copy.
1935 
1936       // Prefetch the data into the L2 cache.
1937       __ dcbt(R3_ARG1, 0);
1938 
1939       // If supported set DSCR pre-fetch to deepest.
1940       if (VM_Version::has_mfdscr()) {
1941         __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
1942         __ mtdscr(tmp2);
1943       }
1944 
1945       __ li(tmp1, 16);
1946 
1947       // Backbranch target aligned to 32-byte. Not 16-byte align as
1948       // loop contains < 8 instructions that fit inside a single
1949       // i-cache sector.
1950       __ align(32);
1951 
1952       __ bind(l_5);
1953       // Use loop with VSX load/store instructions to
1954       // copy 4 elements a time.
1955       __ lxvd2x(tmp_vsr1, R3_ARG1);        // Load src
1956       __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst
1957       __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1);  // Load src + 16
1958       __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
1959       __ addi(R3_ARG1, R3_ARG1, 32);       // Update src+=32
1960       __ addi(R4_ARG2, R4_ARG2, 32);       // Update dsc+=32
1961       __ bdnz(l_5);                        // Dec CTR and loop if not zero.
1962 
1963       // Restore DSCR pre-fetch value.
1964       if (VM_Version::has_mfdscr()) {
1965         __ load_const_optimized(tmp2, VM_Version::_dscr_val);
1966         __ mtdscr(tmp2);
1967       }
1968 
1969     } // VSX
1970    } // FasterArrayCopy
1971 
1972     // copy 1 element at a time
1973     __ bind(l_3);
1974     __ cmpwi(CCR0, R5_ARG3, 0);
1975     __ beq(CCR0, l_1);
1976 
1977     { // FasterArrayCopy
1978       __ mtctr(R5_ARG3);
1979       __ addi(R3_ARG1, R3_ARG1, -8);
1980       __ addi(R4_ARG2, R4_ARG2, -8);
1981 
1982       __ bind(l_2);
1983       __ ldu(R0, 8, R3_ARG1);
1984       __ stdu(R0, 8, R4_ARG2);
1985       __ bdnz(l_2);
1986 
1987     }
1988     __ bind(l_1);
1989   }
1990 
1991   // Generate stub for disjoint long copy.  If "aligned" is true, the
1992   // "from" and "to" addresses are assumed to be heapword aligned.
1993   //
1994   // Arguments for generated stub:
1995   //      from:  R3_ARG1
1996   //      to:    R4_ARG2
1997   //      count: R5_ARG3 treated as signed
1998   //
1999   address generate_disjoint_long_copy(bool aligned, const char * name) {
2000     StubCodeMark mark(this, "StubRoutines", name);
2001     address start = __ function_entry();
2002     assert_positive_int(R5_ARG3);
2003     generate_disjoint_long_copy_core(aligned);
2004     __ li(R3_RET, 0); // return 0
2005     __ blr();
2006 
2007     return start;
2008   }
2009 
2010   // Generate core code for conjoint long copy (and oop copy on
2011   // 64-bit).  If "aligned" is true, the "from" and "to" addresses
2012   // are assumed to be heapword aligned.
2013   //
2014   // Arguments:
2015   //      from:  R3_ARG1
2016   //      to:    R4_ARG2
2017   //      count: R5_ARG3 treated as signed
2018   //
2019   void generate_conjoint_long_copy_core(bool aligned) {
2020     Register tmp1 = R6_ARG4;
2021     Register tmp2 = R7_ARG5;
2022     Register tmp3 = R8_ARG6;
2023     Register tmp4 = R0;
2024 
2025     VectorSRegister tmp_vsr1  = VSR1;
2026     VectorSRegister tmp_vsr2  = VSR2;
2027 
2028     Label l_1, l_2, l_3, l_4, l_5;
2029 
2030     __ cmpwi(CCR0, R5_ARG3, 0);
2031     __ beq(CCR0, l_1);
2032 
2033     { // FasterArrayCopy
2034       __ sldi(R5_ARG3, R5_ARG3, 3);
2035       __ add(R3_ARG1, R3_ARG1, R5_ARG3);
2036       __ add(R4_ARG2, R4_ARG2, R5_ARG3);
2037       __ srdi(R5_ARG3, R5_ARG3, 3);
2038 
2039       __ cmpwi(CCR0, R5_ARG3, 3);
2040       __ ble(CCR0, l_5); // copy 1 at a time if less than 4 elements remain
2041 
2042       __ srdi(tmp1, R5_ARG3, 2);
2043       __ andi(R5_ARG3, R5_ARG3, 3);
2044       __ mtctr(tmp1);
2045 
2046      if (!VM_Version::has_vsx()) {
2047       __ bind(l_4);
2048       // Use unrolled version for mass copying (copy 4 elements a time).
2049       // Load feeding store gets zero latency on Power6, however not on Power5.
2050       // Therefore, the following sequence is made for the good of both.
2051       __ addi(R3_ARG1, R3_ARG1, -32);
2052       __ addi(R4_ARG2, R4_ARG2, -32);
2053       __ ld(tmp4, 24, R3_ARG1);
2054       __ ld(tmp3, 16, R3_ARG1);
2055       __ ld(tmp2, 8, R3_ARG1);
2056       __ ld(tmp1, 0, R3_ARG1);
2057       __ std(tmp4, 24, R4_ARG2);
2058       __ std(tmp3, 16, R4_ARG2);
2059       __ std(tmp2, 8, R4_ARG2);
2060       __ std(tmp1, 0, R4_ARG2);
2061       __ bdnz(l_4);
2062      } else { // Processor supports VSX, so use it to mass copy.
2063       // Prefetch the data into the L2 cache.
2064       __ dcbt(R3_ARG1, 0);
2065 
2066       // If supported set DSCR pre-fetch to deepest.
2067       if (VM_Version::has_mfdscr()) {
2068         __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
2069         __ mtdscr(tmp2);
2070       }
2071 
2072       __ li(tmp1, 16);
2073 
2074       // Backbranch target aligned to 32-byte. Not 16-byte align as
2075       // loop contains < 8 instructions that fit inside a single
2076       // i-cache sector.
2077       __ align(32);
2078 
2079       __ bind(l_4);
2080       // Use loop with VSX load/store instructions to
2081       // copy 4 elements a time.
2082       __ addi(R3_ARG1, R3_ARG1, -32);      // Update src-=32
2083       __ addi(R4_ARG2, R4_ARG2, -32);      // Update dsc-=32
2084       __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1);  // Load src+16
2085       __ lxvd2x(tmp_vsr1, R3_ARG1);        // Load src
2086       __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst+16
2087       __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst
2088       __ bdnz(l_4);
2089 
2090       // Restore DSCR pre-fetch value.
2091       if (VM_Version::has_mfdscr()) {
2092         __ load_const_optimized(tmp2, VM_Version::_dscr_val);
2093         __ mtdscr(tmp2);
2094       }
2095      }
2096 
2097       __ cmpwi(CCR0, R5_ARG3, 0);
2098       __ beq(CCR0, l_1);
2099 
2100       __ bind(l_5);
2101       __ mtctr(R5_ARG3);
2102       __ bind(l_3);
2103       __ ld(R0, -8, R3_ARG1);
2104       __ std(R0, -8, R4_ARG2);
2105       __ addi(R3_ARG1, R3_ARG1, -8);
2106       __ addi(R4_ARG2, R4_ARG2, -8);
2107       __ bdnz(l_3);
2108 
2109     }
2110     __ bind(l_1);
2111   }
2112 
2113   // Generate stub for conjoint long copy.  If "aligned" is true, the
2114   // "from" and "to" addresses are assumed to be heapword aligned.
2115   //
2116   // Arguments for generated stub:
2117   //      from:  R3_ARG1
2118   //      to:    R4_ARG2
2119   //      count: R5_ARG3 treated as signed
2120   //
2121   address generate_conjoint_long_copy(bool aligned, const char * name) {
2122     StubCodeMark mark(this, "StubRoutines", name);
2123     address start = __ function_entry();
2124     assert_positive_int(R5_ARG3);
2125     address nooverlap_target = aligned ?
2126       STUB_ENTRY(arrayof_jlong_disjoint_arraycopy) :
2127       STUB_ENTRY(jlong_disjoint_arraycopy);
2128 
2129     array_overlap_test(nooverlap_target, 3);
2130     generate_conjoint_long_copy_core(aligned);
2131 
2132     __ li(R3_RET, 0); // return 0
2133     __ blr();
2134 
2135     return start;
2136   }
2137 
2138   // Generate stub for conjoint oop copy.  If "aligned" is true, the
2139   // "from" and "to" addresses are assumed to be heapword aligned.
2140   //
2141   // Arguments for generated stub:
2142   //      from:  R3_ARG1
2143   //      to:    R4_ARG2
2144   //      count: R5_ARG3 treated as signed
2145   //      dest_uninitialized: G1 support
2146   //
2147   address generate_conjoint_oop_copy(bool aligned, const char * name, bool dest_uninitialized) {
2148     StubCodeMark mark(this, "StubRoutines", name);
2149 
2150     address start = __ function_entry();
2151     assert_positive_int(R5_ARG3);
2152     address nooverlap_target = aligned ?
2153       STUB_ENTRY(arrayof_oop_disjoint_arraycopy) :
2154       STUB_ENTRY(oop_disjoint_arraycopy);
2155 
2156     gen_write_ref_array_pre_barrier(R3_ARG1, R4_ARG2, R5_ARG3, dest_uninitialized, R9_ARG7);
2157 
2158     // Save arguments.
2159     __ mr(R9_ARG7, R4_ARG2);
2160     __ mr(R10_ARG8, R5_ARG3);
2161 
2162     if (UseCompressedOops) {
2163       array_overlap_test(nooverlap_target, 2);
2164       generate_conjoint_int_copy_core(aligned);
2165     } else {
2166       array_overlap_test(nooverlap_target, 3);
2167       generate_conjoint_long_copy_core(aligned);
2168     }
2169 
2170     gen_write_ref_array_post_barrier(R9_ARG7, R10_ARG8, R11_scratch1);
2171     __ li(R3_RET, 0); // return 0
2172     __ blr();
2173     return start;
2174   }
2175 
2176   // Generate stub for disjoint oop copy.  If "aligned" is true, the
2177   // "from" and "to" addresses are assumed to be heapword aligned.
2178   //
2179   // Arguments for generated stub:
2180   //      from:  R3_ARG1
2181   //      to:    R4_ARG2
2182   //      count: R5_ARG3 treated as signed
2183   //      dest_uninitialized: G1 support
2184   //
2185   address generate_disjoint_oop_copy(bool aligned, const char * name, bool dest_uninitialized) {
2186     StubCodeMark mark(this, "StubRoutines", name);
2187     address start = __ function_entry();
2188     assert_positive_int(R5_ARG3);
2189     gen_write_ref_array_pre_barrier(R3_ARG1, R4_ARG2, R5_ARG3, dest_uninitialized, R9_ARG7);
2190 
2191     // save some arguments, disjoint_long_copy_core destroys them.
2192     // needed for post barrier
2193     __ mr(R9_ARG7, R4_ARG2);
2194     __ mr(R10_ARG8, R5_ARG3);
2195 
2196     if (UseCompressedOops) {
2197       generate_disjoint_int_copy_core(aligned);
2198     } else {
2199       generate_disjoint_long_copy_core(aligned);
2200     }
2201 
2202     gen_write_ref_array_post_barrier(R9_ARG7, R10_ARG8, R11_scratch1);
2203     __ li(R3_RET, 0); // return 0
2204     __ blr();
2205 
2206     return start;
2207   }
2208 
2209 
2210   // Helper for generating a dynamic type check.
2211   // Smashes only the given temp registers.
2212   void generate_type_check(Register sub_klass,
2213                            Register super_check_offset,
2214                            Register super_klass,
2215                            Register temp,
2216                            Label& L_success) {
2217     assert_different_registers(sub_klass, super_check_offset, super_klass);
2218 
2219     BLOCK_COMMENT("type_check:");
2220 
2221     Label L_miss;
2222 
2223     __ check_klass_subtype_fast_path(sub_klass, super_klass, temp, R0, &L_success, &L_miss, NULL,
2224                                      super_check_offset);
2225     __ check_klass_subtype_slow_path(sub_klass, super_klass, temp, R0, &L_success, NULL);
2226 
2227     // Fall through on failure!
2228     __ bind(L_miss);
2229   }
2230 
2231 
2232   //  Generate stub for checked oop copy.
2233   //
2234   // Arguments for generated stub:
2235   //      from:  R3
2236   //      to:    R4
2237   //      count: R5 treated as signed
2238   //      ckoff: R6 (super_check_offset)
2239   //      ckval: R7 (super_klass)
2240   //      ret:   R3 zero for success; (-1^K) where K is partial transfer count
2241   //
2242   address generate_checkcast_copy(const char *name, bool dest_uninitialized) {
2243 
2244     const Register R3_from   = R3_ARG1;      // source array address
2245     const Register R4_to     = R4_ARG2;      // destination array address
2246     const Register R5_count  = R5_ARG3;      // elements count
2247     const Register R6_ckoff  = R6_ARG4;      // super_check_offset
2248     const Register R7_ckval  = R7_ARG5;      // super_klass
2249 
2250     const Register R8_offset = R8_ARG6;      // loop var, with stride wordSize
2251     const Register R9_remain = R9_ARG7;      // loop var, with stride -1
2252     const Register R10_oop   = R10_ARG8;     // actual oop copied
2253     const Register R11_klass = R11_scratch1; // oop._klass
2254     const Register R12_tmp   = R12_scratch2;
2255 
2256     const Register R2_minus1 = R2;
2257 
2258     //__ align(CodeEntryAlignment);
2259     StubCodeMark mark(this, "StubRoutines", name);
2260     address start = __ function_entry();
2261 
2262     // Assert that int is 64 bit sign extended and arrays are not conjoint.
2263 #ifdef ASSERT
2264     {
2265     assert_positive_int(R5_ARG3);
2266     const Register tmp1 = R11_scratch1, tmp2 = R12_scratch2;
2267     Label no_overlap;
2268     __ subf(tmp1, R3_ARG1, R4_ARG2); // distance in bytes
2269     __ sldi(tmp2, R5_ARG3, LogBytesPerHeapOop); // size in bytes
2270     __ cmpld(CCR0, R3_ARG1, R4_ARG2); // Use unsigned comparison!
2271     __ cmpld(CCR1, tmp1, tmp2);
2272     __ crnand(CCR0, Assembler::less, CCR1, Assembler::less);
2273     // Overlaps if Src before dst and distance smaller than size.
2274     // Branch to forward copy routine otherwise.
2275     __ blt(CCR0, no_overlap);
2276     __ stop("overlap in checkcast_copy", 0x9543);
2277     __ bind(no_overlap);
2278     }
2279 #endif
2280 
2281     gen_write_ref_array_pre_barrier(R3_from, R4_to, R5_count, dest_uninitialized, R12_tmp, /* preserve: */ R6_ckoff, R7_ckval);
2282 
2283     //inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, R12_tmp, R3_RET);
2284 
2285     Label load_element, store_element, store_null, success, do_card_marks;
2286     __ or_(R9_remain, R5_count, R5_count); // Initialize loop index, and test it.
2287     __ li(R8_offset, 0);                   // Offset from start of arrays.
2288     __ li(R2_minus1, -1);
2289     __ bne(CCR0, load_element);
2290 
2291     // Empty array: Nothing to do.
2292     __ li(R3_RET, 0);           // Return 0 on (trivial) success.
2293     __ blr();
2294 
2295     // ======== begin loop ========
2296     // (Entry is load_element.)
2297     __ align(OptoLoopAlignment);
2298     __ bind(store_element);
2299     if (UseCompressedOops) {
2300       __ encode_heap_oop_not_null(R10_oop);
2301       __ bind(store_null);
2302       __ stw(R10_oop, R8_offset, R4_to);
2303     } else {
2304       __ bind(store_null);
2305       __ std(R10_oop, R8_offset, R4_to);
2306     }
2307 
2308     __ addi(R8_offset, R8_offset, heapOopSize);   // Step to next offset.
2309     __ add_(R9_remain, R2_minus1, R9_remain);     // Decrement the count.
2310     __ beq(CCR0, success);
2311 
2312     // ======== loop entry is here ========
2313     __ bind(load_element);
2314     __ load_heap_oop(R10_oop, R8_offset, R3_from, &store_null);  // Load the oop.
2315 
2316     __ load_klass(R11_klass, R10_oop); // Query the object klass.
2317 
2318     generate_type_check(R11_klass, R6_ckoff, R7_ckval, R12_tmp,
2319                         // Branch to this on success:
2320                         store_element);
2321     // ======== end loop ========
2322 
2323     // It was a real error; we must depend on the caller to finish the job.
2324     // Register R9_remain has number of *remaining* oops, R5_count number of *total* oops.
2325     // Emit GC store barriers for the oops we have copied (R5_count minus R9_remain),
2326     // and report their number to the caller.
2327     __ subf_(R5_count, R9_remain, R5_count);
2328     __ nand(R3_RET, R5_count, R5_count);   // report (-1^K) to caller
2329     __ bne(CCR0, do_card_marks);
2330     __ blr();
2331 
2332     __ bind(success);
2333     __ li(R3_RET, 0);
2334 
2335     __ bind(do_card_marks);
2336     // Store check on R4_to[0..R5_count-1].
2337     gen_write_ref_array_post_barrier(R4_to, R5_count, R12_tmp, /* preserve: */ R3_RET);
2338     __ blr();
2339     return start;
2340   }
2341 
2342 
2343   //  Generate 'unsafe' array copy stub.
2344   //  Though just as safe as the other stubs, it takes an unscaled
2345   //  size_t argument instead of an element count.
2346   //
2347   // Arguments for generated stub:
2348   //      from:  R3
2349   //      to:    R4
2350   //      count: R5 byte count, treated as ssize_t, can be zero
2351   //
2352   // Examines the alignment of the operands and dispatches
2353   // to a long, int, short, or byte copy loop.
2354   //
2355   address generate_unsafe_copy(const char* name,
2356                                address byte_copy_entry,
2357                                address short_copy_entry,
2358                                address int_copy_entry,
2359                                address long_copy_entry) {
2360 
2361     const Register R3_from   = R3_ARG1;      // source array address
2362     const Register R4_to     = R4_ARG2;      // destination array address
2363     const Register R5_count  = R5_ARG3;      // elements count (as long on PPC64)
2364 
2365     const Register R6_bits   = R6_ARG4;      // test copy of low bits
2366     const Register R7_tmp    = R7_ARG5;
2367 
2368     //__ align(CodeEntryAlignment);
2369     StubCodeMark mark(this, "StubRoutines", name);
2370     address start = __ function_entry();
2371 
2372     // Bump this on entry, not on exit:
2373     //inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr, R6_bits, R7_tmp);
2374 
2375     Label short_copy, int_copy, long_copy;
2376 
2377     __ orr(R6_bits, R3_from, R4_to);
2378     __ orr(R6_bits, R6_bits, R5_count);
2379     __ andi_(R0, R6_bits, (BytesPerLong-1));
2380     __ beq(CCR0, long_copy);
2381 
2382     __ andi_(R0, R6_bits, (BytesPerInt-1));
2383     __ beq(CCR0, int_copy);
2384 
2385     __ andi_(R0, R6_bits, (BytesPerShort-1));
2386     __ beq(CCR0, short_copy);
2387 
2388     // byte_copy:
2389     __ b(byte_copy_entry);
2390 
2391     __ bind(short_copy);
2392     __ srwi(R5_count, R5_count, LogBytesPerShort);
2393     __ b(short_copy_entry);
2394 
2395     __ bind(int_copy);
2396     __ srwi(R5_count, R5_count, LogBytesPerInt);
2397     __ b(int_copy_entry);
2398 
2399     __ bind(long_copy);
2400     __ srwi(R5_count, R5_count, LogBytesPerLong);
2401     __ b(long_copy_entry);
2402 
2403     return start;
2404   }
2405 
2406 
2407   // Perform range checks on the proposed arraycopy.
2408   // Kills the two temps, but nothing else.
2409   // Also, clean the sign bits of src_pos and dst_pos.
2410   void arraycopy_range_checks(Register src,     // source array oop
2411                               Register src_pos, // source position
2412                               Register dst,     // destination array oop
2413                               Register dst_pos, // destination position
2414                               Register length,  // length of copy
2415                               Register temp1, Register temp2,
2416                               Label& L_failed) {
2417     BLOCK_COMMENT("arraycopy_range_checks:");
2418 
2419     const Register array_length = temp1;  // scratch
2420     const Register end_pos      = temp2;  // scratch
2421 
2422     //  if (src_pos + length > arrayOop(src)->length() ) FAIL;
2423     __ lwa(array_length, arrayOopDesc::length_offset_in_bytes(), src);
2424     __ add(end_pos, src_pos, length);  // src_pos + length
2425     __ cmpd(CCR0, end_pos, array_length);
2426     __ bgt(CCR0, L_failed);
2427 
2428     //  if (dst_pos + length > arrayOop(dst)->length() ) FAIL;
2429     __ lwa(array_length, arrayOopDesc::length_offset_in_bytes(), dst);
2430     __ add(end_pos, dst_pos, length);  // src_pos + length
2431     __ cmpd(CCR0, end_pos, array_length);
2432     __ bgt(CCR0, L_failed);
2433 
2434     BLOCK_COMMENT("arraycopy_range_checks done");
2435   }
2436 
2437 
2438   //
2439   //  Generate generic array copy stubs
2440   //
2441   //  Input:
2442   //    R3    -  src oop
2443   //    R4    -  src_pos
2444   //    R5    -  dst oop
2445   //    R6    -  dst_pos
2446   //    R7    -  element count
2447   //
2448   //  Output:
2449   //    R3 ==  0  -  success
2450   //    R3 == -1  -  need to call System.arraycopy
2451   //
2452   address generate_generic_copy(const char *name,
2453                                 address entry_jbyte_arraycopy,
2454                                 address entry_jshort_arraycopy,
2455                                 address entry_jint_arraycopy,
2456                                 address entry_oop_arraycopy,
2457                                 address entry_disjoint_oop_arraycopy,
2458                                 address entry_jlong_arraycopy,
2459                                 address entry_checkcast_arraycopy) {
2460     Label L_failed, L_objArray;
2461 
2462     // Input registers
2463     const Register src       = R3_ARG1;  // source array oop
2464     const Register src_pos   = R4_ARG2;  // source position
2465     const Register dst       = R5_ARG3;  // destination array oop
2466     const Register dst_pos   = R6_ARG4;  // destination position
2467     const Register length    = R7_ARG5;  // elements count
2468 
2469     // registers used as temp
2470     const Register src_klass = R8_ARG6;  // source array klass
2471     const Register dst_klass = R9_ARG7;  // destination array klass
2472     const Register lh        = R10_ARG8; // layout handler
2473     const Register temp      = R2;
2474 
2475     //__ align(CodeEntryAlignment);
2476     StubCodeMark mark(this, "StubRoutines", name);
2477     address start = __ function_entry();
2478 
2479     // Bump this on entry, not on exit:
2480     //inc_counter_np(SharedRuntime::_generic_array_copy_ctr, lh, temp);
2481 
2482     // In principle, the int arguments could be dirty.
2483 
2484     //-----------------------------------------------------------------------
2485     // Assembler stubs will be used for this call to arraycopy
2486     // if the following conditions are met:
2487     //
2488     // (1) src and dst must not be null.
2489     // (2) src_pos must not be negative.
2490     // (3) dst_pos must not be negative.
2491     // (4) length  must not be negative.
2492     // (5) src klass and dst klass should be the same and not NULL.
2493     // (6) src and dst should be arrays.
2494     // (7) src_pos + length must not exceed length of src.
2495     // (8) dst_pos + length must not exceed length of dst.
2496     BLOCK_COMMENT("arraycopy initial argument checks");
2497 
2498     __ cmpdi(CCR1, src, 0);      // if (src == NULL) return -1;
2499     __ extsw_(src_pos, src_pos); // if (src_pos < 0) return -1;
2500     __ cmpdi(CCR5, dst, 0);      // if (dst == NULL) return -1;
2501     __ cror(CCR1, Assembler::equal, CCR0, Assembler::less);
2502     __ extsw_(dst_pos, dst_pos); // if (src_pos < 0) return -1;
2503     __ cror(CCR5, Assembler::equal, CCR0, Assembler::less);
2504     __ extsw_(length, length);   // if (length < 0) return -1;
2505     __ cror(CCR1, Assembler::equal, CCR5, Assembler::equal);
2506     __ cror(CCR1, Assembler::equal, CCR0, Assembler::less);
2507     __ beq(CCR1, L_failed);
2508 
2509     BLOCK_COMMENT("arraycopy argument klass checks");
2510     __ load_klass(src_klass, src);
2511     __ load_klass(dst_klass, dst);
2512 
2513     // Load layout helper
2514     //
2515     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2516     // 32        30    24            16              8     2                 0
2517     //
2518     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2519     //
2520 
2521     int lh_offset = in_bytes(Klass::layout_helper_offset());
2522 
2523     // Load 32-bits signed value. Use br() instruction with it to check icc.
2524     __ lwz(lh, lh_offset, src_klass);
2525 
2526     // Handle objArrays completely differently...
2527     jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2528     __ load_const_optimized(temp, objArray_lh, R0);
2529     __ cmpw(CCR0, lh, temp);
2530     __ beq(CCR0, L_objArray);
2531 
2532     __ cmpd(CCR5, src_klass, dst_klass);          // if (src->klass() != dst->klass()) return -1;
2533     __ cmpwi(CCR6, lh, Klass::_lh_neutral_value); // if (!src->is_Array()) return -1;
2534 
2535     __ crnand(CCR5, Assembler::equal, CCR6, Assembler::less);
2536     __ beq(CCR5, L_failed);
2537 
2538     // At this point, it is known to be a typeArray (array_tag 0x3).
2539 #ifdef ASSERT
2540     { Label L;
2541       jint lh_prim_tag_in_place = (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2542       __ load_const_optimized(temp, lh_prim_tag_in_place, R0);
2543       __ cmpw(CCR0, lh, temp);
2544       __ bge(CCR0, L);
2545       __ stop("must be a primitive array");
2546       __ bind(L);
2547     }
2548 #endif
2549 
2550     arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
2551                            temp, dst_klass, L_failed);
2552 
2553     // TypeArrayKlass
2554     //
2555     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2556     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2557     //
2558 
2559     const Register offset = dst_klass;    // array offset
2560     const Register elsize = src_klass;    // log2 element size
2561 
2562     __ rldicl(offset, lh, 64 - Klass::_lh_header_size_shift, 64 - exact_log2(Klass::_lh_header_size_mask + 1));
2563     __ andi(elsize, lh, Klass::_lh_log2_element_size_mask);
2564     __ add(src, offset, src);       // src array offset
2565     __ add(dst, offset, dst);       // dst array offset
2566 
2567     // Next registers should be set before the jump to corresponding stub.
2568     const Register from     = R3_ARG1;  // source array address
2569     const Register to       = R4_ARG2;  // destination array address
2570     const Register count    = R5_ARG3;  // elements count
2571 
2572     // 'from', 'to', 'count' registers should be set in this order
2573     // since they are the same as 'src', 'src_pos', 'dst'.
2574 
2575     BLOCK_COMMENT("scale indexes to element size");
2576     __ sld(src_pos, src_pos, elsize);
2577     __ sld(dst_pos, dst_pos, elsize);
2578     __ add(from, src_pos, src);  // src_addr
2579     __ add(to, dst_pos, dst);    // dst_addr
2580     __ mr(count, length);        // length
2581 
2582     BLOCK_COMMENT("choose copy loop based on element size");
2583     // Using conditional branches with range 32kB.
2584     const int bo = Assembler::bcondCRbiIs1, bi = Assembler::bi0(CCR0, Assembler::equal);
2585     __ cmpwi(CCR0, elsize, 0);
2586     __ bc(bo, bi, entry_jbyte_arraycopy);
2587     __ cmpwi(CCR0, elsize, LogBytesPerShort);
2588     __ bc(bo, bi, entry_jshort_arraycopy);
2589     __ cmpwi(CCR0, elsize, LogBytesPerInt);
2590     __ bc(bo, bi, entry_jint_arraycopy);
2591 #ifdef ASSERT
2592     { Label L;
2593       __ cmpwi(CCR0, elsize, LogBytesPerLong);
2594       __ beq(CCR0, L);
2595       __ stop("must be long copy, but elsize is wrong");
2596       __ bind(L);
2597     }
2598 #endif
2599     __ b(entry_jlong_arraycopy);
2600 
2601     // ObjArrayKlass
2602   __ bind(L_objArray);
2603     // live at this point:  src_klass, dst_klass, src[_pos], dst[_pos], length
2604 
2605     Label L_disjoint_plain_copy, L_checkcast_copy;
2606     //  test array classes for subtyping
2607     __ cmpd(CCR0, src_klass, dst_klass);         // usual case is exact equality
2608     __ bne(CCR0, L_checkcast_copy);
2609 
2610     // Identically typed arrays can be copied without element-wise checks.
2611     arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
2612                            temp, lh, L_failed);
2613 
2614     __ addi(src, src, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //src offset
2615     __ addi(dst, dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //dst offset
2616     __ sldi(src_pos, src_pos, LogBytesPerHeapOop);
2617     __ sldi(dst_pos, dst_pos, LogBytesPerHeapOop);
2618     __ add(from, src_pos, src);  // src_addr
2619     __ add(to, dst_pos, dst);    // dst_addr
2620     __ mr(count, length);        // length
2621     __ b(entry_oop_arraycopy);
2622 
2623   __ bind(L_checkcast_copy);
2624     // live at this point:  src_klass, dst_klass
2625     {
2626       // Before looking at dst.length, make sure dst is also an objArray.
2627       __ lwz(temp, lh_offset, dst_klass);
2628       __ cmpw(CCR0, lh, temp);
2629       __ bne(CCR0, L_failed);
2630 
2631       // It is safe to examine both src.length and dst.length.
2632       arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
2633                              temp, lh, L_failed);
2634 
2635       // Marshal the base address arguments now, freeing registers.
2636       __ addi(src, src, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //src offset
2637       __ addi(dst, dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //dst offset
2638       __ sldi(src_pos, src_pos, LogBytesPerHeapOop);
2639       __ sldi(dst_pos, dst_pos, LogBytesPerHeapOop);
2640       __ add(from, src_pos, src);  // src_addr
2641       __ add(to, dst_pos, dst);    // dst_addr
2642       __ mr(count, length);        // length
2643 
2644       Register sco_temp = R6_ARG4;             // This register is free now.
2645       assert_different_registers(from, to, count, sco_temp,
2646                                  dst_klass, src_klass);
2647 
2648       // Generate the type check.
2649       int sco_offset = in_bytes(Klass::super_check_offset_offset());
2650       __ lwz(sco_temp, sco_offset, dst_klass);
2651       generate_type_check(src_klass, sco_temp, dst_klass,
2652                           temp, L_disjoint_plain_copy);
2653 
2654       // Fetch destination element klass from the ObjArrayKlass header.
2655       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2656 
2657       // The checkcast_copy loop needs two extra arguments:
2658       __ ld(R7_ARG5, ek_offset, dst_klass);   // dest elem klass
2659       __ lwz(R6_ARG4, sco_offset, R7_ARG5);   // sco of elem klass
2660       __ b(entry_checkcast_arraycopy);
2661     }
2662 
2663     __ bind(L_disjoint_plain_copy);
2664     __ b(entry_disjoint_oop_arraycopy);
2665 
2666   __ bind(L_failed);
2667     __ li(R3_RET, -1); // return -1
2668     __ blr();
2669     return start;
2670   }
2671 
2672   // Arguments for generated stub (little endian only):
2673   //   R3_ARG1   - source byte array address
2674   //   R4_ARG2   - destination byte array address
2675   //   R5_ARG3   - round key array
2676   address generate_aescrypt_encryptBlock() {
2677     assert(UseAES, "need AES instructions and misaligned SSE support");
2678     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2679 
2680     address start = __ function_entry();
2681 
2682     Label L_doLast;
2683 
2684     Register from           = R3_ARG1;  // source array address
2685     Register to             = R4_ARG2;  // destination array address
2686     Register key            = R5_ARG3;  // round key array
2687 
2688     Register keylen         = R8;
2689     Register temp           = R9;
2690     Register keypos         = R10;
2691     Register hex            = R11;
2692     Register fifteen        = R12;
2693 
2694     VectorRegister vRet     = VR0;
2695 
2696     VectorRegister vKey1    = VR1;
2697     VectorRegister vKey2    = VR2;
2698     VectorRegister vKey3    = VR3;
2699     VectorRegister vKey4    = VR4;
2700 
2701     VectorRegister fromPerm = VR5;
2702     VectorRegister keyPerm  = VR6;
2703     VectorRegister toPerm   = VR7;
2704     VectorRegister fSplt    = VR8;
2705 
2706     VectorRegister vTmp1    = VR9;
2707     VectorRegister vTmp2    = VR10;
2708     VectorRegister vTmp3    = VR11;
2709     VectorRegister vTmp4    = VR12;
2710 
2711     VectorRegister vLow     = VR13;
2712     VectorRegister vHigh    = VR14;
2713 
2714     __ li              (hex, 16);
2715     __ li              (fifteen, 15);
2716     __ vspltisb        (fSplt, 0x0f);
2717 
2718     // load unaligned from[0-15] to vsRet
2719     __ lvx             (vRet, from);
2720     __ lvx             (vTmp1, fifteen, from);
2721     __ lvsl            (fromPerm, from);
2722     __ vxor            (fromPerm, fromPerm, fSplt);
2723     __ vperm           (vRet, vRet, vTmp1, fromPerm);
2724 
2725     // load keylen (44 or 52 or 60)
2726     __ lwz             (keylen, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT), key);
2727 
2728     // to load keys
2729     __ lvsr            (keyPerm, key);
2730     __ vxor            (vTmp2, vTmp2, vTmp2);
2731     __ vspltisb        (vTmp2, -16);
2732     __ vrld            (keyPerm, keyPerm, vTmp2);
2733     __ vrld            (keyPerm, keyPerm, vTmp2);
2734     __ vsldoi          (keyPerm, keyPerm, keyPerm, -8);
2735 
2736     // load the 1st round key to vKey1
2737     __ li              (keypos, 0);
2738     __ lvx             (vKey1, keypos, key);
2739     __ addi            (keypos, keypos, 16);
2740     __ lvx             (vTmp1, keypos, key);
2741     __ vperm           (vKey1, vTmp1, vKey1, keyPerm);
2742 
2743     // 1st round
2744     __ vxor (vRet, vRet, vKey1);
2745 
2746     // load the 2nd round key to vKey1
2747     __ addi            (keypos, keypos, 16);
2748     __ lvx             (vTmp2, keypos, key);
2749     __ vperm           (vKey1, vTmp2, vTmp1, keyPerm);
2750 
2751     // load the 3rd round key to vKey2
2752     __ addi            (keypos, keypos, 16);
2753     __ lvx             (vTmp1, keypos, key);
2754     __ vperm           (vKey2, vTmp1, vTmp2, keyPerm);
2755 
2756     // load the 4th round key to vKey3
2757     __ addi            (keypos, keypos, 16);
2758     __ lvx             (vTmp2, keypos, key);
2759     __ vperm           (vKey3, vTmp2, vTmp1, keyPerm);
2760 
2761     // load the 5th round key to vKey4
2762     __ addi            (keypos, keypos, 16);
2763     __ lvx             (vTmp1, keypos, key);
2764     __ vperm           (vKey4, vTmp1, vTmp2, keyPerm);
2765 
2766     // 2nd - 5th rounds
2767     __ vcipher (vRet, vRet, vKey1);
2768     __ vcipher (vRet, vRet, vKey2);
2769     __ vcipher (vRet, vRet, vKey3);
2770     __ vcipher (vRet, vRet, vKey4);
2771 
2772     // load the 6th round key to vKey1
2773     __ addi            (keypos, keypos, 16);
2774     __ lvx             (vTmp2, keypos, key);
2775     __ vperm           (vKey1, vTmp2, vTmp1, keyPerm);
2776 
2777     // load the 7th round key to vKey2
2778     __ addi            (keypos, keypos, 16);
2779     __ lvx             (vTmp1, keypos, key);
2780     __ vperm           (vKey2, vTmp1, vTmp2, keyPerm);
2781 
2782     // load the 8th round key to vKey3
2783     __ addi            (keypos, keypos, 16);
2784     __ lvx             (vTmp2, keypos, key);
2785     __ vperm           (vKey3, vTmp2, vTmp1, keyPerm);
2786 
2787     // load the 9th round key to vKey4
2788     __ addi            (keypos, keypos, 16);
2789     __ lvx             (vTmp1, keypos, key);
2790     __ vperm           (vKey4, vTmp1, vTmp2, keyPerm);
2791 
2792     // 6th - 9th rounds
2793     __ vcipher (vRet, vRet, vKey1);
2794     __ vcipher (vRet, vRet, vKey2);
2795     __ vcipher (vRet, vRet, vKey3);
2796     __ vcipher (vRet, vRet, vKey4);
2797 
2798     // load the 10th round key to vKey1
2799     __ addi            (keypos, keypos, 16);
2800     __ lvx             (vTmp2, keypos, key);
2801     __ vperm           (vKey1, vTmp2, vTmp1, keyPerm);
2802 
2803     // load the 11th round key to vKey2
2804     __ addi            (keypos, keypos, 16);
2805     __ lvx             (vTmp1, keypos, key);
2806     __ vperm           (vKey2, vTmp1, vTmp2, keyPerm);
2807 
2808     // if all round keys are loaded, skip next 4 rounds
2809     __ cmpwi           (CCR0, keylen, 44);
2810     __ beq             (CCR0, L_doLast);
2811 
2812     // 10th - 11th rounds
2813     __ vcipher (vRet, vRet, vKey1);
2814     __ vcipher (vRet, vRet, vKey2);
2815 
2816     // load the 12th round key to vKey1
2817     __ addi            (keypos, keypos, 16);
2818     __ lvx             (vTmp2, keypos, key);
2819     __ vperm           (vKey1, vTmp2, vTmp1, keyPerm);
2820 
2821     // load the 13th round key to vKey2
2822     __ addi            (keypos, keypos, 16);
2823     __ lvx             (vTmp1, keypos, key);
2824     __ vperm           (vKey2, vTmp1, vTmp2, keyPerm);
2825 
2826     // if all round keys are loaded, skip next 2 rounds
2827     __ cmpwi           (CCR0, keylen, 52);
2828     __ beq             (CCR0, L_doLast);
2829 
2830     // 12th - 13th rounds
2831     __ vcipher (vRet, vRet, vKey1);
2832     __ vcipher (vRet, vRet, vKey2);
2833 
2834     // load the 14th round key to vKey1
2835     __ addi            (keypos, keypos, 16);
2836     __ lvx             (vTmp2, keypos, key);
2837     __ vperm           (vKey1, vTmp2, vTmp1, keyPerm);
2838 
2839     // load the 15th round key to vKey2
2840     __ addi            (keypos, keypos, 16);
2841     __ lvx             (vTmp1, keypos, key);
2842     __ vperm           (vKey2, vTmp1, vTmp2, keyPerm);
2843 
2844     __ bind(L_doLast);
2845 
2846     // last two rounds
2847     __ vcipher (vRet, vRet, vKey1);
2848     __ vcipherlast (vRet, vRet, vKey2);
2849 
2850     __ neg             (temp, to);
2851     __ lvsr            (toPerm, temp);
2852     __ vspltisb        (vTmp2, -1);
2853     __ vxor            (vTmp1, vTmp1, vTmp1);
2854     __ vperm           (vTmp2, vTmp2, vTmp1, toPerm);
2855     __ vxor            (toPerm, toPerm, fSplt);
2856     __ lvx             (vTmp1, to);
2857     __ vperm           (vRet, vRet, vRet, toPerm);
2858     __ vsel            (vTmp1, vTmp1, vRet, vTmp2);
2859     __ lvx             (vTmp4, fifteen, to);
2860     __ stvx            (vTmp1, to);
2861     __ vsel            (vRet, vRet, vTmp4, vTmp2);
2862     __ stvx            (vRet, fifteen, to);
2863 
2864     __ blr();
2865      return start;
2866   }
2867 
2868   // Arguments for generated stub (little endian only):
2869   //   R3_ARG1   - source byte array address
2870   //   R4_ARG2   - destination byte array address
2871   //   R5_ARG3   - K (key) in little endian int array
2872   address generate_aescrypt_decryptBlock() {
2873     assert(UseAES, "need AES instructions and misaligned SSE support");
2874     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2875 
2876     address start = __ function_entry();
2877 
2878     Label L_doLast;
2879     Label L_do44;
2880     Label L_do52;
2881     Label L_do60;
2882 
2883     Register from           = R3_ARG1;  // source array address
2884     Register to             = R4_ARG2;  // destination array address
2885     Register key            = R5_ARG3;  // round key array
2886 
2887     Register keylen         = R8;
2888     Register temp           = R9;
2889     Register keypos         = R10;
2890     Register hex            = R11;
2891     Register fifteen        = R12;
2892 
2893     VectorRegister vRet     = VR0;
2894 
2895     VectorRegister vKey1    = VR1;
2896     VectorRegister vKey2    = VR2;
2897     VectorRegister vKey3    = VR3;
2898     VectorRegister vKey4    = VR4;
2899     VectorRegister vKey5    = VR5;
2900 
2901     VectorRegister fromPerm = VR6;
2902     VectorRegister keyPerm  = VR7;
2903     VectorRegister toPerm   = VR8;
2904     VectorRegister fSplt    = VR9;
2905 
2906     VectorRegister vTmp1    = VR10;
2907     VectorRegister vTmp2    = VR11;
2908     VectorRegister vTmp3    = VR12;
2909     VectorRegister vTmp4    = VR13;
2910 
2911     VectorRegister vLow     = VR14;
2912     VectorRegister vHigh    = VR15;
2913 
2914     __ li              (hex, 16);
2915     __ li              (fifteen, 15);
2916     __ vspltisb        (fSplt, 0x0f);
2917 
2918     // load unaligned from[0-15] to vsRet
2919     __ lvx             (vRet, from);
2920     __ lvx             (vTmp1, fifteen, from);
2921     __ lvsl            (fromPerm, from);
2922     __ vxor            (fromPerm, fromPerm, fSplt);
2923     __ vperm           (vRet, vRet, vTmp1, fromPerm); // align [and byte swap in LE]
2924 
2925     // load keylen (44 or 52 or 60)
2926     __ lwz             (keylen, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT), key);
2927 
2928     // to load keys
2929     __ lvsr            (keyPerm, key);
2930     __ vxor            (vTmp2, vTmp2, vTmp2);
2931     __ vspltisb        (vTmp2, -16);
2932     __ vrld            (keyPerm, keyPerm, vTmp2);
2933     __ vrld            (keyPerm, keyPerm, vTmp2);
2934     __ vsldoi          (keyPerm, keyPerm, keyPerm, -8);
2935 
2936     __ cmpwi           (CCR0, keylen, 44);
2937     __ beq             (CCR0, L_do44);
2938 
2939     __ cmpwi           (CCR0, keylen, 52);
2940     __ beq             (CCR0, L_do52);
2941 
2942     // load the 15th round key to vKey11
2943     __ li              (keypos, 240);
2944     __ lvx             (vTmp1, keypos, key);
2945     __ addi            (keypos, keypos, -16);
2946     __ lvx             (vTmp2, keypos, key);
2947     __ vperm           (vKey1, vTmp1, vTmp2, keyPerm);
2948 
2949     // load the 14th round key to vKey10
2950     __ addi            (keypos, keypos, -16);
2951     __ lvx             (vTmp1, keypos, key);
2952     __ vperm           (vKey2, vTmp2, vTmp1, keyPerm);
2953 
2954     // load the 13th round key to vKey10
2955     __ addi            (keypos, keypos, -16);
2956     __ lvx             (vTmp2, keypos, key);
2957     __ vperm           (vKey3, vTmp1, vTmp2, keyPerm);
2958 
2959     // load the 12th round key to vKey10
2960     __ addi            (keypos, keypos, -16);
2961     __ lvx             (vTmp1, keypos, key);
2962     __ vperm           (vKey4, vTmp2, vTmp1, keyPerm);
2963 
2964     // load the 11th round key to vKey10
2965     __ addi            (keypos, keypos, -16);
2966     __ lvx             (vTmp2, keypos, key);
2967     __ vperm           (vKey5, vTmp1, vTmp2, keyPerm);
2968 
2969     // 1st - 5th rounds
2970     __ vxor            (vRet, vRet, vKey1);
2971     __ vncipher        (vRet, vRet, vKey2);
2972     __ vncipher        (vRet, vRet, vKey3);
2973     __ vncipher        (vRet, vRet, vKey4);
2974     __ vncipher        (vRet, vRet, vKey5);
2975 
2976     __ b               (L_doLast);
2977 
2978     __ bind            (L_do52);
2979 
2980     // load the 13th round key to vKey11
2981     __ li              (keypos, 208);
2982     __ lvx             (vTmp1, keypos, key);
2983     __ addi            (keypos, keypos, -16);
2984     __ lvx             (vTmp2, keypos, key);
2985     __ vperm           (vKey1, vTmp1, vTmp2, keyPerm);
2986 
2987     // load the 12th round key to vKey10
2988     __ addi            (keypos, keypos, -16);
2989     __ lvx             (vTmp1, keypos, key);
2990     __ vperm           (vKey2, vTmp2, vTmp1, keyPerm);
2991 
2992     // load the 11th round key to vKey10
2993     __ addi            (keypos, keypos, -16);
2994     __ lvx             (vTmp2, keypos, key);
2995     __ vperm           (vKey3, vTmp1, vTmp2, keyPerm);
2996 
2997     // 1st - 3rd rounds
2998     __ vxor            (vRet, vRet, vKey1);
2999     __ vncipher        (vRet, vRet, vKey2);
3000     __ vncipher        (vRet, vRet, vKey3);
3001 
3002     __ b               (L_doLast);
3003 
3004     __ bind            (L_do44);
3005 
3006     // load the 11th round key to vKey11
3007     __ li              (keypos, 176);
3008     __ lvx             (vTmp1, keypos, key);
3009     __ addi            (keypos, keypos, -16);
3010     __ lvx             (vTmp2, keypos, key);
3011     __ vperm           (vKey1, vTmp1, vTmp2, keyPerm);
3012 
3013     // 1st round
3014     __ vxor            (vRet, vRet, vKey1);
3015 
3016     __ bind            (L_doLast);
3017 
3018     // load the 10th round key to vKey10
3019     __ addi            (keypos, keypos, -16);
3020     __ lvx             (vTmp1, keypos, key);
3021     __ vperm           (vKey1, vTmp2, vTmp1, keyPerm);
3022 
3023     // load the 9th round key to vKey10
3024     __ addi            (keypos, keypos, -16);
3025     __ lvx             (vTmp2, keypos, key);
3026     __ vperm           (vKey2, vTmp1, vTmp2, keyPerm);
3027 
3028     // load the 8th round key to vKey10
3029     __ addi            (keypos, keypos, -16);
3030     __ lvx             (vTmp1, keypos, key);
3031     __ vperm           (vKey3, vTmp2, vTmp1, keyPerm);
3032 
3033     // load the 7th round key to vKey10
3034     __ addi            (keypos, keypos, -16);
3035     __ lvx             (vTmp2, keypos, key);
3036     __ vperm           (vKey4, vTmp1, vTmp2, keyPerm);
3037 
3038     // load the 6th round key to vKey10
3039     __ addi            (keypos, keypos, -16);
3040     __ lvx             (vTmp1, keypos, key);
3041     __ vperm           (vKey5, vTmp2, vTmp1, keyPerm);
3042 
3043     // last 10th - 6th rounds
3044     __ vncipher        (vRet, vRet, vKey1);
3045     __ vncipher        (vRet, vRet, vKey2);
3046     __ vncipher        (vRet, vRet, vKey3);
3047     __ vncipher        (vRet, vRet, vKey4);
3048     __ vncipher        (vRet, vRet, vKey5);
3049 
3050     // load the 5th round key to vKey10
3051     __ addi            (keypos, keypos, -16);
3052     __ lvx             (vTmp2, keypos, key);
3053     __ vperm           (vKey1, vTmp1, vTmp2, keyPerm);
3054 
3055     // load the 4th round key to vKey10
3056     __ addi            (keypos, keypos, -16);
3057     __ lvx             (vTmp1, keypos, key);
3058     __ vperm           (vKey2, vTmp2, vTmp1, keyPerm);
3059 
3060     // load the 3rd round key to vKey10
3061     __ addi            (keypos, keypos, -16);
3062     __ lvx             (vTmp2, keypos, key);
3063     __ vperm           (vKey3, vTmp1, vTmp2, keyPerm);
3064 
3065     // load the 2nd round key to vKey10
3066     __ addi            (keypos, keypos, -16);
3067     __ lvx             (vTmp1, keypos, key);
3068     __ vperm           (vKey4, vTmp2, vTmp1, keyPerm);
3069 
3070     // load the 1st round key to vKey10
3071     __ addi            (keypos, keypos, -16);
3072     __ lvx             (vTmp2, keypos, key);
3073     __ vperm           (vKey5, vTmp1, vTmp2, keyPerm);
3074 
3075     // last 5th - 1th rounds
3076     __ vncipher        (vRet, vRet, vKey1);
3077     __ vncipher        (vRet, vRet, vKey2);
3078     __ vncipher        (vRet, vRet, vKey3);
3079     __ vncipher        (vRet, vRet, vKey4);
3080     __ vncipherlast    (vRet, vRet, vKey5);
3081 
3082     __ neg             (temp, to);
3083     __ lvsr            (toPerm, temp);
3084     __ vspltisb        (vTmp2, -1);
3085     __ vxor            (vTmp1, vTmp1, vTmp1);
3086     __ vperm           (vTmp2, vTmp2, vTmp1, toPerm);
3087     __ vxor            (toPerm, toPerm, fSplt);
3088     __ lvx             (vTmp1, to);
3089     __ vperm           (vRet, vRet, vRet, toPerm);
3090     __ vsel            (vTmp1, vTmp1, vRet, vTmp2);
3091     __ lvx             (vTmp4, fifteen, to);
3092     __ stvx            (vTmp1, to);
3093     __ vsel            (vRet, vRet, vTmp4, vTmp2);
3094     __ stvx            (vRet, fifteen, to);
3095 
3096     __ blr();
3097      return start;
3098   }
3099 
3100   void generate_arraycopy_stubs() {
3101     // Note: the disjoint stubs must be generated first, some of
3102     // the conjoint stubs use them.
3103 
3104     // non-aligned disjoint versions
3105     StubRoutines::_jbyte_disjoint_arraycopy       = generate_disjoint_byte_copy(false, "jbyte_disjoint_arraycopy");
3106     StubRoutines::_jshort_disjoint_arraycopy      = generate_disjoint_short_copy(false, "jshort_disjoint_arraycopy");
3107     StubRoutines::_jint_disjoint_arraycopy        = generate_disjoint_int_copy(false, "jint_disjoint_arraycopy");
3108     StubRoutines::_jlong_disjoint_arraycopy       = generate_disjoint_long_copy(false, "jlong_disjoint_arraycopy");
3109     StubRoutines::_oop_disjoint_arraycopy         = generate_disjoint_oop_copy(false, "oop_disjoint_arraycopy", false);
3110     StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_oop_copy(false, "oop_disjoint_arraycopy_uninit", true);
3111 
3112     // aligned disjoint versions
3113     StubRoutines::_arrayof_jbyte_disjoint_arraycopy      = generate_disjoint_byte_copy(true, "arrayof_jbyte_disjoint_arraycopy");
3114     StubRoutines::_arrayof_jshort_disjoint_arraycopy     = generate_disjoint_short_copy(true, "arrayof_jshort_disjoint_arraycopy");
3115     StubRoutines::_arrayof_jint_disjoint_arraycopy       = generate_disjoint_int_copy(true, "arrayof_jint_disjoint_arraycopy");
3116     StubRoutines::_arrayof_jlong_disjoint_arraycopy      = generate_disjoint_long_copy(true, "arrayof_jlong_disjoint_arraycopy");
3117     StubRoutines::_arrayof_oop_disjoint_arraycopy        = generate_disjoint_oop_copy(true, "arrayof_oop_disjoint_arraycopy", false);
3118     StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(true, "oop_disjoint_arraycopy_uninit", true);
3119 
3120     // non-aligned conjoint versions
3121     StubRoutines::_jbyte_arraycopy      = generate_conjoint_byte_copy(false, "jbyte_arraycopy");
3122     StubRoutines::_jshort_arraycopy     = generate_conjoint_short_copy(false, "jshort_arraycopy");
3123     StubRoutines::_jint_arraycopy       = generate_conjoint_int_copy(false, "jint_arraycopy");
3124     StubRoutines::_jlong_arraycopy      = generate_conjoint_long_copy(false, "jlong_arraycopy");
3125     StubRoutines::_oop_arraycopy        = generate_conjoint_oop_copy(false, "oop_arraycopy", false);
3126     StubRoutines::_oop_arraycopy_uninit = generate_conjoint_oop_copy(false, "oop_arraycopy_uninit", true);
3127 
3128     // aligned conjoint versions
3129     StubRoutines::_arrayof_jbyte_arraycopy      = generate_conjoint_byte_copy(true, "arrayof_jbyte_arraycopy");
3130     StubRoutines::_arrayof_jshort_arraycopy     = generate_conjoint_short_copy(true, "arrayof_jshort_arraycopy");
3131     StubRoutines::_arrayof_jint_arraycopy       = generate_conjoint_int_copy(true, "arrayof_jint_arraycopy");
3132     StubRoutines::_arrayof_jlong_arraycopy      = generate_conjoint_long_copy(true, "arrayof_jlong_arraycopy");
3133     StubRoutines::_arrayof_oop_arraycopy        = generate_conjoint_oop_copy(true, "arrayof_oop_arraycopy", false);
3134     StubRoutines::_arrayof_oop_arraycopy_uninit = generate_conjoint_oop_copy(true, "arrayof_oop_arraycopy", true);
3135 
3136     // special/generic versions
3137     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", false);
3138     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", true);
3139 
3140     StubRoutines::_unsafe_arraycopy  = generate_unsafe_copy("unsafe_arraycopy",
3141                                                             STUB_ENTRY(jbyte_arraycopy),
3142                                                             STUB_ENTRY(jshort_arraycopy),
3143                                                             STUB_ENTRY(jint_arraycopy),
3144                                                             STUB_ENTRY(jlong_arraycopy));
3145     StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy",
3146                                                              STUB_ENTRY(jbyte_arraycopy),
3147                                                              STUB_ENTRY(jshort_arraycopy),
3148                                                              STUB_ENTRY(jint_arraycopy),
3149                                                              STUB_ENTRY(oop_arraycopy),
3150                                                              STUB_ENTRY(oop_disjoint_arraycopy),
3151                                                              STUB_ENTRY(jlong_arraycopy),
3152                                                              STUB_ENTRY(checkcast_arraycopy));
3153 
3154     // fill routines
3155     if (OptimizeFill) {
3156       StubRoutines::_jbyte_fill          = generate_fill(T_BYTE,  false, "jbyte_fill");
3157       StubRoutines::_jshort_fill         = generate_fill(T_SHORT, false, "jshort_fill");
3158       StubRoutines::_jint_fill           = generate_fill(T_INT,   false, "jint_fill");
3159       StubRoutines::_arrayof_jbyte_fill  = generate_fill(T_BYTE,  true, "arrayof_jbyte_fill");
3160       StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
3161       StubRoutines::_arrayof_jint_fill   = generate_fill(T_INT,   true, "arrayof_jint_fill");
3162     }
3163   }
3164 
3165   // Safefetch stubs.
3166   void generate_safefetch(const char* name, int size, address* entry, address* fault_pc, address* continuation_pc) {
3167     // safefetch signatures:
3168     //   int      SafeFetch32(int*      adr, int      errValue);
3169     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3170     //
3171     // arguments:
3172     //   R3_ARG1 = adr
3173     //   R4_ARG2 = errValue
3174     //
3175     // result:
3176     //   R3_RET  = *adr or errValue
3177 
3178     StubCodeMark mark(this, "StubRoutines", name);
3179 
3180     // Entry point, pc or function descriptor.
3181     *entry = __ function_entry();
3182 
3183     // Load *adr into R4_ARG2, may fault.
3184     *fault_pc = __ pc();
3185     switch (size) {
3186       case 4:
3187         // int32_t, signed extended
3188         __ lwa(R4_ARG2, 0, R3_ARG1);
3189         break;
3190       case 8:
3191         // int64_t
3192         __ ld(R4_ARG2, 0, R3_ARG1);
3193         break;
3194       default:
3195         ShouldNotReachHere();
3196     }
3197 
3198     // return errValue or *adr
3199     *continuation_pc = __ pc();
3200     __ mr(R3_RET, R4_ARG2);
3201     __ blr();
3202   }
3203 
3204   // Stub for BigInteger::multiplyToLen()
3205   //
3206   //  Arguments:
3207   //
3208   //  Input:
3209   //    R3 - x address
3210   //    R4 - x length
3211   //    R5 - y address
3212   //    R6 - y length
3213   //    R7 - z address
3214   //    R8 - z length
3215   //
3216   address generate_multiplyToLen() {
3217 
3218     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
3219 
3220     address start = __ function_entry();
3221 
3222     const Register x     = R3;
3223     const Register xlen  = R4;
3224     const Register y     = R5;
3225     const Register ylen  = R6;
3226     const Register z     = R7;
3227     const Register zlen  = R8;
3228 
3229     const Register tmp1  = R2; // TOC not used.
3230     const Register tmp2  = R9;
3231     const Register tmp3  = R10;
3232     const Register tmp4  = R11;
3233     const Register tmp5  = R12;
3234 
3235     // non-volatile regs
3236     const Register tmp6  = R31;
3237     const Register tmp7  = R30;
3238     const Register tmp8  = R29;
3239     const Register tmp9  = R28;
3240     const Register tmp10 = R27;
3241     const Register tmp11 = R26;
3242     const Register tmp12 = R25;
3243     const Register tmp13 = R24;
3244 
3245     BLOCK_COMMENT("Entry:");
3246 
3247     // C2 does not respect int to long conversion for stub calls.
3248     __ clrldi(xlen, xlen, 32);
3249     __ clrldi(ylen, ylen, 32);
3250     __ clrldi(zlen, zlen, 32);
3251 
3252     // Save non-volatile regs (frameless).
3253     int current_offs = 8;
3254     __ std(R24, -current_offs, R1_SP); current_offs += 8;
3255     __ std(R25, -current_offs, R1_SP); current_offs += 8;
3256     __ std(R26, -current_offs, R1_SP); current_offs += 8;
3257     __ std(R27, -current_offs, R1_SP); current_offs += 8;
3258     __ std(R28, -current_offs, R1_SP); current_offs += 8;
3259     __ std(R29, -current_offs, R1_SP); current_offs += 8;
3260     __ std(R30, -current_offs, R1_SP); current_offs += 8;
3261     __ std(R31, -current_offs, R1_SP);
3262 
3263     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5,
3264                        tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13);
3265 
3266     // Restore non-volatile regs.
3267     current_offs = 8;
3268     __ ld(R24, -current_offs, R1_SP); current_offs += 8;
3269     __ ld(R25, -current_offs, R1_SP); current_offs += 8;
3270     __ ld(R26, -current_offs, R1_SP); current_offs += 8;
3271     __ ld(R27, -current_offs, R1_SP); current_offs += 8;
3272     __ ld(R28, -current_offs, R1_SP); current_offs += 8;
3273     __ ld(R29, -current_offs, R1_SP); current_offs += 8;
3274     __ ld(R30, -current_offs, R1_SP); current_offs += 8;
3275     __ ld(R31, -current_offs, R1_SP);
3276 
3277     __ blr();  // Return to caller.
3278 
3279     return start;
3280   }
3281 
3282   /**
3283    * Arguments:
3284    *
3285    * Inputs:
3286    *   R3_ARG1    - int   crc
3287    *   R4_ARG2    - byte* buf
3288    *   R5_ARG3    - int   length (of buffer)
3289    *
3290    * scratch:
3291    *   R2, R6-R12
3292    *
3293    * Ouput:
3294    *   R3_RET     - int   crc result
3295    */
3296   // Compute CRC32 function.
3297   address generate_CRC32_updateBytes(const char* name) {
3298     __ align(CodeEntryAlignment);
3299     StubCodeMark mark(this, "StubRoutines", name);
3300     address start = __ function_entry();  // Remember stub start address (is rtn value).
3301 
3302     // arguments to kernel_crc32:
3303     const Register crc     = R3_ARG1;  // Current checksum, preset by caller or result from previous call.
3304     const Register data    = R4_ARG2;  // source byte array
3305     const Register dataLen = R5_ARG3;  // #bytes to process
3306 
3307     const Register table   = R6;       // crc table address
3308 
3309 #ifdef VM_LITTLE_ENDIAN
3310     if (VM_Version::has_vpmsumb()) {
3311       const Register constants    = R2;  // constants address
3312       const Register bconstants   = R8;  // barret table address
3313 
3314       const Register t0      = R9;
3315       const Register t1      = R10;
3316       const Register t2      = R11;
3317       const Register t3      = R12;
3318       const Register t4      = R7;
3319 
3320       BLOCK_COMMENT("Stub body {");
3321       assert_different_registers(crc, data, dataLen, table);
3322 
3323       StubRoutines::ppc64::generate_load_crc_table_addr(_masm, table);
3324       StubRoutines::ppc64::generate_load_crc_constants_addr(_masm, constants);
3325       StubRoutines::ppc64::generate_load_crc_barret_constants_addr(_masm, bconstants);
3326 
3327       __ kernel_crc32_1word_vpmsumd(crc, data, dataLen, table, constants, bconstants, t0, t1, t2, t3, t4);
3328 
3329       BLOCK_COMMENT("return");
3330       __ mr_if_needed(R3_RET, crc);      // Updated crc is function result. No copying required (R3_ARG1 == R3_RET).
3331       __ blr();
3332 
3333       BLOCK_COMMENT("} Stub body");
3334     } else
3335 #endif
3336     {
3337       const Register t0      = R2;
3338       const Register t1      = R7;
3339       const Register t2      = R8;
3340       const Register t3      = R9;
3341       const Register tc0     = R10;
3342       const Register tc1     = R11;
3343       const Register tc2     = R12;
3344 
3345       BLOCK_COMMENT("Stub body {");
3346       assert_different_registers(crc, data, dataLen, table);
3347 
3348       StubRoutines::ppc64::generate_load_crc_table_addr(_masm, table);
3349 
3350       __ kernel_crc32_1word(crc, data, dataLen, table, t0, t1, t2, t3, tc0, tc1, tc2, table);
3351 
3352       BLOCK_COMMENT("return");
3353       __ mr_if_needed(R3_RET, crc);      // Updated crc is function result. No copying required (R3_ARG1 == R3_RET).
3354       __ blr();
3355 
3356       BLOCK_COMMENT("} Stub body");
3357     }
3358 
3359     return start;
3360   }
3361 
3362   // Initialization
3363   void generate_initial() {
3364     // Generates all stubs and initializes the entry points
3365 
3366     // Entry points that exist in all platforms.
3367     // Note: This is code that could be shared among different platforms - however the
3368     // benefit seems to be smaller than the disadvantage of having a
3369     // much more complicated generator structure. See also comment in
3370     // stubRoutines.hpp.
3371 
3372     StubRoutines::_forward_exception_entry          = generate_forward_exception();
3373     StubRoutines::_call_stub_entry                  = generate_call_stub(StubRoutines::_call_stub_return_address);
3374     StubRoutines::_catch_exception_entry            = generate_catch_exception();
3375 
3376     // Build this early so it's available for the interpreter.
3377     StubRoutines::_throw_StackOverflowError_entry   =
3378       generate_throw_exception("StackOverflowError throw_exception",
3379                                CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError), false);
3380     StubRoutines::_throw_delayed_StackOverflowError_entry =
3381       generate_throw_exception("delayed StackOverflowError throw_exception",
3382                                CAST_FROM_FN_PTR(address, SharedRuntime::throw_delayed_StackOverflowError), false);
3383 
3384     // CRC32 Intrinsics.
3385     if (UseCRC32Intrinsics) {
3386       StubRoutines::_crc_table_adr    = (address)StubRoutines::ppc64::_crc_table;
3387       StubRoutines::_updateBytesCRC32 = generate_CRC32_updateBytes("CRC32_updateBytes");
3388     }
3389   }
3390 
3391   void generate_all() {
3392     // Generates all stubs and initializes the entry points
3393 
3394     // These entry points require SharedInfo::stack0 to be set up in
3395     // non-core builds
3396     StubRoutines::_throw_AbstractMethodError_entry         = generate_throw_exception("AbstractMethodError throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError),  false);
3397     // Handle IncompatibleClassChangeError in itable stubs.
3398     StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError),  false);
3399     StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call), false);
3400 
3401     // support for verify_oop (must happen after universe_init)
3402     StubRoutines::_verify_oop_subroutine_entry             = generate_verify_oop();
3403 
3404     // arraycopy stubs used by compilers
3405     generate_arraycopy_stubs();
3406 
3407     // Safefetch stubs.
3408     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
3409                                                        &StubRoutines::_safefetch32_fault_pc,
3410                                                        &StubRoutines::_safefetch32_continuation_pc);
3411     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
3412                                                        &StubRoutines::_safefetchN_fault_pc,
3413                                                        &StubRoutines::_safefetchN_continuation_pc);
3414 
3415 #ifdef COMPILER2
3416     if (UseMultiplyToLenIntrinsic) {
3417       StubRoutines::_multiplyToLen = generate_multiplyToLen();
3418     }
3419 #endif
3420 
3421     if (UseMontgomeryMultiplyIntrinsic) {
3422       StubRoutines::_montgomeryMultiply
3423         = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply);
3424     }
3425     if (UseMontgomerySquareIntrinsic) {
3426       StubRoutines::_montgomerySquare
3427         = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
3428     }
3429 
3430     if (UseAESIntrinsics) {
3431       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
3432       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
3433     }
3434 
3435   }
3436 
3437  public:
3438   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
3439     // replace the standard masm with a special one:
3440     _masm = new MacroAssembler(code);
3441     if (all) {
3442       generate_all();
3443     } else {
3444       generate_initial();
3445     }
3446   }
3447 };
3448 
3449 void StubGenerator_generate(CodeBuffer* code, bool all) {
3450   StubGenerator g(code, all);
3451 }