| # Copyright (C) 2023-2025 Apple Inc. All rights reserved. |
| # |
| # Redistribution and use in source and binary forms, with or without |
| # modification, are permitted provided that the following conditions |
| # are met: |
| # 1. Redistributions of source code must retain the above copyright |
| # notice, this list of conditions and the following disclaimer. |
| # 2. Redistributions in binary form must reproduce the above copyright |
| # notice, this list of conditions and the following disclaimer in the |
| # documentation and/or other materials provided with the distribution. |
| # |
| # THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' |
| # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, |
| # THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
| # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS |
| # BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF |
| # THE POSSIBILITY OF SUCH DAMAGE. |
| |
| # Callee save |
| |
| macro saveIPIntRegisters() |
| # NOTE: We intentionally don't restore pinned wasm registers here. These are saved |
| # and restored when entering Wasm by the JSToWasm wrapper and changes to them are meant |
| # to be observable within the same Wasm module. |
| subp IPIntCalleeSaveSpaceStackAligned, sp |
| if ARM64 or ARM64E |
| storepairq MC, PC, -2 * SlotSize[cfr] |
| elsif X86_64 or RISCV64 |
| storep PC, -1 * SlotSize[cfr] |
| storep MC, -2 * SlotSize[cfr] |
| end |
| end |
| |
| macro restoreIPIntRegisters() |
| # NOTE: We intentionally don't restore pinned wasm registers here. These are saved |
| # and restored when entering Wasm by the JSToWasm wrapper and changes to them are meant |
| # to be observable within the same Wasm module. |
| if ARM64 or ARM64E |
| loadpairq -2 * SlotSize[cfr], MC, PC |
| elsif X86_64 or RISCV64 |
| loadp -1 * SlotSize[cfr], PC |
| loadp -2 * SlotSize[cfr], MC |
| end |
| addp IPIntCalleeSaveSpaceStackAligned, sp |
| end |
| |
| # Dispatch target bases |
| |
| if ARM64 or ARM64E or X86_64 |
| const ipint_dispatch_base = _ipint_unreachable |
| end |
| |
| if ARM64 or ARM64E |
| const ipint_gc_dispatch_base = _ipint_struct_new |
| const ipint_conversion_dispatch_base = _ipint_i32_trunc_sat_f32_s |
| const ipint_simd_dispatch_base = _ipint_simd_v128_load_mem |
| const ipint_atomic_dispatch_base = _ipint_memory_atomic_notify |
| end |
| |
| # Tail-call bytecode dispatch |
| |
| macro nextIPIntInstruction() |
| loadb [PC], t0 |
| if ARM64 or ARM64E |
| # x0 = opcode |
| pcrtoaddr ipint_dispatch_base, t7 |
| addlshiftp t7, t0, (constexpr (WTF::fastLog2(JSC::IPInt::alignIPInt))), t0 |
| jmp t0 |
| elsif X86_64 |
| pcrtoaddr ipint_dispatch_base, t1 |
| lshiftq (constexpr (WTF::fastLog2(JSC::IPInt::alignIPInt))), t0 |
| addq t1, t0 |
| jmp t0 |
| else |
| error |
| end |
| end |
| |
| # Stack operations |
| # Every value on the stack is always 16 bytes! This makes life easy. |
| |
| macro pushQuad(reg) |
| if ARM64 or ARM64E |
| push reg, reg |
| elsif X86_64 |
| push reg, reg |
| else |
| break |
| end |
| end |
| |
| macro pushQuadPair(reg1, reg2) |
| push reg1, reg2 |
| end |
| |
| macro popQuad(reg) |
| # FIXME: emit post-increment in offlineasm |
| if ARM64 or ARM64E |
| loadqinc [sp], reg, V128ISize |
| elsif X86_64 |
| loadq [sp], reg |
| addq V128ISize, sp |
| else |
| break |
| end |
| end |
| |
| macro pushVec(reg) |
| pushv reg |
| end |
| |
| macro popVec(reg) |
| popv reg |
| end |
| |
| # Typed push/pop to make code pretty |
| |
| macro pushInt32(reg) |
| pushQuad(reg) |
| end |
| |
| macro popInt32(reg) |
| popQuad(reg) |
| end |
| |
| macro pushFloat32(reg) |
| pushv reg |
| end |
| |
| macro popFloat32(reg) |
| popv reg |
| end |
| |
| macro pushInt64(reg) |
| pushQuad(reg) |
| end |
| |
| macro popInt64(reg) |
| popQuad(reg) |
| end |
| |
| macro pushFloat64(reg) |
| pushv reg |
| end |
| |
| macro popFloat64(reg) |
| popv reg |
| end |
| |
| # Entering IPInt |
| |
| # MC = location in argumINT bytecode |
| # csr0 = tmp |
| # csr1 = dst |
| # csr2 = src |
| # csr3 = end |
| # csr4 = for dispatch |
| |
| const argumINTTmp = csr0 |
| const argumINTDst = sc0 |
| const argumINTSrc = csr2 |
| const argumINTEnd = csr3 |
| const argumINTDsp = csr4 |
| |
| macro ipintEntry() |
| const argumINTEndAsScratch = argumINTEnd |
| checkStackOverflow(ws0, argumINTEndAsScratch) |
| |
| # Allocate space for locals and rethrow values |
| if ARM64 or ARM64E |
| loadpairi Wasm::IPIntCallee::m_localSizeToAlloc[ws0], argumINTTmp, argumINTEnd |
| else |
| loadi Wasm::IPIntCallee::m_localSizeToAlloc[ws0], argumINTTmp |
| loadi Wasm::IPIntCallee::m_numRethrowSlotsToAlloc[ws0], argumINTEnd |
| end |
| mulp LocalSize, argumINTEnd |
| mulp LocalSize, argumINTTmp |
| # Allocate locals first (closest to CFR) |
| subp argumINTTmp, sp |
| move sp, argumINTDsp |
| # Allocate rethrow slots below locals |
| subp argumINTEnd, sp |
| # argumINTEnd = boundary for zero-init loop. Handlers write [argumINTDst] then subp, |
| # so after localSizeToAlloc handlers, argumINTDst = argumINTDsp - LocalSize. |
| move argumINTDsp, argumINTEnd |
| subp LocalSize, argumINTEnd |
| loadp Wasm::IPIntCallee::m_argumINTBytecode + VectorBufferOffset[ws0], MC |
| |
| push argumINTTmp, argumINTDst, argumINTSrc, argumINTEnd |
| |
| # Start writing at local[0] = CFR - IPIntLocalsBaseOffset, going downward |
| leap -IPIntLocalsBaseOffset[cfr], argumINTDst |
| leap FirstArgumentOffset[cfr], argumINTSrc |
| |
| validateOpcodeConfig(argumINTTmp) |
| argumINTDispatch() |
| end |
| |
| macro argumINTDispatch() |
| loadb [MC], argumINTTmp |
| addp 1, MC |
| bbgteq argumINTTmp, (constexpr IPInt::ArgumINTBytecode::NumOpcodes), _ipint_argument_dispatch_err |
| lshiftp (constexpr (WTF::fastLog2(JSC::IPInt::alignArgumInt))), argumINTTmp |
| if ARM64 or ARM64E or X86_64 |
| pcrtoaddr _argumINT_begin, argumINTDsp |
| addp argumINTTmp, argumINTDsp |
| jmp argumINTDsp |
| else |
| break |
| end |
| end |
| |
| macro argumINTInitializeDefaultLocals() |
| # zero out remaining locals (argumINTDst moves downward toward argumINTEnd) |
| bpeq argumINTDst, argumINTEnd, .ipint_entry_finish_zero |
| loadb [MC], argumINTTmp |
| addp 1, MC |
| sxb2p argumINTTmp, argumINTTmp |
| andp ValueNull, argumINTTmp |
| if ARM64 or ARM64E |
| # offlineasm doesn't have xzr so emit it |
| emit "stp x19, xzr, [x9]" |
| elsif X86_64 |
| storep argumINTTmp, [argumINTDst] |
| storep 0, 8[argumINTDst] |
| end |
| subp LocalSize, argumINTDst |
| end |
| |
| macro argumINTFinish() |
| pop argumINTEnd, argumINTSrc, argumINTDst, argumINTTmp |
| end |
| |
| ############################# |
| # 0x00 - 0x11: control flow # |
| ############################# |
| |
| ipintOp(_unreachable, macro() |
| jmp _ipint_throw_Unreachable |
| end) |
| |
| ipintOp(_nop, macro() |
| # nop |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_block, macro() |
| # block |
| validateOpcodeConfig(t0) |
| if ARM64 or ARM64E |
| loadpairi IPInt::BlockMetadata::deltaPC[MC], t0, t1 |
| else |
| loadi IPInt::BlockMetadata::deltaPC[MC], t0 |
| loadi IPInt::BlockMetadata::deltaMC[MC], t1 |
| end |
| sxi2q t0, t0 |
| sxi2q t1, t1 |
| advancePCByReg(t0) |
| advanceMCByReg(t1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_loop, macro() |
| # loop |
| # We already validateOpcodeConfig in ipintLoopOSR. |
| ipintLoopOSR(1) |
| loadb IPInt::InstructionLengthMetadata::length[MC], t0 |
| advancePCByReg(t0) |
| advanceMCByReg(constexpr (sizeof(IPInt::InstructionLengthMetadata))) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_if, macro() |
| # if |
| validateOpcodeConfig(t1) |
| popInt32(t0) |
| bineq 0, t0, .ipint_if_taken |
| if ARM64 or ARM64E |
| loadpairi IPInt::IfMetadata::elseDeltaPC[MC], t0, t1 |
| else |
| loadi IPInt::IfMetadata::elseDeltaPC[MC], t0 |
| loadi IPInt::IfMetadata::elseDeltaMC[MC], t1 |
| end |
| advancePCByReg(t0) |
| advanceMCByReg(t1) |
| nextIPIntInstruction() |
| .ipint_if_taken: |
| # Skip LEB128 |
| loadb IPInt::IfMetadata::instructionLength[MC], t0 |
| advanceMC(constexpr (sizeof(IPInt::IfMetadata))) |
| advancePCByReg(t0) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_else, macro() |
| # else |
| # Counterintuitively, we only run this instruction if the if |
| # clause is TAKEN. This is used to branch to the end of the |
| # block. |
| validateOpcodeConfig(t0) |
| if ARM64 or ARM64E |
| loadpairi IPInt::BlockMetadata::deltaPC[MC], t0, t1 |
| else |
| loadi IPInt::BlockMetadata::deltaPC[MC], t0 |
| loadi IPInt::BlockMetadata::deltaMC[MC], t1 |
| end |
| # always skipping forward - no need to sign-extend t0, t1 |
| advancePCByReg(t0) |
| advanceMCByReg(t1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_try, macro() |
| validateOpcodeConfig(t0) |
| loadb IPInt::InstructionLengthMetadata::length[MC], t0 |
| advancePCByReg(t0) |
| advanceMC(constexpr (sizeof(IPInt::InstructionLengthMetadata))) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_catch, macro() |
| # Counterintuitively, like else, we only run this instruction |
| # if no exception was thrown during the preceeding try or catch block. |
| validateOpcodeConfig(t0) |
| if ARM64 or ARM64E |
| loadpairi IPInt::BlockMetadata::deltaPC[MC], t0, t1 |
| else |
| loadi IPInt::BlockMetadata::deltaPC[MC], t0 |
| loadi IPInt::BlockMetadata::deltaMC[MC], t1 |
| end |
| # always skipping forward - no need to sign-extend t0, t1 |
| advancePCByReg(t0) |
| advanceMCByReg(t1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_throw, macro() |
| saveCallSiteIndex() |
| |
| loadp JSWebAssemblyInstance::m_vm[wasmInstance], t0 |
| loadp VM::topEntryFrame[t0], t0 |
| copyCalleeSavesToEntryFrameCalleeSavesBuffer(t0) |
| |
| move cfr, a1 |
| move sp, a2 |
| loadi IPInt::ThrowMetadata::exceptionIndex[MC], a3 |
| operationCall(macro() cCall4(_ipint_extern_throw_exception) end) |
| jumpToException() |
| end) |
| |
| ipintOp(_rethrow, macro() |
| saveCallSiteIndex() |
| |
| loadp JSWebAssemblyInstance::m_vm[wasmInstance], t0 |
| loadp VM::topEntryFrame[t0], t0 |
| copyCalleeSavesToEntryFrameCalleeSavesBuffer(t0) |
| |
| move cfr, a1 |
| loadi IPInt::RethrowMetadata::tryDepth[MC], a2 |
| operationCall(macro() cCall3(_ipint_extern_rethrow_exception) end) |
| jumpToException() |
| end) |
| |
| ipintOp(_throw_ref, macro() |
| popQuad(a2) |
| bieq a2, ValueNull, _ipint_throw_NullExnrefReference |
| |
| saveCallSiteIndex() |
| |
| loadp JSWebAssemblyInstance::m_vm[wasmInstance], t0 |
| loadp VM::topEntryFrame[t0], t0 |
| copyCalleeSavesToEntryFrameCalleeSavesBuffer(t0) |
| |
| move cfr, a1 |
| operationCall(macro() cCall3(_ipint_extern_throw_ref) end) |
| jumpToException() |
| end) |
| |
| macro uintDispatch() |
| loadb [MC], sc1 |
| addq 1, MC |
| bigteq sc1, (constexpr IPInt::UIntBytecode::NumOpcodes), _ipint_uint_dispatch_err |
| lshiftq (constexpr (WTF::fastLog2(JSC::IPInt::alignUInt))), sc1 |
| pcrtoaddr _uint_begin, PC |
| addq PC, sc1 |
| jmp sc1 |
| end |
| |
| ipintOp(_end, macro() |
| validateOpcodeConfig(t1) |
| if X86_64 |
| loadp UnboxedWasmCalleeStackSlot[cfr], ws0 |
| end |
| loadp Wasm::IPIntCallee::m_bytecodeEnd[ws0], t1 |
| bqeq PC, t1, .ipint_end_ret |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| # This implementation is specially defined out of ipintOp scope to make end implementation tight. |
| .ipint_end_ret: |
| loadp Wasm::IPIntCallee::m_uINTBytecode + VectorBufferOffset[ws0], MC |
| ipintEpilogueOSR(10) |
| if X86_64 |
| loadp UnboxedWasmCalleeStackSlot[cfr], ws0 |
| end |
| loadi Wasm::IPIntCallee::m_topOfReturnStackFPOffset[ws0], sc0 |
| addp cfr, sc0 |
| |
| // We've already validateOpcodeConfig() in all the places that can jump to .ipint_end_ret. |
| uintDispatch() |
| |
| ipintOp(_br, macro() |
| # br |
| validateOpcodeConfig(t0) |
| loadh IPInt::BranchTargetMetadata::toPop[MC], t0 |
| # number to keep |
| loadh IPInt::BranchTargetMetadata::toKeep[MC], t1 |
| |
| # ex. pop 3 and keep 2 |
| # |
| # +4 +3 +2 +1 sp |
| # a b c d e |
| # d e |
| # |
| # [sp + k + numToPop] = [sp + k] for k in numToKeep-1 -> 0 |
| move t0, t2 |
| mulq StackValueSize, t2 |
| leap [sp, t2], t2 |
| |
| .ipint_br_poploop: |
| bqeq t1, 0, .ipint_br_popend |
| subq 1, t1 |
| move t1, t3 |
| mulq StackValueSize, t3 |
| loadq [sp, t3], t0 |
| storeq t0, [t2, t3] |
| loadq 8[sp, t3], t0 |
| storeq t0, 8[t2, t3] |
| jmp .ipint_br_poploop |
| .ipint_br_popend: |
| loadh IPInt::BranchTargetMetadata::toPop[MC], t0 |
| mulq StackValueSize, t0 |
| leap [sp, t0], sp |
| |
| if ARM64 or ARM64E |
| loadpairi IPInt::BlockMetadata::deltaPC[MC], t0, t1 |
| else |
| loadi IPInt::BlockMetadata::deltaPC[MC], t0 |
| loadi IPInt::BlockMetadata::deltaMC[MC], t1 |
| end |
| sxi2q t0, t0 |
| sxi2q t1, t1 |
| advancePCByReg(t0) |
| advanceMCByReg(t1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_br_if, macro() |
| # pop i32 |
| validateOpcodeConfig(t2) |
| popInt32(t0) |
| bineq t0, 0, _ipint_br |
| loadb IPInt::BranchMetadata::instructionLength[MC], t0 |
| advanceMC(constexpr (sizeof(IPInt::BranchMetadata))) |
| advancePCByReg(t0) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_br_table, macro() |
| # br_table |
| validateOpcodeConfig(t2) |
| popInt32(t0) |
| loadi IPInt::SwitchMetadata::size[MC], t1 |
| advanceMC(constexpr (sizeof(IPInt::SwitchMetadata))) |
| bib t0, t1, .ipint_br_table_clamped |
| subq t1, 1, t0 |
| .ipint_br_table_clamped: |
| move t0, t1 |
| muli (constexpr (sizeof(IPInt::BranchTargetMetadata))), t0 |
| addq t0, MC |
| jmp _ipint_br |
| end) |
| |
| ipintOp(_return, macro() |
| validateOpcodeConfig(MC) |
| # ret |
| |
| if X86_64 |
| loadp UnboxedWasmCalleeStackSlot[cfr], ws0 |
| end |
| |
| # This is guaranteed going to an end instruction, so skip |
| # dispatch and end of program check for speed |
| jmp .ipint_end_ret |
| end) |
| |
| if ARM64 or ARM64E |
| const IPIntCallCallee = sc1 |
| const IPIntCallFunctionSlot = sc0 |
| elsif X86_64 |
| const IPIntCallCallee = t7 |
| const IPIntCallFunctionSlot = t6 |
| end |
| |
| ipintOp(_call, macro() |
| // The operationCall below already calls validateOpcodeConfig(). |
| saveCallSiteIndex() |
| |
| loadb IPInt::CallMetadata::length[MC], t0 |
| advancePCByReg(t0) |
| |
| move cfr, a1 |
| move MC, a2 |
| advanceMC(IPInt::CallMetadata::signature) |
| |
| subq 16, sp |
| move sp, a3 |
| |
| # operation returns the entrypoint in r0 and the target instance in r1 |
| # operation stores the target callee to sp[0] and target function info to sp[1] |
| operationCall(macro() cCall4(_ipint_extern_prepare_call) end) |
| loadq [sp], IPIntCallCallee |
| loadq 8[sp], IPIntCallFunctionSlot |
| addq 16, sp |
| |
| # call |
| jmp .ipint_call_common |
| end) |
| |
| ipintOp(_call_indirect, macro() |
| // The operationCall below already calls validateOpcodeConfig(). |
| saveCallSiteIndex() |
| |
| # Get function index by pointer, use it as a return for callee |
| move sp, a2 |
| |
| # Get callIndirectMetadata |
| move cfr, a1 |
| move MC, a3 |
| |
| operationCallMayThrow(macro() cCall4(_ipint_extern_prepare_call_indirect) end) |
| |
| # operationCallMayThrow saves the call site index, so we have to advance the PC after. |
| # Otherwise, the wrong call site index will be saved. |
| loadb IPInt::CallIndirectMetadata::length[MC], t3 |
| advancePCByReg(t3) |
| advanceMC(IPInt::CallIndirectMetadata::signature) |
| |
| loadq [sp], IPIntCallCallee |
| loadq 8[sp], IPIntCallFunctionSlot |
| addq 16, sp |
| |
| jmp .ipint_call_common |
| end) |
| |
| ipintOp(_return_call, macro() |
| // The operationCall below already calls validateOpcodeConfig(). |
| saveCallSiteIndex() |
| |
| loadb IPInt::TailCallMetadata::length[MC], t0 |
| advancePCByReg(t0) |
| |
| move cfr, a1 |
| move MC, a2 |
| subq 16, sp |
| move sp, a3 |
| |
| # operation returns the entrypoint in r0 and the target instance in r1 |
| # this operation stores the boxed Callee into *r2 |
| operationCall(macro() cCall4(_ipint_extern_prepare_call) end) |
| |
| loadq [sp], IPIntCallCallee |
| loadq 8[sp], IPIntCallFunctionSlot |
| addq 16, sp |
| |
| loadi IPInt::TailCallMetadata::callerStackArgSize[MC], t3 |
| advanceMC(IPInt::TailCallMetadata::argumentBytecode) |
| jmp .ipint_tail_call_common |
| end) |
| |
| ipintOp(_return_call_indirect, macro() |
| // The operationCallMayThrow below already calls validateOpcodeConfig(). |
| saveCallSiteIndex() |
| |
| # Get function index by pointer, use it as a return for callee |
| move sp, a2 |
| |
| # Get callIndirectMetadata |
| move cfr, a1 |
| move MC, a3 |
| operationCallMayThrow(macro() cCall4(_ipint_extern_prepare_call_indirect) end) |
| |
| # operationCallMayThrow saves the call site index, so we have to advance the PC after. |
| # Otherwise, the wrong call site index will be saved. |
| loadb IPInt::TailCallIndirectMetadata::length[MC], t3 |
| advancePCByReg(t3) |
| |
| loadq [sp], IPIntCallCallee |
| loadq 8[sp], IPIntCallFunctionSlot |
| addq 16, sp |
| |
| loadi IPInt::TailCallIndirectMetadata::callerStackArgSize[MC], t3 |
| advanceMC(IPInt::TailCallIndirectMetadata::argumentBytecode) |
| jmp .ipint_tail_call_common |
| end) |
| |
| ipintOp(_call_ref, macro() |
| // The operationCall below already calls validateOpcodeConfig(). |
| saveCallSiteIndex() |
| |
| move cfr, a1 |
| move MC, a2 |
| move sp, a3 |
| |
| operationCallMayThrow(macro() cCall4(_ipint_extern_prepare_call_ref) end) |
| loadq [sp], IPIntCallCallee |
| loadq 8[sp], IPIntCallFunctionSlot |
| addq 16, sp |
| |
| loadb IPInt::CallRefMetadata::length[MC], t3 |
| advanceMC(IPInt::CallRefMetadata::signature) |
| advancePCByReg(t3) |
| |
| jmp .ipint_call_common |
| end) |
| |
| ipintOp(_return_call_ref, macro() |
| // The operationCallMayThrow below already calls validateOpcodeConfig(). |
| saveCallSiteIndex() |
| |
| move cfr, a1 |
| move MC, a2 |
| move sp, a3 |
| operationCallMayThrow(macro() cCall4(_ipint_extern_prepare_call_ref) end) |
| |
| # operationCallMayThrow saves the call site index, so we have to advance the PC after. |
| # Otherwise, the wrong call site index will be saved. |
| loadb IPInt::TailCallRefMetadata::length[MC], t3 |
| advancePCByReg(t3) |
| |
| loadq [sp], IPIntCallCallee |
| loadq 8[sp], IPIntCallFunctionSlot |
| addq 16, sp |
| |
| loadi IPInt::TailCallRefMetadata::callerStackArgSize[MC], t3 |
| advanceMC(IPInt::TailCallRefMetadata::argumentBytecode) |
| jmp .ipint_tail_call_common |
| end) |
| |
| reservedOpcode(0x16) |
| reservedOpcode(0x17) |
| |
| ipintOp(_delegate, macro() |
| # Counterintuitively, like else, we only run this instruction |
| # if no exception was thrown during the preceeding try or catch block. |
| validateOpcodeConfig(t0) |
| if ARM64 or ARM64E |
| loadpairi IPInt::BlockMetadata::deltaPC[MC], t0, t1 |
| else |
| loadi IPInt::BlockMetadata::deltaPC[MC], t0 |
| loadi IPInt::BlockMetadata::deltaMC[MC], t1 |
| end |
| # always skipping forward - no need to sign-extend t0, t1 |
| advancePCByReg(t0) |
| advanceMCByReg(t1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_catch_all, macro() |
| # Counterintuitively, like else, we only run this instruction |
| # if no exception was thrown during the preceeding try or catch block. |
| validateOpcodeConfig(t0) |
| if ARM64 or ARM64E |
| loadpairi IPInt::BlockMetadata::deltaPC[MC], t0, t1 |
| else |
| loadi IPInt::BlockMetadata::deltaPC[MC], t0 |
| loadi IPInt::BlockMetadata::deltaMC[MC], t1 |
| end |
| # always skipping forward - no need to sign-extend t0, t1 |
| advancePCByReg(t0) |
| advanceMCByReg(t1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_drop, macro() |
| addq StackValueSize, sp |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_select, macro() |
| popInt32(t0) |
| bieq t0, 0, .ipint_select_val2 |
| addq StackValueSize, sp |
| advancePC(1) |
| advanceMC(constexpr (sizeof(IPInt::InstructionLengthMetadata))) |
| nextIPIntInstruction() |
| .ipint_select_val2: |
| popVec(v1) |
| popVec(v0) |
| pushVec(v1) |
| advancePC(1) |
| advanceMC(constexpr (sizeof(IPInt::InstructionLengthMetadata))) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_select_t, macro() |
| popInt32(t0) |
| bieq t0, 0, .ipint_select_t_val2 |
| addq StackValueSize, sp |
| loadb IPInt::InstructionLengthMetadata::length[MC], t0 |
| advancePCByReg(t0) |
| advanceMC(constexpr (sizeof(IPInt::InstructionLengthMetadata))) |
| nextIPIntInstruction() |
| .ipint_select_t_val2: |
| popVec(v1) |
| popVec(v0) |
| pushVec(v1) |
| loadb IPInt::InstructionLengthMetadata::length[MC], t0 |
| advancePCByReg(t0) |
| advanceMC(constexpr (sizeof(IPInt::InstructionLengthMetadata))) |
| nextIPIntInstruction() |
| end) |
| |
| reservedOpcode(0x1d) |
| reservedOpcode(0x1e) |
| |
| ipintOp(_try_table, macro() |
| # advance MC/PC |
| validateOpcodeConfig(t0) |
| if ARM64 or ARM64E |
| loadpairi IPInt::BlockMetadata::deltaPC[MC], t0, t1 |
| else |
| loadi IPInt::BlockMetadata::deltaPC[MC], t0 |
| loadi IPInt::BlockMetadata::deltaMC[MC], t1 |
| end |
| # always skipping forward - no need to sign-extend t0, t1 |
| advancePCByReg(t0) |
| advanceMCByReg(t1) |
| nextIPIntInstruction() |
| end) |
| |
| ################################### |
| # 0x20 - 0x26: get and set values # |
| ################################### |
| |
| macro localGet() |
| # Index into locals: local[i] = CFR - IPIntLocalsBaseOffset - i * LocalSize |
| lshiftp (constexpr (WTF::fastLog2(JSC::IPInt::LOCAL_SIZE))), t0 |
| subp cfr, t0, t0 |
| loadv -IPIntLocalsBaseOffset[t0], v0 |
| end |
| |
| macro localSet() |
| # Store to locals: local[i] = CFR - IPIntLocalsBaseOffset - i * LocalSize |
| lshiftp (constexpr (WTF::fastLog2(JSC::IPInt::LOCAL_SIZE))), t0 |
| subp cfr, t0, t0 |
| storev v0, -IPIntLocalsBaseOffset[t0] |
| end |
| |
| ipintOp(_local_get, macro() |
| # local.get |
| loadb 1[PC], t0 |
| bbaeq t0, 0x80, .ipint_local_get_slow_path |
| localGet() |
| pushVec(v0) |
| advancePC(2) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_local_set, macro() |
| # local.set |
| loadb 1[PC], t0 |
| bbaeq t0, 0x80, .ipint_local_set_slow_path |
| popVec(v0) |
| localSet() |
| advancePC(2) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_local_tee, macro() |
| # local.tee |
| loadb 1[PC], t0 |
| bbaeq t0, 0x80, .ipint_local_tee_slow_path |
| loadv [sp], v0 |
| localSet() |
| advancePC(2) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_global_get, macro() |
| loadb IPInt::GlobalMetadata::instructionLength[MC], t0 |
| advancePCByReg(t0) |
| |
| # Load pre-computed index from metadata |
| loadb IPInt::GlobalMetadata::bindingMode[MC], t2 |
| loadi IPInt::GlobalMetadata::index[MC], t1 |
| loadp JSWebAssemblyInstance::m_globals[wasmInstance], t0 |
| advanceMC(constexpr (sizeof(IPInt::GlobalMetadata))) |
| |
| lshiftp 1, t1 |
| bieq t2, 0, .ipint_global_get_embedded |
| loadp [t0, t1, 8], t0 |
| loadv [t0], v0 |
| pushVec(v0) |
| nextIPIntInstruction() |
| |
| .ipint_global_get_embedded: |
| loadv [t0, t1, 8], v0 |
| pushVec(v0) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_global_set, macro() |
| # isRef = 1 => ref, use slowpath |
| loadb IPInt::GlobalMetadata::isRef[MC], t0 |
| bineq t0, 0, .ipint_global_set_refpath |
| # bindingMode = 1 => portable |
| loadb IPInt::GlobalMetadata::bindingMode[MC], t2 |
| # get global addr |
| loadp JSWebAssemblyInstance::m_globals[wasmInstance], t0 |
| # get value to store |
| popVec(v0) |
| # get index |
| loadi IPInt::GlobalMetadata::index[MC], t1 |
| lshiftp 1, t1 |
| bieq t2, 0, .ipint_global_set_embedded |
| # portable: dereference then set |
| loadp [t0, t1, 8], t0 |
| storev v0, [t0] |
| loadb IPInt::GlobalMetadata::instructionLength[MC], t0 |
| advancePCByReg(t0) |
| advanceMC(constexpr (sizeof(IPInt::GlobalMetadata))) |
| jmp .ipint_global_set_dispatch |
| |
| .ipint_global_set_embedded: |
| # embedded: set directly |
| storev v0, [t0, t1, 8] |
| loadb IPInt::GlobalMetadata::instructionLength[MC], t0 |
| advancePCByReg(t0) |
| advanceMC(constexpr (sizeof(IPInt::GlobalMetadata))) |
| jmp .ipint_global_set_dispatch |
| |
| .ipint_global_set_refpath: |
| loadi IPInt::GlobalMetadata::index[MC], a1 |
| # Pop from stack |
| popQuad(a2) |
| operationCall(macro() cCall3(_ipint_extern_set_global_ref) end) |
| |
| loadb IPInt::GlobalMetadata::instructionLength[MC], t0 |
| advancePCByReg(t0) |
| advanceMC(constexpr (sizeof(IPInt::GlobalMetadata))) |
| |
| .ipint_global_set_dispatch: |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_table_get, macro() |
| # Load pre-computed index from metadata |
| loadi IPInt::TableAccessMetadata::index[MC], a1 |
| popInt32(a2) |
| |
| operationCallMayThrow(macro() cCall3(_ipint_extern_table_get) end) |
| |
| pushQuad(r0) |
| |
| loadb IPInt::TableAccessMetadata::instructionLength[MC], t0 |
| |
| advancePCByReg(t0) |
| advanceMC(constexpr (sizeof(IPInt::TableAccessMetadata))) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_table_set, macro() |
| # Load pre-computed index from metadata |
| loadi IPInt::TableAccessMetadata::index[MC], a1 |
| popQuad(a3) |
| popInt32(a2) |
| operationCallMayThrow(macro() cCall4(_ipint_extern_table_set) end) |
| |
| loadb IPInt::TableAccessMetadata::instructionLength[MC], t0 |
| |
| advancePCByReg(t0) |
| advanceMC(constexpr (sizeof(IPInt::TableAccessMetadata))) |
| nextIPIntInstruction() |
| end) |
| |
| reservedOpcode(0x27) |
| |
| macro popMemoryIndex(reg) |
| popInt64(reg) # Note that popInt32 and popInt64 are same implementation. |
| btbnz JSWebAssemblyInstance::m_cachedIsMemory64[wasmInstance], .done |
| zxi2q reg, reg |
| .done: |
| end |
| |
| macro baddpc(src, dst, label) |
| # FIXME: make this a proper instruction |
| addp src, dst |
| bpb dst, src, label # unsigned overflow check |
| end |
| |
| |
| macro loadStoreMakePointerFast(alignAccess, offsetAccess, wasmAddrReg, size, scratch, scratch2, slowLabel) |
| # overwrites wasmAddrReg with computed pointer. |
| # Fast path: alignment byte < 0x40 (single-byte, no multi-memory), |
| # and offset byte < 0x80 (single-byte). Memory index is 0. |
| # alignAccess/offsetAccess are memory access patterns for the memarg bytes. |
| # For non-SIMD: pass (1[PC], 2[PC]). For SIMD: pass ([t4], 1[t4]). |
| |
| # Check alignment byte: if >= 0x40, it's multi-memory or unusual alignment |
| loadb alignAccess, scratch2 # alignment/flags byte |
| bbaeq scratch2, 0x40, slowLabel |
| loadb offsetAccess, scratch # offset byte |
| bbaeq scratch, 0x80, slowLabel |
| |
| # Both single-byte, memory index = 0. scratch = offset value. |
| baddpc(scratch, wasmAddrReg, _ipint_throw_OutOfBoundsMemoryAccess) |
| move size - 1, scratch2 |
| baddpc(wasmAddrReg, scratch2, _ipint_throw_OutOfBoundsMemoryAccess) |
| |
| bpaeq scratch2, boundsCheckingSize, _ipint_throw_OutOfBoundsMemoryAccess # scratch2 contains wasm address + size - 1 |
| addp memoryBase, wasmAddrReg |
| end |
| |
| # Note: wasmAddrReg (t0) is set by the handler's popMemoryIndex before branching here. |
| # For store ops, the data register (t3 for int, ft0 for float) is also set by the handler. |
| macro loadStoreMakePointerSlow(cursor, wasmAddrReg, size, scratch, scratch2, decodeScratch1, decodeScratch2) |
| # 1. Decode flags/alignment, check multi-memory bit |
| decodeLEBVarUInt(scratch, cursor, decodeScratch1, decodeScratch2) |
| |
| # 2. If multi-memory, decode memory index; otherwise 0 |
| btiz scratch, 0x40, .memoryIndex0 |
| decodeLEBVarUInt(scratch, cursor, decodeScratch1, decodeScratch2) |
| jmp .decodeOffset |
| .memoryIndex0: |
| move 0, scratch |
| |
| .decodeOffset: |
| # 3. Decode offset |
| decodeLEBVarUInt(scratch2, cursor, decodeScratch1, decodeScratch2) |
| |
| baddpc(scratch2, wasmAddrReg, _ipint_throw_OutOfBoundsMemoryAccess) |
| move size - 1, scratch2 |
| baddpc(wasmAddrReg, scratch2, _ipint_throw_OutOfBoundsMemoryAccess) |
| |
| btinz scratch, .memoryIsNotZero |
| bpaeq scratch2, boundsCheckingSize, _ipint_throw_OutOfBoundsMemoryAccess # scratch2 contains wasm address + size - 1 |
| addp memoryBase, wasmAddrReg |
| jmp .done |
| |
| .memoryIsNotZero: |
| mulp constexpr (sizeof(JSWebAssemblyInstance::WasmMemoryBaseAndSize)), scratch |
| # FIXME: it's probably worth trying to use a loadpair here, but that requires a separate x86 codepath |
| loadp (constexpr (JSWebAssemblyInstance::offsetOfCachedMemoryBaseSizePair(0) + sizeof(void*))) [wasmInstance, scratch], decodeScratch1 # bounds checking size |
| bpaeq scratch2, decodeScratch1, _ipint_throw_OutOfBoundsMemoryAccess # scratch2 contains wasm address + size - 1 |
| loadp (constexpr (JSWebAssemblyInstance::offsetOfCachedMemoryBaseSizePair(0))) [wasmInstance, scratch], scratch2 # memory base |
| addp scratch2, wasmAddrReg |
| .done: |
| end |
| |
| ipintOp(_i32_load_mem, macro() |
| # i32.load |
| # pop index |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast(1[PC], 2[PC], t0, 4, t1, t2, .ipint_i32_load_mem_slow_path) |
| # load memory location |
| loadi [t0], t1 |
| pushInt32(t1) |
| |
| advancePC(3) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i64_load_mem, macro() |
| # i32.load |
| # pop index |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast(1[PC], 2[PC], t0, 8, t1, t2, .ipint_i64_load_mem_slow_path) |
| # load memory location |
| loadq [t0], t1 |
| pushInt64(t1) |
| |
| advancePC(3) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_f32_load_mem, macro() |
| # f32.load |
| # pop index |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast(1[PC], 2[PC], t0, 4, t1, t2, .ipint_f32_load_mem_slow_path) |
| # load memory location |
| loadf [t0], ft0 |
| pushFloat32(ft0) |
| |
| advancePC(3) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_f64_load_mem, macro() |
| # f64.load |
| # pop index |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast(1[PC], 2[PC], t0, 8, t1, t2, .ipint_f64_load_mem_slow_path) |
| # load memory location |
| loadd [t0], ft0 |
| pushFloat64(ft0) |
| |
| advancePC(3) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i32_load8s_mem, macro() |
| # i32.load8_s |
| # pop index |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast(1[PC], 2[PC], t0, 1, t1, t2, .ipint_i32_load8s_mem_slow_path) |
| loadbsi [t0], t1 |
| pushInt32(t1) |
| |
| advancePC(3) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i32_load8u_mem, macro() |
| # i32.load8_u |
| # pop index |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast(1[PC], 2[PC], t0, 1, t1, t2, .ipint_i32_load8u_mem_slow_path) |
| loadb [t0], t1 |
| pushInt32(t1) |
| |
| advancePC(3) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i32_load16s_mem, macro() |
| # i32.load16_s |
| # pop index |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast(1[PC], 2[PC], t0, 2, t1, t2, .ipint_i32_load16s_mem_slow_path) |
| loadhsi [t0], t1 |
| pushInt32(t1) |
| |
| advancePC(3) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i32_load16u_mem, macro() |
| # i32.load16_u |
| # pop index |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast(1[PC], 2[PC], t0, 2, t1, t2, .ipint_i32_load16u_mem_slow_path) |
| loadh [t0], t1 |
| pushInt32(t1) |
| |
| advancePC(3) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i64_load8s_mem, macro() |
| # i64.load8_s |
| # pop index |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast(1[PC], 2[PC], t0, 1, t1, t2, .ipint_i64_load8s_mem_slow_path) |
| loadbsq [t0], t1 |
| pushInt64(t1) |
| |
| advancePC(3) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i64_load8u_mem, macro() |
| # i64.load8_u |
| # pop index |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast(1[PC], 2[PC], t0, 1, t1, t2, .ipint_i64_load8u_mem_slow_path) |
| loadb [t0], t1 |
| pushInt64(t1) |
| |
| advancePC(3) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i64_load16s_mem, macro() |
| # i64.load16_s |
| # pop index |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast(1[PC], 2[PC], t0, 2, t1, t2, .ipint_i64_load16s_mem_slow_path) |
| loadhsq [t0], t1 |
| pushInt64(t1) |
| |
| advancePC(3) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i64_load16u_mem, macro() |
| # i64.load16_u |
| # pop index |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast(1[PC], 2[PC], t0, 2, t1, t2, .ipint_i64_load16u_mem_slow_path) |
| loadh [t0], t1 |
| pushInt64(t1) |
| |
| advancePC(3) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i64_load32s_mem, macro() |
| # i64.load32_s |
| # pop index |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast(1[PC], 2[PC], t0, 4, t1, t2, .ipint_i64_load32s_mem_slow_path) |
| loadi [t0], t1 |
| sxi2q t1, t1 |
| pushInt64(t1) |
| |
| advancePC(3) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i64_load32u_mem, macro() |
| # i64.load8_s |
| # pop index |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast(1[PC], 2[PC], t0, 4, t1, t2, .ipint_i64_load32u_mem_slow_path) |
| loadi [t0], t1 |
| pushInt64(t1) |
| |
| advancePC(3) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i32_store_mem, macro() |
| # i32.store |
| # pop data |
| popInt32(t3) |
| # pop index |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast(1[PC], 2[PC], t0, 4, t1, t2, .ipint_i32_store_mem_slow_path) |
| storei t3, [t0] |
| advancePC(3) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i64_store_mem, macro() |
| # i64.store |
| # pop data |
| popInt64(t3) |
| # pop index |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast(1[PC], 2[PC], t0, 8, t1, t2, .ipint_i64_store_mem_slow_path) |
| storeq t3, [t0] |
| advancePC(3) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_f32_store_mem, macro() |
| # f32.store |
| # pop data |
| popFloat32(ft0) |
| # pop index |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast(1[PC], 2[PC], t0, 4, t1, t2, .ipint_f32_store_mem_slow_path) |
| storef ft0, [t0] |
| advancePC(3) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_f64_store_mem, macro() |
| # f64.store |
| # pop data |
| popFloat64(ft0) |
| # pop index |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast(1[PC], 2[PC], t0, 8, t1, t2, .ipint_f64_store_mem_slow_path) |
| stored ft0, [t0] |
| advancePC(3) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i32_store8_mem, macro() |
| # i32.store8 |
| # pop data |
| popInt32(t3) |
| # pop index |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast(1[PC], 2[PC], t0, 1, t1, t2, .ipint_i32_store8_mem_slow_path) |
| storeb t3, [t0] |
| advancePC(3) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i32_store16_mem, macro() |
| # i32.store16 |
| # pop data |
| popInt32(t3) |
| # pop index |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast(1[PC], 2[PC], t0, 2, t1, t2, .ipint_i32_store16_mem_slow_path) |
| storeh t3, [t0] |
| advancePC(3) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i64_store8_mem, macro() |
| # i64.store8 |
| # pop data |
| popInt64(t3) |
| # pop index |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast(1[PC], 2[PC], t0, 1, t1, t2, .ipint_i64_store8_mem_slow_path) |
| storeb t3, [t0] |
| advancePC(3) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i64_store16_mem, macro() |
| # i64.store16 |
| # pop data |
| popInt64(t3) |
| # pop index |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast(1[PC], 2[PC], t0, 2, t1, t2, .ipint_i64_store16_mem_slow_path) |
| storeh t3, [t0] |
| advancePC(3) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i64_store32_mem, macro() |
| # i64.store32 |
| # pop data |
| popInt64(t3) |
| # pop index |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast(1[PC], 2[PC], t0, 4, t1, t2, .ipint_i64_store32_mem_slow_path) |
| storei t3, [t0] |
| advancePC(3) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_memory_size, macro() |
| loadb IPInt::MemorySizeMetadata::memoryIndex[MC], t0 |
| advanceMC(constexpr (sizeof(IPInt::MemorySizeMetadata))) |
| btinz t0, .callMemorySize |
| loadp constexpr (JSWebAssemblyInstance::offsetOfCachedMemory0Size())[wasmInstance], t0 # size of memory 0 |
| jmp .doneLoadingMemorySize |
| .callMemorySize: |
| move t0, a1 |
| operationCall(macro() cCall2(_ipint_extern_memory_size) end) |
| move r0, t0 |
| .doneLoadingMemorySize: |
| urshiftp 16, t0 |
| zxi2q t0, t0 |
| pushInt32(t0) |
| advancePC(2) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_memory_grow, macro() |
| popInt32(a1) |
| loadb IPInt::MemoryGrowMetadata::memoryIndex[MC], a2 |
| advanceMC(constexpr (sizeof(IPInt::MemoryGrowMetadata))) |
| operationCall(macro() cCall3(_ipint_extern_memory_grow) end) |
| pushInt32(r0) |
| ipintReloadMemory(t2) |
| advancePC(2) |
| nextIPIntInstruction() |
| end) |
| |
| ################################ |
| # 0x41 - 0x44: constant values # |
| ################################ |
| |
| ipintOp(_i32_const, macro() |
| # i32.const - decode signed LEB128 from bytecode |
| loadb 1[PC], t0 |
| bbaeq t0, 0x80, .ipint_i32_const_slow_path |
| # single byte: sign extend from 7 bits |
| lshifti 25, t0 |
| rshifti 25, t0 |
| pushInt32(t0) |
| advancePC(2) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i64_const, macro() |
| # i64.const - decode signed LEB128 from bytecode |
| loadb 1[PC], t0 |
| bbaeq t0, 0x80, .ipint_i64_const_slow_path |
| # single byte: sign extend from 7 bits |
| lshiftq 57, t0 |
| rshiftq 57, t0 |
| pushInt64(t0) |
| advancePC(2) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_f32_const, macro() |
| # f32.const |
| # Load pre-computed value from metadata |
| loadf 1[PC], ft0 |
| pushFloat32(ft0) |
| |
| advancePC(5) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_f64_const, macro() |
| # f64.const |
| # Load pre-computed value from metadata |
| loadd 1[PC], ft0 |
| pushFloat64(ft0) |
| |
| advancePC(9) |
| nextIPIntInstruction() |
| end) |
| |
| ############################### |
| # 0x45 - 0x4f: i32 comparison # |
| ############################### |
| |
| ipintOp(_i32_eqz, macro() |
| # i32.eqz |
| popInt32(t0) |
| cieq t0, 0, t0 |
| pushInt32(t0) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i32_eq, macro() |
| # i32.eq |
| popInt32(t1) |
| popInt32(t0) |
| cieq t0, t1, t0 |
| pushInt32(t0) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i32_ne, macro() |
| # i32.ne |
| popInt32(t1) |
| popInt32(t0) |
| cineq t0, t1, t0 |
| pushInt32(t0) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i32_lt_s, macro() |
| # i32.lt_s |
| popInt32(t1) |
| popInt32(t0) |
| cilt t0, t1, t0 |
| pushInt32(t0) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i32_lt_u, macro() |
| # i32.lt_u |
| popInt32(t1) |
| popInt32(t0) |
| cib t0, t1, t0 |
| pushInt32(t0) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i32_gt_s, macro() |
| # i32.gt_s |
| popInt32(t1) |
| popInt32(t0) |
| cigt t0, t1, t0 |
| pushInt32(t0) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i32_gt_u, macro() |
| # i32.gt_u |
| popInt32(t1) |
| popInt32(t0) |
| cia t0, t1, t0 |
| pushInt32(t0) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i32_le_s, macro() |
| # i32.le_s |
| popInt32(t1) |
| popInt32(t0) |
| cilteq t0, t1, t0 |
| pushInt32(t0) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i32_le_u, macro() |
| # i32.le_u |
| popInt32(t1) |
| popInt32(t0) |
| cibeq t0, t1, t0 |
| pushInt32(t0) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i32_ge_s, macro() |
| # i32.ge_s |
| popInt32(t1) |
| popInt32(t0) |
| cigteq t0, t1, t0 |
| pushInt32(t0) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i32_ge_u, macro() |
| # i32.ge_u |
| popInt32(t1) |
| popInt32(t0) |
| ciaeq t0, t1, t0 |
| pushInt32(t0) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ############################### |
| # 0x50 - 0x5a: i64 comparison # |
| ############################### |
| |
| ipintOp(_i64_eqz, macro() |
| # i64.eqz |
| popInt64(t0) |
| cqeq t0, 0, t0 |
| pushInt32(t0) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i64_eq, macro() |
| # i64.eq |
| popInt64(t1) |
| popInt64(t0) |
| cqeq t0, t1, t0 |
| pushInt32(t0) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i64_ne, macro() |
| # i64.ne |
| popInt64(t1) |
| popInt64(t0) |
| cqneq t0, t1, t0 |
| pushInt32(t0) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i64_lt_s, macro() |
| # i64.lt_s |
| popInt64(t1) |
| popInt64(t0) |
| cqlt t0, t1, t0 |
| pushInt32(t0) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i64_lt_u, macro() |
| # i64.lt_u |
| popInt64(t1) |
| popInt64(t0) |
| cqb t0, t1, t0 |
| pushInt32(t0) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i64_gt_s, macro() |
| # i64.gt_s |
| popInt64(t1) |
| popInt64(t0) |
| cqgt t0, t1, t0 |
| pushInt32(t0) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i64_gt_u, macro() |
| # i64.gt_u |
| popInt64(t1) |
| popInt64(t0) |
| cqa t0, t1, t0 |
| pushInt32(t0) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i64_le_s, macro() |
| # i64.le_s |
| popInt64(t1) |
| popInt64(t0) |
| cqlteq t0, t1, t0 |
| pushInt32(t0) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i64_le_u, macro() |
| # i64.le_u |
| popInt64(t1) |
| popInt64(t0) |
| cqbeq t0, t1, t0 |
| pushInt32(t0) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i64_ge_s, macro() |
| # i64.ge_s |
| popInt64(t1) |
| popInt64(t0) |
| cqgteq t0, t1, t0 |
| pushInt32(t0) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i64_ge_u, macro() |
| # i64.ge_u |
| popInt64(t1) |
| popInt64(t0) |
| cqaeq t0, t1, t0 |
| pushInt32(t0) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ############################### |
| # 0x5b - 0x60: f32 comparison # |
| ############################### |
| |
| ipintOp(_f32_eq, macro() |
| # f32.eq |
| popFloat32(ft1) |
| popFloat32(ft0) |
| cfeq ft0, ft1, t0 |
| pushInt32(t0) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_f32_ne, macro() |
| # f32.ne |
| popFloat32(ft1) |
| popFloat32(ft0) |
| cfnequn ft0, ft1, t0 |
| pushInt32(t0) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_f32_lt, macro() |
| # f32.lt |
| popFloat32(ft1) |
| popFloat32(ft0) |
| cflt ft0, ft1, t0 |
| pushInt32(t0) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_f32_gt, macro() |
| # f32.gt |
| popFloat32(ft1) |
| popFloat32(ft0) |
| cfgt ft0, ft1, t0 |
| pushInt32(t0) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_f32_le, macro() |
| # f32.le |
| popFloat32(ft1) |
| popFloat32(ft0) |
| cflteq ft0, ft1, t0 |
| pushInt32(t0) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_f32_ge, macro() |
| # f32.ge |
| popFloat32(ft1) |
| popFloat32(ft0) |
| cfgteq ft0, ft1, t0 |
| pushInt32(t0) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ############################### |
| # 0x61 - 0x66: f64 comparison # |
| ############################### |
| |
| ipintOp(_f64_eq, macro() |
| # f64.eq |
| popFloat64(ft1) |
| popFloat64(ft0) |
| cdeq ft0, ft1, t0 |
| pushInt32(t0) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_f64_ne, macro() |
| # f64.ne |
| popFloat64(ft1) |
| popFloat64(ft0) |
| cdnequn ft0, ft1, t0 |
| pushInt32(t0) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_f64_lt, macro() |
| # f64.lt |
| popFloat64(ft1) |
| popFloat64(ft0) |
| cdlt ft0, ft1, t0 |
| pushInt32(t0) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_f64_gt, macro() |
| # f64.gt |
| popFloat64(ft1) |
| popFloat64(ft0) |
| cdgt ft0, ft1, t0 |
| pushInt32(t0) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_f64_le, macro() |
| # f64.le |
| popFloat64(ft1) |
| popFloat64(ft0) |
| cdlteq ft0, ft1, t0 |
| pushInt32(t0) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_f64_ge, macro() |
| # f64.ge |
| popFloat64(ft1) |
| popFloat64(ft0) |
| cdgteq ft0, ft1, t0 |
| pushInt32(t0) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ############################### |
| # 0x67 - 0x78: i32 operations # |
| ############################### |
| |
| ipintOp(_i32_clz, macro() |
| # i32.clz |
| popInt32(t0) |
| lzcnti t0, t1 |
| pushInt32(t1) |
| |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i32_ctz, macro() |
| # i32.ctz |
| popInt32(t0) |
| tzcnti t0, t1 |
| pushInt32(t1) |
| |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i32_popcnt, macro() |
| # i32.popcnt |
| popInt32(t1) |
| operationCall(macro() cCall2(_slow_path_wasm_popcount) end) |
| pushInt32(r1) |
| |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i32_add, macro() |
| # i32.add |
| popInt32(t1) |
| popInt32(t0) |
| addi t1, t0 |
| pushInt32(t0) |
| |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i32_sub, macro() |
| # i32.sub |
| popInt32(t1) |
| popInt32(t0) |
| subi t1, t0 |
| pushInt32(t0) |
| |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i32_mul, macro() |
| # i32.mul |
| popInt32(t1) |
| popInt32(t0) |
| muli t1, t0 |
| pushInt32(t0) |
| |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i32_div_s, macro() |
| # i32.div_s |
| popInt32(t1) |
| popInt32(t0) |
| btiz t1, _ipint_throw_DivisionByZero |
| |
| bineq t1, -1, .ipint_i32_div_s_safe |
| bieq t0, constexpr INT32_MIN, _ipint_throw_IntegerOverflow |
| |
| .ipint_i32_div_s_safe: |
| if X86_64 |
| # FIXME: Add a way to static_asset that t0 is rax and t2 is rdx |
| # https://bugs.webkit.org/show_bug.cgi?id=203692 |
| cdqi |
| idivi t1 |
| elsif ARM64 or ARM64E or RISCV64 |
| divis t1, t0 |
| else |
| error |
| end |
| pushInt32(t0) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i32_div_u, macro() |
| # i32.div_u |
| popInt32(t1) |
| popInt32(t0) |
| btiz t1, _ipint_throw_DivisionByZero |
| |
| if X86_64 |
| xori t2, t2 |
| udivi t1 |
| elsif ARM64 or ARM64E or RISCV64 |
| divi t1, t0 |
| else |
| error |
| end |
| pushInt32(t0) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i32_rem_s, macro() |
| # i32.rem_s |
| popInt32(t1) |
| popInt32(t0) |
| |
| btiz t1, _ipint_throw_DivisionByZero |
| |
| bineq t1, -1, .ipint_i32_rem_s_safe |
| bineq t0, constexpr INT32_MIN, .ipint_i32_rem_s_safe |
| |
| move 0, t2 |
| jmp .ipint_i32_rem_s_return |
| |
| .ipint_i32_rem_s_safe: |
| if X86_64 |
| # FIXME: Add a way to static_asset that t0 is rax and t2 is rdx |
| # https://bugs.webkit.org/show_bug.cgi?id=203692 |
| cdqi |
| idivi t1 |
| elsif ARM64 or ARM64E |
| divis t1, t0, t2 |
| muli t1, t2 |
| subi t0, t2, t2 |
| elsif RISCV64 |
| remis t0, t1, t2 |
| else |
| error |
| end |
| |
| .ipint_i32_rem_s_return: |
| pushInt32(t2) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i32_rem_u, macro() |
| # i32.rem_u |
| popInt32(t1) |
| popInt32(t0) |
| btiz t1, _ipint_throw_DivisionByZero |
| |
| if X86_64 |
| xori t2, t2 |
| udivi t1 |
| elsif ARM64 or ARM64E |
| divi t1, t0, t2 |
| muli t1, t2 |
| subi t0, t2, t2 |
| elsif RISCV64 |
| remi t0, t1, t2 |
| else |
| error |
| end |
| pushInt32(t2) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i32_and, macro() |
| # i32.and |
| popInt32(t1) |
| popInt32(t0) |
| andi t1, t0 |
| pushInt32(t0) |
| |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i32_or, macro() |
| # i32.or |
| popInt32(t1) |
| popInt32(t0) |
| ori t1, t0 |
| pushInt32(t0) |
| |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i32_xor, macro() |
| # i32.xor |
| popInt32(t1) |
| popInt32(t0) |
| xori t1, t0 |
| pushInt32(t0) |
| |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i32_shl, macro() |
| # i32.shl |
| popInt32(t1) |
| popInt32(t0) |
| lshifti t1, t0 |
| pushInt32(t0) |
| |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i32_shr_s, macro() |
| # i32.shr_s |
| popInt32(t1) |
| popInt32(t0) |
| rshifti t1, t0 |
| pushInt32(t0) |
| |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i32_shr_u, macro() |
| # i32.shr_u |
| popInt32(t1) |
| popInt32(t0) |
| urshifti t1, t0 |
| pushInt32(t0) |
| |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i32_rotl, macro() |
| # i32.rotl |
| popInt32(t1) |
| popInt32(t0) |
| lrotatei t1, t0 |
| pushInt32(t0) |
| |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i32_rotr, macro() |
| # i32.rotr |
| popInt32(t1) |
| popInt32(t0) |
| rrotatei t1, t0 |
| pushInt32(t0) |
| |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ############################### |
| # 0x79 - 0x8a: i64 operations # |
| ############################### |
| |
| ipintOp(_i64_clz, macro() |
| # i64.clz |
| popInt64(t0) |
| lzcntq t0, t1 |
| pushInt64(t1) |
| |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i64_ctz, macro() |
| # i64.ctz |
| popInt64(t0) |
| tzcntq t0, t1 |
| pushInt64(t1) |
| |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i64_popcnt, macro() |
| # i64.popcnt |
| popInt64(t1) |
| operationCall(macro() cCall2(_slow_path_wasm_popcountll) end) |
| pushInt64(r1) |
| |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i64_add, macro() |
| # i64.add |
| popInt64(t1) |
| popInt64(t0) |
| addq t1, t0 |
| pushInt64(t0) |
| |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i64_sub, macro() |
| # i64.sub |
| popInt64(t1) |
| popInt64(t0) |
| subq t1, t0 |
| pushInt64(t0) |
| |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i64_mul, macro() |
| # i64.mul |
| popInt64(t1) |
| popInt64(t0) |
| mulq t1, t0 |
| pushInt64(t0) |
| |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i64_div_s, macro() |
| # i64.div_s |
| popInt64(t1) |
| popInt64(t0) |
| btqz t1, _ipint_throw_DivisionByZero |
| |
| bqneq t1, -1, .ipint_i64_div_s_safe |
| bqeq t0, constexpr INT64_MIN, _ipint_throw_IntegerOverflow |
| |
| .ipint_i64_div_s_safe: |
| if X86_64 |
| # FIXME: Add a way to static_asset that t0 is rax and t2 is rdx |
| # https://bugs.webkit.org/show_bug.cgi?id=203692 |
| cqoq |
| idivq t1 |
| elsif ARM64 or ARM64E or RISCV64 |
| divqs t1, t0 |
| else |
| error |
| end |
| pushInt64(t0) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i64_div_u, macro() |
| # i64.div_u |
| popInt64(t1) |
| popInt64(t0) |
| btqz t1, _ipint_throw_DivisionByZero |
| |
| if X86_64 |
| xorq t2, t2 |
| udivq t1 |
| elsif ARM64 or ARM64E or RISCV64 |
| divq t1, t0 |
| else |
| error |
| end |
| pushInt64(t0) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i64_rem_s, macro() |
| # i64.rem_s |
| popInt64(t1) |
| popInt64(t0) |
| |
| btqz t1, _ipint_throw_DivisionByZero |
| |
| bqneq t1, -1, .ipint_i64_rem_s_safe |
| bqneq t0, constexpr INT64_MIN, .ipint_i64_rem_s_safe |
| |
| move 0, t2 |
| jmp .ipint_i64_rem_s_return |
| |
| .ipint_i64_rem_s_safe: |
| if X86_64 |
| # FIXME: Add a way to static_asset that t0 is rax and t2 is rdx |
| # https://bugs.webkit.org/show_bug.cgi?id=203692 |
| cqoq |
| idivq t1 |
| elsif ARM64 or ARM64E |
| divqs t1, t0, t2 |
| mulq t1, t2 |
| subq t0, t2, t2 |
| elsif RISCV64 |
| remqs t0, t1, t2 |
| else |
| error |
| end |
| |
| .ipint_i64_rem_s_return: |
| pushInt64(t2) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i64_rem_u, macro() |
| # i64.rem_u |
| popInt64(t1) |
| popInt64(t0) |
| btqz t1, _ipint_throw_DivisionByZero |
| |
| if X86_64 |
| xorq t2, t2 |
| udivq t1 |
| elsif ARM64 or ARM64E |
| divq t1, t0, t2 |
| mulq t1, t2 |
| subq t0, t2, t2 |
| elsif RISCV64 |
| remq t0, t1, t2 |
| else |
| error |
| end |
| pushInt64(t2) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i64_and, macro() |
| # i64.and |
| popInt64(t1) |
| popInt64(t0) |
| andq t1, t0 |
| pushInt64(t0) |
| |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i64_or, macro() |
| # i64.or |
| popInt64(t1) |
| popInt64(t0) |
| orq t1, t0 |
| pushInt64(t0) |
| |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i64_xor, macro() |
| # i64.xor |
| popInt64(t1) |
| popInt64(t0) |
| xorq t1, t0 |
| pushInt64(t0) |
| |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i64_shl, macro() |
| # i64.shl |
| popInt64(t1) |
| popInt64(t0) |
| lshiftq t1, t0 |
| pushInt64(t0) |
| |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i64_shr_s, macro() |
| # i64.shr_s |
| popInt64(t1) |
| popInt64(t0) |
| rshiftq t1, t0 |
| pushInt64(t0) |
| |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i64_shr_u, macro() |
| # i64.shr_u |
| popInt64(t1) |
| popInt64(t0) |
| urshiftq t1, t0 |
| pushInt64(t0) |
| |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i64_rotl, macro() |
| # i64.rotl |
| popInt64(t1) |
| popInt64(t0) |
| lrotateq t1, t0 |
| pushInt64(t0) |
| |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i64_rotr, macro() |
| # i64.rotr |
| popInt64(t1) |
| popInt64(t0) |
| rrotateq t1, t0 |
| pushInt64(t0) |
| |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ############################### |
| # 0x8b - 0x98: f32 operations # |
| ############################### |
| |
| ipintOp(_f32_abs, macro() |
| # f32.abs |
| popFloat32(ft0) |
| absf ft0, ft1 |
| pushFloat32(ft1) |
| |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_f32_neg, macro() |
| # f32.neg |
| popFloat32(ft0) |
| negf ft0, ft1 |
| pushFloat32(ft1) |
| |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_f32_ceil, macro() |
| # f32.ceil |
| popFloat32(ft0) |
| ceilf ft0, ft1 |
| pushFloat32(ft1) |
| |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_f32_floor, macro() |
| # f32.floor |
| popFloat32(ft0) |
| floorf ft0, ft1 |
| pushFloat32(ft1) |
| |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_f32_trunc, macro() |
| # f32.trunc |
| popFloat32(ft0) |
| truncatef ft0, ft1 |
| pushFloat32(ft1) |
| |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_f32_nearest, macro() |
| # f32.nearest |
| popFloat32(ft0) |
| roundf ft0, ft1 |
| pushFloat32(ft1) |
| |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_f32_sqrt, macro() |
| # f32.sqrt |
| popFloat32(ft0) |
| sqrtf ft0, ft1 |
| pushFloat32(ft1) |
| |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_f32_add, macro() |
| # f32.add |
| popFloat32(ft1) |
| popFloat32(ft0) |
| addf ft1, ft0 |
| pushFloat32(ft0) |
| |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_f32_sub, macro() |
| # f32.sub |
| popFloat32(ft1) |
| popFloat32(ft0) |
| subf ft1, ft0 |
| pushFloat32(ft0) |
| |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_f32_mul, macro() |
| # f32.mul |
| popFloat32(ft1) |
| popFloat32(ft0) |
| mulf ft1, ft0 |
| pushFloat32(ft0) |
| |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_f32_div, macro() |
| # f32.div |
| popFloat32(ft1) |
| popFloat32(ft0) |
| divf ft1, ft0 |
| pushFloat32(ft0) |
| |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_f32_min, macro() |
| # f32.min |
| popFloat32(ft1) |
| popFloat32(ft0) |
| bfeq ft0, ft1, .ipint_f32_min_equal |
| bflt ft0, ft1, .ipint_f32_min_lt |
| bfgt ft0, ft1, .ipint_f32_min_return |
| |
| .ipint_f32_min_NaN: |
| addf ft0, ft1 |
| pushFloat32(ft1) |
| advancePC(1) |
| nextIPIntInstruction() |
| |
| .ipint_f32_min_equal: |
| orf ft0, ft1 |
| pushFloat32(ft1) |
| advancePC(1) |
| nextIPIntInstruction() |
| |
| .ipint_f32_min_lt: |
| moved ft0, ft1 |
| pushFloat32(ft1) |
| advancePC(1) |
| nextIPIntInstruction() |
| |
| .ipint_f32_min_return: |
| pushFloat32(ft1) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_f32_max, macro() |
| # f32.max |
| popFloat32(ft1) |
| popFloat32(ft0) |
| |
| bfeq ft1, ft0, .ipint_f32_max_equal |
| bflt ft1, ft0, .ipint_f32_max_lt |
| bfgt ft1, ft0, .ipint_f32_max_return |
| |
| .ipint_f32_max_NaN: |
| addf ft0, ft1 |
| pushFloat32(ft1) |
| advancePC(1) |
| nextIPIntInstruction() |
| |
| .ipint_f32_max_equal: |
| andf ft0, ft1 |
| pushFloat32(ft1) |
| advancePC(1) |
| nextIPIntInstruction() |
| |
| .ipint_f32_max_lt: |
| moved ft0, ft1 |
| pushFloat32(ft1) |
| advancePC(1) |
| nextIPIntInstruction() |
| |
| .ipint_f32_max_return: |
| pushFloat32(ft1) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_f32_copysign, macro() |
| # f32.copysign |
| popFloat32(ft1) |
| popFloat32(ft0) |
| |
| ff2i ft1, t1 |
| move 0x80000000, t2 |
| andi t2, t1 |
| |
| ff2i ft0, t0 |
| move 0x7fffffff, t2 |
| andi t2, t0 |
| |
| ori t1, t0 |
| fi2f t0, ft0 |
| |
| pushFloat32(ft0) |
| |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ############################### |
| # 0x99 - 0xa6: f64 operations # |
| ############################### |
| |
| ipintOp(_f64_abs, macro() |
| # f64.abs |
| popFloat64(ft0) |
| absd ft0, ft1 |
| pushFloat64(ft1) |
| |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_f64_neg, macro() |
| # f64.neg |
| popFloat64(ft0) |
| negd ft0, ft1 |
| pushFloat64(ft1) |
| |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_f64_ceil, macro() |
| # f64.ceil |
| popFloat64(ft0) |
| ceild ft0, ft1 |
| pushFloat64(ft1) |
| |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_f64_floor, macro() |
| # f64.floor |
| popFloat64(ft0) |
| floord ft0, ft1 |
| pushFloat64(ft1) |
| |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_f64_trunc, macro() |
| # f64.trunc |
| popFloat64(ft0) |
| truncated ft0, ft1 |
| pushFloat64(ft1) |
| |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_f64_nearest, macro() |
| # f64.nearest |
| popFloat64(ft0) |
| roundd ft0, ft1 |
| pushFloat64(ft1) |
| |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_f64_sqrt, macro() |
| # f64.sqrt |
| popFloat64(ft0) |
| sqrtd ft0, ft1 |
| pushFloat64(ft1) |
| |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_f64_add, macro() |
| # f64.add |
| popFloat64(ft1) |
| popFloat64(ft0) |
| addd ft1, ft0 |
| pushFloat64(ft0) |
| |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_f64_sub, macro() |
| # f64.sub |
| popFloat64(ft1) |
| popFloat64(ft0) |
| subd ft1, ft0 |
| pushFloat64(ft0) |
| |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_f64_mul, macro() |
| # f64.mul |
| popFloat64(ft1) |
| popFloat64(ft0) |
| muld ft1, ft0 |
| pushFloat64(ft0) |
| |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_f64_div, macro() |
| # f64.div |
| popFloat64(ft1) |
| popFloat64(ft0) |
| divd ft1, ft0 |
| pushFloat64(ft0) |
| |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_f64_min, macro() |
| # f64.min |
| popFloat64(ft1) |
| popFloat64(ft0) |
| bdeq ft0, ft1, .ipint_f64_min_equal |
| bdlt ft0, ft1, .ipint_f64_min_lt |
| bdgt ft0, ft1, .ipint_f64_min_return |
| |
| .ipint_f64_min_NaN: |
| addd ft0, ft1 |
| pushFloat64(ft1) |
| advancePC(1) |
| nextIPIntInstruction() |
| |
| .ipint_f64_min_equal: |
| ord ft0, ft1 |
| pushFloat64(ft1) |
| advancePC(1) |
| nextIPIntInstruction() |
| |
| .ipint_f64_min_lt: |
| moved ft0, ft1 |
| pushFloat64(ft1) |
| advancePC(1) |
| nextIPIntInstruction() |
| |
| .ipint_f64_min_return: |
| pushFloat64(ft1) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_f64_max, macro() |
| # f64.max |
| popFloat64(ft1) |
| popFloat64(ft0) |
| |
| bdeq ft1, ft0, .ipint_f64_max_equal |
| bdlt ft1, ft0, .ipint_f64_max_lt |
| bdgt ft1, ft0, .ipint_f64_max_return |
| |
| .ipint_f64_max_NaN: |
| addd ft0, ft1 |
| pushFloat64(ft1) |
| advancePC(1) |
| nextIPIntInstruction() |
| |
| .ipint_f64_max_equal: |
| andd ft0, ft1 |
| pushFloat64(ft1) |
| advancePC(1) |
| nextIPIntInstruction() |
| |
| .ipint_f64_max_lt: |
| moved ft0, ft1 |
| pushFloat64(ft1) |
| advancePC(1) |
| nextIPIntInstruction() |
| |
| .ipint_f64_max_return: |
| pushFloat64(ft1) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_f64_copysign, macro() |
| # f64.copysign |
| popFloat64(ft1) |
| popFloat64(ft0) |
| |
| fd2q ft1, t1 |
| move 0x8000000000000000, t2 |
| andq t2, t1 |
| |
| fd2q ft0, t0 |
| move 0x7fffffffffffffff, t2 |
| andq t2, t0 |
| |
| orq t1, t0 |
| fq2d t0, ft0 |
| |
| pushFloat64(ft0) |
| |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ############################ |
| # 0xa7 - 0xc4: conversions # |
| ############################ |
| |
| ipintOp(_i32_wrap_i64, macro() |
| # because of how we store values on stack, do nothing |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| |
| ipintOp(_i32_trunc_f32_s, macro() |
| popFloat32(ft0) |
| move 0xcf000000, t0 # INT32_MIN (Note that INT32_MIN - 1.0 in float is the same as INT32_MIN in float). |
| fi2f t0, ft1 |
| bfltun ft0, ft1, _ipint_throw_OutOfBoundsTrunc |
| |
| move 0x4f000000, t0 # -INT32_MIN |
| fi2f t0, ft1 |
| bfgtequn ft0, ft1, _ipint_throw_OutOfBoundsTrunc |
| |
| truncatef2is ft0, t0 |
| pushInt32(t0) |
| advancePC(1) |
| nextIPIntInstruction() |
| |
| end) |
| |
| ipintOp(_i32_trunc_f32_u, macro() |
| popFloat32(ft0) |
| move 0xbf800000, t0 # -1.0 |
| fi2f t0, ft1 |
| bfltequn ft0, ft1, _ipint_throw_OutOfBoundsTrunc |
| |
| move 0x4f800000, t0 # INT32_MIN * -2.0 |
| fi2f t0, ft1 |
| bfgtequn ft0, ft1, _ipint_throw_OutOfBoundsTrunc |
| |
| truncatef2i ft0, t0 |
| pushInt32(t0) |
| advancePC(1) |
| nextIPIntInstruction() |
| |
| end) |
| |
| ipintOp(_i32_trunc_f64_s, macro() |
| popFloat64(ft0) |
| move 0xc1e0000000200000, t0 # INT32_MIN - 1.0 |
| fq2d t0, ft1 |
| bdltequn ft0, ft1, _ipint_throw_OutOfBoundsTrunc |
| |
| move 0x41e0000000000000, t0 # -INT32_MIN |
| fq2d t0, ft1 |
| bdgtequn ft0, ft1, _ipint_throw_OutOfBoundsTrunc |
| |
| truncated2is ft0, t0 |
| pushInt32(t0) |
| advancePC(1) |
| nextIPIntInstruction() |
| |
| end) |
| |
| ipintOp(_i32_trunc_f64_u, macro() |
| popFloat64(ft0) |
| move 0xbff0000000000000, t0 # -1.0 |
| fq2d t0, ft1 |
| bdltequn ft0, ft1, _ipint_throw_OutOfBoundsTrunc |
| |
| move 0x41f0000000000000, t0 # INT32_MIN * -2.0 |
| fq2d t0, ft1 |
| bdgtequn ft0, ft1, _ipint_throw_OutOfBoundsTrunc |
| |
| truncated2i ft0, t0 |
| pushInt32(t0) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i64_extend_i32_s, macro() |
| popInt32(t0) |
| sxi2q t0, t0 |
| pushInt64(t0) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i64_extend_i32_u, macro() |
| popInt32(t0) |
| move 0, t1 |
| noti t1 |
| andq t1, t0 |
| pushInt64(t0) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i64_trunc_f32_s, macro() |
| popFloat32(ft0) |
| move 0xdf000000, t0 # INT64_MIN |
| fi2f t0, ft1 |
| bfltun ft0, ft1, _ipint_throw_OutOfBoundsTrunc |
| |
| move 0x5f000000, t0 # -INT64_MIN |
| fi2f t0, ft1 |
| bfgtequn ft0, ft1, _ipint_throw_OutOfBoundsTrunc |
| |
| truncatef2qs ft0, t0 |
| pushInt64(t0) |
| advancePC(1) |
| nextIPIntInstruction() |
| |
| end) |
| |
| ipintOp(_i64_trunc_f32_u, macro() |
| popFloat32(ft0) |
| move 0xbf800000, t0 # -1.0 |
| fi2f t0, ft1 |
| bfltequn ft0, ft1, _ipint_throw_OutOfBoundsTrunc |
| |
| move 0x5f800000, t0 # INT64_MIN * -2.0 |
| fi2f t0, ft1 |
| bfgtequn ft0, ft1, _ipint_throw_OutOfBoundsTrunc |
| |
| truncatef2q ft0, t0 |
| pushInt64(t0) |
| advancePC(1) |
| nextIPIntInstruction() |
| |
| end) |
| |
| ipintOp(_i64_trunc_f64_s, macro() |
| popFloat64(ft0) |
| move 0xc3e0000000000000, t0 # INT64_MIN |
| fq2d t0, ft1 |
| bdltun ft0, ft1, _ipint_throw_OutOfBoundsTrunc |
| |
| move 0x43e0000000000000, t0 # -INT64_MIN |
| fq2d t0, ft1 |
| bdgtequn ft0, ft1, _ipint_throw_OutOfBoundsTrunc |
| |
| truncated2qs ft0, t0 |
| pushInt64(t0) |
| advancePC(1) |
| nextIPIntInstruction() |
| |
| end) |
| |
| ipintOp(_i64_trunc_f64_u, macro() |
| popFloat64(ft0) |
| move 0xbff0000000000000, t0 # -1.0 |
| fq2d t0, ft1 |
| bdltequn ft0, ft1, _ipint_throw_OutOfBoundsTrunc |
| |
| move 0x43f0000000000000, t0 # INT64_MIN * -2.0 |
| fq2d t0, ft1 |
| bdgtequn ft0, ft1, _ipint_throw_OutOfBoundsTrunc |
| |
| truncated2q ft0, t0 |
| pushInt64(t0) |
| advancePC(1) |
| nextIPIntInstruction() |
| |
| end) |
| |
| ipintOp(_f32_convert_i32_s, macro() |
| popInt32(t0) |
| andq 0xffffffff, t0 |
| ci2fs t0, ft0 |
| pushFloat32(ft0) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_f32_convert_i32_u, macro() |
| popInt32(t0) |
| andq 0xffffffff, t0 |
| ci2f t0, ft0 |
| pushFloat32(ft0) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_f32_convert_i64_s, macro() |
| popInt64(t0) |
| cq2fs t0, ft0 |
| pushFloat32(ft0) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_f32_convert_i64_u, macro() |
| popInt64(t0) |
| if X86_64 |
| cq2f t0, t1, ft0 |
| else |
| cq2f t0, ft0 |
| end |
| pushFloat32(ft0) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_f32_demote_f64, macro() |
| popFloat64(ft0) |
| cd2f ft0, ft0 |
| pushFloat32(ft0) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_f64_convert_i32_s, macro() |
| popInt32(t0) |
| andq 0xffffffff, t0 |
| ci2ds t0, ft0 |
| pushFloat64(ft0) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_f64_convert_i32_u, macro() |
| popInt32(t0) |
| andq 0xffffffff, t0 |
| ci2d t0, ft0 |
| pushFloat64(ft0) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_f64_convert_i64_s, macro() |
| popInt64(t0) |
| cq2ds t0, ft0 |
| pushFloat64(ft0) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_f64_convert_i64_u, macro() |
| popInt64(t0) |
| if X86_64 |
| cq2d t0, t1, ft0 |
| else |
| cq2d t0, ft0 |
| end |
| pushFloat64(ft0) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_f64_promote_f32, macro() |
| popFloat32(ft0) |
| cf2d ft0, ft0 |
| pushFloat64(ft0) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i32_reinterpret_f32, macro() |
| popFloat32(ft0) |
| ff2i ft0, t0 |
| pushInt32(t0) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i64_reinterpret_f64, macro() |
| popFloat64(ft0) |
| fd2q ft0, t0 |
| pushInt64(t0) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_f32_reinterpret_i32, macro() |
| # nop because of stack layout |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_f64_reinterpret_i64, macro() |
| # nop because of stack layout |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i32_extend8_s, macro() |
| # i32.extend8_s |
| popInt32(t0) |
| sxb2i t0, t0 |
| pushInt32(t0) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i32_extend16_s, macro() |
| # i32.extend8_s |
| popInt32(t0) |
| sxh2i t0, t0 |
| pushInt32(t0) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i64_extend8_s, macro() |
| # i64.extend8_s |
| popInt64(t0) |
| sxb2q t0, t0 |
| pushInt64(t0) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i64_extend16_s, macro() |
| # i64.extend8_s |
| popInt64(t0) |
| sxh2q t0, t0 |
| pushInt64(t0) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i64_extend32_s, macro() |
| # i64.extend8_s |
| popInt64(t0) |
| sxi2q t0, t0 |
| pushInt64(t0) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| reservedOpcode(0xc5) |
| reservedOpcode(0xc6) |
| reservedOpcode(0xc7) |
| reservedOpcode(0xc8) |
| reservedOpcode(0xc9) |
| reservedOpcode(0xca) |
| reservedOpcode(0xcb) |
| reservedOpcode(0xcc) |
| reservedOpcode(0xcd) |
| reservedOpcode(0xce) |
| reservedOpcode(0xcf) |
| |
| ##################### |
| # 0xd0 - 0xd6: refs # |
| ##################### |
| |
| ipintOp(_ref_null_t, macro() |
| # Push null value, skip heap type LEB128 in bytecode |
| move ValueNull, t0 |
| pushQuad(t0) |
| leap 1[PC], PC |
| skipLEB128(PC, t0) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_ref_is_null, macro() |
| popQuad(t0) |
| cqeq t0, ValueNull, t0 |
| pushInt32(t0) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_ref_func, macro() |
| loadi IPInt::RefFuncMetadata::index[MC], a1 |
| operationCall(macro() cCall2(_ipint_extern_ref_func) end) |
| pushQuad(r0) |
| loadb IPInt::RefFuncMetadata::instructionLength[MC], t0 |
| advancePC(t0) |
| advanceMC(constexpr (sizeof(IPInt::RefFuncMetadata))) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_ref_eq, macro() |
| popQuad(t0) |
| popQuad(t1) |
| cqeq t0, t1, t0 |
| pushInt32(t0) |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_ref_as_non_null, macro() |
| loadq [sp], t0 |
| bqeq t0, ValueNull, _ipint_throw_NullRefAsNonNull |
| advancePC(1) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_br_on_null, macro() |
| validateOpcodeConfig(t0) |
| loadq [sp], t0 |
| bqneq t0, ValueNull, .br_on_null_not_null |
| |
| # pop the null |
| addq StackValueSize, sp |
| jmp _ipint_br |
| .br_on_null_not_null: |
| loadb IPInt::BranchMetadata::instructionLength[MC], t0 |
| advanceMC(constexpr (sizeof(IPInt::BranchMetadata))) |
| advancePCByReg(t0) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_br_on_non_null, macro() |
| validateOpcodeConfig(t0) |
| loadq [sp], t0 |
| bqneq t0, ValueNull, _ipint_br |
| addq StackValueSize, sp |
| loadb IPInt::BranchMetadata::instructionLength[MC], t0 |
| advanceMC(constexpr (sizeof(IPInt::BranchMetadata))) |
| advancePCByReg(t0) |
| nextIPIntInstruction() |
| end) |
| |
| reservedOpcode(0xd7) |
| reservedOpcode(0xd8) |
| reservedOpcode(0xd9) |
| reservedOpcode(0xda) |
| reservedOpcode(0xdb) |
| reservedOpcode(0xdc) |
| reservedOpcode(0xdd) |
| reservedOpcode(0xde) |
| reservedOpcode(0xdf) |
| reservedOpcode(0xe0) |
| reservedOpcode(0xe1) |
| reservedOpcode(0xe2) |
| reservedOpcode(0xe3) |
| reservedOpcode(0xe4) |
| reservedOpcode(0xe5) |
| reservedOpcode(0xe6) |
| reservedOpcode(0xe7) |
| reservedOpcode(0xe8) |
| reservedOpcode(0xe9) |
| reservedOpcode(0xea) |
| reservedOpcode(0xeb) |
| reservedOpcode(0xec) |
| reservedOpcode(0xed) |
| reservedOpcode(0xee) |
| reservedOpcode(0xef) |
| reservedOpcode(0xf0) |
| reservedOpcode(0xf1) |
| reservedOpcode(0xf2) |
| reservedOpcode(0xf3) |
| reservedOpcode(0xf4) |
| reservedOpcode(0xf5) |
| reservedOpcode(0xf6) |
| reservedOpcode(0xf7) |
| reservedOpcode(0xf8) |
| reservedOpcode(0xf9) |
| reservedOpcode(0xfa) |
| |
| # If the following four instructions are given more descriptive names, |
| # the changes should be matched in IPINT_INSTRUCTIONS in Tools/lldb/debug_ipint.py |
| |
| ipintOp(_gc_prefix, macro() |
| leap 1[PC], t4 |
| decodeLEBVarUInt(t0, t4, t1, t2) |
| # Security guarantee: always less than 30 (0x00 -> 0x1e) |
| biaeq t0, 0x1f, .ipint_gc_nonexistent |
| leap _os_script_config_storage, t1 |
| loadp JSC::LLInt::OpcodeConfig::ipint_gc_dispatch_base[t1], t1 |
| if ARM64 or ARM64E |
| addlshiftp t1, t0, (constexpr (WTF::fastLog2(JSC::IPInt::alignIPInt))), t0 |
| jmp t0 |
| elsif X86_64 |
| lshiftq (constexpr (WTF::fastLog2(JSC::IPInt::alignIPInt))), t0 |
| addq t1, t0 |
| jmp t0 |
| end |
| |
| .ipint_gc_nonexistent: |
| break |
| end) |
| |
| ipintOp(_conversion_prefix, macro() |
| leap 1[PC], t4 |
| decodeLEBVarUInt(t0, t4, t1, t2) |
| # Security guarantee: always less than 23 (0x00 -> 0x16) |
| biaeq t0, 0x17, .ipint_conversion_nonexistent |
| leap _os_script_config_storage, t1 |
| loadp JSC::LLInt::OpcodeConfig::ipint_conversion_dispatch_base[t1], t1 |
| if ARM64 or ARM64E |
| addlshiftp t1, t0, (constexpr (WTF::fastLog2(JSC::IPInt::alignIPInt))), t0 |
| jmp t0 |
| elsif X86_64 |
| lshiftq (constexpr (WTF::fastLog2(JSC::IPInt::alignIPInt))), t0 |
| addq t1, t0 |
| jmp t0 |
| end |
| |
| .ipint_conversion_nonexistent: |
| break |
| end) |
| |
| ipintOp(_simd_prefix, macro() |
| leap 1[PC], t4 |
| decodeLEBVarUInt(t0, t4, t1, t2) |
| # Security guarantee: always less than 276 (0x00 -> 0x113, including relaxed SIMD) |
| biaeq t0, 0x114, .ipint_simd_nonexistent |
| leap _os_script_config_storage, t1 |
| loadp JSC::LLInt::OpcodeConfig::ipint_simd_dispatch_base[t1], t1 |
| if ARM64 or ARM64E |
| addlshiftp t1, t0, (constexpr (WTF::fastLog2(JSC::IPInt::alignIPInt))), t0 |
| jmp t0 |
| elsif X86_64 |
| lshiftq (constexpr (WTF::fastLog2(JSC::IPInt::alignIPInt))), t0 |
| addq t1, t0 |
| jmp t0 |
| end |
| |
| .ipint_simd_nonexistent: |
| break |
| end) |
| |
| ipintOp(_atomic_prefix, macro() |
| leap 1[PC], t4 |
| decodeLEBVarUInt(t0, t4, t1, t2) |
| # Security guarantee: always less than 78 (0x00 -> 0x4e) |
| biaeq t0, 0x4f, .ipint_atomic_nonexistent |
| leap _os_script_config_storage, t1 |
| loadp JSC::LLInt::OpcodeConfig::ipint_atomic_dispatch_base[t1], t1 |
| if ARM64 or ARM64E |
| addlshiftp t1, t0, (constexpr (WTF::fastLog2(JSC::IPInt::alignAtomicIPInt))), t0 |
| jmp t0 |
| elsif X86_64 |
| lshiftq (constexpr (WTF::fastLog2(JSC::IPInt::alignAtomicIPInt))), t0 |
| addq t1, t0 |
| jmp t0 |
| end |
| |
| .ipint_atomic_nonexistent: |
| break |
| end) |
| |
| reservedOpcode(0xff) |
| break |
| |
| ##################### |
| ## GC instructions ## |
| ##################### |
| |
| ipintOp(_struct_new, macro() |
| loadi IPInt::StructNewMetadata::type[MC], a1 # type |
| move sp, a2 |
| operationCallMayThrow(macro() cCall3(_ipint_extern_struct_new) end) |
| loadh IPInt::StructNewMetadata::params[MC], t1 # number of parameters popped |
| mulq StackValueSize, t1 |
| addq t1, sp |
| pushQuad(r0) |
| loadb IPInt::StructNewMetadata::length[MC], t0 |
| advancePCByReg(t0) |
| advanceMC(constexpr (sizeof(IPInt::StructNewMetadata))) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_struct_new_default, macro() |
| loadi IPInt::StructNewDefaultMetadata::type[MC], a1 # type |
| operationCallMayThrow(macro() cCall2(_ipint_extern_struct_new_default) end) |
| pushQuad(r0) |
| loadb IPInt::StructNewDefaultMetadata::length[MC], t0 |
| advancePCByReg(t0) |
| advanceMC(constexpr (sizeof(IPInt::StructNewDefaultMetadata))) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_struct_get, macro() |
| popQuad(a1) # object |
| loadi IPInt::StructGetSetMetadata::fieldIndex[MC], a2 # field index |
| subp StackValueSize, sp # allocate space for result |
| move sp, a3 # result location |
| operationCallMayThrow(macro() cCall4(_ipint_extern_struct_get) end) |
| |
| loadb IPInt::StructGetSetMetadata::length[MC], t0 |
| advancePCByReg(t0) |
| advanceMC(constexpr (sizeof(IPInt::StructGetSetMetadata))) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_struct_get_s, macro() |
| popQuad(a1) # object |
| loadi IPInt::StructGetSetMetadata::fieldIndex[MC], a2 # field index |
| subp StackValueSize, sp # allocate space for result |
| move sp, a3 # result location |
| operationCallMayThrow(macro() cCall4(_ipint_extern_struct_get_s) end) |
| |
| loadb IPInt::StructGetSetMetadata::length[MC], t0 |
| advancePCByReg(t0) |
| advanceMC(constexpr (sizeof(IPInt::StructGetSetMetadata))) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_struct_get_u, macro() |
| popQuad(a1) # object |
| loadi IPInt::StructGetSetMetadata::fieldIndex[MC], a2 # field index |
| subp StackValueSize, sp # allocate space for result |
| move sp, a3 # result location |
| operationCallMayThrow(macro() cCall4(_ipint_extern_struct_get) end) |
| |
| loadb IPInt::StructGetSetMetadata::length[MC], t0 |
| advancePCByReg(t0) |
| advanceMC(constexpr (sizeof(IPInt::StructGetSetMetadata))) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_struct_set, macro() |
| loadp StackValueSize[sp], a1 # object |
| loadi IPInt::StructGetSetMetadata::fieldIndex[MC], a2 # field index |
| move sp, a3 |
| operationCallMayThrow(macro() cCall4(_ipint_extern_struct_set) end) |
| |
| loadb IPInt::StructGetSetMetadata::length[MC], t0 |
| addp 2 * StackValueSize, sp |
| advancePCByReg(t0) |
| advanceMC(constexpr (sizeof(IPInt::StructGetSetMetadata))) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_array_new, macro() |
| loadi IPInt::ArrayNewMetadata::type[MC], a1 # type |
| popInt32(a2) # length |
| move sp, a3 # pointer to default value |
| operationCallMayThrow(macro() cCall4(_ipint_extern_array_new) end) |
| addp StackValueSize, sp # pop default value |
| |
| pushQuad(r0) |
| |
| loadb IPInt::ArrayNewMetadata::length[MC], t0 |
| advancePCByReg(t0) |
| advanceMC(constexpr (sizeof(IPInt::ArrayNewMetadata))) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_array_new_default, macro() |
| loadi IPInt::ArrayNewMetadata::type[MC], a1 # type |
| popInt32(a2) # length |
| operationCallMayThrow(macro() cCall3(_ipint_extern_array_new_default) end) |
| |
| pushQuad(r0) |
| |
| loadb IPInt::ArrayNewMetadata::length[MC], t0 |
| advancePCByReg(t0) |
| advanceMC(constexpr (sizeof(IPInt::ArrayNewMetadata))) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_array_new_fixed, macro() |
| loadi IPInt::ArrayNewFixedMetadata::type[MC], a1 # type |
| loadi IPInt::ArrayNewFixedMetadata::arraySize[MC], a2 # array length |
| move sp, a3 # arguments |
| operationCallMayThrow(macro() cCall4(_ipint_extern_array_new_fixed) end) |
| |
| # pop all the arguments |
| loadi IPInt::ArrayNewFixedMetadata::arraySize[MC], t3 # array length |
| muli StackValueSize, t3 |
| addp t3, sp |
| |
| pushQuad(r0) |
| |
| loadb IPInt::ArrayNewFixedMetadata::length[MC], t0 |
| advancePCByReg(t0) |
| advanceMC(constexpr (sizeof(IPInt::ArrayNewFixedMetadata))) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_array_new_data, macro() |
| move MC, a1 # metadata |
| popInt32(a3) # size |
| popInt32(a2) # offset |
| operationCallMayThrow(macro() cCall4(_ipint_extern_array_new_data) end) |
| |
| pushQuad(r0) |
| |
| loadb IPInt::ArrayNewDataMetadata::length[MC], t0 |
| advancePCByReg(t0) |
| advanceMC(constexpr (sizeof(IPInt::ArrayNewDataMetadata))) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_array_new_elem, macro() |
| move MC, a1 # metadata |
| popInt32(a3) # size |
| popInt32(a2) # offset |
| operationCallMayThrow(macro() cCall4(_ipint_extern_array_new_elem) end) |
| |
| pushQuad(r0) |
| |
| loadb IPInt::ArrayNewElemMetadata::length[MC], t0 |
| advancePCByReg(t0) |
| advanceMC(constexpr (sizeof(IPInt::ArrayNewElemMetadata))) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_array_get, macro() |
| loadi IPInt::ArrayGetSetMetadata::type[MC], a1 # type |
| move sp, a2 # all args on stack, result will be returned on stack |
| operationCallMayThrow(macro() cCall3(_ipint_extern_array_get) end) |
| |
| addp StackValueSize, sp # 2 args - 1 result |
| |
| loadb IPInt::ArrayGetSetMetadata::length[MC], t0 |
| advancePCByReg(t0) |
| advanceMC(constexpr (sizeof(IPInt::ArrayGetSetMetadata))) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_array_get_s, macro() |
| loadi IPInt::ArrayGetSetMetadata::type[MC], a1 # type |
| move sp, a2 # all args on stack, result will be returned on stack |
| operationCallMayThrow(macro() cCall3(_ipint_extern_array_get_s) end) |
| |
| addp StackValueSize, sp # 2 args - 1 result |
| |
| loadb IPInt::ArrayGetSetMetadata::length[MC], t0 |
| advancePCByReg(t0) |
| advanceMC(constexpr (sizeof(IPInt::ArrayGetSetMetadata))) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_array_get_u, macro() |
| loadi IPInt::ArrayGetSetMetadata::type[MC], a1 # type |
| move sp, a2 # all args on stack, result will be returned on stack |
| operationCallMayThrow(macro() cCall3(_ipint_extern_array_get) end) |
| |
| addp StackValueSize, sp # 2 args - 1 result |
| |
| loadb IPInt::ArrayGetSetMetadata::length[MC], t0 |
| advancePCByReg(t0) |
| advanceMC(constexpr (sizeof(IPInt::ArrayGetSetMetadata))) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_array_set, macro() |
| loadi IPInt::ArrayGetSetMetadata::type[MC], a1 # type |
| move sp, a2 # stack pointer with all the arguments |
| operationCallMayThrow(macro() cCall3(_ipint_extern_array_set) end) |
| |
| addq StackValueSize * 3, sp |
| |
| loadb IPInt::ArrayGetSetMetadata::length[MC], t0 |
| advancePCByReg(t0) |
| advanceMC(constexpr (sizeof(IPInt::ArrayGetSetMetadata))) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_array_len, macro() |
| popQuad(t0) # array into t0 |
| bqeq t0, ValueNull, _ipint_throw_NullAccess |
| loadi JSWebAssemblyArray::m_size[t0], t0 |
| pushInt32(t0) |
| loadb IPInt::InstructionLengthMetadata::length[MC], t0 |
| advancePCByReg(t0) |
| advanceMC(constexpr (sizeof(IPInt::InstructionLengthMetadata))) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_array_fill, macro() |
| move sp, a1 |
| operationCallMayThrow(macro() cCall2(_ipint_extern_array_fill) end) |
| |
| addp StackValueSize * 4, sp |
| |
| loadb IPInt::InstructionLengthMetadata::length[MC], t0 |
| advancePCByReg(t0) |
| advanceMC(constexpr (sizeof(IPInt::InstructionLengthMetadata))) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_array_copy, macro() |
| move sp, a1 |
| operationCallMayThrow(macro() cCall2(_ipint_extern_array_copy) end) |
| |
| addp StackValueSize * 5, sp |
| |
| loadb IPInt::InstructionLengthMetadata::length[MC], t0 |
| advancePCByReg(t0) |
| advanceMC(constexpr (sizeof(IPInt::InstructionLengthMetadata))) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_array_init_data, macro() |
| loadi IPInt::ArrayInitDataMetadata::dataSegmentIndex[MC], a1 |
| move sp, a2 |
| operationCallMayThrow(macro() cCall3(_ipint_extern_array_init_data) end) |
| |
| addp StackValueSize * 4, sp |
| |
| loadb IPInt::ArrayInitDataMetadata::length[MC], t0 |
| advancePCByReg(t0) |
| advanceMC(constexpr (sizeof(IPInt::ArrayInitDataMetadata))) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_array_init_elem, macro() |
| loadi IPInt::ArrayInitElemMetadata::elemSegmentIndex[MC], a1 |
| move sp, a2 |
| operationCallMayThrow(macro() cCall3(_ipint_extern_array_init_elem) end) |
| |
| addp StackValueSize * 4, sp |
| |
| loadb IPInt::ArrayInitElemMetadata::length[MC], t0 |
| advancePCByReg(t0) |
| advanceMC(constexpr (sizeof(IPInt::ArrayInitElemMetadata))) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_ref_test, macro() |
| loadi IPInt::RefTestCastMetadata::toHeapType[MC], a1 |
| move 0, a2 # allowNull |
| popQuad(a3) |
| operationCall(macro() cCall3(_ipint_extern_ref_test) end) |
| |
| pushInt32(r0) |
| |
| loadb IPInt::RefTestCastMetadata::length[MC], t0 |
| advancePCByReg(t0) |
| advanceMC(constexpr (sizeof(IPInt::RefTestCastMetadata))) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_ref_test_nullable, macro() |
| loadi IPInt::RefTestCastMetadata::toHeapType[MC], a1 |
| move 1, a2 # allowNull |
| popQuad(a3) |
| operationCall(macro() cCall3(_ipint_extern_ref_test) end) |
| |
| pushInt32(r0) |
| |
| loadb IPInt::RefTestCastMetadata::length[MC], t0 |
| advancePCByReg(t0) |
| advanceMC(constexpr (sizeof(IPInt::RefTestCastMetadata))) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_ref_cast, macro() |
| loadi IPInt::RefTestCastMetadata::toHeapType[MC], a1 |
| move 0, a2 # allowNull |
| popQuad(a3) |
| operationCallMayThrow(macro() cCall3(_ipint_extern_ref_cast) end) |
| |
| pushInt32(r0) |
| |
| loadb IPInt::RefTestCastMetadata::length[MC], t0 |
| advancePCByReg(t0) |
| advanceMC(constexpr (sizeof(IPInt::RefTestCastMetadata))) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_ref_cast_nullable, macro() |
| loadi IPInt::RefTestCastMetadata::toHeapType[MC], a1 |
| move 1, a2 # allowNull |
| popQuad(a3) |
| operationCallMayThrow(macro() cCall3(_ipint_extern_ref_cast) end) |
| |
| pushInt32(r0) |
| |
| loadb IPInt::RefTestCastMetadata::length[MC], t0 |
| advancePCByReg(t0) |
| advanceMC(constexpr (sizeof(IPInt::RefTestCastMetadata))) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_br_on_cast, macro() |
| validateOpcodeConfig(a1) |
| loadi IPInt::RefTestCastMetadata::toHeapType[MC], a1 |
| # fb 18 FLAGS |
| loadb 2[PC], a2 |
| rshifti 1, a2 # bit 1 = null2 |
| loadq [sp], a3 |
| operationCall(macro() cCall3(_ipint_extern_ref_test) end) |
| |
| advanceMC(constexpr (sizeof(IPInt::RefTestCastMetadata))) |
| |
| bineq r0, 0, _ipint_br |
| loadb IPInt::BranchMetadata::instructionLength[MC], t0 |
| advanceMC(constexpr (sizeof(IPInt::BranchMetadata))) |
| advancePCByReg(t0) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_br_on_cast_fail, macro() |
| validateOpcodeConfig(a1) |
| loadi IPInt::RefTestCastMetadata::toHeapType[MC], a1 |
| loadb 2[PC], a2 |
| # fb 19 FLAGS |
| rshifti 1, a2 # bit 1 = null2 |
| loadq [sp], a3 |
| operationCall(macro() cCall3(_ipint_extern_ref_test) end) |
| |
| advanceMC(constexpr (sizeof(IPInt::RefTestCastMetadata))) |
| |
| bieq r0, 0, _ipint_br |
| loadb IPInt::BranchMetadata::instructionLength[MC], t0 |
| advanceMC(constexpr (sizeof(IPInt::BranchMetadata))) |
| advancePCByReg(t0) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_any_convert_extern, macro() |
| popQuad(a1) |
| operationCall(macro() cCall2(_ipint_extern_any_convert_extern) end) |
| pushQuad(r0) |
| loadb IPInt::InstructionLengthMetadata::length[MC], t0 |
| advancePCByReg(t0) |
| advanceMC(constexpr (sizeof(IPInt::InstructionLengthMetadata))) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_extern_convert_any, macro() |
| # do nothing |
| loadb IPInt::InstructionLengthMetadata::length[MC], t0 |
| advancePCByReg(t0) |
| advanceMC(constexpr (sizeof(IPInt::InstructionLengthMetadata))) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_ref_i31, macro() |
| popInt32(t0) |
| lshifti 0x1, t0 |
| rshifti 0x1, t0 |
| orq TagNumber, t0 |
| pushQuad(t0) |
| |
| loadb IPInt::InstructionLengthMetadata::length[MC], t0 |
| advancePCByReg(t0) |
| advanceMC(constexpr (sizeof(IPInt::InstructionLengthMetadata))) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i31_get_s, macro() |
| popQuad(t0) |
| bqeq t0, ValueNull, _ipint_throw_NullI31Get |
| pushInt32(t0) |
| |
| loadb IPInt::InstructionLengthMetadata::length[MC], t0 |
| advancePCByReg(t0) |
| advanceMC(constexpr (sizeof(IPInt::InstructionLengthMetadata))) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i31_get_u, macro() |
| popQuad(t0) |
| bqeq t0, ValueNull, _ipint_throw_NullI31Get |
| andq 0x7fffffff, t0 |
| pushInt32(t0) |
| |
| loadb IPInt::InstructionLengthMetadata::length[MC], t0 |
| advancePCByReg(t0) |
| advanceMC(constexpr (sizeof(IPInt::InstructionLengthMetadata))) |
| nextIPIntInstruction() |
| end) |
| |
| ############################# |
| ## Conversion instructions ## |
| ############################# |
| |
| ipintOp(_i32_trunc_sat_f32_s, macro() |
| popFloat32(ft0) |
| |
| move 0xcf000000, t0 # INT32_MIN (Note that INT32_MIN - 1.0 in float is the same as INT32_MIN in float). |
| fi2f t0, ft1 |
| bfltun ft0, ft1, .ipint_i32_trunc_sat_f32_s_outOfBoundsTruncSatMinOrNaN |
| |
| move 0x4f000000, t0 # -INT32_MIN |
| fi2f t0, ft1 |
| bfgtequn ft0, ft1, .ipint_i32_trunc_sat_f32_s_outOfBoundsTruncSatMax |
| |
| truncatef2is ft0, t0 |
| pushInt32(t0) |
| |
| .end: |
| loadb IPInt::InstructionLengthMetadata::length[MC], t0 |
| advancePCByReg(t0) |
| advanceMC(constexpr (sizeof(IPInt::InstructionLengthMetadata))) |
| nextIPIntInstruction() |
| |
| .ipint_i32_trunc_sat_f32_s_outOfBoundsTruncSatMinOrNaN: |
| bfeq ft0, ft0, .ipint_i32_trunc_sat_f32_s_outOfBoundsTruncSatMin |
| move 0, t0 |
| pushInt32(t0) |
| jmp .end |
| |
| .ipint_i32_trunc_sat_f32_s_outOfBoundsTruncSatMax: |
| move (constexpr INT32_MAX), t0 |
| pushInt32(t0) |
| jmp .end |
| |
| .ipint_i32_trunc_sat_f32_s_outOfBoundsTruncSatMin: |
| move (constexpr INT32_MIN), t0 |
| pushInt32(t0) |
| jmp .end |
| end) |
| |
| ipintOp(_i32_trunc_sat_f32_u, macro() |
| popFloat32(ft0) |
| |
| move 0xbf800000, t0 # -1.0 |
| fi2f t0, ft1 |
| bfltequn ft0, ft1, .ipint_i32_trunc_sat_f32_u_outOfBoundsTruncSatMin |
| |
| move 0x4f800000, t0 # INT32_MIN * -2.0 |
| fi2f t0, ft1 |
| bfgtequn ft0, ft1, .ipint_i32_trunc_sat_f32_u_outOfBoundsTruncSatMax |
| |
| truncatef2i ft0, t0 |
| pushInt32(t0) |
| |
| .end: |
| loadb IPInt::InstructionLengthMetadata::length[MC], t0 |
| advancePCByReg(t0) |
| advanceMC(constexpr (sizeof(IPInt::InstructionLengthMetadata))) |
| nextIPIntInstruction() |
| |
| .ipint_i32_trunc_sat_f32_u_outOfBoundsTruncSatMin: |
| move 0, t0 |
| pushInt32(t0) |
| jmp .end |
| |
| .ipint_i32_trunc_sat_f32_u_outOfBoundsTruncSatMax: |
| move (constexpr UINT32_MAX), t0 |
| pushInt32(t0) |
| jmp .end |
| end) |
| |
| ipintOp(_i32_trunc_sat_f64_s, macro() |
| popFloat64(ft0) |
| |
| move 0xc1e0000000200000, t0 # INT32_MIN - 1.0 |
| fq2d t0, ft1 |
| bdltequn ft0, ft1, .ipint_i32_trunc_sat_f64_s_outOfBoundsTruncSatMinOrNaN |
| |
| move 0x41e0000000000000, t0 # -INT32_MIN |
| fq2d t0, ft1 |
| bdgtequn ft0, ft1, .ipint_i32_trunc_sat_f64_s_outOfBoundsTruncSatMax |
| |
| truncated2is ft0, t0 |
| pushInt32(t0) |
| |
| .end: |
| loadb IPInt::InstructionLengthMetadata::length[MC], t0 |
| advancePCByReg(t0) |
| advanceMC(constexpr (sizeof(IPInt::InstructionLengthMetadata))) |
| nextIPIntInstruction() |
| |
| .ipint_i32_trunc_sat_f64_s_outOfBoundsTruncSatMinOrNaN: |
| bdeq ft0, ft0, .ipint_i32_trunc_sat_f64_s_outOfBoundsTruncSatMin |
| move 0, t0 |
| pushInt32(t0) |
| jmp .end |
| |
| .ipint_i32_trunc_sat_f64_s_outOfBoundsTruncSatMax: |
| move (constexpr INT32_MAX), t0 |
| pushInt32(t0) |
| jmp .end |
| |
| .ipint_i32_trunc_sat_f64_s_outOfBoundsTruncSatMin: |
| move (constexpr INT32_MIN), t0 |
| pushInt32(t0) |
| jmp .end |
| end) |
| |
| ipintOp(_i32_trunc_sat_f64_u, macro() |
| popFloat64(ft0) |
| |
| move 0xbff0000000000000, t0 # -1.0 |
| fq2d t0, ft1 |
| bdltequn ft0, ft1, .ipint_i32_trunc_sat_f64_u_outOfBoundsTruncSatMin |
| |
| move 0x41f0000000000000, t0 # INT32_MIN * -2.0 |
| fq2d t0, ft1 |
| bdgtequn ft0, ft1, .ipint_i32_trunc_sat_f64_u_outOfBoundsTruncSatMax |
| |
| truncated2i ft0, t0 |
| pushInt32(t0) |
| |
| .end: |
| loadb IPInt::InstructionLengthMetadata::length[MC], t0 |
| advancePCByReg(t0) |
| advanceMC(constexpr (sizeof(IPInt::InstructionLengthMetadata))) |
| nextIPIntInstruction() |
| |
| .ipint_i32_trunc_sat_f64_u_outOfBoundsTruncSatMin: |
| move 0, t0 |
| pushInt32(t0) |
| jmp .end |
| |
| .ipint_i32_trunc_sat_f64_u_outOfBoundsTruncSatMax: |
| move (constexpr UINT32_MAX), t0 |
| pushInt32(t0) |
| jmp .end |
| end) |
| |
| ipintOp(_i64_trunc_sat_f32_s, macro() |
| popFloat32(ft0) |
| |
| move 0xdf000000, t0 # INT64_MIN |
| fi2f t0, ft1 |
| bfltun ft0, ft1, .ipint_i64_trunc_sat_f32_s_outOfBoundsTruncSatMinOrNaN |
| |
| move 0x5f000000, t0 # -INT64_MIN |
| fi2f t0, ft1 |
| bfgtequn ft0, ft1, .ipint_i64_trunc_sat_f32_s_outOfBoundsTruncSatMax |
| |
| truncatef2qs ft0, t0 |
| pushInt64(t0) |
| |
| .end: |
| loadb IPInt::InstructionLengthMetadata::length[MC], t0 |
| advancePCByReg(t0) |
| advanceMC(constexpr (sizeof(IPInt::InstructionLengthMetadata))) |
| nextIPIntInstruction() |
| |
| .ipint_i64_trunc_sat_f32_s_outOfBoundsTruncSatMinOrNaN: |
| bfeq ft0, ft0, .ipint_i64_trunc_sat_f32_s_outOfBoundsTruncSatMin |
| move 0, t0 |
| pushInt64(t0) |
| jmp .end |
| |
| .ipint_i64_trunc_sat_f32_s_outOfBoundsTruncSatMax: |
| move (constexpr INT64_MAX), t0 |
| pushInt64(t0) |
| jmp .end |
| |
| .ipint_i64_trunc_sat_f32_s_outOfBoundsTruncSatMin: |
| move (constexpr INT64_MIN), t0 |
| pushInt64(t0) |
| jmp .end |
| end) |
| |
| ipintOp(_i64_trunc_sat_f32_u, macro() |
| popFloat32(ft0) |
| |
| move 0xbf800000, t0 # -1.0 |
| fi2f t0, ft1 |
| bfltequn ft0, ft1, .ipint_i64_trunc_sat_f32_u_outOfBoundsTruncSatMin |
| |
| move 0x5f800000, t0 # INT64_MIN * -2.0 |
| fi2f t0, ft1 |
| bfgtequn ft0, ft1, .ipint_i64_trunc_sat_f32_u_outOfBoundsTruncSatMax |
| |
| truncatef2q ft0, t0 |
| pushInt64(t0) |
| |
| .end: |
| loadb IPInt::InstructionLengthMetadata::length[MC], t0 |
| advancePCByReg(t0) |
| advanceMC(constexpr (sizeof(IPInt::InstructionLengthMetadata))) |
| nextIPIntInstruction() |
| |
| .ipint_i64_trunc_sat_f32_u_outOfBoundsTruncSatMin: |
| move 0, t0 |
| pushInt64(t0) |
| jmp .end |
| |
| .ipint_i64_trunc_sat_f32_u_outOfBoundsTruncSatMax: |
| move (constexpr UINT64_MAX), t0 |
| pushInt64(t0) |
| jmp .end |
| end) |
| |
| ipintOp(_i64_trunc_sat_f64_s, macro() |
| popFloat64(ft0) |
| move 0xc3e0000000000000, t0 # INT64_MIN |
| fq2d t0, ft1 |
| bdltun ft0, ft1, .ipint_i64_trunc_sat_f64_s_outOfBoundsTruncSatMinOrNaN |
| |
| move 0x43e0000000000000, t0 # -INT64_MIN |
| fq2d t0, ft1 |
| bdgtequn ft0, ft1, .ipint_i64_trunc_sat_f64_s_outOfBoundsTruncSatMax |
| |
| truncated2qs ft0, t0 |
| pushInt64(t0) |
| |
| .end: |
| loadb IPInt::InstructionLengthMetadata::length[MC], t0 |
| advancePCByReg(t0) |
| advanceMC(constexpr (sizeof(IPInt::InstructionLengthMetadata))) |
| nextIPIntInstruction() |
| |
| .ipint_i64_trunc_sat_f64_s_outOfBoundsTruncSatMinOrNaN: |
| bdeq ft0, ft0, .ipint_i64_trunc_sat_f64_s_outOfBoundsTruncSatMin |
| move 0, t0 |
| pushInt64(t0) |
| jmp .end |
| |
| .ipint_i64_trunc_sat_f64_s_outOfBoundsTruncSatMax: |
| move (constexpr INT64_MAX), t0 |
| pushInt64(t0) |
| jmp .end |
| |
| .ipint_i64_trunc_sat_f64_s_outOfBoundsTruncSatMin: |
| move (constexpr INT64_MIN), t0 |
| pushInt64(t0) |
| jmp .end |
| end) |
| |
| ipintOp(_i64_trunc_sat_f64_u, macro() |
| popFloat64(ft0) |
| |
| move 0xbff0000000000000, t0 # -1.0 |
| fq2d t0, ft1 |
| bdltequn ft0, ft1, .ipint_i64_trunc_sat_f64_u_outOfBoundsTruncSatMin |
| |
| move 0x43f0000000000000, t0 # INT64_MIN * -2.0 |
| fq2d t0, ft1 |
| bdgtequn ft0, ft1, .ipint_i64_trunc_sat_f64_u_outOfBoundsTruncSatMax |
| |
| truncated2q ft0, t0 |
| pushInt64(t0) |
| |
| .end: |
| loadb IPInt::InstructionLengthMetadata::length[MC], t0 |
| advancePCByReg(t0) |
| advanceMC(constexpr (sizeof(IPInt::InstructionLengthMetadata))) |
| nextIPIntInstruction() |
| |
| .ipint_i64_trunc_sat_f64_u_outOfBoundsTruncSatMin: |
| move 0, t0 |
| pushInt64(t0) |
| jmp .end |
| |
| .ipint_i64_trunc_sat_f64_u_outOfBoundsTruncSatMax: |
| move (constexpr UINT64_MAX), t0 |
| pushInt64(t0) |
| jmp .end |
| end) |
| |
| ipintOp(_memory_init, macro() |
| # memory.init |
| loadb IPInt::MemoryInitMetadata::memoryIndex[MC], a3 |
| move sp, a2 |
| loadi IPInt::MemoryInitMetadata::dataIndex[MC], a1 |
| operationCallMayThrow(macro() cCall4(_ipint_extern_memory_init) end) |
| addq 3 * StackValueSize, sp |
| loadb IPInt::MemoryInitMetadata::instructionLength[MC], t0 |
| advancePCByReg(t0) |
| advanceMC(constexpr (sizeof(IPInt::MemoryInitMetadata))) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_data_drop, macro() |
| # data.drop |
| loadi IPInt::DataAccessMetadata::index[MC], a1 |
| operationCall(macro() cCall2(_ipint_extern_data_drop) end) |
| loadb IPInt::DataAccessMetadata::instructionLength[MC], t0 |
| advancePCByReg(t0) |
| advanceMC(constexpr (sizeof(IPInt::DataAccessMetadata))) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_memory_copy, macro() |
| # memory.copy |
| loadb IPInt::MemoryCopyMetadata::dstMemoryIndex[MC], a1 |
| pushQuad(a1) |
| loadb IPInt::MemoryCopyMetadata::srcMemoryIndex[MC], a1 |
| pushQuad(a1) |
| move sp, a1 |
| # starting at top of stack: src memory index, dst memory index, n, s, d |
| operationCallMayThrow(macro() cCall2(_ipint_extern_memory_copy) end) |
| addq 5 * StackValueSize, sp |
| |
| loadb IPInt::MemoryCopyMetadata::instructionLength[MC], t0 |
| advancePCByReg(t0) |
| advanceMC(constexpr (sizeof(IPInt::MemoryCopyMetadata))) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_memory_fill, macro() |
| # memory.fill |
| loadb IPInt::MemoryFillMetadata::memoryIndex[MC], a1 |
| pushQuad(a1) |
| move sp, a1 |
| # starting at top of stack: memory index, n, val, d |
| operationCallMayThrow(macro() cCall2(_ipint_extern_memory_fill) end) |
| addq 4 * StackValueSize, sp |
| |
| loadb IPInt::MemoryFillMetadata::instructionLength[MC], t0 |
| advancePCByReg(t0) |
| advanceMC(constexpr (sizeof(IPInt::MemoryFillMetadata))) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_table_init, macro() |
| # table.init |
| move sp, a1 |
| leap [MC], a2 # IPInt::tableInitMetadata |
| operationCallMayThrow(macro() cCall3(_ipint_extern_table_init) end) |
| addp 3 * StackValueSize, sp |
| loadb IPInt::TableInitMetadata::instructionLength[MC], t0 |
| advancePCByReg(t0) |
| advanceMC(constexpr (sizeof(IPInt::TableInitMetadata))) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_elem_drop, macro() |
| # elem.drop |
| loadi IPInt::ElemDropMetadata::index[MC], a1 |
| operationCall(macro() cCall2(_ipint_extern_elem_drop) end) |
| loadb IPInt::ElemDropMetadata::instructionLength[MC], t0 |
| advancePCByReg(t0) |
| advanceMC(constexpr (sizeof(IPInt::ElemDropMetadata))) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_table_copy, macro() |
| # table.copy |
| move sp, a1 |
| move MC, a2 |
| operationCallMayThrow(macro() cCall3(_ipint_extern_table_copy) end) |
| addp 3 * StackValueSize, sp |
| loadb IPInt::TableCopyMetadata::instructionLength[MC], t0 |
| advancePCByReg(t0) |
| advanceMC(constexpr (sizeof(IPInt::TableCopyMetadata))) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_table_grow, macro() |
| # table.grow |
| move sp, a1 |
| move MC, a2 # IPInt::tableGrowMetadata |
| operationCall(macro() cCall3(_ipint_extern_table_grow) end) |
| addp StackValueSize * 2, sp |
| pushQuad(r0) |
| loadb IPInt::TableGrowMetadata::instructionLength[MC], t0 |
| advancePCByReg(t0) |
| advanceMC(constexpr (sizeof(IPInt::TableGrowMetadata))) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_table_size, macro() |
| # table.size |
| loadi IPInt::TableAccessMetadata::index[MC], a1 |
| operationCall(macro() cCall2(_ipint_extern_table_size) end) |
| pushQuad(r0) |
| loadb IPInt::TableAccessMetadata::instructionLength[MC], t0 |
| advancePCByReg(t0) |
| advanceMC(constexpr (sizeof(IPInt::TableAccessMetadata))) |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_table_fill, macro() |
| # table.fill |
| move sp, a1 |
| move MC, a2 |
| operationCallMayThrow(macro() cCall3(_ipint_extern_table_fill) end) |
| addp 3 * StackValueSize, sp |
| loadb IPInt::TableFillMetadata::instructionLength[MC], t0 |
| advancePCByReg(t0) |
| advanceMC(constexpr (sizeof(IPInt::TableFillMetadata))) |
| nextIPIntInstruction() |
| end) |
| |
| reservedOpcode(misc_0x12) |
| break |
| |
| ################################## |
| ## Wide Arithmetic Instructions ## |
| ################################## |
| |
| ipintOp(_i64_add128, macro() |
| # i64.add128: [lhsLo lhsHi rhsLo rhsHi] -> [resultLo resultHi] |
| # Stack layout (top first): sp[0]=rhsHi, sp[1]=rhsLo, sp[2]=lhsHi, sp[3]=lhsLo |
| popQuad(t3) # rhsHi |
| popQuad(t2) # rhsLo |
| popQuad(t1) # lhsHi |
| popQuad(t0) # lhsLo |
| if ARM64 or ARM64E |
| addqs t0, t2, t0 # resultLo = lhsLo + rhsLo, sets carry flag |
| adcq t1, t3, t1 # resultHi = lhsHi + rhsHi + carry flag |
| elsif X86_64 |
| addq t2, t0 # resultLo = lhsLo + rhsLo, sets carry flag |
| adcq t3, t1 # resultHi = lhsHi + rhsHi + carry flag |
| end |
| pushQuad(t0) |
| pushQuad(t1) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i64_sub128, macro() |
| # i64.sub128: [lhsLo lhsHi rhsLo rhsHi] -> [resultLo resultHi] |
| # Stack layout (top first): sp[0]=rhsHi, sp[1]=rhsLo, sp[2]=lhsHi, sp[3]=lhsLo |
| popQuad(t3) # rhsHi |
| popQuad(t2) # rhsLo |
| popQuad(t1) # lhsHi |
| popQuad(t0) # lhsLo |
| if ARM64 or ARM64E |
| subqs t0, t2, t0 # resultLo = lhsLo - rhsLo, sets carry flag (borrow) |
| sbcq t1, t3, t1 # resultHi = lhsHi - rhsHi - carry flag |
| elsif X86_64 |
| subq t2, t0 # resultLo = lhsLo - rhsLo, sets carry flag (borrow) |
| sbcq t3, t1 # resultHi = lhsHi - rhsHi - carry flag |
| end |
| pushQuad(t0) |
| pushQuad(t1) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i64_mul_wide_s, macro() |
| # i64.mul_wide_s: [lhs rhs] -> [resultLo resultHi] |
| # Stack layout (top first): sp[0]=rhs, sp[1]=lhs |
| popQuad(t1) # rhs |
| popQuad(t0) # lhs |
| if ARM64 or ARM64E |
| smulhq t0, t1, t2 # resultHi = smulh(lhs, rhs) - must precede mulq |
| mulq t1, t0 # resultLo = lhs * rhs |
| elsif X86_64 |
| # t0 = rax |
| # t2 = rdx |
| smulhq t1 # imulq %rsi: rdx:rax = rax * rsi -> t0=resultLo, t2=resultHi |
| end |
| pushQuad(t0) |
| pushQuad(t2) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_i64_mul_wide_u, macro() |
| # i64.mul_wide_u: [lhs rhs] -> [resultLo resultHi] |
| # Stack layout (top first): sp[0]=rhs, sp[1]=lhs |
| popQuad(t1) # rhs |
| popQuad(t0) # lhs |
| if ARM64 or ARM64E |
| umulhq t0, t1, t2 # resultHi = umulh(lhs, rhs) - must precede mulq |
| mulq t1, t0 # resultLo = lhs * rhs |
| elsif X86_64 |
| # t0 = rax |
| # t2 = rdx |
| umulhq t1 # mulq %rsi: rdx:rax = rax * rsi -> t0=resultLo, t2=resultHi |
| end |
| pushQuad(t0) |
| pushQuad(t2) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ####################### |
| ## SIMD Instructions ## |
| ####################### |
| |
| const ImmLaneIdxOffset = 0 # Offset from t4 (points past the decoded SIMD opcode) |
| const ImmLaneIdx16Mask = 0xf |
| const ImmLaneIdx8Mask = 0x7 |
| const ImmLaneIdx4Mask = 0x3 |
| const ImmLaneIdx2Mask = 0x1 |
| |
| # Platform-specific SIMD load macros (shared between fast and slow paths). |
| # Input: t0 = host pointer (rax on x86_64). Output: v0 = loaded vector. |
| # Clobbers: ft0 (ARM64), t1 (splat ops on ARM64). |
| |
| macro simdLoad8x8s() |
| if ARM64 or ARM64E |
| loadd [t0], ft0 |
| emit "sxtl v16.8h, v0.8b" |
| elsif X86_64 |
| emit "pmovsxbw (%rax), %xmm0" |
| else |
| break |
| end |
| end |
| |
| macro simdLoad8x8u() |
| if ARM64 or ARM64E |
| loadd [t0], ft0 |
| emit "uxtl v16.8h, v0.8b" |
| elsif X86_64 |
| emit "pmovzxbw (%rax), %xmm0" |
| else |
| break |
| end |
| end |
| |
| macro simdLoad16x4s() |
| if ARM64 or ARM64E |
| loadd [t0], ft0 |
| emit "sxtl v16.4s, v0.4h" |
| elsif X86_64 |
| emit "pmovsxwd (%rax), %xmm0" |
| else |
| break |
| end |
| end |
| |
| macro simdLoad16x4u() |
| if ARM64 or ARM64E |
| loadd [t0], ft0 |
| emit "uxtl v16.4s, v0.4h" |
| elsif X86_64 |
| emit "pmovzxwd (%rax), %xmm0" |
| else |
| break |
| end |
| end |
| |
| macro simdLoad32x2s() |
| if ARM64 or ARM64E |
| loadd [t0], ft0 |
| emit "sxtl v16.2d, v0.2s" |
| elsif X86_64 |
| emit "pmovsxdq (%rax), %xmm0" |
| else |
| break |
| end |
| end |
| |
| macro simdLoad32x2u() |
| if ARM64 or ARM64E |
| loadd [t0], ft0 |
| emit "uxtl v16.2d, v0.2s" |
| elsif X86_64 |
| emit "pmovzxdq (%rax), %xmm0" |
| else |
| break |
| end |
| end |
| |
| macro simdLoadSplat8() |
| if ARM64 or ARM64E |
| loadb [t0], t1 |
| emit "dup v16.16b, w1" |
| elsif X86_64 |
| emit "vpinsrb $0, (%rax), %xmm0, %xmm0" |
| emit "vpxor %xmm1, %xmm1, %xmm1" |
| emit "vpshufb %xmm1, %xmm0, %xmm0" |
| else |
| break |
| end |
| end |
| |
| macro simdLoadSplat16() |
| if ARM64 or ARM64E |
| loadh [t0], t1 |
| emit "dup v16.8h, w1" |
| elsif X86_64 |
| emit "vpinsrw $0, (%rax), %xmm0, %xmm0" |
| emit "vpshuflw $0, %xmm0, %xmm0" |
| emit "vpunpcklqdq %xmm0, %xmm0, %xmm0" |
| else |
| break |
| end |
| end |
| |
| macro simdLoadSplat32() |
| if ARM64 or ARM64E |
| loadi [t0], t1 |
| emit "dup v16.4s, w1" |
| elsif X86_64 |
| emit "vbroadcastss (%rax), %xmm0" |
| else |
| break |
| end |
| end |
| |
| macro simdLoadSplat64() |
| if ARM64 or ARM64E |
| loadq [t0], t1 |
| emit "dup v16.2d, x1" |
| elsif X86_64 |
| emit "vmovddup (%rax), %xmm0" |
| else |
| break |
| end |
| end |
| |
| # 0xFD 0x00 - 0xFD 0x0B: memory |
| |
| ipintOp(_simd_v128_load_mem, macro() |
| # v128.load |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 16, t1, t2, .simd_v128_load_slow_path) |
| loadv [t0], v0 |
| pushVec(v0) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_v128_load_8x8s_mem, macro() |
| # v128.load8x8_s |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 8, t1, t2, .simd_v128_load_8x8s_slow_path) |
| simdLoad8x8s() |
| pushVec(v0) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_v128_load_8x8u_mem, macro() |
| # v128.load8x8_u |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 8, t1, t2, .simd_v128_load_8x8u_slow_path) |
| simdLoad8x8u() |
| pushVec(v0) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_v128_load_16x4s_mem, macro() |
| # v128.load16x4_s |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 8, t1, t2, .simd_v128_load_16x4s_slow_path) |
| simdLoad16x4s() |
| pushVec(v0) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_v128_load_16x4u_mem, macro() |
| # v128.load16x4_u |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 8, t1, t2, .simd_v128_load_16x4u_slow_path) |
| simdLoad16x4u() |
| pushVec(v0) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_v128_load_32x2s_mem, macro() |
| # v128.load32x2_s |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 8, t1, t2, .simd_v128_load_32x2s_slow_path) |
| simdLoad32x2s() |
| pushVec(v0) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_v128_load_32x2u_mem, macro() |
| # v128.load32x2_u |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 8, t1, t2, .simd_v128_load_32x2u_slow_path) |
| simdLoad32x2u() |
| pushVec(v0) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_v128_load8_splat_mem, macro() |
| # v128.load8_splat |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 1, t1, t2, .simd_v128_load8_splat_slow_path) |
| simdLoadSplat8() |
| pushVec(v0) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_v128_load16_splat_mem, macro() |
| # v128.load16_splat |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 2, t1, t2, .simd_v128_load16_splat_slow_path) |
| simdLoadSplat16() |
| pushVec(v0) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_v128_load32_splat_mem, macro() |
| # v128.load32_splat |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 4, t1, t2, .simd_v128_load32_splat_slow_path) |
| simdLoadSplat32() |
| pushVec(v0) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_v128_load64_splat_mem, macro() |
| # v128.load64_splat |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 8, t1, t2, .simd_v128_load64_splat_slow_path) |
| simdLoadSplat64() |
| pushVec(v0) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_v128_store_mem, macro() |
| # v128.store |
| popVec(v0) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 16, t1, t2, .simd_v128_store_slow_path) |
| storev v0, [t0] |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| # 0xFD 0x0C: v128.const |
| ipintOp(_simd_v128_const, macro() |
| # v128.const |
| loadv [t4], v0 |
| pushVec(v0) |
| leap 16[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| # 0xFD 0x0D - 0xFD 0x14: splat (+ shuffle/swizzle) |
| |
| ipintOp(_simd_i8x16_shuffle, macro() |
| # i8x16.shuffle - shuffle bytes from two vectors using 16 immediate indices |
| if ARM64 or ARM64E |
| popVec(v1) |
| popVec(v0) |
| loadv [t4], v2 |
| emit "tbl v16.16b, {v16.16b, v17.16b}, v18.16b" |
| pushVec(v0) |
| else |
| # X86_64 doesn't natively support shuffle so emulate it |
| subp V128ISize, sp # Allocate temp result |
| |
| # Loop through 16 output positions |
| move 0, t0 |
| |
| .shuffleLoop: |
| loadb [t4, t0, 1], t1 |
| |
| bigt t1, 31, .outOfBounds |
| bigt t1, 15, .useRightVector |
| |
| .useLeftVector: |
| loadb 32[sp, t1], t2 |
| jmp .storeByte |
| |
| .useRightVector: |
| subq t1, 16, t3 |
| loadb 16[sp, t3], t2 |
| jmp .storeByte |
| |
| .outOfBounds: |
| move 0, t2 |
| |
| .storeByte: |
| storeb t2, [sp, t0] # Store to temp result |
| addq 1, t0 # Increment loop counter |
| bilt t0, 16, .shuffleLoop |
| |
| # Copy temp result to final result location |
| loadq [sp], t0 |
| loadq 8[sp], t1 |
| storeq t0, 32[sp] |
| storeq t1, 40[sp] |
| |
| addp 2 * V128ISize, sp # Pop temp result and right vector |
| end |
| |
| leap 16[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i8x16_swizzle, macro() |
| # i8x16.swizzle - swizzle bytes from first vector using indices from second vector |
| popVec(v1) |
| popVec(v0) |
| |
| if ARM64 or ARM64E |
| emit "tbl v16.16b, {v16.16b}, v17.16b" |
| elsif X86_64 |
| # vpshufb only checks bit 7 for out-of-bounds (returns 0 if bit 7 is set) |
| # WebAssembly requires returning 0 for any index >= 16 |
| # Add 0x70 with unsigned saturation, so any index > 15 sets bit 7 |
| # (15 + 0x70 = 0x7F, anything > 15 saturates to 0xFF) |
| # See BBQJIT::fixupOutOfBoundsIndicesForSwizzle |
| emit "movabsq $0x7070707070707070, %rax" |
| emit "vmovq %rax, %xmm2" |
| emit "vpunpcklqdq %xmm2, %xmm2, %xmm2" # xmm2 = [0x70, 0x70, ..., 0x70] (16 bytes) |
| emit "vpaddusb %xmm2, %xmm1, %xmm1" # Saturating add to set bit 7 for indices > 15 |
| emit "vpshufb %xmm1, %xmm0, %xmm0" # Now vpshufb will return 0 for out-of-bounds |
| else |
| break # Not implemented |
| end |
| |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i8x16_splat, macro() |
| # i8x16.splat - splat i32 value to all 16 8-bit lanes |
| popInt32(t0) |
| |
| if ARM64 or ARM64E |
| emit "dup v16.16b, w0" |
| elsif X86_64 |
| # t0 is eax on X86_64, move to xmm0 and broadcast to all 16 bytes |
| emit "vmovd %eax, %xmm0" |
| emit "vpinsrb $1, %eax, %xmm0, %xmm0" |
| emit "vpshuflw $0, %xmm0, %xmm0" |
| emit "vpunpcklqdq %xmm0, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i16x8_splat, macro() |
| # i16x8.splat - splat i32 value to all 8 16-bit lanes |
| popInt32(t0) |
| |
| if ARM64 or ARM64E |
| emit "dup v16.8h, w0" |
| elsif X86_64 |
| # t0 is eax on X86_64, move to xmm0 and broadcast to all 8 words |
| emit "vmovd %eax, %xmm0" |
| emit "vpshuflw $0, %xmm0, %xmm0" |
| emit "vpunpcklqdq %xmm0, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i32x4_splat, macro() |
| # i32x4.splat - splat i32 value to all 4 32-bit lanes |
| popInt32(t0) |
| |
| if ARM64 or ARM64E |
| emit "dup v16.4s, w0" |
| elsif X86_64 |
| # t0 is eax on X86_64, move to xmm0 and broadcast to all 4 dwords |
| emit "vmovd %eax, %xmm0" |
| emit "vshufps $0, %xmm0, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i64x2_splat, macro() |
| # i64x2.splat - splat i64 value to all 2 64-bit lanes |
| popInt64(t0) |
| |
| if ARM64 or ARM64E |
| emit "dup v16.2d, x0" |
| elsif X86_64 |
| # t0 is rax on X86_64 |
| emit "vmovq %rax, %xmm0" |
| emit "vmovddup %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_f32x4_splat, macro() |
| # f32x4.splat - splat f32 value to all 4 32-bit float lanes |
| popFloat32(ft0) |
| |
| if ARM64 or ARM64E |
| emit "dup v16.4s, v0.s[0]" |
| elsif X86_64 |
| # ft0 is xmm0 on X86_64, broadcast to all 4 float lanes |
| emit "vshufps $0x00, %xmm0, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_f64x2_splat, macro() |
| # f64x2.splat - splat f64 value to all 2 64-bit float lanes |
| popFloat64(ft0) |
| |
| if ARM64 or ARM64E |
| emit "dup v16.2d, v0.d[0]" |
| elsif X86_64 |
| # ft0 is xmm0 on X86_64, duplicate lower 64-bit to both lanes |
| emit "vmovddup %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| # 0xFD 0x15 - 0xFD 0x22: extract and replace lanes |
| ipintOp(_simd_i8x16_extract_lane_s, macro() |
| # i8x16.extract_lane_s (lane) |
| loadb ImmLaneIdxOffset[t4], t0 |
| andi ImmLaneIdx16Mask, t0 |
| loadbsi [sp, t0], t0 |
| addp V128ISize, sp |
| pushInt32(t0) |
| leap 1[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i8x16_extract_lane_u, macro() |
| # i8x16.extract_lane_u (lane) |
| loadb ImmLaneIdxOffset[t4], t0 |
| andi ImmLaneIdx16Mask, t0 |
| loadb [sp, t0], t0 |
| addp V128ISize, sp |
| pushInt32(t0) |
| leap 1[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i8x16_replace_lane, macro() |
| # i8x16.replace_lane (lane) |
| loadb ImmLaneIdxOffset[t4], t0 |
| andi ImmLaneIdx16Mask, t0 |
| popInt32(t1) # value to replace with |
| storeb t1, [sp, t0] # replace the byte at lane index |
| leap 1[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i16x8_extract_lane_s, macro() |
| # i16x8.extract_lane_s (lane) |
| loadb ImmLaneIdxOffset[t4], t0 |
| andi ImmLaneIdx8Mask, t0 |
| loadhsi [sp, t0, 2], t0 |
| addp V128ISize, sp |
| pushInt32(t0) |
| leap 1[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i16x8_extract_lane_u, macro() |
| # i16x8.extract_lane_u (lane) |
| loadb ImmLaneIdxOffset[t4], t0 |
| andi ImmLaneIdx8Mask, t0 |
| loadh [sp, t0, 2], t0 |
| addp V128ISize, sp |
| pushInt32(t0) |
| leap 1[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i16x8_replace_lane, macro() |
| # i16x8.replace_lane (lane) |
| loadb ImmLaneIdxOffset[t4], t0 |
| andi ImmLaneIdx8Mask, t0 |
| popInt32(t1) # value to replace with |
| storeh t1, [sp, t0, 2] # replace the 16-bit value at lane index |
| leap 1[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i32x4_extract_lane, macro() |
| # i32x4.extract_lane (lane) |
| loadb ImmLaneIdxOffset[t4], t0 |
| andi ImmLaneIdx4Mask, t0 |
| loadi [sp, t0, 4], t0 |
| addp V128ISize, sp |
| pushInt32(t0) |
| leap 1[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i32x4_replace_lane, macro() |
| # i32x4.replace_lane (lane) |
| loadb ImmLaneIdxOffset[t4], t0 |
| andi ImmLaneIdx4Mask, t0 |
| popInt32(t1) # value to replace with |
| storei t1, [sp, t0, 4] # replace the 32-bit value at lane index |
| leap 1[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i64x2_extract_lane, macro() |
| # i64x2.extract_lane (lane) |
| loadb ImmLaneIdxOffset[t4], t0 |
| andi ImmLaneIdx2Mask, t0 |
| loadq [sp, t0, 8], t0 |
| addp V128ISize, sp |
| pushInt64(t0) |
| leap 1[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i64x2_replace_lane, macro() |
| # i64x2.replace_lane (lane) |
| loadb ImmLaneIdxOffset[t4], t0 |
| andi ImmLaneIdx2Mask, t0 |
| popInt64(t1) # value to replace with |
| storeq t1, [sp, t0, 8] # replace the 64-bit value at lane index |
| leap 1[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_f32x4_extract_lane, macro() |
| # f32x4.extract_lane (lane) |
| loadb ImmLaneIdxOffset[t4], t0 |
| andi ImmLaneIdx4Mask, t0 |
| loadf [sp, t0, 4], ft0 |
| addp V128ISize, sp |
| pushFloat32(ft0) |
| leap 1[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_f32x4_replace_lane, macro() |
| # f32x4.replace_lane (lane) |
| loadb ImmLaneIdxOffset[t4], t0 |
| andi ImmLaneIdx4Mask, t0 |
| popFloat32(ft0) # value to replace with |
| storef ft0, [sp, t0, 4] # replace the 32-bit float at lane index |
| leap 1[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_f64x2_extract_lane, macro() |
| # f64x2.extract_lane (lane) |
| loadb ImmLaneIdxOffset[t4], t0 |
| andi ImmLaneIdx2Mask, t0 |
| loadd [sp, t0, 8], ft0 |
| addp V128ISize, sp |
| pushFloat64(ft0) |
| leap 1[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_f64x2_replace_lane, macro() |
| # f64x2.replace_lane (lane) |
| loadb ImmLaneIdxOffset[t4], t0 |
| andi ImmLaneIdx2Mask, t0 |
| popFloat64(ft0) # value to replace with |
| stored ft0, [sp, t0, 8] # replace the 64-bit float at lane index |
| leap 1[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| # 0xFD 0x23 - 0xFD 0x2C: i8x16 operations |
| ipintOp(_simd_i8x16_eq, macro() |
| # i8x16.eq - compare 16 8-bit integers for equality |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "cmeq v16.16b, v16.16b, v17.16b" |
| elsif X86_64 |
| emit "vpcmpeqb %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i8x16_ne, macro() |
| # i8x16.ne - compare 16 8-bit integers for inequality |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| # Compare 16 bytes for equality, then invert the result |
| emit "cmeq v16.16b, v16.16b, v17.16b" |
| emit "mvn v16.16b, v16.16b" |
| elsif X86_64 |
| # Compare for equality, then invert the result |
| emit "vpcmpeqb %xmm1, %xmm0, %xmm0" |
| emit "vpcmpeqb %xmm2, %xmm2, %xmm2" # Set all bits to 1 |
| emit "vpxor %xmm2, %xmm0, %xmm0" # Invert result |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i8x16_lt_s, macro() |
| # i8x16.lt_s - compare 16 8-bit signed integers for less than |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| # cmgt v17, v16 gives us v1 > v0, which is equivalent to v0 < v1 |
| emit "cmgt v16.16b, v17.16b, v16.16b" |
| elsif X86_64 |
| # vpcmpgtb xmm1, xmm0 gives us xmm1 > xmm0, which is equivalent to xmm0 < xmm1 |
| emit "vpcmpgtb %xmm0, %xmm1, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i8x16_lt_u, macro() |
| # i8x16.lt_u - compare 16 8-bit unsigned integers for less than |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| # cmhi v17, v16 gives us v1 > v0 (unsigned), which is equivalent to v0 < v1 |
| emit "cmhi v16.16b, v17.16b, v16.16b" |
| elsif X86_64 |
| # For unsigned comparison, we need to use min/max approach since there's no direct unsigned compare |
| emit "vpminub %xmm1, %xmm0, %xmm2" # min(xmm0, xmm1) -> xmm2 |
| emit "vpcmpeqb %xmm0, %xmm2, %xmm2" # xmm0 == min ? (xmm0 <= xmm1) |
| emit "vpcmpeqb %xmm1, %xmm0, %xmm0" # xmm0 == xmm1 ? |
| emit "vpandn %xmm2, %xmm0, %xmm0" # (xmm0 <= xmm1) && (xmm0 != xmm1) = (xmm0 < xmm1) |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i8x16_gt_s, macro() |
| # i8x16.gt_s - compare 16 8-bit signed integers for greater than |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "cmgt v16.16b, v16.16b, v17.16b" |
| elsif X86_64 |
| emit "vpcmpgtb %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i8x16_gt_u, macro() |
| # i8x16.gt_u - compare 16 8-bit unsigned integers for greater than |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "cmhi v16.16b, v16.16b, v17.16b" |
| elsif X86_64 |
| # For unsigned comparison: xmm0 > xmm1 iff min(xmm0, xmm1) == xmm1 && xmm0 != xmm1 |
| emit "vpminub %xmm1, %xmm0, %xmm2" # min(xmm0, xmm1) -> xmm2 |
| emit "vpcmpeqb %xmm1, %xmm2, %xmm2" # xmm1 == min ? (xmm1 <= xmm0) |
| emit "vpcmpeqb %xmm1, %xmm0, %xmm0" # xmm0 == xmm1 ? |
| emit "vpandn %xmm2, %xmm0, %xmm0" # (xmm1 <= xmm0) && (xmm0 != xmm1) = (xmm0 > xmm1) |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i8x16_le_s, macro() |
| # i8x16.le_s - compare 16 8-bit signed integers for less than or equal |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| # cmge v17, v16 gives us v1 >= v0, which is equivalent to v0 <= v1 |
| emit "cmge v16.16b, v17.16b, v16.16b" |
| elsif X86_64 |
| # xmm0 <= xmm1 iff !(xmm0 > xmm1) |
| emit "vpcmpgtb %xmm1, %xmm0, %xmm0" # xmm0 > xmm1 |
| emit "vpcmpeqb %xmm2, %xmm2, %xmm2" # Set all bits to 1 |
| emit "vpxor %xmm2, %xmm0, %xmm0" # Invert result: !(xmm0 > xmm1) |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i8x16_le_u, macro() |
| # i8x16.le_u - compare 16 8-bit unsigned integers for less than or equal |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| # cmhs v17, v16 gives us v1 >= v0 (unsigned), which is equivalent to v0 <= v1 |
| emit "cmhs v16.16b, v17.16b, v16.16b" |
| elsif X86_64 |
| # xmm0 <= xmm1 iff min(xmm0, xmm1) == xmm0 |
| emit "vpminub %xmm1, %xmm0, %xmm2" # min(xmm0, xmm1) -> xmm2 |
| emit "vpcmpeqb %xmm0, %xmm2, %xmm0" # xmm0 == min ? (xmm0 <= xmm1) |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i8x16_ge_s, macro() |
| # i8x16.ge_s - compare 16 8-bit signed integers for greater than or equal |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "cmge v16.16b, v16.16b, v17.16b" |
| elsif X86_64 |
| # xmm0 >= xmm1 iff !(xmm0 < xmm1) iff !(xmm1 > xmm0) |
| emit "vpcmpgtb %xmm0, %xmm1, %xmm0" # xmm1 > xmm0 |
| emit "vpcmpeqb %xmm2, %xmm2, %xmm2" # Set all bits to 1 |
| emit "vpxor %xmm2, %xmm0, %xmm0" # Invert result: !(xmm1 > xmm0) |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i8x16_ge_u, macro() |
| # i8x16.ge_u - compare 16 8-bit unsigned integers for greater than or equal |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "cmhs v16.16b, v16.16b, v17.16b" |
| elsif X86_64 |
| # xmm0 >= xmm1 iff min(xmm0, xmm1) == xmm1 |
| emit "vpminub %xmm1, %xmm0, %xmm2" # min(xmm0, xmm1) -> xmm2 |
| emit "vpcmpeqb %xmm1, %xmm2, %xmm0" # xmm1 == min ? (xmm1 <= xmm0) = (xmm0 >= xmm1) |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| # 0xFD 0x2D - 0xFD 0x36: i8x16 operations |
| |
| ipintOp(_simd_i16x8_eq, macro() |
| # i16x8.eq - compare 8 16-bit integers for equality |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "cmeq v16.8h, v16.8h, v17.8h" |
| elsif X86_64 |
| emit "vpcmpeqw %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i16x8_ne, macro() |
| # i16x8.ne - compare 8 16-bit integers for inequality |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "cmeq v16.8h, v16.8h, v17.8h" |
| emit "mvn v16.16b, v16.16b" |
| elsif X86_64 |
| # Compare for equality, then invert the result |
| emit "vpcmpeqw %xmm1, %xmm0, %xmm0" |
| emit "vpcmpeqw %xmm2, %xmm2, %xmm2" # Set all bits to 1 |
| emit "vpxor %xmm2, %xmm0, %xmm0" # Invert result |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i16x8_lt_s, macro() |
| # i16x8.lt_s - compare 8 16-bit signed integers for less than |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| # cmgt v17, v16 gives us v1 > v0, which is equivalent to v0 < v1 |
| emit "cmgt v16.8h, v17.8h, v16.8h" |
| elsif X86_64 |
| # vpcmpgtw xmm1, xmm0 gives us xmm1 > xmm0, which is equivalent to xmm0 < xmm1 |
| emit "vpcmpgtw %xmm0, %xmm1, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i16x8_lt_u, macro() |
| # i16x8.lt_u - compare 8 16-bit unsigned integers for less than |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| # cmhi v17, v16 gives us v1 > v0 (unsigned), which is equivalent to v0 < v1 |
| emit "cmhi v16.8h, v17.8h, v16.8h" |
| elsif X86_64 |
| # For unsigned comparison, we need to use min/max approach since there's no direct unsigned compare |
| emit "vpminuw %xmm1, %xmm0, %xmm2" # min(xmm0, xmm1) -> xmm2 |
| emit "vpcmpeqw %xmm0, %xmm2, %xmm2" # xmm0 == min ? (xmm0 <= xmm1) |
| emit "vpcmpeqw %xmm1, %xmm0, %xmm0" # xmm0 == xmm1 ? |
| emit "vpandn %xmm2, %xmm0, %xmm0" # (xmm0 <= xmm1) && (xmm0 != xmm1) = (xmm0 < xmm1) |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i16x8_gt_s, macro() |
| # i16x8.gt_s - compare 8 16-bit signed integers for greater than |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "cmgt v16.8h, v16.8h, v17.8h" |
| elsif X86_64 |
| emit "vpcmpgtw %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i16x8_gt_u, macro() |
| # i16x8.gt_u - compare 8 16-bit unsigned integers for greater than |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "cmhi v16.8h, v16.8h, v17.8h" |
| elsif X86_64 |
| # For unsigned comparison: xmm0 > xmm1 iff min(xmm0, xmm1) == xmm1 && xmm0 != xmm1 |
| emit "vpminuw %xmm1, %xmm0, %xmm2" # min(xmm0, xmm1) -> xmm2 |
| emit "vpcmpeqw %xmm1, %xmm2, %xmm2" # xmm1 == min ? (xmm1 <= xmm0) |
| emit "vpcmpeqw %xmm1, %xmm0, %xmm0" # xmm0 == xmm1 ? |
| emit "vpandn %xmm2, %xmm0, %xmm0" # (xmm1 <= xmm0) && (xmm0 != xmm1) = (xmm0 > xmm1) |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i16x8_le_s, macro() |
| # i16x8.le_s - compare 8 16-bit signed integers for less than or equal |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| # cmge v17, v16 gives us v1 >= v0, which is equivalent to v0 <= v1 |
| emit "cmge v16.8h, v17.8h, v16.8h" |
| elsif X86_64 |
| # xmm0 <= xmm1 iff !(xmm0 > xmm1) |
| emit "vpcmpgtw %xmm1, %xmm0, %xmm0" # xmm0 > xmm1 |
| emit "vpcmpeqw %xmm2, %xmm2, %xmm2" # Set all bits to 1 |
| emit "vpxor %xmm2, %xmm0, %xmm0" # Invert result: !(xmm0 > xmm1) |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i16x8_le_u, macro() |
| # i16x8.le_u - compare 8 16-bit unsigned integers for less than or equal |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| # cmhs v17, v16 gives us v1 >= v0 (unsigned), which is equivalent to v0 <= v1 |
| emit "cmhs v16.8h, v17.8h, v16.8h" |
| elsif X86_64 |
| # xmm0 <= xmm1 iff min(xmm0, xmm1) == xmm0 |
| emit "vpminuw %xmm1, %xmm0, %xmm2" # min(xmm0, xmm1) -> xmm2 |
| emit "vpcmpeqw %xmm0, %xmm2, %xmm0" # xmm0 == min ? (xmm0 <= xmm1) |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i16x8_ge_s, macro() |
| # i16x8.ge_s - compare 8 16-bit signed integers for greater than or equal |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "cmge v16.8h, v16.8h, v17.8h" |
| elsif X86_64 |
| # xmm0 >= xmm1 iff !(xmm0 < xmm1) iff !(xmm1 > xmm0) |
| emit "vpcmpgtw %xmm0, %xmm1, %xmm0" # xmm1 > xmm0 |
| emit "vpcmpeqw %xmm2, %xmm2, %xmm2" # Set all bits to 1 |
| emit "vpxor %xmm2, %xmm0, %xmm0" # Invert result: !(xmm1 > xmm0) |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i16x8_ge_u, macro() |
| # i16x8.ge_u - compare 8 16-bit unsigned integers for greater than or equal |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "cmhs v16.8h, v16.8h, v17.8h" |
| elsif X86_64 |
| # xmm0 >= xmm1 iff min(xmm0, xmm1) == xmm1 |
| emit "vpminuw %xmm1, %xmm0, %xmm2" # min(xmm0, xmm1) -> xmm2 |
| emit "vpcmpeqw %xmm1, %xmm2, %xmm0" # xmm1 == min ? (xmm1 <= xmm0) = (xmm0 >= xmm1) |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| # 0xFD 0x37 - 0xFD 0x40: i32x4 operations |
| ipintOp(_simd_i32x4_eq, macro() |
| # i32x4.eq - compare 4 32-bit integers for equality |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "cmeq v16.4s, v16.4s, v17.4s" |
| elsif X86_64 |
| emit "vpcmpeqd %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i32x4_ne, macro() |
| # i32x4.ne - compare 4 32-bit integers for inequality |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "cmeq v16.4s, v16.4s, v17.4s" |
| emit "mvn v16.16b, v16.16b" |
| elsif X86_64 |
| # Compare for equality, then invert the result |
| emit "vpcmpeqd %xmm1, %xmm0, %xmm0" |
| emit "vpcmpeqd %xmm2, %xmm2, %xmm2" # Set all bits to 1 |
| emit "vpxor %xmm2, %xmm0, %xmm0" # Invert result |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i32x4_lt_s, macro() |
| # i32x4.lt_s - compare 4 32-bit signed integers for less than |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| # cmgt v17, v16 gives us v1 > v0, which is equivalent to v0 < v1 |
| emit "cmgt v16.4s, v17.4s, v16.4s" |
| elsif X86_64 |
| # vpcmpgtd xmm1, xmm0 gives us xmm1 > xmm0, which is equivalent to xmm0 < xmm1 |
| emit "vpcmpgtd %xmm0, %xmm1, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i32x4_lt_u, macro() |
| # i32x4.lt_u - compare 4 32-bit unsigned integers for less than |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| # cmhi v17, v16 gives us v1 > v0 (unsigned), which is equivalent to v0 < v1 |
| emit "cmhi v16.4s, v17.4s, v16.4s" |
| elsif X86_64 |
| # For unsigned comparison, we need to use min/max approach |
| emit "vpminud %xmm1, %xmm0, %xmm2" # min(xmm0, xmm1) -> xmm2 |
| emit "vpcmpeqd %xmm0, %xmm2, %xmm2" # xmm0 == min ? (xmm0 <= xmm1) |
| emit "vpcmpeqd %xmm1, %xmm0, %xmm0" # xmm0 == xmm1 ? |
| emit "vpandn %xmm2, %xmm0, %xmm0" # (xmm0 <= xmm1) && (xmm0 != xmm1) = (xmm0 < xmm1) |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i32x4_gt_s, macro() |
| # i32x4.gt_s - compare 4 32-bit signed integers for greater than |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "cmgt v16.4s, v16.4s, v17.4s" |
| elsif X86_64 |
| emit "vpcmpgtd %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i32x4_gt_u, macro() |
| # i32x4.gt_u - compare 4 32-bit unsigned integers for greater than |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "cmhi v16.4s, v16.4s, v17.4s" |
| elsif X86_64 |
| # For unsigned comparison: xmm0 > xmm1 iff min(xmm0, xmm1) == xmm1 && xmm0 != xmm1 |
| emit "vpminud %xmm1, %xmm0, %xmm2" # min(xmm0, xmm1) -> xmm2 |
| emit "vpcmpeqd %xmm1, %xmm2, %xmm2" # xmm1 == min ? (xmm1 <= xmm0) |
| emit "vpcmpeqd %xmm1, %xmm0, %xmm0" # xmm0 == xmm1 ? |
| emit "vpandn %xmm2, %xmm0, %xmm0" # (xmm1 <= xmm0) && (xmm0 != xmm1) = (xmm0 > xmm1) |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i32x4_le_s, macro() |
| # i32x4.le_s - compare 4 32-bit signed integers for less than or equal |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| # cmge v17, v16 gives us v1 >= v0, which is equivalent to v0 <= v1 |
| emit "cmge v16.4s, v17.4s, v16.4s" |
| elsif X86_64 |
| # xmm0 <= xmm1 iff !(xmm0 > xmm1) |
| emit "vpcmpgtd %xmm1, %xmm0, %xmm0" # xmm0 > xmm1 |
| emit "vpcmpeqd %xmm2, %xmm2, %xmm2" # Set all bits to 1 |
| emit "vpxor %xmm2, %xmm0, %xmm0" # Invert result: !(xmm0 > xmm1) |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i32x4_le_u, macro() |
| # i32x4.le_u - compare 4 32-bit unsigned integers for less than or equal |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| # cmhs v17, v16 gives us v1 >= v0 (unsigned), which is equivalent to v0 <= v1 |
| emit "cmhs v16.4s, v17.4s, v16.4s" |
| elsif X86_64 |
| # xmm0 <= xmm1 iff min(xmm0, xmm1) == xmm0 |
| emit "vpminud %xmm1, %xmm0, %xmm2" # min(xmm0, xmm1) -> xmm2 |
| emit "vpcmpeqd %xmm0, %xmm2, %xmm0" # xmm0 == min ? (xmm0 <= xmm1) |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i32x4_ge_s, macro() |
| # i32x4.ge_s - compare 4 32-bit signed integers for greater than or equal |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "cmge v16.4s, v16.4s, v17.4s" |
| elsif X86_64 |
| # xmm0 >= xmm1 iff !(xmm0 < xmm1) iff !(xmm1 > xmm0) |
| emit "vpcmpgtd %xmm0, %xmm1, %xmm0" # xmm1 > xmm0 |
| emit "vpcmpeqd %xmm2, %xmm2, %xmm2" # Set all bits to 1 |
| emit "vpxor %xmm2, %xmm0, %xmm0" # Invert result: !(xmm1 > xmm0) |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i32x4_ge_u, macro() |
| # i32x4.ge_u - compare 4 32-bit unsigned integers for greater than or equal |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "cmhs v16.4s, v16.4s, v17.4s" |
| elsif X86_64 |
| # xmm0 >= xmm1 iff min(xmm0, xmm1) == xmm1 |
| emit "vpminud %xmm1, %xmm0, %xmm2" # min(xmm0, xmm1) -> xmm2 |
| emit "vpcmpeqd %xmm1, %xmm2, %xmm0" # xmm1 == min ? (xmm1 <= xmm0) = (xmm0 >= xmm1) |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| # 0xFD 0x41 - 0xFD 0x46: f32x4 operations |
| ipintOp(_simd_f32x4_eq, macro() |
| # f32x4.eq - compare 4 32-bit floats for equality |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "fcmeq v16.4s, v16.4s, v17.4s" |
| elsif X86_64 |
| emit "vcmpeqps %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_f32x4_ne, macro() |
| # f32x4.ne - compare 4 32-bit floats for inequality |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "fcmeq v16.4s, v16.4s, v17.4s" |
| emit "mvn v16.16b, v16.16b" |
| elsif X86_64 |
| emit "vcmpneqps %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_f32x4_lt, macro() |
| # f32x4.lt - compare 4 32-bit floats for less than |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| # fcmgt v17, v16 gives us v1 > v0, which is equivalent to v0 < v1 |
| emit "fcmgt v16.4s, v17.4s, v16.4s" |
| elsif X86_64 |
| emit "vcmpltps %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_f32x4_gt, macro() |
| # f32x4.gt - compare 4 32-bit floats for greater than |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "fcmgt v16.4s, v16.4s, v17.4s" |
| elsif X86_64 |
| emit "vcmpgtps %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_f32x4_le, macro() |
| # f32x4.le - compare 4 32-bit floats for less than or equal |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| # fcmge v17, v16 gives us v1 >= v0, which is equivalent to v0 <= v1 |
| emit "fcmge v16.4s, v17.4s, v16.4s" |
| elsif X86_64 |
| emit "vcmpleps %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_f32x4_ge, macro() |
| # f32x4.ge - compare 4 32-bit floats for greater than or equal |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "fcmge v16.4s, v16.4s, v17.4s" |
| elsif X86_64 |
| emit "vcmpgeps %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| # 0xFD 0x47 - 0xFD 0x4c: f64x2 operations |
| ipintOp(_simd_f64x2_eq, macro() |
| # f64x2.eq - compare 2 64-bit floats for equality |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "fcmeq v16.2d, v16.2d, v17.2d" |
| elsif X86_64 |
| emit "vcmpeqpd %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_f64x2_ne, macro() |
| # f64x2.ne - compare 2 64-bit floats for inequality |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "fcmeq v16.2d, v16.2d, v17.2d" |
| emit "mvn v16.16b, v16.16b" |
| elsif X86_64 |
| emit "vcmpneqpd %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_f64x2_lt, macro() |
| # f64x2.lt - compare 2 64-bit floats for less than |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| # fcmgt v17, v16 gives us v1 > v0, which is equivalent to v0 < v1 |
| emit "fcmgt v16.2d, v17.2d, v16.2d" |
| elsif X86_64 |
| emit "vcmpltpd %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_f64x2_gt, macro() |
| # f64x2.gt - compare 2 64-bit floats for greater than |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "fcmgt v16.2d, v16.2d, v17.2d" |
| elsif X86_64 |
| emit "vcmpgtpd %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_f64x2_le, macro() |
| # f64x2.le - compare 2 64-bit floats for less than or equal |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| # fcmge v17, v16 gives us v1 >= v0, which is equivalent to v0 <= v1 |
| emit "fcmge v16.2d, v17.2d, v16.2d" |
| elsif X86_64 |
| emit "vcmplepd %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_f64x2_ge, macro() |
| # f64x2.ge - compare 2 64-bit floats for greater than or equal |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "fcmge v16.2d, v16.2d, v17.2d" |
| elsif X86_64 |
| emit "vcmpgepd %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| # 0xFD 0x4D - 0xFD 0x53: v128 operations |
| |
| ipintOp(_simd_v128_not, macro() |
| # v128.not - bitwise NOT of 128-bit vector |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "mvn v16.16b, v16.16b" |
| elsif X86_64 |
| emit "vpcmpeqb %xmm1, %xmm1, %xmm1" # Set all bits to 1 |
| emit "vpxor %xmm1, %xmm0, %xmm0" # Invert all bits |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_v128_and, macro() |
| # v128.and - bitwise AND of two 128-bit vectors |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "and v16.16b, v16.16b, v17.16b" |
| elsif X86_64 |
| emit "vpand %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_v128_andnot, macro() |
| # v128.andnot - bitwise AND NOT of two 128-bit vectors (v0 & ~v1) |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "bic v16.16b, v16.16b, v17.16b" |
| elsif X86_64 |
| emit "vpandn %xmm0, %xmm1, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_v128_or, macro() |
| # v128.or - bitwise OR of two 128-bit vectors |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "orr v16.16b, v16.16b, v17.16b" |
| elsif X86_64 |
| emit "vpor %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_v128_xor, macro() |
| # v128.xor - bitwise XOR of two 128-bit vectors |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "eor v16.16b, v16.16b, v17.16b" |
| elsif X86_64 |
| emit "vpxor %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_v128_bitselect, macro() |
| # v128.bitselect - bitwise select: (a & c) | (b & ~c) |
| popVec(v2) # selector c |
| popVec(v1) # b |
| popVec(v0) # a |
| if ARM64 or ARM64E |
| # Use BSL (Bit Select) instruction: bsl vd, vn, vm |
| # BSL performs: vd = (vd & vn) | (~vd & vm) |
| # We need: result = (a & c) | (b & ~c) |
| # So we put c in the destination, then BSL with a and b |
| emit "mov v18.16b, v18.16b" # v2 -> v18 (selector) |
| emit "bsl v18.16b, v16.16b, v17.16b" # (c & a) | (~c & b) |
| emit "mov v16.16b, v18.16b" # result -> v0 |
| elsif X86_64 |
| emit "vpand %xmm2, %xmm0, %xmm3" # xmm3 = a & c |
| emit "vpandn %xmm1, %xmm2, %xmm2" # xmm2 = b & ~c (vpandn does ~src1 & src2) |
| emit "vpor %xmm2, %xmm3, %xmm0" # xmm0 = (a & c) | (b & ~c) |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_v128_any_true, macro() |
| # v128.any_true - return 1 if any bit is set, 0 otherwise |
| popVec(v0) |
| if ARM64 or ARM64E |
| # Use UMAXV to find maximum across all bytes |
| emit "umaxv b16, v16.16b" |
| # Extract the result to general purpose register |
| emit "fmov w0, s16" |
| # Convert non-zero to 1 |
| emit "cmp w0, #0" |
| emit "cset w0, ne" |
| elsif X86_64 |
| emit "vptest %xmm0, %xmm0" |
| emit "setne %al" # Set AL to 1 if ZF=0 (any bit set), 0 if ZF=1 (all zero) |
| emit "movzbl %al, %eax" # Zero-extend AL to EAX |
| else |
| break # Not implemented |
| end |
| pushInt32(t0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| # 0xFD 0x54 - 0xFD 0x5D: v128 load/store lane |
| # For load_lane: stack is [v128, i32_addr]. Pop addr, do memarg, load from memory, |
| # read lane index, replace lane in the v128 still on stack. |
| # For store_lane: stack is [v128, i32_addr]. Pop addr, do memarg, read lane index, |
| # extract value from v128 on stack, pop v128, store to memory. |
| # Lane index is the last byte of the instruction, right after the memarg. |
| |
| ipintOp(_simd_v128_load8_lane_mem, macro() |
| popVec(v0) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 1, t1, t2, .simd_v128_load8_lane_slow_path) |
| loadb [t0], t0 |
| loadb 2[t4], t1 |
| andi ImmLaneIdx16Mask, t1 |
| pushVec(v0) |
| storeb t0, [sp, t1] |
| leap 3[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_v128_load16_lane_mem, macro() |
| popVec(v0) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 2, t1, t2, .simd_v128_load16_lane_slow_path) |
| loadh [t0], t0 |
| loadb 2[t4], t1 |
| andi ImmLaneIdx8Mask, t1 |
| pushVec(v0) |
| storeh t0, [sp, t1, 2] |
| leap 3[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_v128_load32_lane_mem, macro() |
| popVec(v0) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 4, t1, t2, .simd_v128_load32_lane_slow_path) |
| loadi [t0], t0 |
| loadb 2[t4], t1 |
| andi ImmLaneIdx4Mask, t1 |
| pushVec(v0) |
| storei t0, [sp, t1, 4] |
| leap 3[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_v128_load64_lane_mem, macro() |
| popVec(v0) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 8, t1, t2, .simd_v128_load64_lane_slow_path) |
| loadq [t0], t0 |
| loadb 2[t4], t1 |
| andi ImmLaneIdx2Mask, t1 |
| pushVec(v0) |
| storeq t0, [sp, t1, 8] |
| leap 3[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_v128_store8_lane_mem, macro() |
| # Stack: [addr, v128] with v128 on top. Pop both, parse memarg, extract lane, store. |
| popVec(v0) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 1, t1, t2, .simd_v128_store8_lane_slow_path) |
| loadb 2[t4], t1 |
| andi ImmLaneIdx16Mask, t1 |
| # Extract byte from v0 via temp push |
| pushVec(v0) |
| loadb [sp, t1], t1 |
| addp V128ISize, sp |
| storeb t1, [t0] |
| leap 3[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_v128_store16_lane_mem, macro() |
| popVec(v0) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 2, t1, t2, .simd_v128_store16_lane_slow_path) |
| loadb 2[t4], t1 |
| andi ImmLaneIdx8Mask, t1 |
| pushVec(v0) |
| loadh [sp, t1, 2], t1 |
| addp V128ISize, sp |
| storeh t1, [t0] |
| leap 3[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_v128_store32_lane_mem, macro() |
| popVec(v0) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 4, t1, t2, .simd_v128_store32_lane_slow_path) |
| loadb 2[t4], t1 |
| andi ImmLaneIdx4Mask, t1 |
| pushVec(v0) |
| loadi [sp, t1, 4], t1 |
| addp V128ISize, sp |
| storei t1, [t0] |
| leap 3[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_v128_store64_lane_mem, macro() |
| popVec(v0) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 8, t1, t2, .simd_v128_store64_lane_slow_path) |
| loadb 2[t4], t1 |
| andi ImmLaneIdx2Mask, t1 |
| pushVec(v0) |
| loadq [sp, t1, 8], t1 |
| addp V128ISize, sp |
| storeq t1, [t0] |
| leap 3[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_v128_load32_zero_mem, macro() |
| # v128.load32_zero - load 32-bit value from memory and zero-pad to 128 bits |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 4, t1, t2, .simd_v128_load32_zero_slow_path) |
| loadi [t0], t0 |
| subp V128ISize, sp |
| storei t0, [sp] |
| storei 0, 4[sp] |
| storeq 0, 8[sp] |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_v128_load64_zero_mem, macro() |
| # v128.load64_zero - load 64-bit value from memory and zero-pad to 128 bits |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 8, t1, t2, .simd_v128_load64_zero_slow_path) |
| loadq [t0], t0 |
| subp V128ISize, sp |
| storeq t0, [sp] |
| storeq 0, 8[sp] |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| # 0xFD 0x5E - 0xFD 0x5F: f32x4/f64x2 conversion |
| |
| ipintOp(_simd_f32x4_demote_f64x2_zero, macro() |
| # f32x4.demote_f64x2_zero - demote 2 f64 values to f32, zero upper 2 lanes |
| popVec(v0) |
| if ARM64 or ARM64E |
| # Convert the two f64 values in lanes 0,1 to f32 and store in lanes 0,1 |
| emit "fcvtn v16.2s, v16.2d" |
| # Zero the upper 64 bits (lanes 2,3) |
| emit "mov v16.d[1], xzr" |
| elsif X86_64 |
| emit "vcvtpd2ps %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_f64x2_promote_low_f32x4, macro() |
| # f64x2.promote_low_f32x4 - promote lower 2 f32 values to f64 |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "fcvtl v16.2d, v16.2s" |
| elsif X86_64 |
| emit "vcvtps2pd %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| # 0xFD 0x60 - 0x66: i8x16 operations |
| |
| ipintOp(_simd_i8x16_abs, macro() |
| # i8x16.abs - absolute value of 16 8-bit signed integers |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "abs v16.16b, v16.16b" |
| elsif X86_64 |
| emit "vpabsb %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i8x16_neg, macro() |
| # i8x16.neg - negate 16 8-bit integers |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "neg v16.16b, v16.16b" |
| elsif X86_64 |
| # Negate by subtracting from zero |
| emit "vpxor %xmm1, %xmm1, %xmm1" |
| emit "vpsubb %xmm0, %xmm1, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i8x16_popcnt, macro() |
| # i8x16.popcnt - population count (count set bits) for 16 8-bit integers |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "cnt v16.16b, v16.16b" |
| elsif X86_64 |
| # x86_64 does not natively support vector lanewise popcount, so we emulate it using |
| # lookup tables, similar to BBQ JIT implementation |
| |
| # Create bottom nibble mask (0x0f repeated 16 times) |
| emit "movabsq $0x0f0f0f0f0f0f0f0f, %rax" |
| emit "vmovq %rax, %xmm1" |
| emit "vmovq %rax, %xmm4" |
| emit "vpunpcklqdq %xmm4, %xmm1, %xmm1" # xmm1 = bottom nibble mask |
| |
| # Create popcount lookup table |
| emit "movabsq $0x0302020102010100, %rax" # Low 64 bits of lookup table |
| emit "vmovq %rax, %xmm2" |
| emit "movabsq $0x0403030203020201, %rax" # High 64 bits of lookup table |
| emit "vmovq %rax, %xmm4" |
| emit "vpunpcklqdq %xmm4, %xmm2, %xmm2" # xmm2 = popcount lookup table |
| |
| # Split input into low and high nibbles |
| emit "vmovdqa %xmm0, %xmm3" # xmm3 = copy of input |
| emit "vpand %xmm1, %xmm0, %xmm0" # xmm0 = low nibbles (input & mask) |
| emit "vpsrlw $4, %xmm3, %xmm3" # Shift right 4 bits |
| emit "vpand %xmm1, %xmm3, %xmm3" # xmm3 = high nibbles ((input >> 4) & mask) |
| |
| # Lookup popcount for both nibbles using pshufb |
| emit "vpshufb %xmm0, %xmm2, %xmm0" # Lookup low nibbles |
| emit "vpshufb %xmm3, %xmm2, %xmm3" # Lookup high nibbles |
| |
| # Add the results |
| emit "vpaddb %xmm3, %xmm0, %xmm0" # Add popcount of low and high nibbles |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i8x16_all_true, macro() |
| # i8x16.all_true - return 1 if all 16 8-bit lanes are non-zero, 0 otherwise |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "cmeq v17.16b, v16.16b, #0" # Compare each lane with 0 |
| emit "umaxv b17, v17.16b" # Find maximum (any zero lane will make this non-zero) |
| emit "fmov w0, s17" # Move to general register |
| emit "cmp w0, #0" # Compare with 0 |
| emit "cset w0, eq" # Set to 1 if equal (all lanes non-zero), 0 otherwise |
| elsif X86_64 |
| # Compare each byte with zero to create mask of zero lanes |
| emit "vpxor %xmm1, %xmm1, %xmm1" # Create zero vector |
| emit "vpcmpeqb %xmm1, %xmm0, %xmm0" # Compare each byte with 0 (0xFF if zero, 0x00 if non-zero) |
| emit "vpmovmskb %xmm0, %eax" # Extract sign bits to create 16-bit mask |
| emit "test %eax, %eax" # Test if any bit is set (any lane was zero) |
| emit "sete %al" # Set AL to 1 if no bits set (all lanes non-zero), 0 otherwise |
| emit "movzbl %al, %eax" # Zero-extend to full 32-bit register |
| else |
| break # Not implemented |
| end |
| pushInt32(t0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i8x16_bitmask, macro() |
| # i8x16.bitmask - extract most significant bit from each 8-bit lane into a 16-bit integer |
| # Simple loop over the 16 bytes on the stack |
| |
| move 0, t0 # Initialize result |
| move 0, t3 # Byte counter |
| |
| .bitmask_i8x16_loop: |
| # Load byte and check sign bit |
| loadb [sp, t3], t1 |
| andq 0x80, t1 # Extract sign bit |
| btiz t1, .bitmask_i8x16_next |
| |
| # Set corresponding bit in result |
| move 1, t1 |
| lshiftq t3, t1 # Shift to bit position |
| orq t1, t0 |
| |
| .bitmask_i8x16_next: |
| addq 1, t3 # Next byte |
| bilt t3, 16, .bitmask_i8x16_loop |
| |
| addp V128ISize, sp # Pop the vector |
| pushInt32(t0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i8x16_narrow_i16x8_s, macro() |
| # i8x16.narrow_i16x8_s - narrow 2 i16x8 vectors to 1 i8x16 vector with signed saturation |
| popVec(v1) # Second operand |
| popVec(v0) # First operand |
| if ARM64 or ARM64E |
| # Signed saturating extract narrow: combine v0.8h and v1.8h into v16.16b |
| emit "sqxtn v16.8b, v16.8h" # Narrow first vector (v0) to lower 8 bytes |
| emit "sqxtn2 v16.16b, v17.8h" # Narrow second vector (v1) to upper 8 bytes |
| elsif X86_64 |
| emit "vpacksswb %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i8x16_narrow_i16x8_u, macro() |
| # i8x16.narrow_i16x8_u - narrow 2 i16x8 vectors to 1 i8x16 vector with unsigned saturation |
| popVec(v1) # Second operand |
| popVec(v0) # First operand |
| if ARM64 or ARM64E |
| # Signed saturate extract unsigned narrow: combine v0.8h and v1.8h into v16.16b |
| emit "sqxtun v16.8b, v16.8h" # Narrow first vector (v0) to lower 8 bytes |
| emit "sqxtun2 v16.16b, v17.8h" # Narrow second vector (v1) to upper 8 bytes |
| elsif X86_64 |
| emit "vpackuswb %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| # 0xFD 0x67 - 0xFD 0x6A: f32x4 operations |
| |
| ipintOp(_simd_f32x4_ceil, macro() |
| # f32x4.ceil - ceiling of 4 32-bit floats |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "frintp v16.4s, v16.4s" |
| elsif X86_64 |
| emit "vroundps $0x2, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_f32x4_floor, macro() |
| # f32x4.floor - floor of 4 32-bit floats |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "frintm v16.4s, v16.4s" |
| elsif X86_64 |
| emit "vroundps $0x1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_f32x4_trunc, macro() |
| # f32x4.trunc - truncate 4 32-bit floats |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "frintz v16.4s, v16.4s" |
| elsif X86_64 |
| emit "vroundps $0x3, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_f32x4_nearest, macro() |
| # f32x4.nearest - round to nearest integer (ties to even) for 4 32-bit floats |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "frintn v16.4s, v16.4s" |
| elsif X86_64 |
| emit "vroundps $0x0, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| # 0xFD 0x6B - 0xFD 0x73: i8x16 binary operations |
| |
| ipintOp(_simd_i8x16_shl, macro() |
| # i8x16.shl - left shift 16 8-bit integers |
| popInt32(t0) # shift count |
| popVec(v0) # vector |
| if ARM64 or ARM64E |
| # Mask shift count to 0-7 range for 8-bit elements |
| andi 7, t0 |
| # Duplicate shift count to all lanes of vector register |
| emit "dup v17.16b, w0" |
| # Perform left shift |
| emit "ushl v16.16b, v16.16b, v17.16b" |
| elsif X86_64 |
| andi 7, t0 |
| emit "movd %eax, %xmm1" |
| |
| # See MacroAssemblerX86_64::vectorUshl8() |
| |
| # Unpack and zero-extend low input bytes to words |
| emit "vxorps %xmm3, %xmm3, %xmm3" |
| emit "vpunpcklbw %xmm3, %xmm0, %xmm2" |
| |
| # Word-wise shift low input bytes |
| emit "vpsllw %xmm1, %xmm2, %xmm2" |
| |
| # Unpack and zero-extend high input bytes to words |
| emit "vpunpckhbw %xmm3, %xmm0, %xmm3" |
| |
| # Word-wise shift high input bytes |
| emit "vpsllw %xmm1, %xmm3, %xmm3" |
| |
| # Mask away higher bits of left-shifted results |
| emit "vpsllw $8, %xmm2, %xmm2" |
| emit "vpsllw $8, %xmm3, %xmm3" |
| emit "vpsrlw $8, %xmm2, %xmm2" |
| emit "vpsrlw $8, %xmm3, %xmm3" |
| |
| # Pack low and high results back to bytes |
| emit "vpackuswb %xmm3, %xmm2, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i8x16_shr_s, macro() |
| # i8x16.shr_s - arithmetic right shift 16 8-bit signed integers |
| popInt32(t0) # shift count |
| popVec(v0) # vector |
| if ARM64 or ARM64E |
| # Mask shift count to 0-7 range for 8-bit elements |
| andi 7, t0 |
| # Negate for right shift |
| negi t0 |
| # Duplicate shift count to all lanes of vector register |
| emit "dup v17.16b, w0" |
| # Perform arithmetic right shift |
| emit "sshl v16.16b, v16.16b, v17.16b" |
| elsif X86_64 |
| andi 7, t0 |
| emit "movd %eax, %xmm1" |
| |
| # See MacroAssemblerX86_64::vectorSshr8() |
| |
| # Unpack and sign-extend low input bytes to words |
| emit "vpmovsxbw %xmm0, %xmm2" |
| |
| # Word-wise shift low input bytes |
| emit "vpsraw %xmm1, %xmm2, %xmm2" |
| |
| # Unpack and sign-extend high input bytes |
| emit "vpshufd $0x0e, %xmm0, %xmm3" # Move high 8 bytes to low position |
| emit "vpmovsxbw %xmm3, %xmm3" |
| |
| # Word-wise shift high input bytes |
| emit "vpsraw %xmm1, %xmm3, %xmm3" |
| |
| # Pack low and high results back to signed bytes |
| emit "vpacksswb %xmm3, %xmm2, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i8x16_shr_u, macro() |
| # i8x16.shr_u - logical right shift 16 8-bit unsigned integers |
| popInt32(t0) # shift count |
| popVec(v0) # vector |
| if ARM64 or ARM64E |
| # Mask shift count to 0-7 range for 8-bit elements |
| andi 7, t0 |
| # Negate for right shift |
| negi t0 |
| # Duplicate shift count to all lanes of vector register |
| emit "dup v17.16b, w0" |
| # Perform logical right shift |
| emit "ushl v16.16b, v16.16b, v17.16b" |
| elsif X86_64 |
| andi 7, t0 |
| emit "movd %eax, %xmm1" |
| |
| # See MacroAssemblerX86_64::vectorUshr8() |
| |
| # Unpack and zero-extend low input bytes to words |
| emit "vxorps %xmm3, %xmm3, %xmm3" |
| emit "vpunpcklbw %xmm3, %xmm0, %xmm2" |
| |
| # Word-wise shift low input bytes |
| emit "vpsrlw %xmm1, %xmm2, %xmm2" |
| |
| # Unpack and zero-extend high input bytes to words |
| emit "vpunpckhbw %xmm3, %xmm0, %xmm3" |
| |
| # Word-wise shift high input bytes |
| emit "vpsrlw %xmm1, %xmm3, %xmm3" |
| |
| # Pack low and high results back to unsigned bytes |
| emit "vpackuswb %xmm3, %xmm2, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i8x16_add, macro() |
| # i8x16.add - add 16 8-bit integers |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "add v16.16b, v16.16b, v17.16b" |
| elsif X86_64 |
| emit "vpaddb %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i8x16_add_sat_s, macro() |
| # i8x16.add_sat_s - add 16 8-bit signed integers with saturation |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "sqadd v16.16b, v16.16b, v17.16b" |
| elsif X86_64 |
| emit "vpaddsb %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i8x16_add_sat_u, macro() |
| # i8x16.add_sat_u - add 16 8-bit unsigned integers with saturation |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "uqadd v16.16b, v16.16b, v17.16b" |
| elsif X86_64 |
| emit "vpaddusb %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i8x16_sub, macro() |
| # i8x16.sub - subtract 16 8-bit integers |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "sub v16.16b, v16.16b, v17.16b" |
| elsif X86_64 |
| emit "vpsubb %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i8x16_sub_sat_s, macro() |
| # i8x16.sub_sat_s - subtract 16 8-bit signed integers with saturation |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "sqsub v16.16b, v16.16b, v17.16b" |
| elsif X86_64 |
| emit "vpsubsb %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i8x16_sub_sat_u, macro() |
| # i8x16.sub_sat_u - subtract 16 8-bit unsigned integers with saturation |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "uqsub v16.16b, v16.16b, v17.16b" |
| elsif X86_64 |
| emit "vpsubusb %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| # 0xFD 0x74 - 0xFD 0x75: f64x2 operations |
| |
| ipintOp(_simd_f64x2_ceil, macro() |
| # f64x2.ceil - ceiling of 2 64-bit floats |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "frintp v16.2d, v16.2d" |
| elsif X86_64 |
| emit "vroundpd $0x2, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_f64x2_floor, macro() |
| # f64x2.floor - floor of 2 64-bit floats |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "frintm v16.2d, v16.2d" |
| elsif X86_64 |
| emit "vroundpd $0x1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| # 0xFD 0x76 - 0xFD 0x79: i8x16 binary operations |
| ipintOp(_simd_i8x16_min_s, macro() |
| # i8x16.min_s - minimum of 16 8-bit signed integers |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "smin v16.16b, v16.16b, v17.16b" |
| elsif X86_64 |
| emit "vpminsb %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i8x16_min_u, macro() |
| # i8x16.min_u - minimum of 16 8-bit unsigned integers |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "umin v16.16b, v16.16b, v17.16b" |
| elsif X86_64 |
| emit "vpminub %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i8x16_max_s, macro() |
| # i8x16.max_s - maximum of 16 8-bit signed integers |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "smax v16.16b, v16.16b, v17.16b" |
| elsif X86_64 |
| emit "vpmaxsb %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i8x16_max_u, macro() |
| # i8x16.max_u - maximum of 16 8-bit unsigned integers |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "umax v16.16b, v16.16b, v17.16b" |
| elsif X86_64 |
| emit "vpmaxub %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| # 0xFD 0x7A: f64x2 trunc |
| |
| ipintOp(_simd_f64x2_trunc, macro() |
| # f64x2.trunc - truncate 2 64-bit floats |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "frintz v16.2d, v16.2d" |
| elsif X86_64 |
| emit "vroundpd $0x3, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| # 0xFD 0x7B: i8x16 avgr_u |
| |
| ipintOp(_simd_i8x16_avgr_u, macro() |
| # i8x16.avgr_u - average of 16 8-bit unsigned integers with rounding |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "urhadd v16.16b, v16.16b, v17.16b" |
| elsif X86_64 |
| emit "vpavgb %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| # 0xFD 0x7C - 0xFD 0x7F: extadd_pairwise |
| |
| ipintOp(_simd_i16x8_extadd_pairwise_i8x16_s, macro() |
| # i16x8.extadd_pairwise_i8x16_s - pairwise addition of signed 8-bit integers to 16-bit |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "saddlp v16.8h, v16.16b" |
| elsif X86_64 |
| emit "vpcmpeqd %xmm1, %xmm1, %xmm1" # Set all bits to 1 |
| emit "vpsrlw $15, %xmm1, %xmm1" # Shift to get 0x0001 in each 16-bit lane |
| emit "vpackuswb %xmm1, %xmm1, %xmm1" # Pack to get 0x01 in each 8-bit lane |
| emit "vpmaddubsw %xmm0, %xmm1, %xmm0" # Pairwise multiply-add (signed) |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i16x8_extadd_pairwise_i8x16_u, macro() |
| # i16x8.extadd_pairwise_i8x16_u - pairwise addition of unsigned 8-bit integers to 16-bit |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "uaddlp v16.8h, v16.16b" |
| elsif X86_64 |
| emit "vpcmpeqd %xmm1, %xmm1, %xmm1" # Set all bits to 1 |
| emit "vpsrlw $15, %xmm1, %xmm1" # Shift to get 0x0001 in each 16-bit lane |
| emit "vpackuswb %xmm1, %xmm1, %xmm1" # Pack to get 0x01 in each 8-bit lane |
| emit "vpmaddubsw %xmm1, %xmm0, %xmm0" # Pairwise multiply-add (unsigned) |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i32x4_extadd_pairwise_i16x8_s, macro() |
| # i32x4.extadd_pairwise_i16x8_s - pairwise addition of signed 16-bit integers to 32-bit |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "saddlp v16.4s, v16.8h" |
| elsif X86_64 |
| emit "vpcmpeqd %xmm1, %xmm1, %xmm1" # Set all bits to 1 |
| emit "vpsrld $31, %xmm1, %xmm1" # Shift to get 0x00000001 in each 32-bit lane |
| emit "vpackssdw %xmm1, %xmm1, %xmm1" # Pack to get 0x0001 in each 16-bit lane |
| emit "vpmaddwd %xmm0, %xmm1, %xmm0" # Pairwise multiply-add |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i32x4_extadd_pairwise_i16x8_u, macro() |
| # i32x4.extadd_pairwise_i16x8_u - pairwise addition of unsigned 16-bit integers to 32-bit |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "uaddlp v16.4s, v16.8h" |
| elsif X86_64 |
| emit "vpsrld $16, %xmm0, %xmm1" # Shift right to get high 16-bits in low position |
| emit "vpblendw $0xAA, %xmm1, %xmm0, %xmm0" # Blend: keep low 16-bits from src, high 16-bits from shifted |
| emit "vpaddd %xmm1, %xmm0, %xmm0" # Add the pairs |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| # 0xFD 0x80 0x01 - 0xFD 0x93 0x01: i16x8 operations |
| |
| ipintOp(_simd_i16x8_abs, macro() |
| # i16x8.abs - absolute value of 8 16-bit signed integers |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "abs v16.8h, v16.8h" |
| elsif X86_64 |
| emit "vpabsw %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i16x8_neg, macro() |
| # i16x8.neg - negate 8 16-bit integers |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "neg v16.8h, v16.8h" |
| elsif X86_64 |
| # Negate by subtracting from zero |
| emit "vpxor %xmm1, %xmm1, %xmm1" |
| emit "vpsubw %xmm0, %xmm1, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i16x8_q15mulr_sat_s, macro() |
| # i16x8.q15mulr_sat_s - Q15 multiply with rounding and saturation |
| # Q15 format: multiply two 16-bit values, shift right by 15, round and saturate |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "sqrdmulh v16.8h, v16.8h, v17.8h" |
| elsif X86_64 |
| # See MacroAssemblerX86_64::vectorMulSat |
| emit "vpmulhrsw %xmm1, %xmm0, %xmm0" # Q15 multiply with rounding |
| emit "mov $0x8000, %eax" # Load -32768 (0x8000) |
| emit "vmovd %eax, %xmm2" # Move to XMM register |
| emit "vpshuflw $0x00, %xmm2, %xmm2" # Splat to low 4 words |
| emit "vpshufd $0x00, %xmm2, %xmm2" # Splat to all 8 words |
| emit "vpcmpeqw %xmm2, %xmm0, %xmm2" # Compare result with -32768 |
| emit "vpxor %xmm2, %xmm0, %xmm0" # Fix saturation: -32768 becomes 32767 |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i16x8_all_true, macro() |
| # i16x8.all_true - return 1 if all 8 16-bit lanes are non-zero, 0 otherwise |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "cmeq v17.8h, v16.8h, #0" # Compare each lane with 0 |
| emit "umaxv h17, v17.8h" # Find maximum (any zero lane will make this non-zero) |
| emit "fmov w0, s17" # Move to general register |
| emit "cmp w0, #0" # Compare with 0 |
| emit "cset w0, eq" # Set to 1 if equal (all lanes non-zero), 0 otherwise |
| elsif X86_64 |
| # Compare each 16-bit lane with zero |
| emit "vpxor %xmm1, %xmm1, %xmm1" # Create zero vector |
| emit "vpcmpeqw %xmm1, %xmm0, %xmm1" # Compare each word with 0 (1 if zero, 0 if non-zero) |
| |
| # Test if any lane is zero |
| emit "vpmovmskb %xmm1, %eax" # Extract sign bits |
| emit "testl %eax, %eax" # Test if any bits are set |
| emit "sete %al" # Set AL to 1 if no bits set (all lanes non-zero), 0 otherwise |
| emit "movzbl %al, %eax" # Zero-extend to 32-bit |
| else |
| break # Not implemented |
| end |
| pushInt32(t0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i16x8_bitmask, macro() |
| # i16x8.bitmask - extract most significant bit from each 16-bit lane into an 8-bit integer |
| # Simple loop over the 8 16-bit values on the stack |
| |
| move 0, t0 # Initialize result |
| move 0, t3 # Lane counter |
| |
| .bitmask_i16x8_loop: |
| # Load 16-bit value and check sign bit |
| loadh [sp, t3, 2], t1 # Load 16-bit value at offset t1*2 |
| andq 0x8000, t1 # Extract sign bit (bit 15) |
| btiz t1, .bitmask_i16x8_next |
| |
| # Set corresponding bit in result |
| move 1, t1 |
| lshiftq t3, t1 # Shift to bit position |
| orq t1, t0 |
| |
| .bitmask_i16x8_next: |
| addq 1, t3 # Next lane |
| bilt t3, 8, .bitmask_i16x8_loop |
| |
| addp V128ISize, sp # Pop the vector |
| pushInt32(t0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i16x8_narrow_i32x4_s, macro() |
| # i16x8.narrow_i32x4_s - narrow 2 i32x4 vectors to 1 i16x8 vector with signed saturation |
| popVec(v1) # Second operand |
| popVec(v0) # First operand |
| if ARM64 or ARM64E |
| # Signed saturating extract narrow: combine v0.4s and v1.4s into v16.8h |
| emit "sqxtn v16.4h, v16.4s" # Narrow first vector (v0) to lower 4 halfwords |
| emit "sqxtn2 v16.8h, v17.4s" # Narrow second vector (v1) to upper 4 halfwords |
| elsif X86_64 |
| emit "vpackssdw %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i16x8_narrow_i32x4_u, macro() |
| # i16x8.narrow_i32x4_u - narrow 2 i32x4 vectors to 1 i16x8 vector with unsigned saturation |
| popVec(v1) # Second operand |
| popVec(v0) # First operand |
| if ARM64 or ARM64E |
| # Signed saturate extract unsigned narrow: combine v0.4s and v1.4s into v16.8h |
| emit "sqxtun v16.4h, v16.4s" # Narrow first vector (v0) to lower 4 halfwords |
| emit "sqxtun2 v16.8h, v17.4s" # Narrow second vector (v1) to upper 4 halfwords |
| elsif X86_64 |
| emit "vpackusdw %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i16x8_extend_low_i8x16_s, macro() |
| # i16x8.extend_low_i8x16_s - sign-extend lower 8 i8 values to i16 |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "sxtl v16.8h, v16.8b" |
| elsif X86_64 |
| emit "vpmovsxbw %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i16x8_extend_high_i8x16_s, macro() |
| # i16x8.extend_high_i8x16_s - sign-extend upper 8 i8 values to i16 |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "sxtl2 v16.8h, v16.16b" |
| elsif X86_64 |
| # Move high 64 bits to low, then sign extend |
| emit "vpsrldq $8, %xmm0, %xmm0" # Shift right 8 bytes to get high half |
| emit "vpmovsxbw %xmm0, %xmm0" # Sign extend |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i16x8_extend_low_i8x16_u, macro() |
| # i16x8.extend_low_i8x16_u - zero-extend lower 8 i8 values to i16 |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "uxtl v16.8h, v16.8b" |
| elsif X86_64 |
| emit "vpmovzxbw %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i16x8_extend_high_i8x16_u, macro() |
| # i16x8.extend_high_i8x16_u - zero-extend upper 8 i8 values to i16 |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "uxtl2 v16.8h, v16.16b" |
| elsif X86_64 |
| # Move high 64 bits to low, then zero extend |
| emit "vpsrldq $8, %xmm0, %xmm0" # Shift right 8 bytes to get high half |
| emit "vpmovzxbw %xmm0, %xmm0" # Zero extend |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i16x8_shl, macro() |
| # i16x8.shl - left shift 8 16-bit integers |
| popInt32(t0) # shift count |
| popVec(v0) # vector |
| if ARM64 or ARM64E |
| # Mask shift count to 0-15 range for 16-bit elements |
| andi 15, t0 |
| # Duplicate shift count to all lanes of vector register |
| emit "dup v17.8h, w0" |
| # Perform left shift |
| emit "ushl v16.8h, v16.8h, v17.8h" |
| elsif X86_64 |
| # Mask shift count to 0-15 range for 16-bit elements |
| andi 15, t0 |
| emit "movd %eax, %xmm1" |
| # Perform left shift on 16-bit words |
| emit "vpsllw %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i16x8_shr_s, macro() |
| # i16x8.shr_s - arithmetic right shift 8 16-bit signed integers |
| popInt32(t0) # shift count |
| popVec(v0) # vector |
| if ARM64 or ARM64E |
| # Mask shift count to 0-15 range for 16-bit elements |
| andi 15, t0 |
| # Negate for right shift |
| negi t0 |
| # Duplicate shift count to all lanes of vector register |
| emit "dup v17.8h, w0" |
| # Perform arithmetic right shift |
| emit "sshl v16.8h, v16.8h, v17.8h" |
| elsif X86_64 |
| # Mask shift count to 0-15 range for 16-bit elements |
| andi 15, t0 |
| emit "movd %eax, %xmm1" |
| # Perform arithmetic right shift on 16-bit words |
| emit "vpsraw %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i16x8_shr_u, macro() |
| # i16x8.shr_u - logical right shift 8 16-bit unsigned integers |
| popInt32(t0) # shift count |
| popVec(v0) # vector |
| if ARM64 or ARM64E |
| # Mask shift count to 0-15 range for 16-bit elements |
| andi 15, t0 |
| # Negate for right shift |
| negi t0 |
| # Duplicate shift count to all lanes of vector register |
| emit "dup v17.8h, w0" |
| # Perform logical right shift |
| emit "ushl v16.8h, v16.8h, v17.8h" |
| elsif X86_64 |
| andi 15, t0 |
| emit "movd %eax, %xmm1" |
| emit "vpsrlw %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i16x8_add, macro() |
| # i16x8.add - add 8 16-bit integers |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "add v16.8h, v16.8h, v17.8h" |
| elsif X86_64 |
| emit "vpaddw %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i16x8_add_sat_s, macro() |
| # i16x8.add_sat_s - add 8 16-bit signed integers with saturation |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "sqadd v16.8h, v16.8h, v17.8h" |
| elsif X86_64 |
| emit "vpaddsw %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i16x8_add_sat_u, macro() |
| # i16x8.add_sat_u - add 8 16-bit unsigned integers with saturation |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "uqadd v16.8h, v16.8h, v17.8h" |
| elsif X86_64 |
| emit "vpaddusw %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i16x8_sub, macro() |
| # i16x8.sub - subtract 8 16-bit integers |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "sub v16.8h, v16.8h, v17.8h" |
| elsif X86_64 |
| emit "vpsubw %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i16x8_sub_sat_s, macro() |
| # i16x8.sub_sat_s - subtract 8 16-bit signed integers with saturation |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "sqsub v16.8h, v16.8h, v17.8h" |
| elsif X86_64 |
| emit "vpsubsw %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i16x8_sub_sat_u, macro() |
| # i16x8.sub_sat_u - subtract 8 16-bit unsigned integers with saturation |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "uqsub v16.8h, v16.8h, v17.8h" |
| elsif X86_64 |
| emit "vpsubusw %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| # 0xFD 0x94 0x01: f64x2.nearest |
| |
| ipintOp(_simd_f64x2_nearest, macro() |
| # f64x2.nearest - round to nearest integer (ties to even) for 2 64-bit floats |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "frintn v16.2d, v16.2d" |
| elsif X86_64 |
| emit "vroundpd $0x0, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| # 0xFD 0x95 0x01 - 0xFD 0x9F 0x01: i16x8 operations |
| |
| ipintOp(_simd_i16x8_mul, macro() |
| # i16x8.mul - multiply 8 16-bit integers |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "mul v16.8h, v16.8h, v17.8h" |
| elsif X86_64 |
| emit "vpmullw %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i16x8_min_s, macro() |
| # i16x8.min_s - minimum of 8 16-bit signed integers |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "smin v16.8h, v16.8h, v17.8h" |
| elsif X86_64 |
| emit "vpminsw %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i16x8_min_u, macro() |
| # i16x8.min_u - minimum of 8 16-bit unsigned integers |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "umin v16.8h, v16.8h, v17.8h" |
| elsif X86_64 |
| emit "vpminuw %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i16x8_max_s, macro() |
| # i16x8.max_s - maximum of 8 16-bit signed integers |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "smax v16.8h, v16.8h, v17.8h" |
| elsif X86_64 |
| emit "vpmaxsw %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i16x8_max_u, macro() |
| # i16x8.max_u - maximum of 8 16-bit unsigned integers |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "umax v16.8h, v16.8h, v17.8h" |
| elsif X86_64 |
| emit "vpmaxuw %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| reservedOpcode(0xfd9a01) |
| |
| ipintOp(_simd_i16x8_avgr_u, macro() |
| # i16x8.avgr_u - average of 8 16-bit unsigned integers with rounding |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "urhadd v16.8h, v16.8h, v17.8h" |
| elsif X86_64 |
| emit "vpavgw %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i16x8_extmul_low_i8x16_s, macro() |
| # i16x8.extmul_low_i8x16_s - multiply lower 8 i8 elements and extend to i16 |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "smull v16.8h, v16.8b, v17.8b" |
| elsif X86_64 |
| # See MacroAssemblerX86_64::vectorMulLow |
| emit "vpmovsxbw %xmm0, %xmm2" # Sign extend left to scratch |
| emit "vpmovsxbw %xmm1, %xmm0" # Sign extend right to dest |
| emit "vpmullw %xmm2, %xmm0, %xmm0" # Multiply |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i16x8_extmul_high_i8x16_s, macro() |
| # i16x8.extmul_high_i8x16_s - multiply upper 8 i8 elements and extend to i16 |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "smull2 v16.8h, v16.16b, v17.16b" |
| elsif X86_64 |
| # See MacroAssemblerX86_64::vectorMulHigh |
| emit "vpunpckhbw %xmm0, %xmm0, %xmm2" # Unpack high bytes of left |
| emit "vpsraw $8, %xmm2, %xmm2" # Arithmetic shift to sign extend |
| emit "vpunpckhbw %xmm1, %xmm1, %xmm0" # Unpack high bytes of right |
| emit "vpsraw $8, %xmm0, %xmm0" # Arithmetic shift to sign extend |
| emit "vpmullw %xmm2, %xmm0, %xmm0" # Multiply |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i16x8_extmul_low_i8x16_u, macro() |
| # i16x8.extmul_low_i8x16_u - multiply lower 8 u8 elements and extend to i16 |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "umull v16.8h, v16.8b, v17.8b" |
| elsif X86_64 |
| # See MacroAssemblerX86_64::vectorMulLow |
| emit "vpmovzxbw %xmm0, %xmm2" # Zero extend left to scratch |
| emit "vpmovzxbw %xmm1, %xmm0" # Zero extend right to dest |
| emit "vpmullw %xmm2, %xmm0, %xmm0" # Multiply |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i16x8_extmul_high_i8x16_u, macro() |
| # i16x8.extmul_high_i8x16_u - multiply upper 8 u8 elements and extend to i16 |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "umull2 v16.8h, v16.16b, v17.16b" |
| elsif X86_64 |
| # See MacroAssemblerX86_64::vectorMulHigh |
| emit "vpxor %xmm2, %xmm2, %xmm2" # Zero scratch register |
| emit "vpunpckhbw %xmm2, %xmm1, %xmm1" # Unpack high bytes of right with zeros |
| emit "vpunpckhbw %xmm2, %xmm0, %xmm0" # Unpack high bytes of left with zeros |
| emit "vpmullw %xmm1, %xmm0, %xmm0" # Multiply |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| # 0xFD 0xA0 0x01 - 0xFD 0xBF 0x01: i32x4 operations |
| |
| ipintOp(_simd_i32x4_abs, macro() |
| # i32x4.abs - absolute value of 4 32-bit signed integers |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "abs v16.4s, v16.4s" |
| elsif X86_64 |
| emit "vpabsd %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i32x4_neg, macro() |
| # i32x4.neg - negate 4 32-bit integers |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "neg v16.4s, v16.4s" |
| elsif X86_64 |
| # Negate by subtracting from zero |
| emit "vpxor %xmm1, %xmm1, %xmm1" |
| emit "vpsubd %xmm0, %xmm1, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| reservedOpcode(0xfda201) |
| |
| ipintOp(_simd_i32x4_all_true, macro() |
| # i32x4.all_true - return 1 if all 4 32-bit lanes are non-zero, 0 otherwise |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "cmeq v17.4s, v16.4s, #0" # Compare each lane with 0 |
| emit "umaxv s17, v17.4s" # Find maximum (any zero lane will make this non-zero) |
| emit "fmov w0, s17" # Move to general register |
| emit "cmp w0, #0" # Compare with 0 |
| emit "cset w0, eq" # Set to 1 if equal (all lanes non-zero), 0 otherwise |
| elsif X86_64 |
| # Compare each 32-bit lane with zero |
| emit "vpxor %xmm1, %xmm1, %xmm1" # Create zero vector |
| emit "vpcmpeqd %xmm1, %xmm0, %xmm1" # Compare each dword with 0 (1 if zero, 0 if non-zero) |
| |
| # Test if any lane is zero |
| emit "vpmovmskb %xmm1, %eax" # Extract sign bits |
| emit "testl %eax, %eax" # Test if any bits are set |
| emit "sete %al" # Set AL to 1 if no bits set (all lanes non-zero), 0 otherwise |
| emit "movzbl %al, %eax" # Zero-extend to 32-bit |
| else |
| break # Not implemented |
| end |
| pushInt32(t0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i32x4_bitmask, macro() |
| # i32x4.bitmask - extract most significant bit from each 32-bit lane into a 4-bit integer |
| # Simple loop over the 4 32-bit values on the stack |
| |
| move 0, t0 # Initialize result |
| move 0, t3 # Lane counter |
| |
| .bitmask_i32x4_loop: |
| # Load 32-bit value and check sign bit |
| loadi [sp, t3, 4], t1 # Load 32-bit value at offset t1*4 |
| andq 0x80000000, t1 # Extract sign bit (bit 31) |
| btiz t1, .bitmask_i32x4_next |
| |
| # Set corresponding bit in result |
| move 1, t1 |
| lshiftq t3, t1 # Shift to bit position |
| orq t1, t0 |
| |
| .bitmask_i32x4_next: |
| addq 1, t3 # Next lane |
| bilt t3, 4, .bitmask_i32x4_loop |
| |
| addp V128ISize, sp # Pop the vector |
| pushInt32(t0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| reservedOpcode(0xfda501) |
| reservedOpcode(0xfda601) |
| |
| ipintOp(_simd_i32x4_extend_low_i16x8_s, macro() |
| # i32x4.extend_low_i16x8_s - sign-extend lower 4 i16 values to i32 |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "sxtl v16.4s, v16.4h" |
| elsif X86_64 |
| emit "vpmovsxwd %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i32x4_extend_high_i16x8_s, macro() |
| # i32x4.extend_high_i16x8_s - sign-extend upper 4 i16 values to i32 |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "sxtl2 v16.4s, v16.8h" |
| elsif X86_64 |
| # Move high 64 bits to low, then sign extend |
| emit "vpsrldq $8, %xmm0, %xmm0" # Shift right 8 bytes to get high half |
| emit "vpmovsxwd %xmm0, %xmm0" # Sign extend |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i32x4_extend_low_i16x8_u, macro() |
| # i32x4.extend_low_i16x8_u - zero-extend lower 4 i16 values to i32 |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "uxtl v16.4s, v16.4h" |
| elsif X86_64 |
| emit "vpmovzxwd %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i32x4_extend_high_i16x8_u, macro() |
| # i32x4.extend_high_i16x8_u - zero-extend upper 4 i16 values to i32 |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "uxtl2 v16.4s, v16.8h" |
| elsif X86_64 |
| # Move high 64 bits to low, then zero extend |
| emit "vpsrldq $8, %xmm0, %xmm0" # Shift right 8 bytes to get high half |
| emit "vpmovzxwd %xmm0, %xmm0" # Zero extend |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i32x4_shl, macro() |
| # i32x4.shl - left shift 4 32-bit integers |
| popInt32(t0) # shift count |
| popVec(v0) # vector |
| if ARM64 or ARM64E |
| # Mask shift count to 0-31 range for 32-bit elements |
| andi 31, t0 |
| # Duplicate shift count to all lanes of vector register |
| emit "dup v17.4s, w0" |
| # Perform left shift |
| emit "ushl v16.4s, v16.4s, v17.4s" |
| elsif X86_64 |
| andi 31, t0 |
| emit "vmovd %eax, %xmm1" |
| emit "vpslld %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i32x4_shr_s, macro() |
| # i32x4.shr_s - arithmetic right shift 4 32-bit signed integers |
| popInt32(t0) # shift count |
| popVec(v0) # vector |
| if ARM64 or ARM64E |
| # Mask shift count to 0-31 range for 32-bit elements |
| andi 31, t0 |
| # Negate for right shift |
| negi t0 |
| # Duplicate shift count to all lanes of vector register |
| emit "dup v17.4s, w0" |
| # Perform arithmetic right shift |
| emit "sshl v16.4s, v16.4s, v17.4s" |
| elsif X86_64 |
| andi 31, t0 |
| emit "vmovd %eax, %xmm1" |
| emit "vpsrad %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i32x4_shr_u, macro() |
| # i32x4.shr_u - logical right shift 4 32-bit unsigned integers |
| popInt32(t0) # shift count |
| popVec(v0) # vector |
| if ARM64 or ARM64E |
| # Mask shift count to 0-31 range for 32-bit elements |
| andi 31, t0 |
| # Negate for right shift |
| negi t0 |
| # Duplicate shift count to all lanes of vector register |
| emit "dup v17.4s, w0" |
| # Perform logical right shift |
| emit "ushl v16.4s, v16.4s, v17.4s" |
| elsif X86_64 |
| andi 31, t0 |
| emit "vmovd %eax, %xmm1" |
| emit "vpsrld %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i32x4_add, macro() |
| # i32x4.add - add 4 32-bit integers |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "add v16.4s, v16.4s, v17.4s" |
| elsif X86_64 |
| emit "vpaddd %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| reservedOpcode(0xfdaf01) |
| reservedOpcode(0xfdb001) |
| |
| ipintOp(_simd_i32x4_sub, macro() |
| # i32x4.sub - subtract 4 32-bit integers |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "sub v16.4s, v16.4s, v17.4s" |
| elsif X86_64 |
| emit "vpsubd %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| reservedOpcode(0xfdb201) |
| reservedOpcode(0xfdb301) |
| reservedOpcode(0xfdb401) |
| |
| ipintOp(_simd_i32x4_mul, macro() |
| # i32x4.mul - multiply 4 32-bit integers |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "mul v16.4s, v16.4s, v17.4s" |
| elsif X86_64 |
| emit "vpmulld %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i32x4_min_s, macro() |
| # i32x4.min_s - minimum of 4 32-bit signed integers |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "smin v16.4s, v16.4s, v17.4s" |
| elsif X86_64 |
| emit "vpminsd %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i32x4_min_u, macro() |
| # i32x4.min_u - minimum of 4 32-bit unsigned integers |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "umin v16.4s, v16.4s, v17.4s" |
| elsif X86_64 |
| emit "vpminud %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i32x4_max_s, macro() |
| # i32x4.max_s - maximum of 4 32-bit signed integers |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "smax v16.4s, v16.4s, v17.4s" |
| elsif X86_64 |
| emit "vpmaxsd %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i32x4_max_u, macro() |
| # i32x4.max_u - maximum of 4 32-bit unsigned integers |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "umax v16.4s, v16.4s, v17.4s" |
| elsif X86_64 |
| emit "vpmaxud %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i32x4_dot_i16x8_s, macro() |
| # i32x4.dot_i16x8_s - dot product of signed 16-bit integers to 32-bit |
| # Multiplies pairs of adjacent 16-bit elements and adds the results |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| # Use signed multiply long to multiply adjacent pairs, then pairwise add |
| emit "smull v18.4s, v16.4h, v17.4h" # multiply low 4 pairs to v18 |
| emit "smull2 v16.4s, v16.8h, v17.8h" # multiply high 4 pairs to v19 |
| # Now pairwise add adjacent elements within each vector to get dot products |
| emit "addp v16.4s, v18.4s, v16.4s" # pairwise add to get final dot product result |
| elsif X86_64 |
| emit "vpmaddwd %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| reservedOpcode(0xfdbb01) |
| |
| ipintOp(_simd_i32x4_extmul_low_i16x8_s, macro() |
| # i32x4.extmul_low_i16x8_s - multiply lower 4 i16 elements and extend to i32 |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "smull v16.4s, v16.4h, v17.4h" |
| elsif X86_64 |
| # See MacroAssemblerX86_64::vectorMulLow |
| emit "vpmullw %xmm1, %xmm0, %xmm2" # Low multiply to scratch |
| emit "vpmulhw %xmm1, %xmm0, %xmm0" # High multiply (signed) to dest |
| emit "vpunpcklwd %xmm0, %xmm2, %xmm0" # Interleave low words |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i32x4_extmul_high_i16x8_s, macro() |
| # i32x4.extmul_high_i16x8_s - multiply upper 4 i16 elements and extend to i32 |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "smull2 v16.4s, v16.8h, v17.8h" |
| elsif X86_64 |
| # See MacroAssemblerX86_64::vectorMulHigh |
| emit "vpmullw %xmm1, %xmm0, %xmm2" # Low multiply to scratch |
| emit "vpmulhw %xmm1, %xmm0, %xmm0" # High multiply (signed) to dest |
| emit "vpunpckhwd %xmm0, %xmm2, %xmm0" # Interleave high words |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i32x4_extmul_low_i16x8_u, macro() |
| # i32x4.extmul_low_i16x8_u - multiply lower 4 u16 elements and extend to i32 |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "umull v16.4s, v16.4h, v17.4h" |
| elsif X86_64 |
| # See MacroAssemblerX86_64::vectorMulLow |
| emit "vpmullw %xmm1, %xmm0, %xmm2" # Low multiply to scratch |
| emit "vpmulhuw %xmm1, %xmm0, %xmm0" # High multiply (unsigned) to dest |
| emit "vpunpcklwd %xmm0, %xmm2, %xmm0" # Interleave low words |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i32x4_extmul_high_i16x8_u, macro() |
| # i32x4.extmul_high_i16x8_u - multiply upper 4 u16 elements and extend to i32 |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "umull2 v16.4s, v16.8h, v17.8h" |
| elsif X86_64 |
| # See MacroAssemblerX86_64::vectorMulHigh |
| emit "vpmullw %xmm1, %xmm0, %xmm2" # Low multiply to scratch |
| emit "vpmulhuw %xmm1, %xmm0, %xmm0" # High multiply (unsigned) to dest |
| emit "vpunpckhwd %xmm0, %xmm2, %xmm0" # Interleave high words |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| # 0xFD 0xC0 0x01 - 0xFD 0xDF 0x01: i64x2 operations |
| |
| ipintOp(_simd_i64x2_abs, macro() |
| # i64x2.abs - absolute value of 2 64-bit signed integers |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "abs v16.2d, v16.2d" |
| elsif X86_64 |
| # No direct vpabsq instruction, implement manually |
| # For each 64-bit lane: result = (x < 0) ? -x : x |
| emit "vpxor %xmm1, %xmm1, %xmm1" # xmm1 = 0 |
| emit "vpcmpgtq %xmm0, %xmm1, %xmm2" # xmm2 = mask where x < 0 (0 > x) |
| emit "vpsubq %xmm0, %xmm1, %xmm1" # xmm1 = -x |
| emit "vpblendvb %xmm2, %xmm1, %xmm0, %xmm0" # blend: use -x where mask is true, x otherwise |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i64x2_neg, macro() |
| # i64x2.neg - negate 2 64-bit integers |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "neg v16.2d, v16.2d" |
| elsif X86_64 |
| # Negate by subtracting from zero |
| emit "vpxor %xmm1, %xmm1, %xmm1" |
| emit "vpsubq %xmm0, %xmm1, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| reservedOpcode(0xfdc201) |
| |
| ipintOp(_simd_i64x2_all_true, macro() |
| # i64x2.all_true - return 1 if all 2 64-bit lanes are non-zero, 0 otherwise |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "cmeq v17.2d, v16.2d, #0" # Compare each lane with 0 |
| emit "addp d17, v17.2d" # Add pair - if any lane was 0, result will be non-zero |
| emit "fmov x0, d17" # Move to general register |
| emit "cmp x0, #0" # Compare with 0 |
| emit "cset w0, eq" # Set to 1 if equal (all lanes non-zero), 0 otherwise |
| elsif X86_64 |
| # Compare each 64-bit lane with zero |
| emit "vpxor %xmm1, %xmm1, %xmm1" # Create zero vector |
| emit "vpcmpeqq %xmm1, %xmm0, %xmm1" # Compare each qword with 0 (1 if zero, 0 if non-zero) |
| |
| # Test if any lane is zero |
| emit "vpmovmskb %xmm1, %eax" # Extract sign bits |
| emit "testl %eax, %eax" # Test if any bits are set |
| emit "sete %al" # Set AL to 1 if no bits set (all lanes non-zero), 0 otherwise |
| emit "movzbl %al, %eax" # Zero-extend to 32-bit |
| else |
| break # Not implemented |
| end |
| pushInt32(t0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i64x2_bitmask, macro() |
| # i64x2.bitmask - extract most significant bit from each 64-bit lane into a 2-bit integer |
| # Handle both 64-bit values directly |
| |
| # Load both 64-bit values |
| loadq [sp], t0 # Load lane 0 |
| loadq 8[sp], t1 # Load lane 1 |
| addp V128ISize, sp # Pop the vector |
| |
| # Initialize result |
| move 0, t2 |
| |
| # Check lane 0 sign bit (bit 63) |
| move 0x8000000000000000, t3 |
| andq t3, t0 |
| btqz t0, .bitmask_i64x2_lane1 |
| orq 1, t2 # Set bit 0 |
| |
| .bitmask_i64x2_lane1: |
| # Check lane 1 sign bit (bit 63) |
| andq t3, t1 |
| btqz t1, .bitmask_i64x2_done |
| orq 2, t2 # Set bit 1 |
| |
| .bitmask_i64x2_done: |
| pushInt32(t2) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| reservedOpcode(0xfdc501) |
| reservedOpcode(0xfdc601) |
| |
| ipintOp(_simd_i64x2_extend_low_i32x4_s, macro() |
| # i64x2.extend_low_i32x4_s - sign-extend lower 2 i32 values to i64 |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "sxtl v16.2d, v16.2s" |
| elsif X86_64 |
| emit "vpmovsxdq %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i64x2_extend_high_i32x4_s, macro() |
| # i64x2.extend_high_i32x4_s - sign-extend upper 2 i32 values to i64 |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "sxtl2 v16.2d, v16.4s" |
| elsif X86_64 |
| # Move high 64 bits to low, then sign extend |
| emit "vpsrldq $8, %xmm0, %xmm0" # Shift right 8 bytes to get high half |
| emit "vpmovsxdq %xmm0, %xmm0" # Sign extend |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i64x2_extend_low_i32x4_u, macro() |
| # i64x2.extend_low_i32x4_u - zero-extend lower 2 i32 values to i64 |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "uxtl v16.2d, v16.2s" |
| elsif X86_64 |
| emit "vpmovzxdq %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i64x2_extend_high_i32x4_u, macro() |
| # i64x2.extend_high_i32x4_u - zero-extend upper 2 i32 values to i64 |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "uxtl2 v16.2d, v16.4s" |
| elsif X86_64 |
| # Move high 64 bits to low, then zero extend |
| emit "vpsrldq $8, %xmm0, %xmm0" # Shift right 8 bytes to get high half |
| emit "vpmovzxdq %xmm0, %xmm0" # Zero extend |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i64x2_shl, macro() |
| # i64x2.shl - left shift 2 64-bit integers |
| popInt32(t0) # shift count |
| popVec(v0) # vector |
| if ARM64 or ARM64E |
| # Mask shift count to 0-63 range for 64-bit elements |
| andi 63, t0 |
| # Duplicate shift count to all lanes of vector register |
| emit "dup v17.2d, x0" |
| # Perform left shift |
| emit "ushl v16.2d, v16.2d, v17.2d" |
| elsif X86_64 |
| andi 63, t0 |
| emit "movd %eax, %xmm1" |
| emit "vpsllq %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i64x2_shr_s, macro() |
| # i64x2.shr_s - arithmetic right shift 2 64-bit signed integers |
| popInt32(t0) # shift count |
| # Mask shift count to 0-63 range for 64-bit elements |
| andi 63, t0 |
| |
| loadq 8[sp], t1 |
| rshiftq t0, t1 |
| storeq t1, 8[sp] |
| |
| loadq [sp], t1 |
| rshiftq t0, t1 |
| storeq t1, [sp] |
| |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i64x2_shr_u, macro() |
| # i64x2.shr_u - logical right shift 2 64-bit unsigned integers |
| popInt32(t0) # shift count |
| popVec(v0) # vector |
| if ARM64 or ARM64E |
| # Mask shift count to 0-63 range for 64-bit elements |
| andi 63, t0 |
| # Negate for right shift |
| negq t0 |
| # Duplicate shift count to all lanes of vector register |
| emit "dup v17.2d, x0" |
| # Perform logical right shift |
| emit "ushl v16.2d, v16.2d, v17.2d" |
| elsif X86_64 |
| andi 63, t0 |
| emit "movd %eax, %xmm1" |
| emit "vpsrlq %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i64x2_add, macro() |
| # i64x2.add - add 2 64-bit integers |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "add v16.2d, v16.2d, v17.2d" |
| elsif X86_64 |
| emit "vpaddq %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| reservedOpcode(0xfdcf01) |
| reservedOpcode(0xfdd001) |
| |
| ipintOp(_simd_i64x2_sub, macro() |
| # i64x2.sub - subtract 2 64-bit integers |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "sub v16.2d, v16.2d, v17.2d" |
| elsif X86_64 |
| emit "vpsubq %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| reservedOpcode(0xfdd201) |
| reservedOpcode(0xfdd301) |
| reservedOpcode(0xfdd401) |
| |
| ipintOp(_simd_i64x2_mul, macro() |
| # i64x2.mul - multiply 2 64-bit integers (low 64 bits of result) |
| |
| # Extract and multiply lane 0 (first 64-bit element) |
| loadq [sp], t0 # Load lane 0 of vector1 |
| loadq 16[sp], t1 # Load lane 0 of vector0 |
| mulq t1, t0 # Multiply: t0 = t0 * t1 |
| storeq t0, 16[sp] # Store result back to vector0 |
| |
| # Extract and multiply lane 1 (second 64-bit element) |
| loadq 8[sp], t0 # Load lane 1 of vector1 |
| loadq 24[sp], t1 # Load lane 1 of vector0 |
| mulq t1, t0 # Multiply: t0 = t0 * t1 |
| storeq t0, 24[sp] # Store result back to vector0 |
| |
| # Pop vector1, result in vector0 |
| addp V128ISize, sp # Remove first vector from stack, leaving result |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i64x2_eq, macro() |
| # i64x2.eq - compare 2 64-bit integers for equality |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "cmeq v16.2d, v16.2d, v17.2d" |
| elsif X86_64 |
| emit "vpcmpeqq %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i64x2_ne, macro() |
| # i64x2.ne - compare 2 64-bit integers for inequality |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "cmeq v16.2d, v16.2d, v17.2d" |
| emit "mvn v16.16b, v16.16b" |
| elsif X86_64 |
| # Compare for equality, then invert the result |
| emit "vpcmpeqq %xmm1, %xmm0, %xmm0" |
| emit "vpcmpeqq %xmm2, %xmm2, %xmm2" # Set all bits to 1 |
| emit "vpxor %xmm2, %xmm0, %xmm0" # Invert result |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i64x2_lt_s, macro() |
| # i64x2.lt_s - compare 2 64-bit signed integers for less than |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| # cmgt v17, v16 gives us v1 > v0, which is equivalent to v0 < v1 |
| emit "cmgt v16.2d, v17.2d, v16.2d" |
| elsif X86_64 |
| # vpcmpgtq xmm1, xmm0 gives us xmm1 > xmm0, which is equivalent to xmm0 < xmm1 |
| emit "vpcmpgtq %xmm0, %xmm1, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i64x2_gt_s, macro() |
| # i64x2.gt_s - compare 2 64-bit signed integers for greater than |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "cmgt v16.2d, v16.2d, v17.2d" |
| elsif X86_64 |
| emit "vpcmpgtq %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i64x2_le_s, macro() |
| # i64x2.le_s - compare 2 64-bit signed integers for less than or equal |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| # cmge v17, v16 gives us v1 >= v0, which is equivalent to v0 <= v1 |
| emit "cmge v16.2d, v17.2d, v16.2d" |
| elsif X86_64 |
| # xmm0 <= xmm1 iff !(xmm0 > xmm1) |
| emit "vpcmpgtq %xmm1, %xmm0, %xmm0" # xmm0 > xmm1 |
| emit "vpcmpeqq %xmm2, %xmm2, %xmm2" # Set all bits to 1 |
| emit "vpxor %xmm2, %xmm0, %xmm0" # Invert result: !(xmm0 > xmm1) |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i64x2_ge_s, macro() |
| # i64x2.ge_s - compare 2 64-bit signed integers for greater than or equal |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "cmge v16.2d, v16.2d, v17.2d" |
| elsif X86_64 |
| # xmm0 >= xmm1 iff !(xmm0 < xmm1) iff !(xmm1 > xmm0) |
| emit "vpcmpgtq %xmm0, %xmm1, %xmm0" # xmm1 > xmm0 |
| emit "vpcmpeqq %xmm2, %xmm2, %xmm2" # Set all bits to 1 |
| emit "vpxor %xmm2, %xmm0, %xmm0" # Invert result: !(xmm1 > xmm0) |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i64x2_extmul_low_i32x4_s, macro() |
| # i64x2.extmul_low_i32x4_s - multiply lower 2 i32 elements and extend to i64 |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "smull v16.2d, v16.2s, v17.2s" |
| elsif X86_64 |
| # See MacroAssemblerX86_64::vectorMulLow |
| emit "vpunpckldq %xmm0, %xmm0, %xmm2" # Duplicate low dwords of left |
| emit "vpunpckldq %xmm1, %xmm1, %xmm0" # Duplicate low dwords of right |
| emit "vpmuldq %xmm2, %xmm0, %xmm0" # Signed multiply |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i64x2_extmul_high_i32x4_s, macro() |
| # i64x2.extmul_high_i32x4_s - multiply upper 2 i32 elements and extend to i64 |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "smull2 v16.2d, v16.4s, v17.4s" |
| elsif X86_64 |
| # See MacroAssemblerX86_64::vectorMulHigh |
| emit "vpunpckhdq %xmm0, %xmm0, %xmm2" # Duplicate high dwords of left |
| emit "vpunpckhdq %xmm1, %xmm1, %xmm0" # Duplicate high dwords of right |
| emit "vpmuldq %xmm2, %xmm0, %xmm0" # Signed multiply |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i64x2_extmul_low_i32x4_u, macro() |
| # i64x2.extmul_low_i32x4_u - multiply lower 2 u32 elements and extend to i64 |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "umull v16.2d, v16.2s, v17.2s" |
| elsif X86_64 |
| # See MacroAssemblerX86_64::vectorMulLow |
| emit "vpunpckldq %xmm0, %xmm0, %xmm2" # Duplicate low dwords of left |
| emit "vpunpckldq %xmm1, %xmm1, %xmm0" # Duplicate low dwords of right |
| emit "vpmuludq %xmm2, %xmm0, %xmm0" # Unsigned multiply |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i64x2_extmul_high_i32x4_u, macro() |
| # i64x2.extmul_high_i32x4_u - multiply upper 2 u32 elements and extend to i64 |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "umull2 v16.2d, v16.4s, v17.4s" |
| elsif X86_64 |
| # See MacroAssemblerX86_64::vectorMulHigh |
| emit "vpunpckhdq %xmm0, %xmm0, %xmm2" # Duplicate high dwords of left |
| emit "vpunpckhdq %xmm1, %xmm1, %xmm0" # Duplicate high dwords of right |
| emit "vpmuludq %xmm2, %xmm0, %xmm0" # Unsigned multiply |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| # 0xFD 0xE0 0x01 - 0xFD 0xEB 0x01: f32x4 operations |
| |
| ipintOp(_simd_f32x4_abs, macro() |
| # f32x4.abs - absolute value of 4 32-bit floats |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "fabs v16.4s, v16.4s" |
| elsif X86_64 |
| # Clear sign bit by AND with 0x7FFFFFFF mask |
| emit "movabsq $0x7fffffff7fffffff, %rax" |
| emit "vmovq %rax, %xmm1" |
| emit "vpunpcklqdq %xmm1, %xmm1, %xmm1" |
| emit "vandps %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_f32x4_neg, macro() |
| # f32x4.neg - negate 4 32-bit floats |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "fneg v16.4s, v16.4s" |
| elsif X86_64 |
| # Flip sign bit by XOR with 0x80000000 mask |
| emit "movabsq $0x8000000080000000, %rax" |
| emit "vmovq %rax, %xmm1" |
| emit "vpunpcklqdq %xmm1, %xmm1, %xmm1" |
| emit "vxorps %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| reservedOpcode(0xfde201) |
| |
| ipintOp(_simd_f32x4_sqrt, macro() |
| # f32x4.sqrt - square root of 4 32-bit floats |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "fsqrt v16.4s, v16.4s" |
| elsif X86_64 |
| emit "vsqrtps %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_f32x4_add, macro() |
| # f32x4.add - add 4 32-bit floats |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "fadd v16.4s, v16.4s, v17.4s" |
| elsif X86_64 |
| emit "vaddps %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_f32x4_sub, macro() |
| # f32x4.sub - subtract 4 32-bit floats |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "fsub v16.4s, v16.4s, v17.4s" |
| elsif X86_64 |
| emit "vsubps %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_f32x4_mul, macro() |
| # f32x4.mul - multiply 4 32-bit floats |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "fmul v16.4s, v16.4s, v17.4s" |
| elsif X86_64 |
| emit "vmulps %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_f32x4_div, macro() |
| # f32x4.div - divide 4 32-bit floats |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "fdiv v16.4s, v16.4s, v17.4s" |
| elsif X86_64 |
| emit "vdivps %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_f32x4_min, macro() |
| # f32x4.min - minimum of 4 32-bit floats (IEEE 754-2008 semantics) |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "fmin v16.4s, v16.4s, v17.4s" |
| elsif X86_64 |
| # Wasm differs from X86_64 in terms of signed zero values and propagating NaNs |
| # so some special handling of those cases are needed. |
| # Compute result in both directions to handle NaN asymmetry |
| emit "vminps %xmm1, %xmm0, %xmm2" # xmm2 = min(xmm0, xmm1) |
| emit "vminps %xmm0, %xmm1, %xmm0" # xmm0 = min(xmm1, xmm0) |
| |
| # OR results to propagate sign bits and NaN bits |
| emit "vorps %xmm0, %xmm2, %xmm2" # xmm2 = xmm0 | xmm2 |
| |
| # Canonicalize NaNs by checking for unordered values and clearing mantissa |
| emit "vcmpunordps %xmm2, %xmm0, %xmm0" # xmm0 = NaN mask (all 1's where NaN) |
| emit "vorps %xmm0, %xmm2, %xmm2" # xmm2 |= NaN mask |
| emit "vpsrld $10, %xmm0, %xmm0" # Shift mask to clear mantissa bits (f32 uses 10) |
| emit "vpandn %xmm2, %xmm0, %xmm0" # Clear mantissa to canonicalize NaN |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_f32x4_max, macro() |
| # f32x4.max - maximum of 4 32-bit floats (IEEE 754-2008 semantics) |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "fmax v16.4s, v16.4s, v17.4s" |
| elsif X86_64 |
| # Wasm differs from X86_64 in terms of signed zero values and propagating NaNs |
| # so some special handling of those cases are needed. |
| # Compute result in both directions to handle NaN asymmetry |
| emit "vmaxps %xmm1, %xmm0, %xmm2" # xmm2 = max(xmm0, xmm1) |
| emit "vmaxps %xmm0, %xmm1, %xmm0" # xmm0 = max(xmm1, xmm0) |
| |
| # Check for discrepancies by XORing the results |
| emit "vxorps %xmm0, %xmm2, %xmm0" # xmm0 = xmm0 ^ xmm2 |
| |
| # OR results to propagate sign bits and NaN bits |
| emit "vorps %xmm0, %xmm2, %xmm2" # xmm2 = xmm0 | xmm2 |
| |
| # Propagate discrepancies in sign bit |
| emit "vsubps %xmm0, %xmm2, %xmm2" # xmm2 = xmm2 - xmm0 |
| |
| # Canonicalize NaNs by checking for unordered values and clearing mantissa |
| emit "vcmpunordps %xmm2, %xmm0, %xmm0" # xmm0 = NaN mask (all 1's where NaN) |
| emit "vpsrld $10, %xmm0, %xmm0" # Shift mask to clear mantissa bits (f32 uses 10) |
| emit "vpandn %xmm2, %xmm0, %xmm0" # Clear mantissa to canonicalize NaN |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_f32x4_pmin, macro() |
| # f32x4.pmin - pseudo-minimum of 4 32-bit floats (b < a ? b : a) |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| # Use fcmgt to compare v0 > v1, then use bsl to select |
| emit "fcmgt v18.4s, v16.4s, v17.4s" |
| emit "bsl v18.16b, v17.16b, v16.16b" |
| emit "mov v16.16b, v18.16b" |
| elsif X86_64 |
| emit "vcmpgtps %xmm1, %xmm0, %xmm2" # xmm2 = (a > b) ? 0xFFFFFFFF : 0x00000000 |
| emit "vblendvps %xmm2, %xmm1, %xmm0, %xmm0" # select b if mask is true, a if false |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_f32x4_pmax, macro() |
| # f32x4.pmax - pseudo-maximum of 4 32-bit floats (a < b ? b : a) |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| # Use fcmgt to compare v1 > v0, then use bsl to select |
| emit "fcmgt v18.4s, v17.4s, v16.4s" |
| emit "bsl v18.16b, v17.16b, v16.16b" |
| emit "mov v16.16b, v18.16b" |
| elsif X86_64 |
| emit "vcmpgtps %xmm0, %xmm1, %xmm2" # xmm2 = (b > a) ? 0xFFFFFFFF : 0x00000000 |
| emit "vblendvps %xmm2, %xmm1, %xmm0, %xmm0" # select b if mask is true, a if false |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| # 0xFD 0xEC 0x01 - 0xFD 0xF7 0x01: f64x2 operations |
| |
| ipintOp(_simd_f64x2_abs, macro() |
| # f64x2.abs - absolute value of 2 64-bit floats |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "fabs v16.2d, v16.2d" |
| elsif X86_64 |
| # Clear sign bit by AND with 0x7FFFFFFFFFFFFFFF mask |
| emit "movabsq $0x7fffffffffffffff, %rax" |
| emit "vmovq %rax, %xmm1" |
| emit "vpunpcklqdq %xmm1, %xmm1, %xmm1" |
| emit "vandpd %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_f64x2_neg, macro() |
| # f64x2.neg - negate 2 64-bit floats |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "fneg v16.2d, v16.2d" |
| elsif X86_64 |
| # Flip sign bit by XOR with 0x8000000000000000 mask |
| emit "movabsq $0x8000000000000000, %rax" |
| emit "vmovq %rax, %xmm1" |
| emit "vpunpcklqdq %xmm1, %xmm1, %xmm1" |
| emit "vxorpd %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| reservedOpcode(0xfdee01) |
| |
| ipintOp(_simd_f64x2_sqrt, macro() |
| # f64x2.sqrt - square root of 2 64-bit floats |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "fsqrt v16.2d, v16.2d" |
| elsif X86_64 |
| emit "vsqrtpd %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_f64x2_add, macro() |
| # f64x2.add - add 2 64-bit floats |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "fadd v16.2d, v16.2d, v17.2d" |
| elsif X86_64 |
| emit "vaddpd %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_f64x2_sub, macro() |
| # f64x2.sub - subtract 2 64-bit floats |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "fsub v16.2d, v16.2d, v17.2d" |
| elsif X86_64 |
| emit "vsubpd %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_f64x2_mul, macro() |
| # f64x2.mul - multiply 2 64-bit floats |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "fmul v16.2d, v16.2d, v17.2d" |
| elsif X86_64 |
| emit "vmulpd %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_f64x2_div, macro() |
| # f64x2.div - divide 2 64-bit floats |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "fdiv v16.2d, v16.2d, v17.2d" |
| elsif X86_64 |
| emit "vdivpd %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_f64x2_min, macro() |
| # f64x2.min - minimum of 2 64-bit floats (IEEE 754-2008 semantics) |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "fmin v16.2d, v16.2d, v17.2d" |
| elsif X86_64 |
| # Wasm differs from X86_64 in terms of signed zero values and propagating NaNs |
| # so some special handling of those cases are needed. |
| # Compute result in both directions to handle NaN asymmetry |
| emit "vminpd %xmm1, %xmm0, %xmm2" # xmm2 = min(xmm0, xmm1) |
| emit "vminpd %xmm0, %xmm1, %xmm0" # xmm0 = min(xmm1, xmm0) |
| |
| # OR results to propagate sign bits and NaN bits |
| emit "vorpd %xmm0, %xmm2, %xmm2" # xmm2 = xmm0 | xmm2 |
| |
| # Canonicalize NaNs by checking for unordered values and clearing mantissa |
| emit "vcmpunordpd %xmm2, %xmm0, %xmm0" # xmm0 = NaN mask (all 1's where NaN) |
| emit "vorpd %xmm0, %xmm2, %xmm2" # xmm2 |= NaN mask |
| emit "vpsrlq $13, %xmm0, %xmm0" # Shift mask to clear mantissa bits |
| emit "vpandn %xmm2, %xmm0, %xmm0" # Clear mantissa to canonicalize NaN |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_f64x2_max, macro() |
| # f64x2.max - maximum of 2 64-bit floats (IEEE 754-2008 semantics) |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "fmax v16.2d, v16.2d, v17.2d" |
| elsif X86_64 |
| # Wasm differs from X86_64 in terms of signed zero values and propagating NaNs |
| # so some special handling of those cases are needed. |
| # Compute result in both directions to handle NaN asymmetry |
| emit "vmaxpd %xmm1, %xmm0, %xmm2" # xmm2 = max(xmm0, xmm1) |
| emit "vmaxpd %xmm0, %xmm1, %xmm0" # xmm0 = max(xmm1, xmm0) |
| |
| # Check for discrepancies by XORing the results |
| emit "vxorpd %xmm0, %xmm2, %xmm0" # xmm0 = xmm0 ^ xmm2 |
| |
| # OR results to propagate sign bits and NaN bits |
| emit "vorpd %xmm0, %xmm2, %xmm2" # xmm2 = xmm0 | xmm2 |
| |
| # Propagate discrepancies in sign bit |
| emit "vsubpd %xmm0, %xmm2, %xmm2" # xmm2 = xmm2 - xmm0 |
| |
| # Canonicalize NaNs by checking for unordered values and clearing mantissa |
| emit "vcmpunordpd %xmm2, %xmm0, %xmm0" # xmm0 = NaN mask (all 1's where NaN) |
| emit "vpsrlq $13, %xmm0, %xmm0" # Shift mask to clear mantissa bits |
| emit "vpandn %xmm2, %xmm0, %xmm0" # Clear mantissa to canonicalize NaN |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_f64x2_pmin, macro() |
| # f64x2.pmin - pseudo-minimum of 2 64-bit floats (b < a ? b : a) |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| # Use fcmgt to compare v0 > v1, then use bsl to select |
| emit "fcmgt v18.2d, v16.2d, v17.2d" |
| emit "bsl v18.16b, v17.16b, v16.16b" |
| emit "mov v16.16b, v18.16b" |
| elsif X86_64 |
| emit "vcmpgtpd %xmm1, %xmm0, %xmm2" # xmm2 = (a > b) ? 0xFFFFFFFF : 0x00000000 |
| emit "vblendvpd %xmm2, %xmm1, %xmm0, %xmm0" # select b if mask is true, a if false |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_f64x2_pmax, macro() |
| # f64x2.pmax - pseudo-maximum of 2 64-bit floats (a < b ? b : a) |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| # Use fcmgt to compare v1 > v0, then use bsl to select |
| emit "fcmgt v18.2d, v17.2d, v16.2d" |
| emit "bsl v18.16b, v17.16b, v16.16b" |
| emit "mov v16.16b, v18.16b" |
| elsif X86_64 |
| emit "vcmpgtpd %xmm0, %xmm1, %xmm2" # xmm2 = (b > a) ? 0xFFFFFFFF : 0x00000000 |
| emit "vblendvpd %xmm2, %xmm1, %xmm0, %xmm0" # select b if mask is true, a if false |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| # 0xFD 0xF8 0x01 - 0xFD 0xFF 0x01: trunc/convert |
| |
| ipintOp(_simd_i32x4_trunc_sat_f32x4_s, macro() |
| # i32x4.trunc_sat_f32x4_s - truncate 4 f32 values to signed i32 with saturation |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "fcvtzs v16.4s, v16.4s" |
| elsif X86_64 |
| # Saturation logic following MacroAssembler implementation |
| emit "vmovaps %xmm0, %xmm1" # xmm1 = src |
| emit "vcmpunordps %xmm1, %xmm1, %xmm1" # xmm1 = NaN mask |
| emit "vandnps %xmm0, %xmm1, %xmm1" # xmm1 = src with NaN lanes cleared |
| |
| # Load 0x1.0p+31f (2147483648.0f) constant |
| emit "movl $0x4f000000, %eax" # 0x1.0p+31f |
| emit "vmovd %eax, %xmm2" |
| emit "vshufps $0, %xmm2, %xmm2, %xmm2" # Broadcast to all 4 lanes |
| |
| emit "vcmpnltps %xmm2, %xmm1, %xmm3" # xmm3 = positive overflow mask (src >= 0x80000000) |
| emit "vcvttps2dq %xmm1, %xmm1" # Convert with overflow saturated to 0x80000000 |
| emit "vpxor %xmm3, %xmm1, %xmm0" # Convert positive overflow to 0x7FFFFFFF |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i32x4_trunc_sat_f32x4_u, macro() |
| # i32x4.trunc_sat_f32x4_u - truncate 4 f32 values to unsigned i32 with saturation |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "fcvtzu v16.4s, v16.4s" |
| elsif X86_64 |
| # Unsigned saturation logic following MacroAssembler implementation |
| emit "vxorps %xmm1, %xmm1, %xmm1" # xmm1 = 0 |
| emit "vmaxps %xmm1, %xmm0, %xmm0" # Clear NaN and negatives |
| |
| # Load 2147483647.0f constant (rounds to 2147483648.0f in float32) |
| emit "movl $0x4f000000, %eax" # 2147483647.0f |
| emit "vmovd %eax, %xmm2" |
| emit "vshufps $0, %xmm2, %xmm2, %xmm2" # Broadcast to all 4 lanes |
| |
| emit "vmovaps %xmm0, %xmm3" # xmm3 = src copy |
| emit "vsubps %xmm2, %xmm3, %xmm3" # xmm3 = src - 2147483647.0f |
| emit "vcmpnltps %xmm2, %xmm3, %xmm1" # xmm1 = mask for overflow |
| emit "vcvttps2dq %xmm3, %xmm3" # Convert (src - 2147483647.0f) |
| emit "vpxor %xmm1, %xmm3, %xmm3" # Saturate positive overflow to 0x7FFFFFFF |
| |
| emit "vpxor %xmm4, %xmm4, %xmm4" # xmm4 = 0 |
| emit "vpmaxsd %xmm4, %xmm3, %xmm3" # Clear negatives |
| |
| emit "vcvttps2dq %xmm0, %xmm0" # Convert original src |
| emit "vpaddd %xmm3, %xmm0, %xmm0" # Add correction |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_f32x4_convert_i32x4_s, macro() |
| # f32x4.convert_i32x4_s - convert 4 signed i32 values to f32 |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "scvtf v16.4s, v16.4s" |
| elsif X86_64 |
| emit "vcvtdq2ps %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_f32x4_convert_i32x4_u, macro() |
| # f32x4.convert_i32x4_u - convert 4 unsigned i32 values to f32 |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "ucvtf v16.4s, v16.4s" |
| elsif X86_64 |
| # See MacroAssembler::vectorConvertUnsigned |
| emit "vpxor %xmm1, %xmm1, %xmm1" # clear scratch |
| emit "vpblendw $0x55, %xmm0, %xmm1, %xmm1" # i_low = low 16 bits of src |
| emit "vpsubd %xmm1, %xmm0, %xmm0" # i_high = high 16 bits of src |
| emit "vcvtdq2ps %xmm1, %xmm1" # f_low = convertToF32(i_low) |
| emit "vpsrld $1, %xmm0, %xmm0" # i_half_high = i_high / 2 |
| emit "vcvtdq2ps %xmm0, %xmm0" # f_half_high = convertToF32(i_half_high) |
| emit "vaddps %xmm0, %xmm0, %xmm0" # dst = f_half_high + f_half_high + f_low |
| emit "vaddps %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i32x4_trunc_sat_f64x2_s_zero, macro() |
| # i32x4.trunc_sat_f64x2_s_zero - truncate 2 f64 values to signed i32, zero upper 2 lanes |
| popVec(v0) |
| if ARM64 or ARM64E |
| # Convert f64 to signed i64 first |
| emit "fcvtzs v16.2d, v16.2d" |
| # Signed saturating extract narrow from i64 to i32 |
| emit "sqxtn v16.2s, v16.2d" |
| # Zero the upper 64 bits (lanes 2,3) |
| emit "mov v16.d[1], xzr" |
| elsif X86_64 |
| emit "vcmppd $0, %xmm0, %xmm0, %xmm1" # xmm1 = ordered comparison mask (not NaN) |
| |
| # Load 2147483647.0 constant |
| emit "movabsq $0x41dfffffffc00000, %rax" # 2147483647.0 as double |
| emit "vmovq %rax, %xmm2" |
| emit "vpunpcklqdq %xmm2, %xmm2, %xmm2" # Broadcast to both lanes |
| |
| emit "vandpd %xmm2, %xmm1, %xmm1" # xmm1 = 2147483647.0 where not NaN, 0 where NaN |
| emit "vminpd %xmm1, %xmm0, %xmm0" # Clamp to max value and handle NaN |
| emit "vcvttpd2dq %xmm0, %xmm0" # Convert to i32 (result in lower 64 bits, upper zeroed) |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i32x4_trunc_sat_f64x2_u_zero, macro() |
| # i32x4.trunc_sat_f64x2_u_zero - truncate 2 f64 values to unsigned i32, zero upper 2 lanes |
| popVec(v0) |
| if ARM64 or ARM64E |
| # Convert f64 to unsigned i64 first |
| emit "fcvtzu v16.2d, v16.2d" |
| # Unsigned saturating extract narrow from i64 to i32 |
| emit "uqxtn v16.2s, v16.2d" |
| # Zero the upper 64 bits (lanes 2,3) |
| emit "mov v16.d[1], xzr" |
| elsif X86_64 |
| # See MacroAssembler::vectorTruncSatUnsignedFloat64 |
| # Load constants: 4294967295.0 and 0x1.0p+52 |
| emit "movabsq $0x41efffffffe00000, %rax" # 4294967295.0 as double |
| emit "vmovq %rax, %xmm2" |
| emit "vpunpcklqdq %xmm2, %xmm2, %xmm2" # xmm2 = [4294967295.0, 4294967295.0] |
| |
| emit "movabsq $0x4330000000000000, %rax" # 0x1.0p+52 as double |
| emit "vmovq %rax, %xmm3" |
| emit "vpunpcklqdq %xmm3, %xmm3, %xmm3" # xmm3 = [0x1.0p+52, 0x1.0p+52] |
| |
| emit "vxorpd %xmm1, %xmm1, %xmm1" # xmm1 = 0.0 |
| emit "vmaxpd %xmm1, %xmm0, %xmm0" # Clear negatives |
| emit "vminpd %xmm2, %xmm0, %xmm0" # Clamp to 4294967295.0 |
| emit "vroundpd $3, %xmm0, %xmm0" # Truncate toward zero |
| emit "vaddpd %xmm3, %xmm0, %xmm0" # Add 0x1.0p+52 (magic number conversion) |
| emit "vshufps $0x88, %xmm1, %xmm0, %xmm0" # Pack to i32 and zero upper |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_f64x2_convert_low_i32x4_s, macro() |
| # f64x2.convert_low_i32x4_s - convert lower 2 signed i32 values to f64 |
| popVec(v0) |
| if ARM64 or ARM64E |
| # Sign-extend lower 2 i32 values to i64, then convert to f64 |
| emit "sxtl v16.2d, v16.2s" |
| emit "scvtf v16.2d, v16.2d" |
| elsif X86_64 |
| emit "vcvtdq2pd %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_f64x2_convert_low_i32x4_u, macro() |
| # f64x2.convert_low_i32x4_u - convert lower 2 unsigned i32 values to f64 |
| popVec(v0) |
| if ARM64 or ARM64E |
| # Zero-extend lower 2 i32 values to i64, then convert to f64 |
| emit "uxtl v16.2d, v16.2s" |
| emit "ucvtf v16.2d, v16.2d" |
| elsif X86_64 |
| # See MacroAssembler::vectorConvertLowUnsignedInt32 |
| # Load 0x43300000 (high32Bits) and splat to all lanes |
| emit "movl $0x43300000, %eax" |
| emit "vmovd %eax, %xmm1" |
| emit "vpshufd $0, %xmm1, %xmm1" |
| |
| # Unpack lower 2 i32 with high32Bits |
| emit "vunpcklps %xmm1, %xmm0, %xmm0" # Interleave: [i32_0, 0x43300000, i32_1, 0x43300000] |
| |
| # Load 0x1.0p+52 mask |
| emit "movabsq $0x4330000000000000, %rax" # 0x1.0p+52 as double |
| emit "vmovq %rax, %xmm1" |
| emit "vpunpcklqdq %xmm1, %xmm1, %xmm1" # xmm1 = [0x1.0p+52, 0x1.0p+52] |
| |
| # Subtract to get the correct unsigned values |
| emit "vsubpd %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ################################### |
| ## Relaxed SIMD instructions ## |
| ## Opcodes 0x100 - 0x113 ## |
| ################################### |
| |
| ipintOp(_simd_i8x16_relaxed_swizzle, macro() |
| # i8x16.relaxed_swizzle - swizzle bytes (relaxed semantics: out-of-range indices are implementation defined) |
| popVec(v1) # indices |
| popVec(v0) # table |
| if ARM64 or ARM64E |
| # ARM64 tbl instruction returns 0 for out-of-range indices |
| emit "tbl v16.16b, {v16.16b}, v17.16b" |
| elsif X86_64 |
| # x86-64 vpshufb returns 0 for indices with bit 7 set |
| # For relaxed semantics, we can use it directly |
| emit "vpshufb %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i32x4_relaxed_trunc_f32x4_s, macro() |
| # i32x4.relaxed_trunc_f32x4_s - truncate f32 to signed i32 (relaxed: NaN/overflow is implementation defined) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "fcvtzs v16.4s, v16.4s" |
| elsif X86_64 |
| emit "vcvttps2dq %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i32x4_relaxed_trunc_f32x4_u, macro() |
| # i32x4.relaxed_trunc_f32x4_u - truncate f32 to unsigned i32 (relaxed semantics) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "fcvtzu v16.4s, v16.4s" |
| elsif X86_64 |
| # Relaxed semantics: vcvttps2dq converts to signed i32, so values >= 2^31 |
| # produce 0x80000000 (implementation-defined under relaxed spec). |
| emit "vxorps %xmm1, %xmm1, %xmm1" |
| emit "vmaxps %xmm1, %xmm0, %xmm0" |
| emit "vcvttps2dq %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i32x4_relaxed_trunc_f64x2_s_zero, macro() |
| # i32x4.relaxed_trunc_f64x2_s_zero - truncate 2 f64 to signed i32, zero upper lanes |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "fcvtzs v16.2d, v16.2d" |
| emit "sqxtn v16.2s, v16.2d" |
| elsif X86_64 |
| emit "vcvttpd2dq %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i32x4_relaxed_trunc_f64x2_u_zero, macro() |
| # i32x4.relaxed_trunc_f64x2_u_zero - truncate 2 f64 to unsigned i32, zero upper lanes |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "fcvtzu v16.2d, v16.2d" |
| emit "uqxtn v16.2s, v16.2d" |
| elsif X86_64 |
| # Relaxed semantics: simpler unsigned conversion |
| emit "vxorpd %xmm1, %xmm1, %xmm1" |
| emit "vmaxpd %xmm1, %xmm0, %xmm0" |
| # Use magic number conversion |
| emit "movabsq $0x4330000000000000, %rax" |
| emit "vmovq %rax, %xmm1" |
| emit "vpunpcklqdq %xmm1, %xmm1, %xmm1" |
| emit "vroundpd $3, %xmm0, %xmm0" |
| emit "vaddpd %xmm1, %xmm0, %xmm0" |
| emit "vshufps $0x88, %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_f32x4_relaxed_madd, macro() |
| # f32x4.relaxed_madd - fused multiply-add: a * b + c (or unfused) |
| popVec(v2) # c (addend) |
| popVec(v1) # b |
| popVec(v0) # a |
| if ARM64 or ARM64E |
| # fmla vd, vn, vm performs: vd = vd + vn * vm |
| # We want: a * b + c = v16 * v17 + v18 |
| emit "fmla v18.4s, v16.4s, v17.4s" |
| emit "mov v16.16b, v18.16b" |
| elsif X86_64 |
| # Use FMA if available, otherwise mul+add |
| # vfmadd213ps does: dest = (dest * src1) + src2 |
| # We have: xmm0=a, xmm1=b, xmm2=c, want: a*b+c |
| emit "vmulps %xmm1, %xmm0, %xmm0" |
| emit "vaddps %xmm2, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_f32x4_relaxed_nmadd, macro() |
| # f32x4.relaxed_nmadd - fused negative multiply-add: -(a * b) + c (or unfused) |
| popVec(v2) # c (addend) |
| popVec(v1) # b |
| popVec(v0) # a |
| if ARM64 or ARM64E |
| # fmls vd, vn, vm performs: vd = vd - vn * vm |
| # We want: -(a * b) + c = c - (a * b) = v18 - v16 * v17 |
| emit "fmls v18.4s, v16.4s, v17.4s" |
| emit "mov v16.16b, v18.16b" |
| elsif X86_64 |
| # vfnmadd213ps does: dest = -(dest * src1) + src2 |
| emit "vmulps %xmm1, %xmm0, %xmm0" |
| emit "vsubps %xmm0, %xmm2, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_f64x2_relaxed_madd, macro() |
| # f64x2.relaxed_madd - fused multiply-add for f64 |
| popVec(v2) # c (addend) |
| popVec(v1) # b |
| popVec(v0) # a |
| if ARM64 or ARM64E |
| emit "fmla v18.2d, v16.2d, v17.2d" |
| emit "mov v16.16b, v18.16b" |
| elsif X86_64 |
| emit "vmulpd %xmm1, %xmm0, %xmm0" |
| emit "vaddpd %xmm2, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_f64x2_relaxed_nmadd, macro() |
| # f64x2.relaxed_nmadd - fused negative multiply-add for f64 |
| popVec(v2) # c (addend) |
| popVec(v1) # b |
| popVec(v0) # a |
| if ARM64 or ARM64E |
| emit "fmls v18.2d, v16.2d, v17.2d" |
| emit "mov v16.16b, v18.16b" |
| elsif X86_64 |
| emit "vmulpd %xmm1, %xmm0, %xmm0" |
| emit "vsubpd %xmm0, %xmm2, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i8x16_relaxed_laneselect, macro() |
| # i8x16.relaxed_laneselect - select lanes based on mask (relaxed: may use top bit only) |
| popVec(v2) # mask (c) |
| popVec(v1) # b (false lanes) |
| popVec(v0) # a (true lanes) |
| if ARM64 or ARM64E |
| # bsl: dest = (dest & src1) | (~dest & src2) |
| # We want: (mask & a) | (~mask & b) = (c & a) | (~c & b) |
| # Put mask in dest, then bsl with a, b |
| emit "bsl v18.16b, v16.16b, v17.16b" |
| emit "mov v16.16b, v18.16b" |
| elsif X86_64 |
| # vpblendvb uses high bit of each byte in mask |
| emit "vpblendvb %xmm2, %xmm0, %xmm1, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i16x8_relaxed_laneselect, macro() |
| # i16x8.relaxed_laneselect - same as i8x16 (works on bits) |
| popVec(v2) # mask |
| popVec(v1) # b |
| popVec(v0) # a |
| if ARM64 or ARM64E |
| emit "bsl v18.16b, v16.16b, v17.16b" |
| emit "mov v16.16b, v18.16b" |
| elsif X86_64 |
| emit "vpblendvb %xmm2, %xmm0, %xmm1, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i32x4_relaxed_laneselect, macro() |
| # i32x4.relaxed_laneselect - same as i8x16 (works on bits) |
| popVec(v2) # mask |
| popVec(v1) # b |
| popVec(v0) # a |
| if ARM64 or ARM64E |
| emit "bsl v18.16b, v16.16b, v17.16b" |
| emit "mov v16.16b, v18.16b" |
| elsif X86_64 |
| emit "vpblendvb %xmm2, %xmm0, %xmm1, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i64x2_relaxed_laneselect, macro() |
| # i64x2.relaxed_laneselect - same as i8x16 (works on bits) |
| popVec(v2) # mask |
| popVec(v1) # b |
| popVec(v0) # a |
| if ARM64 or ARM64E |
| emit "bsl v18.16b, v16.16b, v17.16b" |
| emit "mov v16.16b, v18.16b" |
| elsif X86_64 |
| emit "vpblendvb %xmm2, %xmm0, %xmm1, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_f32x4_relaxed_min, macro() |
| # f32x4.relaxed_min - minimum (relaxed: NaN behavior is implementation defined) |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "fmin v16.4s, v16.4s, v17.4s" |
| elsif X86_64 |
| emit "vminps %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_f32x4_relaxed_max, macro() |
| # f32x4.relaxed_max - maximum (relaxed: NaN behavior is implementation defined) |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "fmax v16.4s, v16.4s, v17.4s" |
| elsif X86_64 |
| emit "vmaxps %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_f64x2_relaxed_min, macro() |
| # f64x2.relaxed_min - minimum for f64 |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "fmin v16.2d, v16.2d, v17.2d" |
| elsif X86_64 |
| emit "vminpd %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_f64x2_relaxed_max, macro() |
| # f64x2.relaxed_max - maximum for f64 |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "fmax v16.2d, v16.2d, v17.2d" |
| elsif X86_64 |
| emit "vmaxpd %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i16x8_relaxed_q15mulr_s, macro() |
| # i16x8.relaxed_q15mulr_s - Q15 multiply with rounding (relaxed: saturation behavior is implementation defined) |
| popVec(v1) |
| popVec(v0) |
| if ARM64 or ARM64E |
| emit "sqrdmulh v16.8h, v16.8h, v17.8h" |
| elsif X86_64 |
| emit "vpmulhrsw %xmm1, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i16x8_relaxed_dot_i8x16_i7x16_s, macro() |
| # i16x8.relaxed_dot_i8x16_i7x16_s - dot product of signed i8 and unsigned i7, producing i16 |
| popVec(v1) # b (interpreted as i7x16, which fits in unsigned byte) |
| popVec(v0) # a (signed i8x16) |
| if ARM64 or ARM64E |
| # ARM64 doesn't have a direct equivalent, use multiply-add sequence |
| # smull: signed multiply long (lower half) |
| # smull2: signed multiply long (upper half) |
| # addp: pairwise add |
| emit "smull v18.8h, v16.8b, v17.8b" |
| emit "smull2 v19.8h, v16.16b, v17.16b" |
| emit "addp v16.8h, v18.8h, v19.8h" |
| elsif X86_64 |
| # vpmaddubsw: treats first operand as unsigned, second as signed |
| # Swapped order: xmm0=signed, xmm1=unsigned-like |
| emit "vpmaddubsw %xmm0, %xmm1, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintOp(_simd_i32x4_relaxed_dot_i8x16_i7x16_add_s, macro() |
| # i32x4.relaxed_dot_i8x16_i7x16_add_s - dot product + add |
| # Computes: sum of (a[i] * b[i]) for groups of 4, then adds c |
| popVec(v2) # c (addend) |
| popVec(v1) # b |
| popVec(v0) # a |
| if ARM64E or ARM64 |
| # Fallback for generic ARM64 without guaranteed DotProd |
| emit "smull v19.8h, v16.8b, v17.8b" |
| emit "smull2 v20.8h, v16.16b, v17.16b" |
| emit "addp v19.8h, v19.8h, v20.8h" |
| emit "saddlp v19.4s, v19.8h" |
| emit "add v16.4s, v19.4s, v18.4s" |
| elsif X86_64 |
| # vpmaddubsw + vpmaddwd + vpaddd |
| emit "vpmaddubsw %xmm0, %xmm1, %xmm0" |
| # vpmaddwd to extend i16 pairs to i32 |
| emit "vpcmpeqd %xmm3, %xmm3, %xmm3" |
| emit "vpsrlw $15, %xmm3, %xmm3" |
| emit "vpmaddwd %xmm3, %xmm0, %xmm0" |
| emit "vpaddd %xmm2, %xmm0, %xmm0" |
| else |
| break # Not implemented |
| end |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| end) |
| |
| ######################### |
| ## Atomic instructions ## |
| ######################### |
| |
| macro noAlignmentCheck(mem, label) |
| end |
| |
| macro checkAlignment2(mem, label) |
| btpnz mem, 1, label |
| end |
| |
| macro checkAlignment4(mem, label) |
| btpnz mem, 3, label |
| end |
| |
| macro checkAlignment8(mem, label) |
| btpnz mem, 7, label |
| end |
| |
| macro weakCASLoopByte(mem, value, scratch1AndOldValue, scratch2, fn) |
| validateOpcodeConfig(scratch1AndOldValue) |
| if X86_64 |
| loadb [mem], scratch1AndOldValue |
| .loop: |
| move scratch1AndOldValue, scratch2 |
| fn(value, scratch2) |
| batomicweakcasb scratch1AndOldValue, scratch2, [mem], .loop |
| else |
| .loop: |
| loadlinkacqb [mem], scratch1AndOldValue |
| fn(value, scratch1AndOldValue, scratch2) |
| storecondrelb ws2, scratch2, [mem] |
| bineq ws2, 0, .loop |
| end |
| end |
| |
| macro weakCASLoopHalf(mem, value, scratch1AndOldValue, scratch2, fn) |
| validateOpcodeConfig(scratch1AndOldValue) |
| if X86_64 |
| loadh [mem], scratch1AndOldValue |
| .loop: |
| move scratch1AndOldValue, scratch2 |
| fn(value, scratch2) |
| batomicweakcash scratch1AndOldValue, scratch2, [mem], .loop |
| else |
| .loop: |
| loadlinkacqh [mem], scratch1AndOldValue |
| fn(value, scratch1AndOldValue, scratch2) |
| storecondrelh ws2, scratch2, [mem] |
| bineq ws2, 0, .loop |
| end |
| end |
| |
| macro weakCASLoopInt(mem, value, scratch1AndOldValue, scratch2, fn) |
| validateOpcodeConfig(scratch1AndOldValue) |
| if X86_64 |
| loadi [mem], scratch1AndOldValue |
| .loop: |
| move scratch1AndOldValue, scratch2 |
| fn(value, scratch2) |
| batomicweakcasi scratch1AndOldValue, scratch2, [mem], .loop |
| else |
| .loop: |
| loadlinkacqi [mem], scratch1AndOldValue |
| fn(value, scratch1AndOldValue, scratch2) |
| storecondreli ws2, scratch2, [mem] |
| bineq ws2, 0, .loop |
| end |
| end |
| |
| macro weakCASLoopQuad(mem, value, scratch1AndOldValue, scratch2, fn) |
| validateOpcodeConfig(scratch1AndOldValue) |
| if X86_64 |
| loadq [mem], scratch1AndOldValue |
| .loop: |
| move scratch1AndOldValue, scratch2 |
| fn(value, scratch2) |
| batomicweakcasq scratch1AndOldValue, scratch2, [mem], .loop |
| else |
| .loop: |
| loadlinkacqq [mem], scratch1AndOldValue |
| fn(value, scratch1AndOldValue, scratch2) |
| storecondrelq ws2, scratch2, [mem] |
| bineq ws2, 0, .loop |
| end |
| end |
| |
| macro doI32AtomicLoad(mem, dst) |
| checkAlignment4(mem, _ipint_throw_UnalignedMemoryAccess) |
| if ARM64 or ARM64E or X86_64 |
| atomicloadi [mem], dst |
| else |
| error |
| end |
| end |
| |
| macro doI64AtomicLoad(mem, dst) |
| checkAlignment8(mem, _ipint_throw_UnalignedMemoryAccess) |
| if ARM64 or ARM64E or X86_64 |
| atomicloadq [mem], dst |
| else |
| error |
| end |
| end |
| |
| macro doI32AtomicLoad8(mem, dst) |
| noAlignmentCheck(mem, _ipint_throw_UnalignedMemoryAccess) |
| if ARM64 or ARM64E or X86_64 |
| atomicloadb [mem], dst |
| else |
| error |
| end |
| end |
| |
| macro doI32AtomicLoad16(mem, dst) |
| checkAlignment2(mem, _ipint_throw_UnalignedMemoryAccess) |
| if ARM64 or ARM64E or X86_64 |
| atomicloadh [mem], dst |
| else |
| error |
| end |
| end |
| |
| macro doI64AtomicLoad8(mem, dst) |
| noAlignmentCheck(mem, _ipint_throw_UnalignedMemoryAccess) |
| if ARM64 or ARM64E or X86_64 |
| atomicloadb [mem], dst |
| else |
| error |
| end |
| end |
| |
| macro doI64AtomicLoad16(mem, dst) |
| checkAlignment2(mem, _ipint_throw_UnalignedMemoryAccess) |
| if ARM64 or ARM64E or X86_64 |
| atomicloadh [mem], dst |
| else |
| error |
| end |
| end |
| |
| macro doI64AtomicLoad32(mem, dst) |
| checkAlignment4(mem, _ipint_throw_UnalignedMemoryAccess) |
| if ARM64 or ARM64E or X86_64 |
| atomicloadi [mem], dst |
| else |
| error |
| end |
| end |
| |
| macro doI32AtomicStore(mem, val, memCopy, scratch) |
| checkAlignment4(mem, _ipint_throw_UnalignedMemoryAccess) |
| move mem, memCopy |
| if ARM64E |
| atomicxchgi val, [memCopy], val |
| elsif X86_64 |
| atomicxchgi val, [memCopy] |
| elsif ARM64 |
| weakCASLoopInt(memCopy, val, mem, scratch, macro(value, oldValue, newValue) |
| move value, newValue |
| end) |
| else |
| error |
| end |
| end |
| |
| macro doI64AtomicStore(mem, val, memCopy, scratch) |
| checkAlignment8(mem, _ipint_throw_UnalignedMemoryAccess) |
| move mem, memCopy |
| if ARM64E |
| atomicxchgq val, [memCopy], val |
| elsif X86_64 |
| atomicxchgq val, [memCopy] |
| elsif ARM64 |
| weakCASLoopQuad(memCopy, val, mem, scratch, macro(value, oldValue, newValue) |
| move value, newValue |
| end) |
| else |
| error |
| end |
| end |
| |
| macro doI32AtomicStore8(mem, val, memCopy, scratch) |
| noAlignmentCheck(mem, _ipint_throw_UnalignedMemoryAccess) |
| move mem, memCopy |
| if ARM64E |
| atomicxchgb val, [memCopy], val |
| elsif X86_64 |
| atomicxchgb val, [memCopy] |
| elsif ARM64 |
| weakCASLoopByte(memCopy, val, mem, scratch, macro(value, oldValue, newValue) |
| move value, newValue |
| end) |
| else |
| error |
| end |
| end |
| |
| macro doI32AtomicStore16(mem, val, memCopy, scratch) |
| checkAlignment2(mem, _ipint_throw_UnalignedMemoryAccess) |
| move mem, memCopy |
| if ARM64E |
| atomicxchgh val, [memCopy], val |
| elsif X86_64 |
| atomicxchgh val, [memCopy] |
| elsif ARM64 |
| weakCASLoopHalf(memCopy, val, mem, scratch, macro(value, oldValue, newValue) |
| move value, newValue |
| end) |
| else |
| error |
| end |
| end |
| |
| macro doI64AtomicStore8(mem, val, memCopy, scratch) |
| noAlignmentCheck(mem, _ipint_throw_UnalignedMemoryAccess) |
| move mem, memCopy |
| if ARM64E |
| atomicxchgb val, [memCopy], val |
| elsif X86_64 |
| atomicxchgb val, [memCopy] |
| elsif ARM64 |
| weakCASLoopByte(memCopy, val, mem, scratch, macro(value, oldValue, newValue) |
| move value, newValue |
| end) |
| else |
| error |
| end |
| end |
| |
| macro doI64AtomicStore16(mem, val, memCopy, scratch) |
| checkAlignment2(mem, _ipint_throw_UnalignedMemoryAccess) |
| move mem, memCopy |
| if ARM64E |
| atomicxchgh val, [memCopy], val |
| elsif X86_64 |
| atomicxchgh val, [memCopy] |
| elsif ARM64 |
| weakCASLoopHalf(memCopy, val, mem, scratch, macro(value, oldValue, newValue) |
| move value, newValue |
| end) |
| else |
| error |
| end |
| end |
| |
| macro doI64AtomicStore32(mem, val, memCopy, scratch) |
| checkAlignment4(mem, _ipint_throw_UnalignedMemoryAccess) |
| move mem, memCopy |
| if ARM64E |
| atomicxchgi val, [memCopy], val |
| elsif X86_64 |
| atomicxchgi val, [memCopy] |
| elsif ARM64 |
| weakCASLoopInt(memCopy, val, mem, scratch, macro(value, oldValue, newValue) |
| move value, newValue |
| end) |
| else |
| error |
| end |
| end |
| |
| macro doI32AtomicRmwAdd(mem, val, memCopy, scratch) |
| checkAlignment4(mem, _ipint_throw_UnalignedMemoryAccess) |
| move mem, memCopy |
| if ARM64E |
| atomicxchgaddi val, [memCopy], mem |
| elsif X86_64 |
| atomicxchgaddi val, [memCopy] |
| move val, mem |
| elsif ARM64 |
| weakCASLoopInt(memCopy, val, mem, scratch, macro(value, oldValue, newValue) |
| addi value, oldValue, newValue |
| end) |
| else |
| error |
| end |
| end |
| |
| macro doI64AtomicRmwAdd(mem, val, memCopy, scratch) |
| checkAlignment8(mem, _ipint_throw_UnalignedMemoryAccess) |
| move mem, memCopy |
| if ARM64E |
| atomicxchgaddq val, [memCopy], mem |
| elsif X86_64 |
| atomicxchgaddq val, [memCopy] |
| move val, mem |
| elsif ARM64 |
| weakCASLoopQuad(memCopy, val, mem, scratch, macro(value, oldValue, newValue) |
| addq value, oldValue, newValue |
| end) |
| else |
| error |
| end |
| end |
| |
| macro doI32AtomicRmwAdd8(mem, val, memCopy, scratch) |
| noAlignmentCheck(mem, _ipint_throw_UnalignedMemoryAccess) |
| move mem, memCopy |
| if ARM64E |
| atomicxchgaddb val, [memCopy], mem |
| elsif X86_64 |
| atomicxchgaddb val, [memCopy] |
| move val, mem |
| andi 0xff, mem |
| elsif ARM64 |
| weakCASLoopByte(memCopy, val, mem, scratch, macro(value, oldValue, newValue) |
| addi value, oldValue, newValue |
| end) |
| else |
| error |
| end |
| end |
| |
| macro doI32AtomicRmwAdd16(mem, val, memCopy, scratch) |
| checkAlignment2(mem, _ipint_throw_UnalignedMemoryAccess) |
| move mem, memCopy |
| if ARM64E |
| atomicxchgaddh val, [memCopy], mem |
| elsif X86_64 |
| atomicxchgaddh val, [memCopy] |
| move val, mem |
| andi 0xffff, mem |
| elsif ARM64 |
| weakCASLoopHalf(memCopy, val, mem, scratch, macro(value, oldValue, newValue) |
| addi value, oldValue, newValue |
| end) |
| else |
| error |
| end |
| end |
| |
| macro doI64AtomicRmwAdd8(mem, val, memCopy, scratch) |
| noAlignmentCheck(mem, _ipint_throw_UnalignedMemoryAccess) |
| move mem, memCopy |
| if ARM64E |
| atomicxchgaddb val, [memCopy], mem |
| elsif X86_64 |
| atomicxchgaddb val, [memCopy] |
| move val, mem |
| andi 0xff, mem |
| elsif ARM64 |
| weakCASLoopByte(memCopy, val, mem, scratch, macro(value, oldValue, newValue) |
| addi value, oldValue, newValue |
| end) |
| else |
| error |
| end |
| end |
| |
| macro doI64AtomicRmwAdd16(mem, val, memCopy, scratch) |
| checkAlignment2(mem, _ipint_throw_UnalignedMemoryAccess) |
| move mem, memCopy |
| if ARM64E |
| atomicxchgaddh val, [memCopy], mem |
| elsif X86_64 |
| atomicxchgaddh val, [memCopy] |
| move val, mem |
| andi 0xffff, mem |
| elsif ARM64 |
| weakCASLoopHalf(memCopy, val, mem, scratch, macro(value, oldValue, newValue) |
| addi value, oldValue, newValue |
| end) |
| else |
| error |
| end |
| end |
| |
| macro doI64AtomicRmwAdd32(mem, val, memCopy, scratch) |
| checkAlignment4(mem, _ipint_throw_UnalignedMemoryAccess) |
| move mem, memCopy |
| if ARM64E |
| atomicxchgaddi val, [memCopy], mem |
| elsif X86_64 |
| atomicxchgaddi val, [memCopy] |
| move val, mem |
| ori 0, mem |
| elsif ARM64 |
| weakCASLoopInt(memCopy, val, mem, scratch, macro(value, oldValue, newValue) |
| addi value, oldValue, newValue |
| end) |
| else |
| error |
| end |
| end |
| |
| macro doI32AtomicRmwSub(mem, val, memCopy, scratch) |
| checkAlignment4(mem, _ipint_throw_UnalignedMemoryAccess) |
| move mem, memCopy |
| if ARM64E |
| negi val |
| atomicxchgaddi val, [memCopy], mem |
| elsif X86_64 |
| negi val |
| atomicxchgaddi val, [memCopy] |
| move val, mem |
| elsif ARM64 |
| weakCASLoopInt(memCopy, val, mem, scratch, macro(value, oldValue, newValue) |
| subi oldValue, value, newValue |
| end) |
| else |
| error |
| end |
| end |
| |
| macro doI64AtomicRmwSub(mem, val, memCopy, scratch) |
| checkAlignment8(mem, _ipint_throw_UnalignedMemoryAccess) |
| move mem, memCopy |
| if ARM64E |
| negq val |
| atomicxchgaddq val, [memCopy], mem |
| elsif X86_64 |
| negq val |
| atomicxchgaddq val, [memCopy] |
| move val, mem |
| elsif ARM64 |
| weakCASLoopQuad(memCopy, val, mem, scratch, macro(value, oldValue, newValue) |
| subq oldValue, value, newValue |
| end) |
| else |
| error |
| end |
| end |
| |
| macro doI32AtomicRmwSub8(mem, val, memCopy, scratch) |
| noAlignmentCheck(mem, _ipint_throw_UnalignedMemoryAccess) |
| move mem, memCopy |
| if ARM64E |
| negi val |
| atomicxchgaddb val, [memCopy], mem |
| elsif X86_64 |
| negi val |
| atomicxchgaddb val, [memCopy] |
| move val, mem |
| andi 0xff, mem |
| elsif ARM64 |
| weakCASLoopByte(memCopy, val, mem, scratch, macro(value, oldValue, newValue) |
| subi oldValue, value, newValue |
| end) |
| else |
| error |
| end |
| end |
| |
| macro doI32AtomicRmwSub16(mem, val, memCopy, scratch) |
| checkAlignment2(mem, _ipint_throw_UnalignedMemoryAccess) |
| move mem, memCopy |
| if ARM64E |
| negi val |
| atomicxchgaddh val, [memCopy], mem |
| elsif X86_64 |
| negi val |
| atomicxchgaddh val, [memCopy] |
| move val, mem |
| andi 0xffff, mem |
| elsif ARM64 |
| weakCASLoopHalf(memCopy, val, mem, scratch, macro(value, oldValue, newValue) |
| subi oldValue, value, newValue |
| end) |
| else |
| error |
| end |
| end |
| |
| macro doI64AtomicRmwSub8(mem, val, memCopy, scratch) |
| noAlignmentCheck(mem, _ipint_throw_UnalignedMemoryAccess) |
| move mem, memCopy |
| if ARM64E |
| negq val |
| atomicxchgaddb val, [memCopy], mem |
| elsif X86_64 |
| negq val |
| atomicxchgaddb val, [memCopy] |
| move val, mem |
| andi 0xff, mem |
| elsif ARM64 |
| weakCASLoopByte(memCopy, val, mem, scratch, macro(value, oldValue, newValue) |
| subi oldValue, value, newValue |
| end) |
| else |
| error |
| end |
| end |
| |
| macro doI64AtomicRmwSub16(mem, val, memCopy, scratch) |
| checkAlignment2(mem, _ipint_throw_UnalignedMemoryAccess) |
| move mem, memCopy |
| if ARM64E |
| negq val |
| atomicxchgaddh val, [memCopy], mem |
| elsif X86_64 |
| negq val |
| atomicxchgaddh val, [memCopy] |
| move val, mem |
| andi 0xffff, mem |
| elsif ARM64 |
| weakCASLoopHalf(memCopy, val, mem, scratch, macro(value, oldValue, newValue) |
| subi oldValue, value, newValue |
| end) |
| else |
| error |
| end |
| end |
| |
| macro doI64AtomicRmwSub32(mem, val, memCopy, scratch) |
| checkAlignment4(mem, _ipint_throw_UnalignedMemoryAccess) |
| move mem, memCopy |
| if ARM64E |
| negq val |
| atomicxchgaddi val, [memCopy], mem |
| elsif X86_64 |
| negq val |
| atomicxchgaddi val, [memCopy] |
| move val, mem |
| ori 0, mem |
| elsif ARM64 |
| weakCASLoopInt(memCopy, val, mem, scratch, macro(value, oldValue, newValue) |
| subi oldValue, value, newValue |
| end) |
| else |
| error |
| end |
| end |
| |
| macro doI32AtomicRmwAnd(mem, val, memCopy, scratch) |
| checkAlignment4(mem, _ipint_throw_UnalignedMemoryAccess) |
| move mem, memCopy |
| if ARM64E |
| noti val |
| atomicxchgcleari val, [memCopy], mem |
| elsif X86_64 |
| weakCASLoopInt(memCopy, val, mem, scratch, macro (value, dst) |
| andq value, dst |
| end) |
| elsif ARM64 |
| weakCASLoopInt(memCopy, val, mem, scratch, macro(value, oldValue, newValue) |
| andi value, oldValue, newValue |
| end) |
| else |
| error |
| end |
| end |
| |
| macro doI64AtomicRmwAnd(mem, val, memCopy, scratch) |
| checkAlignment8(mem, _ipint_throw_UnalignedMemoryAccess) |
| move mem, memCopy |
| if ARM64E |
| notq val |
| atomicxchgclearq val, [memCopy], mem |
| elsif X86_64 |
| weakCASLoopQuad(memCopy, val, mem, scratch, macro (value, dst) |
| andq value, dst |
| end) |
| elsif ARM64 |
| weakCASLoopQuad(memCopy, val, mem, scratch, macro(value, oldValue, newValue) |
| andq value, oldValue, newValue |
| end) |
| else |
| error |
| end |
| end |
| |
| macro doI32AtomicRmwAnd8(mem, val, memCopy, scratch) |
| noAlignmentCheck(mem, _ipint_throw_UnalignedMemoryAccess) |
| move mem, memCopy |
| if ARM64E |
| noti val |
| atomicxchgclearb val, [memCopy], mem |
| elsif X86_64 |
| weakCASLoopByte(memCopy, val, mem, scratch, macro (value, dst) |
| andq value, dst |
| end) |
| elsif ARM64 |
| weakCASLoopByte(memCopy, val, mem, scratch, macro(value, oldValue, newValue) |
| andi value, oldValue, newValue |
| end) |
| else |
| error |
| end |
| end |
| |
| macro doI32AtomicRmwAnd16(mem, val, memCopy, scratch) |
| checkAlignment2(mem, _ipint_throw_UnalignedMemoryAccess) |
| move mem, memCopy |
| if ARM64E |
| noti val |
| atomicxchgclearh val, [memCopy], mem |
| elsif X86_64 |
| weakCASLoopHalf(memCopy, val, mem, scratch, macro (value, dst) |
| andq value, dst |
| end) |
| elsif ARM64 |
| weakCASLoopHalf(memCopy, val, mem, scratch, macro(value, oldValue, newValue) |
| andi value, oldValue, newValue |
| end) |
| else |
| error |
| end |
| end |
| |
| macro doI64AtomicRmwAnd8(mem, val, memCopy, scratch) |
| noAlignmentCheck(mem, _ipint_throw_UnalignedMemoryAccess) |
| move mem, memCopy |
| if ARM64E |
| notq val |
| atomicxchgclearb val, [memCopy], mem |
| elsif X86_64 |
| weakCASLoopByte(memCopy, val, mem, scratch, macro (value, dst) |
| andq value, dst |
| end) |
| elsif ARM64 |
| weakCASLoopByte(memCopy, val, mem, scratch, macro(value, oldValue, newValue) |
| andi value, oldValue, newValue |
| end) |
| else |
| error |
| end |
| end |
| |
| macro doI64AtomicRmwAnd16(mem, val, memCopy, scratch) |
| checkAlignment2(mem, _ipint_throw_UnalignedMemoryAccess) |
| move mem, memCopy |
| if ARM64E |
| notq val |
| atomicxchgclearh val, [memCopy], mem |
| elsif X86_64 |
| weakCASLoopHalf(memCopy, val, mem, scratch, macro (value, dst) |
| andq value, dst |
| end) |
| elsif ARM64 |
| weakCASLoopHalf(memCopy, val, mem, scratch, macro(value, oldValue, newValue) |
| andi value, oldValue, newValue |
| end) |
| else |
| error |
| end |
| end |
| |
| macro doI64AtomicRmwAnd32(mem, val, memCopy, scratch) |
| checkAlignment4(mem, _ipint_throw_UnalignedMemoryAccess) |
| move mem, memCopy |
| if ARM64E |
| notq val |
| atomicxchgcleari val, [memCopy], mem |
| elsif X86_64 |
| weakCASLoopInt(memCopy, val, mem, scratch, macro (value, dst) |
| andq value, dst |
| end) |
| elsif ARM64 |
| weakCASLoopInt(memCopy, val, mem, scratch, macro(value, oldValue, newValue) |
| andi value, oldValue, newValue |
| end) |
| else |
| error |
| end |
| end |
| |
| macro doI32AtomicRmwOr(mem, val, memCopy, scratch) |
| checkAlignment4(mem, _ipint_throw_UnalignedMemoryAccess) |
| move mem, memCopy |
| if ARM64E |
| atomicxchgori val, [memCopy], mem |
| elsif X86_64 |
| weakCASLoopInt(memCopy, val, mem, scratch, macro (value, dst) |
| ori value, dst |
| end) |
| elsif ARM64 |
| weakCASLoopInt(memCopy, val, mem, scratch, macro(value, oldValue, newValue) |
| ori value, oldValue, newValue |
| end) |
| else |
| error |
| end |
| end |
| |
| macro doI64AtomicRmwOr(mem, val, memCopy, scratch) |
| checkAlignment8(mem, _ipint_throw_UnalignedMemoryAccess) |
| move mem, memCopy |
| if ARM64E |
| atomicxchgorq val, [memCopy], mem |
| elsif X86_64 |
| weakCASLoopQuad(memCopy, val, mem, scratch, macro (value, dst) |
| orq value, dst |
| end) |
| elsif ARM64 |
| weakCASLoopQuad(memCopy, val, mem, scratch, macro(value, oldValue, newValue) |
| orq value, oldValue, newValue |
| end) |
| else |
| error |
| end |
| end |
| |
| macro doI32AtomicRmwOr8(mem, val, memCopy, scratch) |
| noAlignmentCheck(mem, _ipint_throw_UnalignedMemoryAccess) |
| move mem, memCopy |
| if ARM64E |
| atomicxchgorb val, [memCopy], mem |
| elsif X86_64 |
| weakCASLoopByte(memCopy, val, mem, scratch, macro (value, dst) |
| orq value, dst |
| end) |
| elsif ARM64 |
| weakCASLoopByte(memCopy, val, mem, scratch, macro(value, oldValue, newValue) |
| ori value, oldValue, newValue |
| end) |
| else |
| error |
| end |
| end |
| |
| macro doI32AtomicRmwOr16(mem, val, memCopy, scratch) |
| checkAlignment2(mem, _ipint_throw_UnalignedMemoryAccess) |
| move mem, memCopy |
| if ARM64E |
| atomicxchgorh val, [memCopy], mem |
| elsif X86_64 |
| weakCASLoopHalf(memCopy, val, mem, scratch, macro (value, dst) |
| orq value, dst |
| end) |
| elsif ARM64 |
| weakCASLoopHalf(memCopy, val, mem, scratch, macro(value, oldValue, newValue) |
| ori value, oldValue, newValue |
| end) |
| else |
| error |
| end |
| end |
| |
| macro doI64AtomicRmwOr8(mem, val, memCopy, scratch) |
| noAlignmentCheck(mem, _ipint_throw_UnalignedMemoryAccess) |
| move mem, memCopy |
| if ARM64E |
| atomicxchgorb val, [memCopy], mem |
| elsif X86_64 |
| weakCASLoopByte(memCopy, val, mem, scratch, macro (value, dst) |
| orq value, dst |
| end) |
| elsif ARM64 |
| weakCASLoopByte(memCopy, val, mem, scratch, macro(value, oldValue, newValue) |
| ori value, oldValue, newValue |
| end) |
| else |
| error |
| end |
| end |
| |
| macro doI64AtomicRmwOr16(mem, val, memCopy, scratch) |
| checkAlignment2(mem, _ipint_throw_UnalignedMemoryAccess) |
| move mem, memCopy |
| if ARM64E |
| atomicxchgorh val, [memCopy], mem |
| elsif X86_64 |
| weakCASLoopHalf(memCopy, val, mem, scratch, macro (value, dst) |
| orq value, dst |
| end) |
| elsif ARM64 |
| weakCASLoopHalf(memCopy, val, mem, scratch, macro(value, oldValue, newValue) |
| ori value, oldValue, newValue |
| end) |
| else |
| error |
| end |
| end |
| |
| macro doI64AtomicRmwOr32(mem, val, memCopy, scratch) |
| checkAlignment4(mem, _ipint_throw_UnalignedMemoryAccess) |
| move mem, memCopy |
| if ARM64E |
| atomicxchgori val, [memCopy], mem |
| elsif X86_64 |
| weakCASLoopInt(memCopy, val, mem, scratch, macro (value, dst) |
| orq value, dst |
| end) |
| elsif ARM64 |
| weakCASLoopInt(memCopy, val, mem, scratch, macro(value, oldValue, newValue) |
| ori value, oldValue, newValue |
| end) |
| else |
| error |
| end |
| end |
| |
| macro doI32AtomicRmwXor(mem, val, memCopy, scratch) |
| checkAlignment4(mem, _ipint_throw_UnalignedMemoryAccess) |
| move mem, memCopy |
| if ARM64E |
| atomicxchgxori val, [memCopy], mem |
| elsif X86_64 |
| weakCASLoopInt(memCopy, val, mem, scratch, macro (value, dst) |
| xorq value, dst |
| end) |
| elsif ARM64 |
| weakCASLoopInt(memCopy, val, mem, scratch, macro(value, oldValue, newValue) |
| xori value, oldValue, newValue |
| end) |
| else |
| error |
| end |
| end |
| |
| macro doI64AtomicRmwXor(mem, val, memCopy, scratch) |
| checkAlignment8(mem, _ipint_throw_UnalignedMemoryAccess) |
| move mem, memCopy |
| if ARM64E |
| atomicxchgxorq val, [memCopy], mem |
| elsif X86_64 |
| weakCASLoopQuad(memCopy, val, mem, scratch, macro (value, dst) |
| xorq value, dst |
| end) |
| elsif ARM64 |
| weakCASLoopQuad(memCopy, val, mem, scratch, macro(value, oldValue, newValue) |
| xorq value, oldValue, newValue |
| end) |
| else |
| error |
| end |
| end |
| |
| macro doI32AtomicRmwXor8(mem, val, memCopy, scratch) |
| noAlignmentCheck(mem, _ipint_throw_UnalignedMemoryAccess) |
| move mem, memCopy |
| if ARM64E |
| atomicxchgxorb val, [memCopy], mem |
| elsif X86_64 |
| weakCASLoopByte(memCopy, val, mem, scratch, macro (value, dst) |
| xorq value, dst |
| end) |
| elsif ARM64 |
| weakCASLoopByte(memCopy, val, mem, scratch, macro(value, oldValue, newValue) |
| xori value, oldValue, newValue |
| end) |
| else |
| error |
| end |
| end |
| |
| macro doI32AtomicRmwXor16(mem, val, memCopy, scratch) |
| checkAlignment2(mem, _ipint_throw_UnalignedMemoryAccess) |
| move mem, memCopy |
| if ARM64E |
| atomicxchgxorh val, [memCopy], mem |
| elsif X86_64 |
| weakCASLoopHalf(memCopy, val, mem, scratch, macro (value, dst) |
| xorq value, dst |
| end) |
| elsif ARM64 |
| weakCASLoopHalf(memCopy, val, mem, scratch, macro(value, oldValue, newValue) |
| xori value, oldValue, newValue |
| end) |
| else |
| error |
| end |
| end |
| |
| macro doI64AtomicRmwXor8(mem, val, memCopy, scratch) |
| noAlignmentCheck(mem, _ipint_throw_UnalignedMemoryAccess) |
| move mem, memCopy |
| if ARM64E |
| atomicxchgxorb val, [memCopy], mem |
| elsif X86_64 |
| weakCASLoopByte(memCopy, val, mem, scratch, macro (value, dst) |
| xorq value, dst |
| end) |
| elsif ARM64 |
| weakCASLoopByte(memCopy, val, mem, scratch, macro(value, oldValue, newValue) |
| xori value, oldValue, newValue |
| end) |
| else |
| error |
| end |
| end |
| |
| macro doI64AtomicRmwXor16(mem, val, memCopy, scratch) |
| checkAlignment2(mem, _ipint_throw_UnalignedMemoryAccess) |
| move mem, memCopy |
| if ARM64E |
| atomicxchgxorh val, [memCopy], mem |
| elsif X86_64 |
| weakCASLoopHalf(memCopy, val, mem, scratch, macro (value, dst) |
| xorq value, dst |
| end) |
| elsif ARM64 |
| weakCASLoopHalf(memCopy, val, mem, scratch, macro(value, oldValue, newValue) |
| xori value, oldValue, newValue |
| end) |
| else |
| error |
| end |
| end |
| |
| macro doI64AtomicRmwXor32(mem, val, memCopy, scratch) |
| checkAlignment4(mem, _ipint_throw_UnalignedMemoryAccess) |
| move mem, memCopy |
| if ARM64E |
| atomicxchgxori val, [memCopy], mem |
| elsif X86_64 |
| weakCASLoopInt(memCopy, val, mem, scratch, macro (value, dst) |
| xorq value, dst |
| end) |
| elsif ARM64 |
| weakCASLoopInt(memCopy, val, mem, scratch, macro(value, oldValue, newValue) |
| xori value, oldValue, newValue |
| end) |
| else |
| error |
| end |
| end |
| |
| macro doI32AtomicRmwXchg(mem, val, memCopy, scratch) |
| checkAlignment4(mem, _ipint_throw_UnalignedMemoryAccess) |
| move mem, memCopy |
| if ARM64E |
| atomicxchgi val, [memCopy], mem |
| elsif X86_64 |
| weakCASLoopInt(memCopy, val, mem, scratch, macro (value, dst) |
| move value, dst |
| end) |
| elsif ARM64 |
| weakCASLoopInt(memCopy, val, mem, scratch, macro(value, oldValue, newValue) |
| move value, newValue |
| end) |
| else |
| error |
| end |
| end |
| |
| macro doI64AtomicRmwXchg(mem, val, memCopy, scratch) |
| checkAlignment8(mem, _ipint_throw_UnalignedMemoryAccess) |
| move mem, memCopy |
| if ARM64E |
| atomicxchgq val, [memCopy], mem |
| elsif X86_64 |
| weakCASLoopQuad(memCopy, val, mem, scratch, macro (value, dst) |
| move value, dst |
| end) |
| elsif ARM64 |
| weakCASLoopQuad(memCopy, val, mem, scratch, macro(value, oldValue, newValue) |
| move value, newValue |
| end) |
| else |
| error |
| end |
| end |
| |
| macro doI32AtomicRmwXchg8(mem, val, memCopy, scratch) |
| noAlignmentCheck(mem, _ipint_throw_UnalignedMemoryAccess) |
| move mem, memCopy |
| if ARM64E |
| atomicxchgb val, [memCopy], mem |
| elsif X86_64 |
| weakCASLoopByte(memCopy, val, mem, scratch, macro (value, dst) |
| move value, dst |
| end) |
| elsif ARM64 |
| weakCASLoopByte(memCopy, val, mem, scratch, macro(value, oldValue, newValue) |
| move value, newValue |
| end) |
| else |
| error |
| end |
| end |
| |
| macro doI32AtomicRmwXchg16(mem, val, memCopy, scratch) |
| checkAlignment2(mem, _ipint_throw_UnalignedMemoryAccess) |
| move mem, memCopy |
| if ARM64E |
| atomicxchgh val, [memCopy], mem |
| elsif X86_64 |
| weakCASLoopHalf(memCopy, val, mem, scratch, macro (value, dst) |
| move value, dst |
| end) |
| elsif ARM64 |
| weakCASLoopHalf(memCopy, val, mem, scratch, macro(value, oldValue, newValue) |
| move value, newValue |
| end) |
| else |
| error |
| end |
| end |
| |
| macro doI64AtomicRmwXchg8(mem, val, memCopy, scratch) |
| noAlignmentCheck(mem, _ipint_throw_UnalignedMemoryAccess) |
| move mem, memCopy |
| if ARM64E |
| atomicxchgb val, [memCopy], mem |
| elsif X86_64 |
| weakCASLoopByte(memCopy, val, mem, scratch, macro (value, dst) |
| move value, dst |
| end) |
| elsif ARM64 |
| weakCASLoopByte(memCopy, val, mem, scratch, macro(value, oldValue, newValue) |
| move value, newValue |
| end) |
| else |
| error |
| end |
| end |
| |
| macro doI64AtomicRmwXchg16(mem, val, memCopy, scratch) |
| checkAlignment2(mem, _ipint_throw_UnalignedMemoryAccess) |
| move mem, memCopy |
| if ARM64E |
| atomicxchgh val, [memCopy], mem |
| elsif X86_64 |
| weakCASLoopHalf(memCopy, val, mem, scratch, macro (value, dst) |
| move value, dst |
| end) |
| elsif ARM64 |
| weakCASLoopHalf(memCopy, val, mem, scratch, macro(value, oldValue, newValue) |
| move value, newValue |
| end) |
| else |
| error |
| end |
| end |
| |
| macro doI64AtomicRmwXchg32(mem, val, memCopy, scratch) |
| checkAlignment4(mem, _ipint_throw_UnalignedMemoryAccess) |
| move mem, memCopy |
| if ARM64E |
| atomicxchgi val, [memCopy], mem |
| elsif X86_64 |
| weakCASLoopInt(memCopy, val, mem, scratch, macro (value, dst) |
| move value, dst |
| end) |
| elsif ARM64 |
| weakCASLoopInt(memCopy, val, mem, scratch, macro(value, oldValue, newValue) |
| move value, newValue |
| end) |
| else |
| error |
| end |
| end |
| |
| macro doI32AtomicCmpxchg(mem, expected, newVal, memCopy, scratch) |
| checkAlignment4(mem, _ipint_throw_UnalignedMemoryAccess) |
| move mem, memCopy |
| move expected, mem |
| andq 0xffffffff, mem |
| if ARM64E or X86_64 |
| atomicweakcasi mem, newVal, [memCopy] |
| elsif ARM64 |
| weakCASExchangeInt(memCopy, newVal, mem, scratch, expected) |
| else |
| error |
| end |
| end |
| |
| macro doI64AtomicCmpxchg(mem, expected, newVal, memCopy, scratch) |
| checkAlignment8(mem, _ipint_throw_UnalignedMemoryAccess) |
| move mem, memCopy |
| move expected, mem |
| if ARM64E or X86_64 |
| atomicweakcasq mem, newVal, [memCopy] |
| elsif ARM64 |
| weakCASExchangeQuad(memCopy, newVal, mem, scratch, expected) |
| else |
| error |
| end |
| end |
| |
| macro doI32AtomicCmpxchg8(mem, expected, newVal, memCopy, scratch) |
| noAlignmentCheck(mem, _ipint_throw_UnalignedMemoryAccess) |
| move mem, memCopy |
| move expected, mem |
| andq 0xff, mem |
| if ARM64E or X86_64 |
| atomicweakcasb mem, newVal, [memCopy] |
| elsif ARM64 |
| weakCASExchangeByte(memCopy, newVal, mem, scratch, expected) |
| else |
| error |
| end |
| end |
| |
| macro doI32AtomicCmpxchg16(mem, expected, newVal, memCopy, scratch) |
| checkAlignment2(mem, _ipint_throw_UnalignedMemoryAccess) |
| move mem, memCopy |
| move expected, mem |
| andq 0xffff, mem |
| if ARM64E or X86_64 |
| atomicweakcash mem, newVal, [memCopy] |
| elsif ARM64 |
| weakCASExchangeHalf(memCopy, newVal, mem, scratch, expected) |
| else |
| error |
| end |
| end |
| |
| macro doI64AtomicCmpxchg8(mem, expected, newVal, memCopy, scratch) |
| noAlignmentCheck(mem, _ipint_throw_UnalignedMemoryAccess) |
| move mem, memCopy |
| move expected, mem |
| andq 0xff, mem |
| if ARM64E or X86_64 |
| atomicweakcasb mem, newVal, [memCopy] |
| elsif ARM64 |
| weakCASExchangeByte(memCopy, newVal, mem, scratch, expected) |
| else |
| error |
| end |
| end |
| |
| macro doI64AtomicCmpxchg16(mem, expected, newVal, memCopy, scratch) |
| checkAlignment2(mem, _ipint_throw_UnalignedMemoryAccess) |
| move mem, memCopy |
| move expected, mem |
| andq 0xffff, mem |
| if ARM64E or X86_64 |
| atomicweakcash mem, newVal, [memCopy] |
| elsif ARM64 |
| weakCASExchangeHalf(memCopy, newVal, mem, scratch, expected) |
| else |
| error |
| end |
| end |
| |
| macro doI64AtomicCmpxchg32(mem, expected, newVal, memCopy, scratch) |
| checkAlignment4(mem, _ipint_throw_UnalignedMemoryAccess) |
| move mem, memCopy |
| move expected, mem |
| andq 0xffffffff, mem |
| if ARM64E or X86_64 |
| atomicweakcasi mem, newVal, [memCopy] |
| elsif ARM64 |
| weakCASExchangeInt(memCopy, newVal, mem, scratch, expected) |
| else |
| error |
| end |
| end |
| |
| ipintAtomicOp(_memory_atomic_notify, macro() |
| # starting at sp: count, pointer |
| loadb IPInt::AtomicMemoryAccessMetadata::memoryIndex[MC], t0 |
| pushInt32(t0) |
| loadq IPInt::AtomicMemoryAccessMetadata::offset[MC], t0 |
| pushInt32(t0) # offset |
| |
| move sp, a1 |
| |
| operationCall(macro() cCall2(_ipint_extern_memory_atomic_notify) end) |
| bilt r0, 0, _ipint_throw_OutOfBoundsMemoryAccess |
| |
| addq (StackValueSize * 4), sp |
| |
| pushInt32(r0) |
| loadb IPInt::AtomicMemoryAccessMetadata::instructionLength[MC], t0 |
| advancePCByReg(t0) |
| advanceMC(constexpr (sizeof(IPInt::AtomicMemoryAccessMetadata))) |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_memory_atomic_wait32, macro() |
| # starting at sp: timeout, value, pointer |
| loadb IPInt::AtomicMemoryAccessMetadata::memoryIndex[MC], t0 |
| pushInt32(t0) |
| loadq (StackValueSize * 3)[sp], t0 |
| loadq IPInt::AtomicMemoryAccessMetadata::offset[MC], t1 |
| addq t1, t0 |
| storeq t0, (StackValueSize * 3)[sp] # replace pointer with pointer + offset |
| |
| # Push callee/cfr/PC/MC for debugger; operands shift to args[4..7]. |
| subq (StackValueSize * 4), sp |
| storeq ws0, (StackValueSize * 0)[sp] # args[0] = IPIntCallee* |
| storeq cfr, (StackValueSize * 1)[sp] # args[1] = cfr |
| storeq PC, (StackValueSize * 2)[sp] # args[2] = PC |
| storeq MC, (StackValueSize * 3)[sp] # args[3] = MC |
| |
| move sp, a1 |
| |
| operationCall(macro() cCall2(_ipint_extern_memory_atomic_wait32) end) |
| bilt r0, 0, _ipint_throw_OutOfBoundsMemoryAccess |
| |
| addq (StackValueSize * 8), sp |
| |
| pushInt32(r0) |
| loadb IPInt::AtomicMemoryAccessMetadata::instructionLength[MC], t0 |
| advancePCByReg(t0) |
| advanceMC(constexpr (sizeof(IPInt::AtomicMemoryAccessMetadata))) |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_memory_atomic_wait64, macro() |
| # starting at sp: timeout, value, pointer |
| loadb IPInt::AtomicMemoryAccessMetadata::memoryIndex[MC], t0 |
| pushInt32(t0) |
| loadq (StackValueSize * 3)[sp], t0 |
| loadq IPInt::AtomicMemoryAccessMetadata::offset[MC], t1 |
| addq t1, t0 |
| storeq t0, (StackValueSize * 3)[sp] # replace pointer with pointer + offset |
| |
| # Push callee/cfr/PC/MC for debugger; operands shift to args[4..7]. |
| subq (StackValueSize * 4), sp |
| storeq ws0, (StackValueSize * 0)[sp] # args[0] = IPIntCallee* |
| storeq cfr, (StackValueSize * 1)[sp] # args[1] = cfr |
| storeq PC, (StackValueSize * 2)[sp] # args[2] = PC |
| storeq MC, (StackValueSize * 3)[sp] # args[3] = MC |
| |
| move sp, a1 |
| |
| operationCall(macro() cCall2(_ipint_extern_memory_atomic_wait64) end) |
| bilt r0, 0, _ipint_throw_OutOfBoundsMemoryAccess |
| |
| addq (StackValueSize * 8), sp |
| |
| pushInt32(r0) |
| loadb IPInt::AtomicMemoryAccessMetadata::instructionLength[MC], t0 |
| advancePCByReg(t0) |
| advanceMC(constexpr (sizeof(IPInt::AtomicMemoryAccessMetadata))) |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_atomic_fence, macro() |
| fence |
| leap 1[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| reservedAtomicOpcode(atomic_0x4) |
| reservedAtomicOpcode(atomic_0x5) |
| reservedAtomicOpcode(atomic_0x6) |
| reservedAtomicOpcode(atomic_0x7) |
| reservedAtomicOpcode(atomic_0x8) |
| reservedAtomicOpcode(atomic_0x9) |
| reservedAtomicOpcode(atomic_0xa) |
| reservedAtomicOpcode(atomic_0xb) |
| reservedAtomicOpcode(atomic_0xc) |
| reservedAtomicOpcode(atomic_0xd) |
| reservedAtomicOpcode(atomic_0xe) |
| reservedAtomicOpcode(atomic_0xf) |
| |
| ipintAtomicOp(_i32_atomic_load, macro() |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 4, t1, t2, .ipint_i32_atomic_load_slow_path) |
| doI32AtomicLoad(t0, t2) |
| pushInt32(t2) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_i64_atomic_load, macro() |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 8, t1, t2, .ipint_i64_atomic_load_slow_path) |
| doI64AtomicLoad(t0, t2) |
| pushInt64(t2) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_i32_atomic_load8_u, macro() |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 1, t1, t2, .ipint_i32_atomic_load8_u_slow_path) |
| doI32AtomicLoad8(t0, t2) |
| pushInt32(t2) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_i32_atomic_load16_u, macro() |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 2, t1, t2, .ipint_i32_atomic_load16_u_slow_path) |
| doI32AtomicLoad16(t0, t2) |
| pushInt32(t2) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_i64_atomic_load8_u, macro() |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 1, t1, t2, .ipint_i64_atomic_load8_u_slow_path) |
| doI64AtomicLoad8(t0, t2) |
| pushInt64(t2) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_i64_atomic_load16_u, macro() |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 2, t1, t2, .ipint_i64_atomic_load16_u_slow_path) |
| doI64AtomicLoad16(t0, t2) |
| pushInt64(t2) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_i64_atomic_load32_u, macro() |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 4, t1, t2, .ipint_i64_atomic_load32_u_slow_path) |
| doI64AtomicLoad32(t0, t2) |
| pushInt64(t2) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_i32_atomic_store, macro() |
| popInt64(t3) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 4, t1, t2, .ipint_i32_atomic_store_slow_path) |
| doI32AtomicStore(t0, t3, t2, t1) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_i64_atomic_store, macro() |
| popInt64(t3) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 8, t1, t2, .ipint_i64_atomic_store_slow_path) |
| doI64AtomicStore(t0, t3, t2, t1) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_i32_atomic_store8_u, macro() |
| popInt64(t3) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 1, t1, t2, .ipint_i32_atomic_store8_u_slow_path) |
| doI32AtomicStore8(t0, t3, t2, t1) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_i32_atomic_store16_u, macro() |
| popInt64(t3) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 2, t1, t2, .ipint_i32_atomic_store16_u_slow_path) |
| doI32AtomicStore16(t0, t3, t2, t1) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_i64_atomic_store8_u, macro() |
| popInt64(t3) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 1, t1, t2, .ipint_i64_atomic_store8_u_slow_path) |
| doI64AtomicStore8(t0, t3, t2, t1) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_i64_atomic_store16_u, macro() |
| popInt64(t3) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 2, t1, t2, .ipint_i64_atomic_store16_u_slow_path) |
| doI64AtomicStore16(t0, t3, t2, t1) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_i64_atomic_store32_u, macro() |
| popInt64(t3) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 4, t1, t2, .ipint_i64_atomic_store32_u_slow_path) |
| doI64AtomicStore32(t0, t3, t2, t1) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_i32_atomic_rmw_add, macro() |
| popInt64(t3) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 4, t1, t2, .ipint_i32_atomic_rmw_add_slow_path) |
| doI32AtomicRmwAdd(t0, t3, t2, t1) |
| pushInt32(t0) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_i64_atomic_rmw_add, macro() |
| popInt64(t3) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 8, t1, t2, .ipint_i64_atomic_rmw_add_slow_path) |
| doI64AtomicRmwAdd(t0, t3, t2, t1) |
| pushInt64(t0) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_i32_atomic_rmw8_add_u, macro() |
| popInt64(t3) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 1, t1, t2, .ipint_i32_atomic_rmw8_add_u_slow_path) |
| doI32AtomicRmwAdd8(t0, t3, t2, t1) |
| pushInt32(t0) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_i32_atomic_rmw16_add_u, macro() |
| popInt64(t3) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 2, t1, t2, .ipint_i32_atomic_rmw16_add_u_slow_path) |
| doI32AtomicRmwAdd16(t0, t3, t2, t1) |
| pushInt32(t0) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_i64_atomic_rmw8_add_u, macro() |
| popInt64(t3) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 1, t1, t2, .ipint_i64_atomic_rmw8_add_u_slow_path) |
| doI64AtomicRmwAdd8(t0, t3, t2, t1) |
| pushInt64(t0) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_i64_atomic_rmw16_add_u, macro() |
| popInt64(t3) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 2, t1, t2, .ipint_i64_atomic_rmw16_add_u_slow_path) |
| doI64AtomicRmwAdd16(t0, t3, t2, t1) |
| pushInt64(t0) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_i64_atomic_rmw32_add_u, macro() |
| popInt64(t3) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 4, t1, t2, .ipint_i64_atomic_rmw32_add_u_slow_path) |
| doI64AtomicRmwAdd32(t0, t3, t2, t1) |
| pushInt64(t0) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_i32_atomic_rmw_sub, macro() |
| popInt64(t3) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 4, t1, t2, .ipint_i32_atomic_rmw_sub_slow_path) |
| doI32AtomicRmwSub(t0, t3, t2, t1) |
| pushInt32(t0) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_i64_atomic_rmw_sub, macro() |
| popInt64(t3) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 8, t1, t2, .ipint_i64_atomic_rmw_sub_slow_path) |
| doI64AtomicRmwSub(t0, t3, t2, t1) |
| pushInt64(t0) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_i32_atomic_rmw8_sub_u, macro() |
| popInt64(t3) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 1, t1, t2, .ipint_i32_atomic_rmw8_sub_u_slow_path) |
| doI32AtomicRmwSub8(t0, t3, t2, t1) |
| pushInt32(t0) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_i32_atomic_rmw16_sub_u, macro() |
| popInt64(t3) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 2, t1, t2, .ipint_i32_atomic_rmw16_sub_u_slow_path) |
| doI32AtomicRmwSub16(t0, t3, t2, t1) |
| pushInt32(t0) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_i64_atomic_rmw8_sub_u, macro() |
| popInt64(t3) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 1, t1, t2, .ipint_i64_atomic_rmw8_sub_u_slow_path) |
| doI64AtomicRmwSub8(t0, t3, t2, t1) |
| pushInt64(t0) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_i64_atomic_rmw16_sub_u, macro() |
| popInt64(t3) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 2, t1, t2, .ipint_i64_atomic_rmw16_sub_u_slow_path) |
| doI64AtomicRmwSub16(t0, t3, t2, t1) |
| pushInt64(t0) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_i64_atomic_rmw32_sub_u, macro() |
| popInt64(t3) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 4, t1, t2, .ipint_i64_atomic_rmw32_sub_u_slow_path) |
| doI64AtomicRmwSub32(t0, t3, t2, t1) |
| pushInt64(t0) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_i32_atomic_rmw_and, macro() |
| popInt64(t3) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 4, t1, t2, .ipint_i32_atomic_rmw_and_slow_path) |
| doI32AtomicRmwAnd(t0, t3, t2, t1) |
| pushInt32(t0) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_i64_atomic_rmw_and, macro() |
| popInt64(t3) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 8, t1, t2, .ipint_i64_atomic_rmw_and_slow_path) |
| doI64AtomicRmwAnd(t0, t3, t2, t1) |
| pushInt64(t0) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_i32_atomic_rmw8_and_u, macro() |
| popInt64(t3) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 1, t1, t2, .ipint_i32_atomic_rmw8_and_u_slow_path) |
| doI32AtomicRmwAnd8(t0, t3, t2, t1) |
| pushInt32(t0) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_i32_atomic_rmw16_and_u, macro() |
| popInt64(t3) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 2, t1, t2, .ipint_i32_atomic_rmw16_and_u_slow_path) |
| doI32AtomicRmwAnd16(t0, t3, t2, t1) |
| pushInt32(t0) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_i64_atomic_rmw8_and_u, macro() |
| popInt64(t3) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 1, t1, t2, .ipint_i64_atomic_rmw8_and_u_slow_path) |
| doI64AtomicRmwAnd8(t0, t3, t2, t1) |
| pushInt64(t0) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_i64_atomic_rmw16_and_u, macro() |
| popInt64(t3) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 2, t1, t2, .ipint_i64_atomic_rmw16_and_u_slow_path) |
| doI64AtomicRmwAnd16(t0, t3, t2, t1) |
| pushInt64(t0) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_i64_atomic_rmw32_and_u, macro() |
| popInt64(t3) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 4, t1, t2, .ipint_i64_atomic_rmw32_and_u_slow_path) |
| doI64AtomicRmwAnd32(t0, t3, t2, t1) |
| pushInt64(t0) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_i32_atomic_rmw_or, macro() |
| popInt64(t3) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 4, t1, t2, .ipint_i32_atomic_rmw_or_slow_path) |
| doI32AtomicRmwOr(t0, t3, t2, t1) |
| pushInt32(t0) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_i64_atomic_rmw_or, macro() |
| popInt64(t3) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 8, t1, t2, .ipint_i64_atomic_rmw_or_slow_path) |
| doI64AtomicRmwOr(t0, t3, t2, t1) |
| pushInt64(t0) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_i32_atomic_rmw8_or_u, macro() |
| popInt64(t3) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 1, t1, t2, .ipint_i32_atomic_rmw8_or_u_slow_path) |
| doI32AtomicRmwOr8(t0, t3, t2, t1) |
| pushInt32(t0) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_i32_atomic_rmw16_or_u, macro() |
| popInt64(t3) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 2, t1, t2, .ipint_i32_atomic_rmw16_or_u_slow_path) |
| doI32AtomicRmwOr16(t0, t3, t2, t1) |
| pushInt32(t0) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_i64_atomic_rmw8_or_u, macro() |
| popInt64(t3) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 1, t1, t2, .ipint_i64_atomic_rmw8_or_u_slow_path) |
| doI64AtomicRmwOr8(t0, t3, t2, t1) |
| pushInt64(t0) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_i64_atomic_rmw16_or_u, macro() |
| popInt64(t3) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 2, t1, t2, .ipint_i64_atomic_rmw16_or_u_slow_path) |
| doI64AtomicRmwOr16(t0, t3, t2, t1) |
| pushInt64(t0) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_i64_atomic_rmw32_or_u, macro() |
| popInt64(t3) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 4, t1, t2, .ipint_i64_atomic_rmw32_or_u_slow_path) |
| doI64AtomicRmwOr32(t0, t3, t2, t1) |
| pushInt64(t0) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_i32_atomic_rmw_xor, macro() |
| popInt64(t3) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 4, t1, t2, .ipint_i32_atomic_rmw_xor_slow_path) |
| doI32AtomicRmwXor(t0, t3, t2, t1) |
| pushInt32(t0) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_i64_atomic_rmw_xor, macro() |
| popInt64(t3) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 8, t1, t2, .ipint_i64_atomic_rmw_xor_slow_path) |
| doI64AtomicRmwXor(t0, t3, t2, t1) |
| pushInt64(t0) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_i32_atomic_rmw8_xor_u, macro() |
| popInt64(t3) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 1, t1, t2, .ipint_i32_atomic_rmw8_xor_u_slow_path) |
| doI32AtomicRmwXor8(t0, t3, t2, t1) |
| pushInt32(t0) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_i32_atomic_rmw16_xor_u, macro() |
| popInt64(t3) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 2, t1, t2, .ipint_i32_atomic_rmw16_xor_u_slow_path) |
| doI32AtomicRmwXor16(t0, t3, t2, t1) |
| pushInt32(t0) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_i64_atomic_rmw8_xor_u, macro() |
| popInt64(t3) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 1, t1, t2, .ipint_i64_atomic_rmw8_xor_u_slow_path) |
| doI64AtomicRmwXor8(t0, t3, t2, t1) |
| pushInt64(t0) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_i64_atomic_rmw16_xor_u, macro() |
| popInt64(t3) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 2, t1, t2, .ipint_i64_atomic_rmw16_xor_u_slow_path) |
| doI64AtomicRmwXor16(t0, t3, t2, t1) |
| pushInt64(t0) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_i64_atomic_rmw32_xor_u, macro() |
| popInt64(t3) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 4, t1, t2, .ipint_i64_atomic_rmw32_xor_u_slow_path) |
| doI64AtomicRmwXor32(t0, t3, t2, t1) |
| pushInt64(t0) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_i32_atomic_rmw_xchg, macro() |
| popInt64(t3) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 4, t1, t2, .ipint_i32_atomic_rmw_xchg_slow_path) |
| doI32AtomicRmwXchg(t0, t3, t2, t1) |
| pushInt32(t0) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_i64_atomic_rmw_xchg, macro() |
| popInt64(t3) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 8, t1, t2, .ipint_i64_atomic_rmw_xchg_slow_path) |
| doI64AtomicRmwXchg(t0, t3, t2, t1) |
| pushInt64(t0) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_i32_atomic_rmw8_xchg_u, macro() |
| popInt64(t3) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 1, t1, t2, .ipint_i32_atomic_rmw8_xchg_u_slow_path) |
| doI32AtomicRmwXchg8(t0, t3, t2, t1) |
| pushInt32(t0) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_i32_atomic_rmw16_xchg_u, macro() |
| popInt64(t3) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 2, t1, t2, .ipint_i32_atomic_rmw16_xchg_u_slow_path) |
| doI32AtomicRmwXchg16(t0, t3, t2, t1) |
| pushInt32(t0) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_i64_atomic_rmw8_xchg_u, macro() |
| popInt64(t3) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 1, t1, t2, .ipint_i64_atomic_rmw8_xchg_u_slow_path) |
| doI64AtomicRmwXchg8(t0, t3, t2, t1) |
| pushInt64(t0) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_i64_atomic_rmw16_xchg_u, macro() |
| popInt64(t3) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 2, t1, t2, .ipint_i64_atomic_rmw16_xchg_u_slow_path) |
| doI64AtomicRmwXchg16(t0, t3, t2, t1) |
| pushInt64(t0) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_i64_atomic_rmw32_xchg_u, macro() |
| popInt64(t3) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 4, t1, t2, .ipint_i64_atomic_rmw32_xchg_u_slow_path) |
| doI64AtomicRmwXchg32(t0, t3, t2, t1) |
| pushInt64(t0) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| macro weakCASExchangeByte(mem, value, expected, scratch, scratch2) |
| if ARM64 |
| validateOpcodeConfig(scratch2) |
| .loop: |
| loadlinkacqb [mem], scratch2 |
| bqneq expected, scratch2, .fail |
| storecondrelb scratch, value, [mem] |
| bieq scratch, 0, .done |
| jmp .loop |
| .fail: |
| storecondrelb scratch, scratch2, [mem] |
| bieq scratch, 0, .done |
| jmp .loop |
| .done: |
| move scratch2, expected |
| else |
| error |
| end |
| end |
| |
| macro weakCASExchangeHalf(mem, value, expected, scratch, scratch2) |
| if ARM64 |
| validateOpcodeConfig(scratch2) |
| .loop: |
| loadlinkacqh [mem], scratch2 |
| bqneq expected, scratch2, .fail |
| storecondrelh scratch, value, [mem] |
| bieq scratch, 0, .done |
| jmp .loop |
| .fail: |
| storecondrelh scratch, scratch2, [mem] |
| bieq scratch, 0, .done |
| jmp .loop |
| .done: |
| move scratch2, expected |
| else |
| error |
| end |
| end |
| |
| macro weakCASExchangeInt(mem, value, expected, scratch, scratch2) |
| if ARM64 |
| validateOpcodeConfig(scratch2) |
| .loop: |
| loadlinkacqi [mem], scratch2 |
| bqneq expected, scratch2, .fail |
| storecondreli scratch, value, [mem] |
| bieq scratch, 0, .done |
| jmp .loop |
| .fail: |
| storecondreli scratch, scratch2, [mem] |
| bieq scratch, 0, .done |
| jmp .loop |
| .done: |
| move scratch2, expected |
| else |
| error |
| end |
| end |
| |
| macro weakCASExchangeQuad(mem, value, expected, scratch, scratch2) |
| if ARM64 |
| validateOpcodeConfig(scratch2) |
| .loop: |
| loadlinkacqq [mem], scratch2 |
| bqneq expected, scratch2, .fail |
| storecondrelq scratch, value, [mem] |
| bieq scratch, 0, .done |
| jmp .loop |
| .fail: |
| storecondrelq scratch, scratch2, [mem] |
| bieq scratch, 0, .done |
| jmp .loop |
| .done: |
| move scratch2, expected |
| else |
| error |
| end |
| end |
| |
| ipintAtomicOp(_i32_atomic_rmw_cmpxchg, macro() |
| popInt64(t7) |
| popInt64(t3) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 4, t1, t2, .ipint_i32_atomic_rmw_cmpxchg_slow_path) |
| doI32AtomicCmpxchg(t0, t3, t7, t2, t1) |
| pushInt32(t0) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_i64_atomic_rmw_cmpxchg, macro() |
| popInt64(t7) |
| popInt64(t3) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 8, t1, t2, .ipint_i64_atomic_rmw_cmpxchg_slow_path) |
| doI64AtomicCmpxchg(t0, t3, t7, t2, t1) |
| pushInt64(t0) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_i32_atomic_rmw8_cmpxchg_u, macro() |
| popInt64(t7) |
| popInt64(t3) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 1, t1, t2, .ipint_i32_atomic_rmw8_cmpxchg_u_slow_path) |
| doI32AtomicCmpxchg8(t0, t3, t7, t2, t1) |
| pushInt32(t0) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_i32_atomic_rmw16_cmpxchg_u, macro() |
| popInt64(t7) |
| popInt64(t3) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 2, t1, t2, .ipint_i32_atomic_rmw16_cmpxchg_u_slow_path) |
| doI32AtomicCmpxchg16(t0, t3, t7, t2, t1) |
| pushInt32(t0) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_i64_atomic_rmw8_cmpxchg_u, macro() |
| popInt64(t7) |
| popInt64(t3) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 1, t1, t2, .ipint_i64_atomic_rmw8_cmpxchg_u_slow_path) |
| doI64AtomicCmpxchg8(t0, t3, t7, t2, t1) |
| pushInt64(t0) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_i64_atomic_rmw16_cmpxchg_u, macro() |
| popInt64(t7) |
| popInt64(t3) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 2, t1, t2, .ipint_i64_atomic_rmw16_cmpxchg_u_slow_path) |
| doI64AtomicCmpxchg16(t0, t3, t7, t2, t1) |
| pushInt64(t0) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ipintAtomicOp(_i64_atomic_rmw32_cmpxchg_u, macro() |
| popInt64(t7) |
| popInt64(t3) |
| popMemoryIndex(t0) |
| loadStoreMakePointerFast([t4], 1[t4], t0, 4, t1, t2, .ipint_i64_atomic_rmw32_cmpxchg_u_slow_path) |
| doI64AtomicCmpxchg32(t0, t3, t7, t2, t1) |
| pushInt64(t0) |
| leap 2[t4], PC |
| nextIPIntInstruction() |
| end) |
| |
| ####################################### |
| ## ULEB128 decoding logic for locals ## |
| ####################################### |
| |
| .ipint_local_get_slow_path: |
| leap 1[PC], t4 |
| decodeLEBVarUInt(t0, t4, t1, t2) |
| localGet() |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_local_set_slow_path: |
| leap 1[PC], t4 |
| decodeLEBVarUInt(t0, t4, t1, t2) |
| popVec(v0) |
| localSet() |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_local_tee_slow_path: |
| leap 1[PC], t4 |
| decodeLEBVarUInt(t0, t4, t1, t2) |
| loadv [sp], v0 |
| localSet() |
| move t4, PC |
| nextIPIntInstruction() |
| |
| ########################################## |
| ## Out-of-line LEB128 decode slow paths ## |
| ########################################## |
| |
| .ipint_i32_const_slow_path: |
| leap 1[PC], t4 |
| decodeLEBVarSInt32(t0, t4, t1, t2) |
| pushInt32(t0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i64_const_slow_path: |
| leap 1[PC], t4 |
| decodeLEBVarSInt64(t0, t4, t1, t2) |
| pushInt64(t0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| ################################################## |
| ## Out-of-line slow paths for memory load/store ## |
| ################################################## |
| |
| # The handler's fast path pops values and branches here on multi-byte memarg. |
| # t0 = wasm address (from popMemoryIndex), t3 = data value (for int stores), |
| # ft0 = data value (for float stores). These must survive loadStoreMakePointerSlow. |
| # For int stores, t3 is saved/restored around the macro since t3 is used as scratch. |
| |
| .ipint_i32_load_mem_slow_path: |
| leap 1[PC], t4 |
| loadStoreMakePointerSlow(t4, t0, 4, t1, t2, t5, t6) |
| loadi [t0], t1 |
| pushInt32(t1) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i64_load_mem_slow_path: |
| leap 1[PC], t4 |
| loadStoreMakePointerSlow(t4, t0, 8, t1, t2, t5, t6) |
| loadq [t0], t1 |
| pushInt64(t1) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_f32_load_mem_slow_path: |
| leap 1[PC], t4 |
| loadStoreMakePointerSlow(t4, t0, 4, t1, t2, t5, t6) |
| loadf [t0], ft0 |
| pushFloat32(ft0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_f64_load_mem_slow_path: |
| leap 1[PC], t4 |
| loadStoreMakePointerSlow(t4, t0, 8, t1, t2, t5, t6) |
| loadd [t0], ft0 |
| pushFloat64(ft0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i32_load8s_mem_slow_path: |
| leap 1[PC], t4 |
| loadStoreMakePointerSlow(t4, t0, 1, t1, t2, t5, t6) |
| loadbsi [t0], t1 |
| pushInt32(t1) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i32_load8u_mem_slow_path: |
| leap 1[PC], t4 |
| loadStoreMakePointerSlow(t4, t0, 1, t1, t2, t5, t6) |
| loadb [t0], t1 |
| pushInt32(t1) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i32_load16s_mem_slow_path: |
| leap 1[PC], t4 |
| loadStoreMakePointerSlow(t4, t0, 2, t1, t2, t5, t6) |
| loadhsi [t0], t1 |
| pushInt32(t1) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i32_load16u_mem_slow_path: |
| leap 1[PC], t4 |
| loadStoreMakePointerSlow(t4, t0, 2, t1, t2, t5, t6) |
| loadh [t0], t1 |
| pushInt32(t1) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i64_load8s_mem_slow_path: |
| leap 1[PC], t4 |
| loadStoreMakePointerSlow(t4, t0, 1, t1, t2, t5, t6) |
| loadbsq [t0], t1 |
| pushInt64(t1) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i64_load8u_mem_slow_path: |
| leap 1[PC], t4 |
| loadStoreMakePointerSlow(t4, t0, 1, t1, t2, t5, t6) |
| loadb [t0], t1 |
| pushInt64(t1) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i64_load16s_mem_slow_path: |
| leap 1[PC], t4 |
| loadStoreMakePointerSlow(t4, t0, 2, t1, t2, t5, t6) |
| loadhsq [t0], t1 |
| pushInt64(t1) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i64_load16u_mem_slow_path: |
| leap 1[PC], t4 |
| loadStoreMakePointerSlow(t4, t0, 2, t1, t2, t5, t6) |
| loadh [t0], t1 |
| pushInt64(t1) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i64_load32s_mem_slow_path: |
| leap 1[PC], t4 |
| loadStoreMakePointerSlow(t4, t0, 4, t1, t2, t5, t6) |
| loadi [t0], t1 |
| sxi2q t1, t1 |
| pushInt64(t1) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i64_load32u_mem_slow_path: |
| leap 1[PC], t4 |
| loadStoreMakePointerSlow(t4, t0, 4, t1, t2, t5, t6) |
| loadi [t0], t1 |
| pushInt64(t1) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i32_store_mem_slow_path: |
| leap 1[PC], t4 |
| loadStoreMakePointerSlow(t4, t0, 4, t1, t2, t5, t6) |
| storei t3, [t0] |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i64_store_mem_slow_path: |
| leap 1[PC], t4 |
| loadStoreMakePointerSlow(t4, t0, 8, t1, t2, t5, t6) |
| storeq t3, [t0] |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_f32_store_mem_slow_path: |
| leap 1[PC], t4 |
| loadStoreMakePointerSlow(t4, t0, 4, t1, t2, t5, t6) |
| storef ft0, [t0] |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_f64_store_mem_slow_path: |
| leap 1[PC], t4 |
| loadStoreMakePointerSlow(t4, t0, 8, t1, t2, t5, t6) |
| stored ft0, [t0] |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i32_store8_mem_slow_path: |
| leap 1[PC], t4 |
| loadStoreMakePointerSlow(t4, t0, 1, t1, t2, t5, t6) |
| storeb t3, [t0] |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i32_store16_mem_slow_path: |
| leap 1[PC], t4 |
| loadStoreMakePointerSlow(t4, t0, 2, t1, t2, t5, t6) |
| storeh t3, [t0] |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i64_store8_mem_slow_path: |
| leap 1[PC], t4 |
| loadStoreMakePointerSlow(t4, t0, 1, t1, t2, t5, t6) |
| storeb t3, [t0] |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i64_store16_mem_slow_path: |
| leap 1[PC], t4 |
| loadStoreMakePointerSlow(t4, t0, 2, t1, t2, t5, t6) |
| storeh t3, [t0] |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i64_store32_mem_slow_path: |
| leap 1[PC], t4 |
| loadStoreMakePointerSlow(t4, t0, 4, t1, t2, t5, t6) |
| storei t3, [t0] |
| move t4, PC |
| nextIPIntInstruction() |
| |
| ################################################### |
| ## Out-of-line slow paths for SIMD memory access ## |
| ################################################### |
| |
| # t0 = wasm address (from popMemoryIndex before branching). |
| # t4 = cursor pointing to start of memarg (past SIMD opcode, set by simd_prefix). |
| # After loadStoreMakePointerSlow, t4 points past the memarg. |
| |
| .simd_v128_load_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 16, t1, t2, t5, t6) |
| loadv [t0], v0 |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .simd_v128_load_8x8s_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 8, t1, t2, t5, t6) |
| simdLoad8x8s() |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .simd_v128_load_8x8u_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 8, t1, t2, t5, t6) |
| simdLoad8x8u() |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .simd_v128_load_16x4s_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 8, t1, t2, t5, t6) |
| simdLoad16x4s() |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .simd_v128_load_16x4u_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 8, t1, t2, t5, t6) |
| simdLoad16x4u() |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .simd_v128_load_32x2s_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 8, t1, t2, t5, t6) |
| simdLoad32x2s() |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .simd_v128_load_32x2u_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 8, t1, t2, t5, t6) |
| simdLoad32x2u() |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .simd_v128_load8_splat_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 1, t1, t2, t5, t6) |
| simdLoadSplat8() |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .simd_v128_load16_splat_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 2, t1, t2, t5, t6) |
| simdLoadSplat16() |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .simd_v128_load32_splat_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 4, t1, t2, t5, t6) |
| simdLoadSplat32() |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .simd_v128_load64_splat_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 8, t1, t2, t5, t6) |
| simdLoadSplat64() |
| pushVec(v0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .simd_v128_store_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 16, t1, t2, t5, t6) |
| storev v0, [t0] |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .simd_v128_load32_zero_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 4, t1, t2, t5, t6) |
| loadi [t0], t0 |
| subp V128ISize, sp |
| storei t0, [sp] |
| storei 0, 4[sp] |
| storeq 0, 8[sp] |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .simd_v128_load64_zero_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 8, t1, t2, t5, t6) |
| loadq [t0], t0 |
| subp V128ISize, sp |
| storeq t0, [sp] |
| storeq 0, 8[sp] |
| move t4, PC |
| nextIPIntInstruction() |
| |
| # Load lane slow paths: v0 = vector (already popped), t0 = wasm addr. |
| # t4 points past memarg after loadStoreMakePointerSlow. Lane index is at [t4]. |
| |
| .simd_v128_load8_lane_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 1, t1, t2, t5, t6) |
| loadb [t0], t0 |
| loadb [t4], t1 |
| andi ImmLaneIdx16Mask, t1 |
| pushVec(v0) |
| storeb t0, [sp, t1] |
| leap 1[t4], PC |
| nextIPIntInstruction() |
| |
| .simd_v128_load16_lane_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 2, t1, t2, t5, t6) |
| loadh [t0], t0 |
| loadb [t4], t1 |
| andi ImmLaneIdx8Mask, t1 |
| pushVec(v0) |
| storeh t0, [sp, t1, 2] |
| leap 1[t4], PC |
| nextIPIntInstruction() |
| |
| .simd_v128_load32_lane_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 4, t1, t2, t5, t6) |
| loadi [t0], t0 |
| loadb [t4], t1 |
| andi ImmLaneIdx4Mask, t1 |
| pushVec(v0) |
| storei t0, [sp, t1, 4] |
| leap 1[t4], PC |
| nextIPIntInstruction() |
| |
| .simd_v128_load64_lane_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 8, t1, t2, t5, t6) |
| loadq [t0], t0 |
| loadb [t4], t1 |
| andi ImmLaneIdx2Mask, t1 |
| pushVec(v0) |
| storeq t0, [sp, t1, 8] |
| leap 1[t4], PC |
| nextIPIntInstruction() |
| |
| # Store lane slow paths: v0 = vector (already popped), t0 = wasm addr. |
| # t4 points past memarg. Lane index is at [t4]. |
| |
| .simd_v128_store8_lane_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 1, t1, t2, t5, t6) |
| loadb [t4], t1 |
| andi ImmLaneIdx16Mask, t1 |
| pushVec(v0) |
| loadb [sp, t1], t1 |
| addp V128ISize, sp |
| storeb t1, [t0] |
| leap 1[t4], PC |
| nextIPIntInstruction() |
| |
| .simd_v128_store16_lane_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 2, t1, t2, t5, t6) |
| loadb [t4], t1 |
| andi ImmLaneIdx8Mask, t1 |
| pushVec(v0) |
| loadh [sp, t1, 2], t1 |
| addp V128ISize, sp |
| storeh t1, [t0] |
| leap 1[t4], PC |
| nextIPIntInstruction() |
| |
| .simd_v128_store32_lane_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 4, t1, t2, t5, t6) |
| loadb [t4], t1 |
| andi ImmLaneIdx4Mask, t1 |
| pushVec(v0) |
| loadi [sp, t1, 4], t1 |
| addp V128ISize, sp |
| storei t1, [t0] |
| leap 1[t4], PC |
| nextIPIntInstruction() |
| |
| .simd_v128_store64_lane_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 8, t1, t2, t5, t6) |
| loadb [t4], t1 |
| andi ImmLaneIdx2Mask, t1 |
| pushVec(v0) |
| loadq [sp, t1, 8], t1 |
| addp V128ISize, sp |
| storeq t1, [t0] |
| leap 1[t4], PC |
| nextIPIntInstruction() |
| |
| ######################################################### |
| ## Out-of-line slow paths for atomic memory operations ## |
| ######################################################### |
| |
| # t0 = wasm address (from popMemoryIndex before branching). |
| # t4 = cursor pointing to start of memarg (past atomic sub-opcode, set by atomic_prefix). |
| # t3 = data value (for store/RMW ops, survives loadStoreMakePointerSlow). |
| # t7 = new value for CAS (must be push/popped around loadStoreMakePointerSlow). |
| # After loadStoreMakePointerSlow, t4 points past the memarg. |
| |
| .ipint_i32_atomic_load_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 4, t1, t2, t5, t6) |
| doI32AtomicLoad(t0, t2) |
| pushInt32(t2) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i64_atomic_load_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 8, t1, t2, t5, t6) |
| doI64AtomicLoad(t0, t2) |
| pushInt64(t2) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i32_atomic_load8_u_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 1, t1, t2, t5, t6) |
| doI32AtomicLoad8(t0, t2) |
| pushInt32(t2) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i32_atomic_load16_u_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 2, t1, t2, t5, t6) |
| doI32AtomicLoad16(t0, t2) |
| pushInt32(t2) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i64_atomic_load8_u_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 1, t1, t2, t5, t6) |
| doI64AtomicLoad8(t0, t2) |
| pushInt64(t2) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i64_atomic_load16_u_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 2, t1, t2, t5, t6) |
| doI64AtomicLoad16(t0, t2) |
| pushInt64(t2) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i64_atomic_load32_u_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 4, t1, t2, t5, t6) |
| doI64AtomicLoad32(t0, t2) |
| pushInt64(t2) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i32_atomic_store_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 4, t1, t2, t5, t6) |
| doI32AtomicStore(t0, t3, t2, t1) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i64_atomic_store_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 8, t1, t2, t5, t6) |
| doI64AtomicStore(t0, t3, t2, t1) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i32_atomic_store8_u_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 1, t1, t2, t5, t6) |
| doI32AtomicStore8(t0, t3, t2, t1) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i32_atomic_store16_u_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 2, t1, t2, t5, t6) |
| doI32AtomicStore16(t0, t3, t2, t1) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i64_atomic_store8_u_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 1, t1, t2, t5, t6) |
| doI64AtomicStore8(t0, t3, t2, t1) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i64_atomic_store16_u_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 2, t1, t2, t5, t6) |
| doI64AtomicStore16(t0, t3, t2, t1) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i64_atomic_store32_u_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 4, t1, t2, t5, t6) |
| doI64AtomicStore32(t0, t3, t2, t1) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i32_atomic_rmw_add_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 4, t1, t2, t5, t6) |
| doI32AtomicRmwAdd(t0, t3, t2, t1) |
| pushInt32(t0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i64_atomic_rmw_add_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 8, t1, t2, t5, t6) |
| doI64AtomicRmwAdd(t0, t3, t2, t1) |
| pushInt64(t0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i32_atomic_rmw8_add_u_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 1, t1, t2, t5, t6) |
| doI32AtomicRmwAdd8(t0, t3, t2, t1) |
| pushInt32(t0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i32_atomic_rmw16_add_u_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 2, t1, t2, t5, t6) |
| doI32AtomicRmwAdd16(t0, t3, t2, t1) |
| pushInt32(t0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i64_atomic_rmw8_add_u_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 1, t1, t2, t5, t6) |
| doI64AtomicRmwAdd8(t0, t3, t2, t1) |
| pushInt64(t0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i64_atomic_rmw16_add_u_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 2, t1, t2, t5, t6) |
| doI64AtomicRmwAdd16(t0, t3, t2, t1) |
| pushInt64(t0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i64_atomic_rmw32_add_u_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 4, t1, t2, t5, t6) |
| doI64AtomicRmwAdd32(t0, t3, t2, t1) |
| pushInt64(t0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i32_atomic_rmw_sub_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 4, t1, t2, t5, t6) |
| doI32AtomicRmwSub(t0, t3, t2, t1) |
| pushInt32(t0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i64_atomic_rmw_sub_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 8, t1, t2, t5, t6) |
| doI64AtomicRmwSub(t0, t3, t2, t1) |
| pushInt64(t0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i32_atomic_rmw8_sub_u_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 1, t1, t2, t5, t6) |
| doI32AtomicRmwSub8(t0, t3, t2, t1) |
| pushInt32(t0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i32_atomic_rmw16_sub_u_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 2, t1, t2, t5, t6) |
| doI32AtomicRmwSub16(t0, t3, t2, t1) |
| pushInt32(t0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i64_atomic_rmw8_sub_u_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 1, t1, t2, t5, t6) |
| doI64AtomicRmwSub8(t0, t3, t2, t1) |
| pushInt64(t0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i64_atomic_rmw16_sub_u_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 2, t1, t2, t5, t6) |
| doI64AtomicRmwSub16(t0, t3, t2, t1) |
| pushInt64(t0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i64_atomic_rmw32_sub_u_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 4, t1, t2, t5, t6) |
| doI64AtomicRmwSub32(t0, t3, t2, t1) |
| pushInt64(t0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i32_atomic_rmw_and_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 4, t1, t2, t5, t6) |
| doI32AtomicRmwAnd(t0, t3, t2, t1) |
| pushInt32(t0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i64_atomic_rmw_and_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 8, t1, t2, t5, t6) |
| doI64AtomicRmwAnd(t0, t3, t2, t1) |
| pushInt64(t0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i32_atomic_rmw8_and_u_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 1, t1, t2, t5, t6) |
| doI32AtomicRmwAnd8(t0, t3, t2, t1) |
| pushInt32(t0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i32_atomic_rmw16_and_u_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 2, t1, t2, t5, t6) |
| doI32AtomicRmwAnd16(t0, t3, t2, t1) |
| pushInt32(t0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i64_atomic_rmw8_and_u_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 1, t1, t2, t5, t6) |
| doI64AtomicRmwAnd8(t0, t3, t2, t1) |
| pushInt64(t0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i64_atomic_rmw16_and_u_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 2, t1, t2, t5, t6) |
| doI64AtomicRmwAnd16(t0, t3, t2, t1) |
| pushInt64(t0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i64_atomic_rmw32_and_u_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 4, t1, t2, t5, t6) |
| doI64AtomicRmwAnd32(t0, t3, t2, t1) |
| pushInt64(t0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i32_atomic_rmw_or_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 4, t1, t2, t5, t6) |
| doI32AtomicRmwOr(t0, t3, t2, t1) |
| pushInt32(t0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i64_atomic_rmw_or_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 8, t1, t2, t5, t6) |
| doI64AtomicRmwOr(t0, t3, t2, t1) |
| pushInt64(t0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i32_atomic_rmw8_or_u_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 1, t1, t2, t5, t6) |
| doI32AtomicRmwOr8(t0, t3, t2, t1) |
| pushInt32(t0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i32_atomic_rmw16_or_u_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 2, t1, t2, t5, t6) |
| doI32AtomicRmwOr16(t0, t3, t2, t1) |
| pushInt32(t0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i64_atomic_rmw8_or_u_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 1, t1, t2, t5, t6) |
| doI64AtomicRmwOr8(t0, t3, t2, t1) |
| pushInt64(t0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i64_atomic_rmw16_or_u_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 2, t1, t2, t5, t6) |
| doI64AtomicRmwOr16(t0, t3, t2, t1) |
| pushInt64(t0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i64_atomic_rmw32_or_u_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 4, t1, t2, t5, t6) |
| doI64AtomicRmwOr32(t0, t3, t2, t1) |
| pushInt64(t0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i32_atomic_rmw_xor_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 4, t1, t2, t5, t6) |
| doI32AtomicRmwXor(t0, t3, t2, t1) |
| pushInt32(t0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i64_atomic_rmw_xor_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 8, t1, t2, t5, t6) |
| doI64AtomicRmwXor(t0, t3, t2, t1) |
| pushInt64(t0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i32_atomic_rmw8_xor_u_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 1, t1, t2, t5, t6) |
| doI32AtomicRmwXor8(t0, t3, t2, t1) |
| pushInt32(t0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i32_atomic_rmw16_xor_u_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 2, t1, t2, t5, t6) |
| doI32AtomicRmwXor16(t0, t3, t2, t1) |
| pushInt32(t0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i64_atomic_rmw8_xor_u_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 1, t1, t2, t5, t6) |
| doI64AtomicRmwXor8(t0, t3, t2, t1) |
| pushInt64(t0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i64_atomic_rmw16_xor_u_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 2, t1, t2, t5, t6) |
| doI64AtomicRmwXor16(t0, t3, t2, t1) |
| pushInt64(t0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i64_atomic_rmw32_xor_u_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 4, t1, t2, t5, t6) |
| doI64AtomicRmwXor32(t0, t3, t2, t1) |
| pushInt64(t0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i32_atomic_rmw_xchg_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 4, t1, t2, t5, t6) |
| doI32AtomicRmwXchg(t0, t3, t2, t1) |
| pushInt32(t0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i64_atomic_rmw_xchg_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 8, t1, t2, t5, t6) |
| doI64AtomicRmwXchg(t0, t3, t2, t1) |
| pushInt64(t0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i32_atomic_rmw8_xchg_u_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 1, t1, t2, t5, t6) |
| doI32AtomicRmwXchg8(t0, t3, t2, t1) |
| pushInt32(t0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i32_atomic_rmw16_xchg_u_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 2, t1, t2, t5, t6) |
| doI32AtomicRmwXchg16(t0, t3, t2, t1) |
| pushInt32(t0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i64_atomic_rmw8_xchg_u_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 1, t1, t2, t5, t6) |
| doI64AtomicRmwXchg8(t0, t3, t2, t1) |
| pushInt64(t0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i64_atomic_rmw16_xchg_u_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 2, t1, t2, t5, t6) |
| doI64AtomicRmwXchg16(t0, t3, t2, t1) |
| pushInt64(t0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i64_atomic_rmw32_xchg_u_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 4, t1, t2, t5, t6) |
| doI64AtomicRmwXchg32(t0, t3, t2, t1) |
| pushInt64(t0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i32_atomic_rmw_cmpxchg_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 4, t1, t2, t5, t6) |
| doI32AtomicCmpxchg(t0, t3, t7, t2, t1) |
| pushInt32(t0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i64_atomic_rmw_cmpxchg_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 8, t1, t2, t5, t6) |
| doI64AtomicCmpxchg(t0, t3, t7, t2, t1) |
| pushInt64(t0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i32_atomic_rmw8_cmpxchg_u_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 1, t1, t2, t5, t6) |
| doI32AtomicCmpxchg8(t0, t3, t7, t2, t1) |
| pushInt32(t0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i32_atomic_rmw16_cmpxchg_u_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 2, t1, t2, t5, t6) |
| doI32AtomicCmpxchg16(t0, t3, t7, t2, t1) |
| pushInt32(t0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i64_atomic_rmw8_cmpxchg_u_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 1, t1, t2, t5, t6) |
| doI64AtomicCmpxchg8(t0, t3, t7, t2, t1) |
| pushInt64(t0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i64_atomic_rmw16_cmpxchg_u_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 2, t1, t2, t5, t6) |
| doI64AtomicCmpxchg16(t0, t3, t7, t2, t1) |
| pushInt64(t0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| .ipint_i64_atomic_rmw32_cmpxchg_u_slow_path: |
| loadStoreMakePointerSlow(t4, t0, 4, t1, t2, t5, t6) |
| doI64AtomicCmpxchg32(t0, t3, t7, t2, t1) |
| pushInt64(t0) |
| move t4, PC |
| nextIPIntInstruction() |
| |
| ################################## |
| ## "Out of line" logic for call ## |
| ################################## |
| |
| const mintSS = sc1 |
| |
| macro mintPop(reg) |
| loadq [mintSS], reg |
| addq V128ISize, mintSS |
| end |
| |
| macro mintPopV(reg) |
| loadv [mintSS], reg |
| addq V128ISize, mintSS |
| end |
| |
| macro mintArgDispatch() |
| loadb [MC], sc0 |
| addq 1, MC |
| bigteq sc0, (constexpr IPInt::CallArgumentBytecode::NumOpcodes), _ipint_mint_arg_dispatch_err |
| lshiftq (constexpr (WTF::fastLog2(JSC::IPInt::alignMInt))), sc0 |
| if ARM64 or ARM64E |
| pcrtoaddr _mint_begin, csr4 |
| addq sc0, csr4 |
| jmp csr4 |
| elsif X86_64 |
| pcrtoaddr _mint_begin, PC |
| addq PC, sc0 |
| jmp sc0 |
| end |
| end |
| |
| macro mintRetDispatch() |
| loadb [MC], sc0 |
| addq 1, MC |
| bigteq sc0, (constexpr IPInt::CallResultBytecode::NumOpcodes), _ipint_mint_ret_dispatch_err |
| lshiftq (constexpr (WTF::fastLog2(JSC::IPInt::alignMInt))), sc0 |
| if ARM64 or ARM64E |
| pcrtoaddr _mint_begin_return, csr4 |
| addq sc0, csr4 |
| jmp csr4 |
| elsif X86_64 |
| pcrtoaddr _mint_begin_return, PC |
| addq PC, sc0 |
| jmp sc0 |
| end |
| end |
| |
| .ipint_call_common: |
| # we need to do some planning ahead to not step on our own values later |
| # step 1: save all the stuff we had earlier |
| # step 2: calling |
| # - if we have more results than arguments, we need to move our stack pointer up in advance, or else |
| # pushing 16B values to the stack will overtake cleaning up 8B return values. we get this value from |
| # CallSignatureMetadata::numExtraResults |
| # - set up the stack frame (with size CallSignatureMetadata::stackFrameSize) |
| # step 2.5: saving registers: |
| # - push our important data onto the stack here, after the saved space |
| # step 3: jump to called function |
| # - swap out instances, reload memory, and call |
| # step 4: returning |
| # - pop the registers from step 2.5 |
| # - we've left enough space for us to push our new values starting at the original stack pointer now! yay! |
| |
| # Free up r0 to be used as argument register |
| |
| const targetEntrypoint = sc2 |
| const targetInstance = sc3 |
| |
| move r0, targetEntrypoint |
| move r1, targetInstance |
| |
| const extraSpaceForReturns = t0 |
| const stackFrameSize = t1 |
| const numArguments = t2 |
| |
| loadi IPInt::CallSignatureMetadata::stackFrameSize[MC], stackFrameSize |
| loadh IPInt::CallSignatureMetadata::numExtraResults[MC], extraSpaceForReturns |
| mulq StackValueSize, extraSpaceForReturns |
| loadh IPInt::CallSignatureMetadata::numArguments[MC], numArguments |
| mulq StackValueSize, numArguments |
| advanceMC(constexpr (sizeof(IPInt::CallSignatureMetadata))) |
| |
| # calculate the SP after popping all arguments |
| move sp, t3 |
| addp numArguments, t3 |
| |
| # (down = decreasing address) |
| # <first non-arg> <- t3 = SP after all arguments |
| # arg |
| # ... |
| # arg |
| # arg <- initial SP (wasm stack) |
| |
| # store sp as our shadow stack for arguments later |
| move sp, t4 |
| # make extra space if necessary |
| subp extraSpaceForReturns, sp |
| |
| # <first non-arg> <- t3 |
| # arg |
| # ... |
| # arg |
| # arg <- t4 = initial SP (wasm stack) |
| # reserved |
| # reserved <- sp |
| |
| # save t3 as a frame-relative value so stack data can be moved easily for JSPI |
| # t3 is not used after this |
| subp cfr, t3 |
| push t3, PC |
| push t3, wasmInstance |
| |
| # set up the call frame |
| move sp, t2 |
| subp stackFrameSize, sp |
| |
| # <first non-arg> <- first_non_arg_addr |
| # arg |
| # ... |
| # arg |
| # arg <- t4 = initial SP (wasm stack) |
| # reserved |
| # reserved |
| # (first_non_arg_addr - cfr), PC |
| # unused, wasmInstance <- t2 = native argument stack (pushed by mINT) |
| # call frame |
| # call frame |
| # call frame |
| # call frame |
| # call frame |
| # call frame <- sp |
| |
| # set up the Callee slot |
| storeq IPIntCallCallee, Callee - CallerFrameAndPCSize[sp] |
| storep IPIntCallFunctionSlot, CodeBlock - CallerFrameAndPCSize[sp] |
| |
| push targetEntrypoint, targetInstance |
| |
| move t2, sc3 |
| move t4, mintSS |
| |
| # need a common entrypoint because of x86 PC base |
| jmp .ipint_mint_arg_dispatch |
| |
| .ipint_tail_call_common: |
| # Check if we need to insert a restore frame for cross-instance tail calls. |
| # Registers on entry: |
| # r0 = entrypoint, r1 = targetInstance, wasmInstance = current instance, |
| # t3 = callerStackArgSize, IPIntCallCallee = callee, IPIntCallFunctionSlot = func info |
| # Scratch: t4, t5. Do not clobber anything else. |
| # On x86_64 r1==t2 (both rdx) and on ARM64 r1==t1, so neither t2 nor t1 |
| # may be used freely. The ARM64 copy loop uses t2 for pair loads; that is |
| # safe because r1==t1 there (t2 is a distinct register). |
| bpeq r1, wasmInstance, .ipint_tail_call_no_restore_frame |
| |
| loadp ReturnPC[cfr], t4 |
| removeCodePtrTag t4 |
| if ARM64E |
| leap _g_config, t5 |
| loadp JSCConfigGateMapOffset + (constexpr Gate::wasmRestoreFrame) * PtrSize[t5], t5 |
| removeCodePtrTag t5 |
| else |
| pcrtoaddr _wasm_restore_frame_return, t5 |
| end |
| bpeq t4, t5, .ipint_tail_call_no_restore_frame |
| |
| const RestoreFrameSize = constexpr Wasm::RestoreFrameCallee::restoreFrameSizeInBytes |
| |
| # Step 1: Write the restore frame at cfr + 16 + t3. |
| # cfr hasn't shifted yet, so this is cfr_old + (FirstArgumentOffset - RestoreFrameSize) + t3. |
| leap (FirstArgumentOffset - RestoreFrameSize)[cfr, t3], t4 # t4 = restore_cfr |
| |
| loadp [cfr], t5 |
| storep t5, [t4] # originalCallerFrame |
| if ARM64E |
| loadp ReturnPC[cfr], lr |
| addp CallerFrameAndPCSize, cfr, t5 |
| untagReturnAddress t5 |
| addp CallerFrameAndPCSize, t4, t5 |
| tagReturnAddress t5 |
| storep lr, ReturnPC[t4] # originalReturnPC (resigned for restore_cfr) |
| else |
| loadp ReturnPC[cfr], t5 |
| storep t5, ReturnPC[t4] # originalReturnPC |
| end |
| storep wasmInstance, CodeBlock[t4] # saved instance |
| leap _g_restoreFrameCalleeBoxed, t5 |
| loadp [t5], t5 |
| storep t5, Callee[t4] # RestoreFrameCallee |
| |
| # Step 2: Copy [sp, cfr) down by RestoreFrameSize bytes. |
| move sp, t4 |
| |
| # Move sp down now so don't write below it in the copy loop. |
| subp RestoreFrameSize, sp |
| |
| .ipint_restore_frame_copy_loop: |
| bpaeq t4, cfr, .ipint_restore_frame_copy_loop_done |
| if ARM64 or ARM64E |
| # t2 is safe here because r1==t1 on ARM64 (t2 is x2, a distinct register). |
| loadpairq [t4], t2, t5 |
| storepairq t2, t5, -RestoreFrameSize[t4] |
| elsif X86_64 |
| loadp [t4], t5 |
| storep t5, -RestoreFrameSize[t4] |
| loadp 8[t4], t5 |
| storep t5, (8 - RestoreFrameSize)[t4] |
| end |
| addp 16, t4 |
| jmp .ipint_restore_frame_copy_loop |
| |
| .ipint_restore_frame_copy_loop_done: |
| |
| # Step 3: Shift cfr down by 32, sp was handled before the copy loop. |
| subp RestoreFrameSize, cfr |
| |
| # Step 4: Write redirect at cfr_new. |
| # restore_cfr = cfr_new + FirstArgumentOffset + t3 |
| leap FirstArgumentOffset[cfr, t3], t4 # t4 = restore_cfr |
| storep t4, [cfr] # CallerFrame = restore_cfr |
| |
| if ARM64E |
| leap _g_config, t4 |
| loadp JSCConfigGateMapOffset + (constexpr Gate::wasmRestoreFrame) * PtrSize[t4], t4 |
| removeCodePtrTag t4 |
| addp CallerFrameAndPCSize, cfr, t5 |
| tagCodePtr t4, t5 |
| else |
| pcrtoaddr _wasm_restore_frame_return, t4 |
| end |
| storep t4, ReturnPC[cfr] # ReturnPC = wasmRestoreFrame gate |
| |
| .ipint_tail_call_no_restore_frame: |
| # Free up r0 to be used as argument register |
| |
| # <caller frame> |
| # return val |
| # return val |
| # argument |
| # argument |
| # argument |
| # argument |
| # call frame |
| # call frame <- cfr |
| # (IPInt locals) |
| # (IPInt stack) |
| # argument 0 |
| # ... |
| # argument n-1 |
| # argument n <- sp |
| |
| # sc1 = target callee => wasmInstance to free up sc1 |
| const savedCallee = wasmInstance |
| |
| # store entrypoint and target instance on the stack for now |
| push r0, r1 |
| push IPIntCallCallee, IPIntCallFunctionSlot |
| |
| # keep the top of IPInt stack in sc1 as shadow stack |
| move sp, sc1 |
| # we pushed four values previously, so offset for this |
| addq 32, sc1 |
| |
| # <caller frame> |
| # return val |
| # return val |
| # argument |
| # argument |
| # argument |
| # argument |
| # call frame |
| # call frame <- cfr |
| # (IPInt locals) |
| # (IPInt stack) |
| # argument 0 |
| # ... |
| # argument n-1 |
| # argument n <- sc1 |
| # entrypoint, targetInstance |
| # callee, function info <- sp |
| |
| # determine the location to begin copying stack arguments, starting from the last |
| move cfr, sc2 |
| addp FirstArgumentOffset, sc2 |
| addp t3, sc2 # t3 = callerStackArgSize from the metadata |
| |
| # <caller frame> <- sc2 |
| # return val |
| # return val |
| # argument |
| # argument |
| # argument |
| # argument |
| # call frame |
| # call frame <- cfr |
| # (IPInt locals) |
| # (IPInt stack) |
| # argument 0 |
| # ... |
| # argument n-1 |
| # argument n <- sc1 |
| # entrypoint, targetInstance |
| # callee, function info <- sp |
| |
| # get saved MC and PC |
| |
| if ARM64 or ARM64E |
| loadpairq -0x10[cfr], t0, t1 |
| elsif X86_64 or RISCV64 |
| loadp -0x8[cfr], t1 |
| loadp -0x10[cfr], t0 |
| end |
| |
| push t0, t1 |
| |
| # store the return address and CFR on the stack so we don't lose it |
| loadp ReturnPC[cfr], t0 |
| loadp [cfr], t1 |
| |
| push t0, t1 |
| |
| # <caller frame> <- sc2 |
| # return val |
| # return val |
| # argument |
| # argument |
| # argument |
| # argument |
| # call frame |
| # call frame <- cfr |
| # (IPInt locals) |
| # (IPInt stack) |
| # argument 0 |
| # ... |
| # argument n-1 |
| # argument n <- sc1 |
| # entrypoint, targetInstance |
| # callee, function info |
| # saved MC/PC |
| # return address, saved CFR <- sp |
| |
| .ipint_mint_arg_dispatch: |
| // We've already validateOpcodeConfig() in all the Wasm call opcodes. |
| mintArgDispatch() |
| |
| # tail calls reuse most of mINT's argument logic, but exit into a different tail call stub. |
| # we use sc2 to keep the new stack frame |
| |
| mintAlign(_a0) |
| _mint_begin: |
| mintPop(a0) |
| mintArgDispatch() |
| |
| mintAlign(_a1) |
| mintPop(a1) |
| mintArgDispatch() |
| |
| mintAlign(_a2) |
| if ARM64 or ARM64E or X86_64 |
| mintPop(a2) |
| mintArgDispatch() |
| else |
| break |
| end |
| |
| mintAlign(_a3) |
| if ARM64 or ARM64E or X86_64 |
| mintPop(a3) |
| mintArgDispatch() |
| else |
| break |
| end |
| |
| mintAlign(_a4) |
| if ARM64 or ARM64E or X86_64 |
| mintPop(a4) |
| mintArgDispatch() |
| else |
| break |
| end |
| |
| mintAlign(_a5) |
| if ARM64 or ARM64E or X86_64 |
| mintPop(a5) |
| mintArgDispatch() |
| else |
| break |
| end |
| |
| mintAlign(_a6) |
| if ARM64 or ARM64E |
| mintPop(a6) |
| mintArgDispatch() |
| else |
| break |
| end |
| |
| mintAlign(_a7) |
| if ARM64 or ARM64E |
| mintPop(a7) |
| mintArgDispatch() |
| else |
| break |
| end |
| |
| mintAlign(_fa0) |
| mintPopV(wfa0) |
| mintArgDispatch() |
| |
| mintAlign(_fa1) |
| mintPopV(wfa1) |
| mintArgDispatch() |
| |
| mintAlign(_fa2) |
| mintPopV(wfa2) |
| mintArgDispatch() |
| |
| mintAlign(_fa3) |
| mintPopV(wfa3) |
| mintArgDispatch() |
| |
| mintAlign(_fa4) |
| mintPopV(wfa4) |
| mintArgDispatch() |
| |
| mintAlign(_fa5) |
| mintPopV(wfa5) |
| mintArgDispatch() |
| |
| mintAlign(_fa6) |
| mintPopV(wfa6) |
| mintArgDispatch() |
| |
| mintAlign(_fa7) |
| mintPopV(wfa7) |
| mintArgDispatch() |
| |
| # Note that the regular call and tail call opcodes will be implemented slightly differently. |
| # Regular calls have to save space for return values, while tail calls are reusing the stack frame |
| # and thus do not have to care. |
| |
| # CallArgumentBytecode::CallArgDecSP (0x10) |
| mintAlign(_call_argument_dec_sp) |
| subp 2 * SlotSize, sc3 |
| mintArgDispatch() |
| |
| # CallArgumentBytecode::CallArgStore0 (0x11) |
| mintAlign(_call_argument_store_0) |
| mintPop(sc2) |
| storeq sc2, [sc3] |
| mintArgDispatch() |
| |
| # CallArgumentBytecode::CallArgDecSPStore8 (0x12) |
| mintAlign(_call_argument_dec_sp_store_8) |
| mintPop(sc2) |
| subp 2 * SlotSize, sc3 |
| storeq sc2, 8[sc3] |
| mintArgDispatch() |
| |
| # CallArgumentBytecode::CallArgDecSPStoreVector0 (0x13) |
| mintAlign(_call_argument_dec_sp_store_vector_0) |
| subp 2 * SlotSize, sc3 |
| loadq [mintSS], sc2 |
| storeq sc2, [sc3] |
| loadq 8[mintSS], sc2 |
| storeq sc2, 8[sc3] |
| addq StackValueSize, mintSS |
| mintArgDispatch() |
| |
| # CallArgumentBytecode::TailCallArgDecSPStoreVector8 (0x14) |
| mintAlign(_call_argument_dec_sp_store_vector_8) |
| subp 2 * SlotSize, sc3 |
| loadq [mintSS], sc2 |
| storeq sc2, 8[sc3] |
| loadq 8[mintSS], sc2 |
| storeq sc2, 16[sc3] |
| addq StackValueSize, mintSS |
| mintArgDispatch() |
| |
| # For tail calls, we're writing into the same frame. We're going to first push stack arguments onto the stack. |
| # Once we're done, we'll copy them back down into the new frame, to avoid having to deal with writing over |
| # arguments lower down on the stack. |
| |
| # CallArgumentBytecode::TailCallArgDecSP (0x15) |
| mintAlign(_tail_call_argument_dec_sp) |
| subp 2 * SlotSize, sp |
| mintArgDispatch() |
| |
| # CallArgumentBytecode::TailCallArgStore0 (0x16) |
| mintAlign(_tail_call_argument_store_0) |
| mintPop(sc3) |
| storeq sc3, [sp] |
| mintArgDispatch() |
| |
| # CallArgumentBytecode::TailCallArgDecSPStore8 (0x17) |
| mintAlign(_tail_call_argument_dec_sp_store_8) |
| mintPop(sc3) |
| subp 2 * SlotSize, sp |
| storeq sc3, 8[sp] |
| mintArgDispatch() |
| |
| # CallArgumentBytecode::TailCallArgDecSPStoreVector0 (0x18) |
| mintAlign(_tail_call_argument_dec_sp_store_vector_0) |
| subp 2 * SlotSize, sp |
| loadq [mintSS], sc3 |
| storeq sc3, [sp] |
| loadq 8[mintSS], sc3 |
| storeq sc3, 8[sp] |
| addq StackValueSize, mintSS |
| mintArgDispatch() |
| |
| # CallArgumentBytecode::TailCallArgDecSPStoreVector8 (0x19) |
| mintAlign(_tail_call_argument_dec_sp_store_vector_8) |
| subp 2 * SlotSize, sp |
| loadq [mintSS], sc3 |
| storeq sc3, 8[sp] |
| loadq 8[mintSS], sc3 |
| storeq sc3, 16[sp] |
| addq StackValueSize, mintSS |
| mintArgDispatch() |
| |
| # CallArgumentBytecode::TailCall (0x1a) |
| mintAlign(_tail_call) |
| jmp .ipint_perform_tail_call |
| |
| # CallArgumentBytecode::Call (0x1b) |
| mintAlign(_call) |
| pop wasmInstance, ws0 |
| |
| # Save stack pointer, if we tail call someone who changes the frame above's stack argument size. |
| # Store its value relative to cfp so stack frames can be easily relocated for JSPI. |
| move sp, sc1 |
| subp cfr, sc1 |
| storep sc1, ThisArgumentOffset[cfr] |
| |
| # Set up memory |
| ipintReloadMemory(ws1) |
| |
| # Make the call |
| if ARM64E |
| leap _g_config, ws1 |
| jmp JSCConfigGateMapOffset + (constexpr Gate::wasm_ipint_call) * PtrSize[ws1], NativeToJITGatePtrTag # WasmEntryPtrTag |
| end |
| |
| _wasm_trampoline_wasm_ipint_call: |
| _wasm_trampoline_wasm_ipint_call_wide16: |
| _wasm_trampoline_wasm_ipint_call_wide32: |
| call ws0, WasmEntryPtrTag |
| |
| _wasm_ipint_call_return_location: |
| _wasm_ipint_call_return_location_wide16: |
| _wasm_ipint_call_return_location_wide32: |
| # Compute sc3 (pointing to saved caller info) using the saved SP value, |
| # without restoring SP yet. We need callee's SP to read stack results |
| # which are at the bottom of the arg/result area (SP + headerSize). |
| loadi IPInt::CallReturnMetadata::stackFrameSize[MC], sc3 |
| loadp ThisArgumentOffset[cfr], sc0 |
| addp cfr, sc0 |
| addp sc0, sc3 |
| |
| const mintRetSrc = sc1 |
| const mintRetDst = sc2 |
| |
| # mintRetSrc: read stack results from the callee's SP (current SP) |
| loadi IPInt::CallReturnMetadata::firstStackResultSPOffset[MC], mintRetSrc |
| advanceMC(IPInt::CallReturnMetadata::resultBytecode) |
| leap [sp, mintRetSrc], mintRetSrc |
| |
| # load (first_non_arg_addr - cfr) from the stack and make it absolute |
| if ARM64 or ARM64E |
| loadp (2 * SlotSize)[sc3], mintRetDst |
| elsif X86_64 |
| loadp (3 * SlotSize)[sc3], mintRetDst |
| end |
| addp cfr, mintRetDst |
| |
| // We've already validateOpcodeConfig() in all the Wasm call opcodes, and |
| // that is the only way to get here. |
| mintRetDispatch() |
| |
| mintAlign(_r0) |
| _mint_begin_return: |
| subp StackValueSize, mintRetDst |
| storeq wa0, [mintRetDst] |
| mintRetDispatch() |
| |
| mintAlign(_r1) |
| subp StackValueSize, mintRetDst |
| storeq wa1, [mintRetDst] |
| mintRetDispatch() |
| |
| mintAlign(_r2) |
| if ARM64 or ARM64E or X86_64 |
| subp StackValueSize, mintRetDst |
| storeq wa2, [mintRetDst] |
| mintRetDispatch() |
| else |
| break |
| end |
| |
| mintAlign(_r3) |
| if ARM64 or ARM64E or X86_64 |
| subp StackValueSize, mintRetDst |
| storeq wa3, [mintRetDst] |
| mintRetDispatch() |
| else |
| break |
| end |
| |
| mintAlign(_r4) |
| if ARM64 or ARM64E or X86_64 |
| subp StackValueSize, mintRetDst |
| storeq wa4, [mintRetDst] |
| mintRetDispatch() |
| else |
| break |
| end |
| |
| mintAlign(_r5) |
| if ARM64 or ARM64E or X86_64 |
| subp StackValueSize, mintRetDst |
| storeq wa5, [mintRetDst] |
| mintRetDispatch() |
| else |
| break |
| end |
| |
| mintAlign(_r6) |
| if ARM64 or ARM64E |
| subp StackValueSize, mintRetDst |
| storeq wa6, [mintRetDst] |
| mintRetDispatch() |
| else |
| break |
| end |
| |
| mintAlign(_r7) |
| if ARM64 or ARM64E |
| subp StackValueSize, mintRetDst |
| storeq wa7, [mintRetDst] |
| mintRetDispatch() |
| else |
| break |
| end |
| |
| mintAlign(_fr0) |
| subp StackValueSize, mintRetDst |
| storev wfa0, [mintRetDst] |
| mintRetDispatch() |
| |
| mintAlign(_fr1) |
| subp StackValueSize, mintRetDst |
| storev wfa1, [mintRetDst] |
| mintRetDispatch() |
| |
| mintAlign(_fr2) |
| subp StackValueSize, mintRetDst |
| storev wfa2, [mintRetDst] |
| mintRetDispatch() |
| |
| mintAlign(_fr3) |
| subp StackValueSize, mintRetDst |
| storev wfa3, [mintRetDst] |
| mintRetDispatch() |
| |
| mintAlign(_fr4) |
| subp StackValueSize, mintRetDst |
| storev wfa4, [mintRetDst] |
| mintRetDispatch() |
| |
| mintAlign(_fr5) |
| subp StackValueSize, mintRetDst |
| storev wfa5, [mintRetDst] |
| mintRetDispatch() |
| |
| mintAlign(_fr6) |
| subp StackValueSize, mintRetDst |
| storev wfa6, [mintRetDst] |
| mintRetDispatch() |
| |
| mintAlign(_fr7) |
| subp StackValueSize, mintRetDst |
| storev wfa7, [mintRetDst] |
| mintRetDispatch() |
| |
| # CallResultBytecode::ResultStack (0x10) |
| mintAlign(_result_stack) |
| loadq [mintRetSrc], sc0 |
| addp SlotSize, mintRetSrc |
| subp StackValueSize, mintRetDst |
| storeq sc0, [mintRetDst] |
| mintRetDispatch() |
| |
| # CallResultBytecode::ResultStackVector (0x11) |
| mintAlign(_result_stack_vector) |
| subp StackValueSize, mintRetDst |
| loadq [mintRetSrc], sc0 |
| storeq sc0, [mintRetDst] |
| loadq 8[mintRetSrc], sc0 |
| storeq sc0, 8[mintRetDst] |
| addp 2 * SlotSize, mintRetSrc |
| mintRetDispatch() |
| |
| mintAlign(_end) |
| |
| # <first non-arg> <- first_non_arg_addr |
| # return result |
| # ... |
| # return result |
| # return result |
| # return result |
| # return result <- mintRetDst => new SP |
| # (first_non_arg_addr - cfr), PC |
| # unused, wasmInstance <- sc3 |
| # call frame |
| # call frame |
| # call frame |
| # call frame <- callee's SP (not yet restored) |
| |
| # note: we don't care about t3 anymore |
| if ARM64 or ARM64E |
| loadpairq [sc3], t3, wasmInstance |
| elsif X86_64 |
| loadq [sc3], wasmInstance |
| loadq 8[sc3], t3 |
| loadp (2 * SlotSize)[sc3], PC |
| end |
| move mintRetDst, sp |
| |
| # Restore PC / MC |
| loadp Callee[cfr], ws0 |
| unboxWasmCallee(ws0, ws1) |
| storep ws0, UnboxedWasmCalleeStackSlot[cfr] |
| |
| # Restore memory |
| ipintReloadMemory(t2) |
| nextIPIntInstruction() |
| |
| .ipint_perform_tail_call: |
| |
| # <caller frame> <- sc2 |
| # return val |
| # return val |
| # argument |
| # argument |
| # argument |
| # argument |
| # call frame |
| # call frame <- cfr |
| # (IPInt locals) |
| # (IPInt stack) <- sc1 (was shadow stack, now dead and can re-use) |
| # argument 0 |
| # ... |
| # argument n-1 |
| # argument n |
| # entrypoint, targetInstance |
| # callee, function info |
| # saved MC/PC |
| # return address, saved CFR |
| # stack arguments |
| # stack arguments |
| # stack arguments |
| # stack arguments <- sp |
| |
| # load the size of the arguments and results space, and subtract that from sc2 |
| loadi [MC], sc3 |
| negq sc3 |
| |
| # copy args to sc2 region |
| validateOpcodeConfig(sc0) |
| .ipint_tail_call_copy_stackargs_loop: |
| bqgteq sc3, 0, .ipint_tail_call_copy_stackargs_loop_end |
| if ARM64 or ARM64E |
| loadpairq [sp], sc0, sc1 |
| storepairq sc0, sc1, [sc2, sc3] |
| else |
| loadq [sp], sc0 |
| loadq 8[sp], sc1 |
| storeq sc0, [sc2, sc3] |
| storeq sc1, 8[sc2, sc3] |
| end |
| |
| addp 16, sc3 |
| addp 16, sp |
| jmp .ipint_tail_call_copy_stackargs_loop |
| |
| .ipint_tail_call_copy_stackargs_loop_end: |
| |
| # reload it here, which isn't optimal, but we don't really have registers |
| loadi [MC], sc3 |
| subp sc3, sc2 |
| |
| # re-setup the call frame, and load our return address in |
| subp FirstArgumentOffset, sc2 |
| if X86_64 |
| pop sc1, sc0 |
| storep sc0, ReturnPC[sc2] |
| elsif ARM64 or ARM64E or ARMv7 or RISCV64 |
| pop sc1, lr |
| end |
| |
| pop PC, MC |
| |
| # function info, callee |
| pop sc3, sc0 |
| |
| # save new Callee |
| storeq sc0, Callee[sc2] |
| storep sc3, CodeBlock[sc2] |
| |
| # take off the last two values we stored, and move SP down to make it look like a fresh frame |
| pop targetInstance, ws0 |
| |
| # <caller frame> |
| # return val |
| # return val |
| # ... |
| # argument |
| # argument |
| # argument |
| # argument |
| # argument <- cfr |
| # argument |
| # argument |
| # <to be frame> |
| # <to be frame> <- NEW SP |
| # <to be frame> <- sc2 |
| # argument 0 |
| # ... |
| # argument n-1 |
| # argument n |
| |
| # on ARM: lr = return address |
| |
| move sc2, sp |
| if ARM64E |
| addp CallerFrameAndPCSize, cfr, ws2 |
| end |
| # saved cfr |
| move sc1, cfr |
| |
| # swap instances |
| move targetInstance, wasmInstance |
| |
| # set up memory |
| ipintReloadMemory(ws1) |
| |
| addp CallerFrameAndPCSize, sp |
| |
| if X86_64 |
| subp 8, sp |
| end |
| |
| # go! |
| if ARM64E |
| leap _g_config, ws1 |
| jmp JSCConfigGateMapOffset + (constexpr Gate::wasmIPIntTailCallWasmEntryPtrTag) * PtrSize[ws1], NativeToJITGatePtrTag # WasmEntryPtrTag |
| end |
| |
| _wasm_trampoline_wasm_ipint_tail_call: |
| _wasm_trampoline_wasm_ipint_tail_call_wide16: |
| _wasm_trampoline_wasm_ipint_tail_call_wide32: |
| jmp ws0, WasmEntryPtrTag |
| |
| _ipint_argument_dispatch_err: |
| move 0x55, a0 |
| break |
| _ipint_uint_dispatch_err: |
| move 0x66, a0 |
| break |
| _ipint_mint_arg_dispatch_err: |
| move 0x77, a0 |
| break |
| _ipint_mint_ret_dispatch_err: |
| move 0x88, a0 |
| break |
| |
| _ipint_throw_Unreachable: |
| handleDebuggerTrapIfNeededAndThrowWasmTrap(Unreachable) |
| |
| _ipint_throw_NullExnrefReference: |
| handleDebuggerTrapIfNeededAndThrowWasmTrap(NullExnrefReference) |
| |
| _ipint_throw_OutOfBoundsMemoryAccess: |
| handleDebuggerTrapIfNeededAndThrowWasmTrap(OutOfBoundsMemoryAccess) |
| |
| _ipint_throw_DivisionByZero: |
| handleDebuggerTrapIfNeededAndThrowWasmTrap(DivisionByZero) |
| |
| _ipint_throw_IntegerOverflow: |
| handleDebuggerTrapIfNeededAndThrowWasmTrap(IntegerOverflow) |
| |
| _ipint_throw_OutOfBoundsTrunc: |
| handleDebuggerTrapIfNeededAndThrowWasmTrap(OutOfBoundsTrunc) |
| |
| _ipint_throw_NullRefAsNonNull: |
| handleDebuggerTrapIfNeededAndThrowWasmTrap(NullRefAsNonNull) |
| |
| _ipint_throw_NullAccess: |
| handleDebuggerTrapIfNeededAndThrowWasmTrap(NullAccess) |
| |
| _ipint_throw_NullI31Get: |
| handleDebuggerTrapIfNeededAndThrowWasmTrap(NullI31Get) |
| |
| _ipint_throw_UnalignedMemoryAccess: |
| handleDebuggerTrapIfNeededAndThrowWasmTrap(UnalignedMemoryAccess) |
| |
| ########################################### |
| # uINT: function return value interpreter # |
| ########################################### |
| |
| uintAlign(_r0) |
| _uint_begin: |
| popQuad(wa0) |
| uintDispatch() |
| |
| uintAlign(_r1) |
| popQuad(wa1) |
| uintDispatch() |
| |
| uintAlign(_r2) |
| popQuad(wa2) |
| uintDispatch() |
| |
| uintAlign(_r3) |
| popQuad(wa3) |
| uintDispatch() |
| |
| uintAlign(_r4) |
| popQuad(wa4) |
| uintDispatch() |
| |
| uintAlign(_r5) |
| popQuad(wa5) |
| uintDispatch() |
| |
| uintAlign(_r6) |
| if ARM64 or ARM64E |
| popQuad(wa6) |
| uintDispatch() |
| else |
| break |
| end |
| |
| uintAlign(_r7) |
| if ARM64 or ARM64E |
| popQuad(wa7) |
| uintDispatch() |
| else |
| break |
| end |
| |
| uintAlign(_fr0) |
| popVec(wfa0) |
| uintDispatch() |
| |
| uintAlign(_fr1) |
| popVec(wfa1) |
| uintDispatch() |
| |
| uintAlign(_fr2) |
| popVec(wfa2) |
| uintDispatch() |
| |
| uintAlign(_fr3) |
| popVec(wfa3) |
| uintDispatch() |
| |
| uintAlign(_fr4) |
| popVec(wfa4) |
| uintDispatch() |
| |
| uintAlign(_fr5) |
| popVec(wfa5) |
| uintDispatch() |
| |
| uintAlign(_fr6) |
| popVec(wfa6) |
| uintDispatch() |
| |
| uintAlign(_fr7) |
| popVec(wfa7) |
| uintDispatch() |
| |
| # destination on stack is sc0 |
| |
| uintAlign(_stack) |
| popInt64(sc1) |
| subp SlotSize, sc0 |
| storeq sc1, [sc0] |
| uintDispatch() |
| |
| uintAlign(_stack_vector) |
| subp 2 * SlotSize, sc0 |
| loadq [sp], sc1 |
| storeq sc1, [sc0] |
| loadq 8[sp], sc1 |
| storeq sc1, 8[sc0] |
| addq StackValueSize, sp |
| uintDispatch() |
| |
| uintAlign(_ret) |
| jmp .ipint_exit |
| |
| # MC = location in argumINT bytecode |
| # csr0 = tmp |
| # csr1 = dst |
| # csr2 = src |
| # csr3 |
| # csr4 = for dispatch |
| |
| # const argumINTDest = csr3 |
| # const argumINTSrc = PB |
| |
| argumINTAlign(_a0) |
| _argumINT_begin: |
| storeq wa0, [argumINTDst] |
| subp LocalSize, argumINTDst |
| argumINTDispatch() |
| |
| argumINTAlign(_a1) |
| storeq wa1, [argumINTDst] |
| subp LocalSize, argumINTDst |
| argumINTDispatch() |
| |
| argumINTAlign(_a2) |
| if ARM64 or ARM64E or X86_64 |
| storeq wa2, [argumINTDst] |
| subp LocalSize, argumINTDst |
| argumINTDispatch() |
| else |
| break |
| end |
| |
| |
| argumINTAlign(_a3) |
| if ARM64 or ARM64E or X86_64 |
| storeq wa3, [argumINTDst] |
| subp LocalSize, argumINTDst |
| argumINTDispatch() |
| else |
| break |
| end |
| |
| argumINTAlign(_a4) |
| if ARM64 or ARM64E or X86_64 |
| storeq wa4, [argumINTDst] |
| subp LocalSize, argumINTDst |
| argumINTDispatch() |
| else |
| break |
| end |
| |
| argumINTAlign(_a5) |
| if ARM64 or ARM64E or X86_64 |
| storeq wa5, [argumINTDst] |
| subp LocalSize, argumINTDst |
| argumINTDispatch() |
| else |
| break |
| end |
| |
| argumINTAlign(_a6) |
| if ARM64 or ARM64E |
| storeq wa6, [argumINTDst] |
| subp LocalSize, argumINTDst |
| argumINTDispatch() |
| else |
| break |
| end |
| |
| argumINTAlign(_a7) |
| if ARM64 or ARM64E |
| storeq wa7, [argumINTDst] |
| subp LocalSize, argumINTDst |
| argumINTDispatch() |
| else |
| break |
| end |
| |
| argumINTAlign(_fa0) |
| storev wfa0, [argumINTDst] |
| subp LocalSize, argumINTDst |
| argumINTDispatch() |
| |
| argumINTAlign(_fa1) |
| storev wfa1, [argumINTDst] |
| subp LocalSize, argumINTDst |
| argumINTDispatch() |
| |
| argumINTAlign(_fa2) |
| storev wfa2, [argumINTDst] |
| subp LocalSize, argumINTDst |
| argumINTDispatch() |
| |
| argumINTAlign(_fa3) |
| storev wfa3, [argumINTDst] |
| subp LocalSize, argumINTDst |
| argumINTDispatch() |
| |
| argumINTAlign(_fa4) |
| storev wfa4, [argumINTDst] |
| subp LocalSize, argumINTDst |
| argumINTDispatch() |
| |
| argumINTAlign(_fa5) |
| storev wfa5, [argumINTDst] |
| subp LocalSize, argumINTDst |
| argumINTDispatch() |
| |
| argumINTAlign(_fa6) |
| storev wfa6, [argumINTDst] |
| subp LocalSize, argumINTDst |
| argumINTDispatch() |
| |
| argumINTAlign(_fa7) |
| storev wfa7, [argumINTDst] |
| subp LocalSize, argumINTDst |
| argumINTDispatch() |
| |
| argumINTAlign(_stack) |
| loadq [argumINTSrc], csr0 |
| addp SlotSize, argumINTSrc |
| storeq csr0, [argumINTDst] |
| subp LocalSize, argumINTDst |
| argumINTDispatch() |
| |
| argumINTAlign(_stack_vector) |
| loadq [argumINTSrc], csr0 |
| storeq csr0, [argumINTDst] |
| loadq 8[argumINTSrc], csr0 |
| storeq csr0, 8[argumINTDst] |
| addp 2 * SlotSize, argumINTSrc |
| subp LocalSize, argumINTDst |
| argumINTDispatch() |
| |
| argumINTAlign(_end) |
| jmp .ipint_entry_end_local |
| |
| if ARM64E |
| global _wasmTailCallTrampoline |
| _wasmTailCallTrampoline: |
| untagReturnAddress ws2 |
| jmp ws0, WasmEntryPtrTag |
| end |
| |
| # Restore frame return stub: only used when JIT cage is disabled. |
| # When JIT cage is enabled, the wasmRestoreFrame gate thunk handles this. |
| # At entry: return values in wa/wfa registers and at sp (don't change these) |
| global _wasm_restore_frame_return |
| _wasm_restore_frame_return: |
| loadp CodeBlock[cfr], wasmInstance |
| ipintReloadMemory(ws0) |
| |
| if ARM64E |
| loadp ReturnPC[cfr], lr |
| addp CallerFrameAndPCSize, cfr, ws0 |
| untagReturnAddress ws0 |
| loadp [cfr], cfr |
| tagReturnAddress sp |
| ret |
| elsif ARM64 |
| loadpairq [cfr], cfr, lr |
| ret |
| elsif X86_64 |
| loadp ReturnPC[cfr], ws1 |
| loadp [cfr], cfr |
| jmp ws1 |
| end |