| //===-- mulsf3.S - single-precision floating point multiplication ---------===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // This file implements single-precision soft-float multiplication with the |
| // IEEE-754 default rounding (to nearest, ties to even), in optimized AArch32 |
| // assembly language suitable to be built as either Arm or Thumb2. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "../assembly.h" |
| |
| |
| .syntax unified |
| .text |
| .p2align 2 |
| |
| #if __ARM_PCS_VFP |
| DEFINE_COMPILERRT_FUNCTION(__mulsf3) |
| push {r4, lr} |
| vmov r0, s0 |
| vmov r1, s1 |
| bl __aeabi_fmul |
| vmov s0, r0 |
| pop {r4, pc} |
| #else |
| DEFINE_COMPILERRT_FUNCTION_ALIAS(__mulsf3, __aeabi_fmul) |
| #endif |
| |
| DEFINE_COMPILERRT_FUNCTION(__aeabi_fmul) |
| |
| // Check if either input exponent is 00 or FF (i.e. not a normalized number), |
| // and if so, branch out of line. If we don't branch out of line, then we've |
| // also extracted the exponents of the input values r0/r1 into bits 16..23 of |
| // r2/r3. But if we do, then that hasn't necessarily been done (because the |
| // second AND might have been skipped). |
| mov r12, #0xFF0000 |
| ands r2, r12, r0, lsr #7 // sets Z if exponent of x is 0 |
| andsne r3, r12, r1, lsr #7 // otherwise, sets Z if exponent of y is 0 |
| teqne r2, r12 // otherwise, sets Z if exponent of x is FF |
| teqne r3, r12 // otherwise, sets Z if exponent of y is FF |
| beq LOCAL_LABEL(uncommon) // branch out of line to handle inf/NaN/0/denorm |
| |
| // Calculate the sign of the result, and put it in an unused bit of r2. |
| teq r0, r1 // sets N to the XOR of x and y's sign bits |
| orrmi r2, r2, #0x100 // if N set, set bit 8 of r2 |
| |
| // Move the input mantissas to the high end of r0/r1, each with its leading |
| // bit set explicitly, so that they're in the right form to be multiplied. |
| mov r12, #0x80000000 |
| orr r0, r12, r0, lsl #8 |
| orr r1, r12, r1, lsl #8 |
| |
| // Now we're ready to multiply mantissas. This is also the place we'll come |
| // back to after decoding denormal inputs. The denormal decoding will also |
| // have to set up the same register contents: |
| // - decoded fractions at the top of r0 and r1 |
| // - exponents in r2 and r3, starting at bit 16 |
| // - output sign in r2 bit 8 |
| LOCAL_LABEL(mul): |
| |
| // Here we multiply the mantissas, and compute the output exponent by adding |
| // the input exponents and rebiasing. These operations are interleaved to |
| // use a delay slot. |
| // |
| // The exponent is rebiased by subtracting 0x80, rather than the 0x7F you'd |
| // expect. That compensates for the leading bit of the mantissa overlapping |
| // it, when we recombine the exponent and mantissa by addition. |
| add r2, r2, r3 // r2 has sum of exponents, freeing up r3 |
| umull r1, r3, r0, r1 // r3:r1 has the double-width product |
| sub r2, r2, #(0x80 << 16) // rebias the summed exponent |
| |
| // Compress the double-word product into just the high-order word r3, by |
| // setting its bit 0 if any bit of the low-order word is nonzero. This |
| // changes the represented value, but not by nearly enough to affect |
| // rounding, because rounding only depends on the bit below the last output |
| // bit, and the general question of whether _any_ nonzero bit exists below |
| // that. |
| cmp r1, #0 // if low word of full product is nonzero |
| orrne r3, r3, #1 // then set LSB of high word |
| |
| // The two inputs to UMULL had their high bits set, that is, were at least |
| // 0x80000000. So the 64-bit product was at least 0x4000000000000000, i.e. |
| // the high bit of the product could be at the top of the word or one bit |
| // below. Check which, by experimentally shifting left, and then undoing it |
| // via RRX if we turned out to have shifted off a 1 bit. |
| lsls r3, r3, #1 // shift left, setting C to the bit shifted off |
| rrxcs r3, r3 // if that bit was 1, put it back again |
| |
| // That ensured the leading 1 bit of the product is now the top of r3, but |
| // also, set C if the leading 1 was _already_ in the top bit. So now we know |
| // whether to increment the exponent. The following instruction does the |
| // conditional increment (because it's ADC), but also, copies the exponent |
| // field from bit 16 of r2 into bit 0, so as to place it just below the |
| // output sign bit. |
| // |
| // So, if the number hasn't overflowed or underflowed, the low 9 bits of r2 |
| // are exactly what we need to combine with the rounded mantissa. But the |
| // full output exponent (with extra bits) is still available in the high half |
| // of r2, so that we can check _whether_ we overflowed or underflowed. |
| adc r2, r2, r2, asr #16 |
| |
| // Recombine the exponent and mantissa, doing most of the rounding as a side |
| // effect: we shift the mantissa right so as to put the round bit into C, and |
| // then we recombine with the exponent using ADC, to increment the mantissa |
| // if C was set. |
| movs r12, r3, lsr #8 |
| adc r0, r12, r2, lsl #23 |
| |
| // To complete the rounding, we must check for the round-to-even tiebreaking |
| // case, by checking if we're in the exact halfway case, which occurs if and |
| // only if we _did_ round up (we can tell this because C is still set from |
| // the MOVS), and also, no bit of r3 is set _below_ the round bit. |
| // |
| // We combine this with an overflow check, so that C ends up set if anything |
| // weird happened, and clear if we're completely finished and can return. |
| // |
| // The best instruction sequence for this part varies between Arm and Thumb. |
| #if !__thumb__ |
| // Arm state: if C was set then we check the low bits of r3, so that Z ends |
| // up set if we need to round to even. |
| // |
| // (We rely here on Z reliably being clear to begin with, because shifting |
| // down the output mantissa definitely gave a nonzero output. Also, the TST |
| // doesn't change C, so if Z does end up set, then C was also set.) |
| // |
| // Then, if we're not rounding to even, we do a CMP which sets C if there's |
| // been an overflow or an underflow. An overflow could occur for an output |
| // exponent as low as 0xFC, because we might increment the exponent by 1 when |
| // renormalizing, by another when recombining with the mantissa, and by one |
| // more if rounding up causes a carry off the top of the mantissa. An |
| // underflow occurs only if the output exponent is negative (because it's |
| // offset by 1, so an exponent of 0 will be incremented to 1), in which case |
| // the top 8 bits of r2 will all be set. Therefore, an unsigned comparison to |
| // see if r2 > 0xFC0000 will catch all overflow and underflow cases. It also |
| // catches a few very large cases that _don't_ quite overflow (exponents of |
| // 0xFC and above that don't get maximally unlucky); those will also be |
| // handled by the slow path. |
| tstcs r3, #0x7F |
| cmpne r2, #0xFC0000 |
| #else |
| // In Thumb, switching between different conditions has a higher cost due to |
| // the (implicit in this code) IT instructions, so we prefer a strategy that |
| // uses CC and CS conditions throughout, at the cost of requiring some extra |
| // cleanup instructions on the slow path. |
| // |
| // If C is set (and hence round-to-even is a possibility), the basic idea is |
| // to shift the full result word (r3) left by 25, leaving only its bottom 7 |
| // bits, which are now the top 7 bits; then we want to set C iff these are 0. |
| // |
| // The "CMP x,y" instruction sets C if y > x (as unsigned integers). So this |
| // could be done in one instruction if only we had a register to use as x, |
| // which has 0 in the top 7 bits and at least one nonzero. Then we could |
| // compare that against the shifted-up value of r3, setting C precisely if |
| // the top 7 bits of y are greater than 0. And happily, we _do_ have such a |
| // register! r12 contains the shifted-down mantissa, which is guaranteed to |
| // have a 1 in bit 23, and 0 above that. |
| // |
| // The shift of r3 happens only in the second operand of the compare, so we |
| // don't lose the original value of r3 in this process. |
| // |
| // The check for over/underflow is exactly as in the Arm branch above, except |
| // based on a different condition. |
| cmpcs r12, r3, lsl #25 // now C is set iff we're rounding to even |
| cmpcc r2, #0xFC0000 // and now it's also set if we've over/underflowed |
| #endif |
| |
| // That's all the checks for difficult cases done. If C is clear, we can |
| // return. |
| bxcc lr |
| |
| // Now the slower path begins. We have to recover enough information to |
| // handle all of round-to-even, overflow and underflow. |
| // |
| // Round to even is the most likely of these, so we detect it first and |
| // handle it as fast as possible. |
| |
| #if __thumb__ |
| // First, Thumb-specific compensation code. The Arm branch of the #if above |
| // will have set Z=0 to indicate round to even, but the Thumb branch didn't |
| // leave any unambiguous indicator of RTE, so we must retest by checking all |
| // the bits shifted off the bottom of the mantissa to see if they're exactly |
| // the half-way value. |
| lsl r12, r3, #24 // r12 = round bit and everything below |
| cmp r12, #0x80000000 // set Z if that is exactly 0x80000000 |
| #endif |
| |
| // Now Z is clear iff we have already rounded up and now must replace that |
| // with rounding to even, which is done by just clearing the low bit of the |
| // mantissa. |
| biceq r0, r0, #1 |
| |
| // Redo the over/underflow check (the same way as in both branches above), |
| // and if it doesn't report a danger, we can return the rounded-to-even |
| // answer. |
| cmp r2, #0xFC0000 // check for over/underflow |
| bxcc lr // and return if none. |
| |
| // Now we only have overflow and underflow left to handle. First, find out |
| // which we're looking at. This is easy by testing the top bit of r2, but |
| // even easier by using the fact that the possible positive and negative |
| // values of r2 are widely enough separated that the 0xFC0000 subtracted by |
| // the CMP above won't have made any difference. So the N flag output from |
| // that comparison _already_ tells us which condition we have: if N is set we |
| // have underflow, and if N is clear, overflow. |
| bpl LOCAL_LABEL(overflow) |
| |
| // Here we're handling underflow. |
| |
| // Add the IEEE 754:1985 exponent bias which funder will expect. This also |
| // brings the exponent back into a range where it can't possibly have carried |
| // into the sign bit, so the output sign will now be right. |
| add r0, r0, #(0xC0 << 23) |
| |
| // Determine whether we rounded up, down or not at all. |
| lsls r2, r3, #1 // input mantissa, without its leading 1 |
| subs r1, r2, r0, lsl #9 // subtract the output mantissa (likewise) |
| |
| // And let funder handle the rest. |
| b SYMBOL_NAME(__compiler_rt_funder) |
| |
| LOCAL_LABEL(overflow): |
| // We come here to handle overflow, but it's not guaranteed that an overflow |
| // has actually happened: our check on the fast path erred on the side of |
| // caution, by catching any output exponent that _could_ cause an overflow. |
| // So first check whether this really is an overflow, by extracting the |
| // output exponent. Exponent 0xFF, or anything that wrapped round to having |
| // the high bit clear, are overflows; 0xFE down to 0xFC are not overflows. |
| // |
| // The value in r0 is correct to return, if there's no overflow. |
| add r12, r0, #(1 << 23) // add 1 to the exponent so 0xFF wraps to 0 |
| movs r12, r12, lsl #1 // test the top bit of the modified value |
| bxmi lr // if top bit is still 1, not an overflow |
| |
| // This is an overflow, so we need to replace it with an appropriately signed |
| // infinity. First we correct the sign by applying a downward bias to the |
| // exponent (the one suggested in IEEE 754:1985, which was chosen to bring |
| // all possible overflowed results back into range). |
| subs r0, r0, #(0xC0 << 23) |
| |
| // Now the sign bit of r0 is correct. Replace everything else with the |
| // encoding of an infinity. |
| mov r1, #0xFF |
| and r0, r0, #0x80000000 |
| orr r0, r0, r1, lsl #23 |
| bx lr |
| |
| LOCAL_LABEL(uncommon): |
| // Handle zeros, denorms, infinities and NaNs. We arrive here knowing that |
| // we've at least done the first _two_ instructions from the entry point, |
| // even if all the rest were skipped. So r2 contains the sign and exponent of |
| // x in bits 16..23, and r12 = 0xFF << 16. |
| // |
| // So, first repeat some instructions from the prologue, which were either |
| // conditionally skipped in the sequence leading to the branch, or skipped |
| // because they happened after the branch. |
| and r3, r12, r1, lsr #7 // get exponent of y in r3 bits 16..23 |
| teq r0, r1 // calculate the sign of the result |
| orrmi r2, r2, #0x100 // and put it in bit 8 of r2 as before |
| |
| // Check for infinities and NaNs, by testing each of r2,r3 to see if it's at |
| // least 0xFF0000 (hence the exponent field is equal to 0xFF). |
| cmp r2, r12 |
| cmplo r3, r12 |
| bhs LOCAL_LABEL(inf_NaN) |
| |
| // If we didn't take that branch, then we have only finite numbers, but at |
| // least one is denormal or zero. A zero makes the result easy (and also is a |
| // more likely input than a denormal), so check those first, as fast as |
| // possible. |
| movs r12, r0, lsl #1 // Z set if x == 0 |
| movsne r12, r1, lsl #1 // now Z set if either input is 0 |
| moveq r0, r2, lsl #23 // in either case, make 0 of the output sign |
| bxeq lr // and return it |
| |
| // Now we know we only have denormals to deal with. Call fnorm2 to sort |
| // them out, and rejoin the main code path above. |
| and r12, r2, #0x100 // save the result sign from r2 |
| lsr r2, #16 // shift extracted exponents down to bit 0 |
| lsr r3, #16 // where fnorm2 will expect them |
| push {r0, r1, r2, r3, r12, lr} |
| mov r0, sp // tell fnorm2 where to find its data |
| bl SYMBOL_NAME(__compiler_rt_fnorm2) |
| pop {r0, r1, r2, r3, r12, lr} |
| lsl r3, #16 // shift exponents back up to bit 16 |
| orr r2, r12, r2, lsl #16 // and put the result sign back in r2 |
| b LOCAL_LABEL(mul) |
| |
| LOCAL_LABEL(inf_NaN): |
| // We come here if at least one input is a NaN or infinity. If either or both |
| // inputs are NaN then we hand off to fnan2 which will propagate a NaN from |
| // the input; otherwise any multiplication involving infinity returns |
| // infinity, unless it's infinity * 0 which is an invalid operation and |
| // returns NaN again. |
| mov r12, #0xFF000000 |
| cmp r12, r0, lsl #1 // if (r0 << 1) > 0xFF000000, r0 is a NaN |
| blo SYMBOL_NAME(__compiler_rt_fnan2) |
| cmp r12, r1, lsl #1 |
| blo SYMBOL_NAME(__compiler_rt_fnan2) |
| |
| // NaNs are dealt with, so now we have at least one infinity. Check if the |
| // other operand is 0. This is conveniently done by XORing the two: because |
| // we know that the low 31 bits of one operand are exactly 0x7F800000, we can |
| // test if the low 31 bits of the other one are all 0 by checking whether the |
| // low 31 bits of (x XOR y) equal 0x7F800000. |
| eor r3, r0, r1 |
| cmp r12, r3, lsl #1 // if inf * 0, this sets Z |
| lsr r0, r12, #1 // set up return value of +infinity |
| orrne r0, r0, r2, lsl #23 // if not inf * 0, put on the output sign |
| orreq r0, r0, #0x400000 // otherwise, set the 'quiet NaN' bit |
| bx lr // and return |
| |
| END_COMPILERRT_FUNCTION(__aeabi_fmul) |
| |
| NO_EXEC_STACK_DIRECTIVE |