| //===-- divsf3.S - single-precision floating point division ---------------===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // This file implements single-precision soft-float division with the IEEE-754 |
| // default rounding (to nearest, ties to even), in optimized AArch32 assembly |
| // language suitable to be built as either Arm or Thumb2. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "../assembly.h" |
| |
| |
| .syntax unified |
| .text |
| .p2align 2 |
| |
| #if __ARM_PCS_VFP |
| DEFINE_COMPILERRT_FUNCTION(__divsf3) |
| push {r4, lr} |
| vmov r0, s0 |
| vmov r1, s1 |
| bl __aeabi_fdiv |
| vmov s0, r0 |
| pop {r4, pc} |
| #else |
| DEFINE_COMPILERRT_FUNCTION_ALIAS(__divsf3, __aeabi_fdiv) |
| #endif |
| |
| DEFINE_COMPILERRT_FUNCTION(__aeabi_fdiv) |
| // Extract the exponents of the inputs into r2 and r3, occupying bits 16-23 |
| // of each register so that there will be space lower down to store extra |
| // data without exponent arithmetic carrying into it. In the process, check |
| // both exponents for 00 or FF and branch out of line to handle all the |
| // uncommon types of value (infinity, NaN, zero, denormals). |
| // |
| // Chaining conditional instructions like this means that the second |
| // instruction (setting up r3) might not be executed at all, so fdiv_uncommon |
| // will have to redo it just in case. That saves an instruction here, |
| // executed for _all_ inputs, and moves it to the uncommon path run for only |
| // some inputs. |
| mov r12, #0xFF0000 |
| ands r2, r12, r0, lsr #7 // r2 has exponent of numerator. (Is it 0?) |
| andsne r3, r12, r1, lsr #7 // r3 has exponent of denominator. (Is it 0?) |
| teqne r2, r12 // if neither was 0, is one FF? |
| teqne r3, r12 // or the other? |
| beq LOCAL_LABEL(uncommon) // branch out of line if any answer was yes |
| |
| // Calculate the output sign, which is always just the XOR of the input |
| // signs. Store it in bit 8 of r2, below the numerator exponent. |
| teq r0, r1 // is the output sign bit 1? |
| orrmi r2, r2, #0x100 // if so, set bit 8 of r2 |
| |
| // Isolate the mantissas of both values, by setting bit 23 of each one and |
| // clearing the 8 bits above that. |
| // |
| // In the process, swap the register allocations (which doesn't cost extra |
| // instructions if we do it as part of this manipulation). We want the |
| // numerator not to be in r0, because r0 is where we'll build up the quotient |
| // while subtracting things from the numerator. |
| orr r12, r0, #1 << 23 |
| orr r0, r1, #1 << 23 |
| bic r1, r12, #0xFF000000 |
| bic r0, r0, #0xFF000000 |
| |
| LOCAL_LABEL(div): |
| // Start of the main division. We get here knowing that: |
| // |
| // r0 = mantissa of denominator, with the leading 1 at bit 23 |
| // r1 = mantissa of numerator, similarly |
| // r2 = (exponent of numerator << 16) + (result sign << 8) |
| // r3 = (exponent of denominator << 16) |
| |
| push {r14} // we'll need an extra register |
| |
| // Calculate the initial result exponent by just subtracting the two input |
| // exponents. This doesn't affect the sign bit lower down in r2. |
| sub r2, r2, r3 |
| |
| // That initial exponent might need to be adjusted by 1, depending on whether |
| // dividing the mantissas gives a value >=1 or <1. We don't need to wait |
| // until the division is finished to work that out: we can tell immediately |
| // by just comparing the mantissas. |
| // |
| // The basic idea is to do the comparison in a way that sets the C flag if |
| // numerator >= denominator. Then we recombine the sign and exponent by doing |
| // "ADC r2, r2, r2, asr #16": the exponent in the top half of r2 is shifted |
| // down to the low 8 bits, just below the sign bit, and using ADC rather than |
| // ADD folds in the conditional increment from the mantissa comparison. |
| // |
| // If we're not incrementing the output exponent, we instead shift the |
| // numerator mantissa left by 1, so that it _is_ greater than the denominator |
| // mantissa. Otherwise we'd generate only a 22-bit quotient, instead of 23. |
| // |
| // The exponent also needs to be rebiased, so that dividing two numbers the |
| // same gives an output exponent of 0x7F. If the two inputs have the same |
| // exponent then we'll have computed an exponent of 0 via the SUB instruction |
| // above; if the mantissas are the same as well then the ADC will increment |
| // it; also, the leading bit of the quotient will increment the exponent |
| // again when we recombine it with the output mantissa later. So we need to |
| // add (0x7F - 2) to the mantissa now, to make an exponent of 0 from the SUB |
| // come to 0x7F after both of those increments. |
| // |
| // Putting all of that together, what we _want_ to do is this: |
| // |
| // [#1] CMP r1, r0 // set C if num >= den |
| // [#2] MOVLO r1, r1, lsl #1 // if num < den, shift num left |
| // [#3] ADD r2, r2, #0x7D0000 // rebias exponent |
| // [#4] ADC r2, r2, r2, asr #16 // combine sign + exp + adjustment |
| // |
| // However, we only do the first of those four instructions right here. The |
| // other three are distributed through the code below, after unrelated load |
| // or multiply instructions which will have a result delay slot on simple |
| // CPUs. Each is labelled "exponent setup [#n]" in a comment. |
| // |
| // (Since instruction #4 depends on the flags set up by #2, we must avoid |
| // clobbering the flags in _any_ of the instructions interleaved with this!) |
| cmp r1, r0 // exponent setup [#1] |
| |
| // Start the mantissa division by making an approximation to the reciprocal |
| // of the denominator. We first obtain an 8-bit approximation using a table |
| // lookup indexed by the top 7 denominator bits (counting the leading 1, so |
| // really there are only 6 bits in the table index). |
| // |
| // (r0 >> 17) is the table index, and its top bit is always set, so it ranges |
| // from 64 to 127 inclusive. So we point the base register 64 bytes before |
| // the actual table. |
| adr r12, LOCAL_LABEL(tab) - 64 |
| #if __thumb__ |
| // Thumb can't do this particular shift+add+load in one instruction - it only |
| // supports left shifts of 0 to 3 bits, not right shifts of 17. So we must |
| // calculate the load offset separately. |
| add r14, r12, r0, lsr #17 |
| ldrb r14, [r14] |
| #else |
| ldrb r14, [r12, r0, lsr #17] |
| #endif |
| |
| // Now do an iteration of Newton-Raphson to improve that 8-bit approximation |
| // to have 15-16 accurate bits. |
| // |
| // Basics of Newton-Raphson for finding a reciprocal: if you want to find 1/d |
| // and you have some approximation x, your next approximation is X = x(2-dx). |
| // Looked at one way, this is the result of applying the N-R formula |
| // X=x-f(x)/f'(x) to the function f(x) = 1/x - d. Another way to look at it |
| // is to suppose that dx = 1 - e, for some e which is small (because dx is |
| // already reasonably close to 1). Then you want to double the number of |
| // correct bits in the next approximation, i.e. square the error. So you want |
| // dX = 1-e^2 = (1-e)(1+e) = dx(2-dx). Cancelling d gives X = x(2-dx) again. |
| // |
| // In this situation, we're working in fixed-point integers rather than real |
| // numbers, and all the scales are different: |
| // * our input denominator d is in the range [2^23,2^24) |
| // * our input approximation x is in the range [2^7,2^8) |
| // * we want the output approximation to be in the range [2^15,2^16) |
| // Those factors combine to mean that we want |
| // x(2^32-dx) / 2^23 |
| // = (2^9 x) - (dx^2 / 2^23) |
| // |
| // But we also want to compute this using ordinary MUL, not a long multiply |
| // instruction (those are slower). So we need to worry about the product |
| // overflowing. dx fits in 32 bits, because it's the product of something |
| // <2^24 with something <2^8; but we must shift it right before multiplying |
| // by x again. |
| |
| mul r12, r0, r14 // r12 = dx |
| movlo r1, r1, lsl #1 // exponent setup [#2] in the MUL delay slot |
| mvn r12, r12, lsr #8 // r12 ~= -dx/2^8 |
| mul r3, r12, r14 // r3 ~= -dx^2/2^8 |
| mov r14, r14, lsl #9 // r14 = 2^9 x |
| add r14, r14, r3, asr #15 // r14 ~= 2^9 x - dx^2 / 2^23 |
| |
| // Now r14 is a 16-bit approximation to the reciprocal of the input mantissa, |
| // scaled by 2^39 (so that the min mantissa 2^23 would have reciprocal 2^16 |
| // in principle, and the max mantissa 2^24-1 would have reciprocal just over |
| // 2^15). The error is always negative (r14 is an underestimate of the true |
| // value), and the maximum error is 6 and a bit ULP (that is, the true |
| // reciprocal is strictly less than (r14+7)). Also, r14 is always strictly |
| // less than 0x10000 (even in the case of the min mantissa, where the true |
| // value would be _exactly_ 0x10000), which eliminates a case of integer |
| // overflow. |
| // |
| // All of these properties of the reciprocal approximation are checked by |
| // exhaustively iterating over all 2^23 possible input mantissas. (The nice |
| // thing about doing this in single rather than double precision!) |
| // |
| // Now we extract most of the quotient by two steps of long division, using |
| // the reciprocal estimate to identify a multiple of the denominator to |
| // subtract from the numerator. To avoid integer overflow, the numerator |
| // mantissa is shifted down 8 bits so that it's less than 0x10000. After we |
| // calculate an approximate quotient, we shift the numerator left and |
| // subtract that multiple of the denominator, moving the next portion of the |
| // numerator into range for the next iteration. |
| |
| // First iteration of long division. We shift the numerator left 11 bits, and |
| // since the quotient approximation is scaled by 2^31, we must shift that |
| // right by 20 to make the right product to subtract from the numerator. |
| mov r12, r1, lsr #8 // shift the numerator down |
| mul r12, r14, r12 // make the quotient approximation |
| mov r1, r1, lsl #11 // shift numerator left, ready for subtraction |
| mov r3, r12, lsr #20 // make first 12-bit block of quotient bits |
| mls r1, r0, r3, r1 // subtract that multiple of den from num |
| |
| add r2, r2, #0x7D0000 // exponent setup [#3] in the MLS delay slot |
| |
| // Second iteration of long division. Differences from the first step: this |
| // time we shift the numerator 12 bits instead of 11, so that the total of |
| // both steps is 23 bits, i.e. we've shifted up by exactly the full width of |
| // the output mantissa. Also, the block of output quotient bits is left in a |
| // different register: it was in r3 the first time, and this time it's in |
| // r12, so that we still have both available at the end of the process. |
| mov r12, r1, lsr #8 // shift the numerator down |
| mul r12, r14, r12 // make the quotient approximation |
| mov r1, r1, lsl #12 // shift numerator left, ready for subtraction |
| mov r12, r12, lsr #19 // make second 11-bit block of quotient |
| mls r1, r0, r12, r1 // subtract that multiple of den from num |
| |
| adc r2, r2, r2, asr #16 // exponent setup [#4] in the MLS delay slot |
| |
| // Now r1 contains the original numerator, shifted left 23, minus _some_ |
| // multiple of the original denominator (which is still in r0). The bounds on |
| // the error in the above steps should make the error at most 1: that is, we |
| // may have to subtract the denominator one more time to make r1 < r0, and |
| // increment the quotient by one more. |
| // |
| // Our quotient is still in two pieces, computed separately in the above long |
| // division steps. We fold the final increment into the same instruction that |
| // recombines them, by doing the comparison in such a way that it sets the |
| // carry flag if the increment is needed. |
| |
| cmp r1, r0 // Set carry flag if num >= den |
| subhs r1, r1, r0 // If so, subtract den from num |
| adc r3, r12, r3, lsl #12 // Recombine quotient halves, plus optional +1 |
| |
| // We've finished with r14 as a temporary register, so we can unstack it now. |
| pop {r14} |
| |
| // Now r3 contains the _rounded-down_ output quotient, and r1 contains the |
| // remainder. That is, (denominator * r3 + r1) = (numerator << 23), and |
| // 0 <= r1 < denominator. |
| // |
| // Next we must round to nearest, by checking if r1 is greater than half the |
| // denominator. In division, it's not possible to hit an exact round-to-even |
| // halfway case, so we don't need to spend any time checking for it. |
| // |
| // Proof of no round-to-even: define the 'width' of a dyadic rational to be |
| // the distance between the lowest and highest 1 bits in its binary |
| // representation, or equivalently, the index of its high bit if you scale it |
| // by a power of 2 to make it an odd integer. E.g. any actual power of 2 has |
| // width 0, and all of 0b11110, 0b1111, 0b11.11 and 0b0.01111 have width 3. |
| // Then for any dyadic rationals a,b, width(ab) >= width(a)+width(b). Let w |
| // be the maximum width that the input precision supports (so that for single |
| // precision, w=23). Then if some division n/d were a round-to-even case, the |
| // true quotient q=n/d would have width exactly w+1. But we have qd=n, so |
| // width(n) >= width(q)+width(d) > w, which can't happen, because n is in the |
| // input precision, hence had width <= w.) |
| // |
| // So we don't need to check for an exact _halfway_ case and clear the low |
| // bit of the quotient after rounding up, as addition and multiplication both |
| // need to do. But we do need to remember if the quotient itself was exact, |
| // that is, if there was no remainder at all. That's needed in underflow |
| // handling. |
| |
| // The rounding check wants to compare remainder with denominator/2. But of |
| // course in integers it's easier to compare 2*remainder with denominator. So |
| // we start by shifting the remainder left by 1, and in the process, set Z if |
| // it's exactly 0 (i.e. the result needs no rounding at all). |
| lsls r1, r1, #1 |
| // Now trial-subtract the denominator. We don't do this at all if the result |
| // was exact. If we do do it, r1 goes negative precisely if we need to round |
| // up, which sets the C flag. (The previous instruction will have left C |
| // clear, since r1 had its top 8 bits all clear. So now C is set _only_ if |
| // we're rounding up.) |
| subsne r1, r1, r0 |
| // Recombine the quotient with the sign + exponent, and use the C flag from |
| // the previous instruction to increment the quotient if we're rounding up. |
| adc r0, r3, r2, lsl #23 |
| |
| // If we haven't either overflowed or underflowed, we're done. We can |
| // identify most of the safe cases by doing an unsigned comparison of the |
| // initial output exponent (in the top half of r2) with 0xFC: if 0 <= r2 < |
| // 0xFC0000 then we have neither underflow nor overflow. |
| // |
| // Rationale: the value in the top half of r2 had three chances to be |
| // incremented before becoming the exponent field of the actual output float. |
| // It was incremented if we found the numerator mantissa was >= the |
| // denominator (producing the value in the _bottom_ half of r2, which we just |
| // ADCed into the output). Then it gets unconditionally incremented again |
| // when the ADC combines it with the leading mantissa bit. And finally, |
| // round-up might increment it a third time. So 0xFC is the smallest value |
| // that can possibly turn into the overflowed value 0xFF after all those |
| // increments. |
| // |
| // On the underflow side, (top half of r2) = 0 corresponds to a value of 1 in |
| // the final result's exponent field (and then rounding might increase it |
| // further); if the exponent was less than that then r2 wraps round and looks |
| // like a very large positive integer from the point of view of this unsigned |
| // comparison. |
| cmp r2, #0xFC0000 |
| bxlo lr |
| |
| // The same comparison will have set the N and V flags to reflect the result |
| // of comparing r2 with 0xFC0000 as a _signed_ integer. That reliably |
| // distinguishes potential underflow (r2 is negative) from potential overflow |
| // (r2 is positive and at least 0xFC0000) |
| bge LOCAL_LABEL(overflow) |
| |
| // Here we might or might not have underflow (but we know we don't have |
| // overflow). To check more carefully, we look at the _bottom_ half of r2, |
| // which contains the exponent after the first adjustment (for num >= denom), |
| // That is, it's still off by 1 (compensating for the leading quotient bit), |
| // and is also before rounding. |
| // |
| // We neglect the effect of rounding: division results that are tiny (less |
| // than the smallest normalised number) before rounding, but then round up to |
| // the smallest normal number, are an acceptable edge case to handle slowly. |
| // We pass those to funder without worrying about them. |
| // |
| // So we want to check whether the bottom half of r2 was negative. It would |
| // be nice to check bits 8-15 of it, but unfortunately, it's already been |
| // combined with the sign (at bit 8), so those bits don't tell us anything |
| // useful. Instead we look at the top 4 bits of the exponent field, i.e. the |
| // 0xF0 bits. The largest _non_-overflowing exponent that might reach here is |
| // less than 3, so it doesn't reach those bits; the smallest possible |
| // underflow, obtained by dividing the smallest denormal by the largest |
| // finite number, is -151 (before the leading bit increments it), which will |
| // set the low 8 bits of r2 to 0x69. That is, the 0xF0 nibble of r2 will be |
| // 0x60 or greater for a (pre-rounding) underflow, and zero for a |
| // non-underflow. |
| |
| tst r2, #0xF0 |
| bxeq lr // no underflow after all; return |
| |
| // Rebias the exponent for funder, which also corrects the sign bit. |
| add r0, r0, #192 << 23 |
| // Tell funder whether the true value is greater or less than the number in |
| // r0. This is obtained from the sign of the remainder (still in r1), with |
| // the only problem being that it's currently reversed. So negate r1 (leaving |
| // 0 at 0 to indicate exactness). |
| rsbs r1, r1, #0 |
| b SYMBOL_NAME(__compiler_rt_funder) |
| |
| LOCAL_LABEL(overflow): |
| // Here we might or might not have overflow (but we know we don't have |
| // underflow). We must check whether we really have overflowed. |
| // |
| // For this it's easiest to check the exponent field in the actual output |
| // value in r0, after _all_ the adjustments have been completed. The largest |
| // overflowed exponent is 0x193, and the smallest exponent that can reach |
| // this is 0xFD (we checked against 0xFC above, but then the leading quotient |
| // bit incremented it). So it's enough to shift the output left by one |
| // (moving the exponent field to the top), increment it once more (so that |
| // the smallest overflowed exponent 0xFF wraps round to 0), and then compare |
| // against 0xFE000000 as an unsigned integer. |
| mov r12, r0, lsl #1 |
| add r12, r12, #1 << 24 |
| cmp r12, #0xFE << 24 // Check for exp = 253 or 254 |
| bxhs lr |
| // We have actual overflow. Rebias r0 to bring the exponent back into range, |
| // which ensures its sign is correct. Then make an infinity of that sign to |
| // return. |
| subs r0, r0, #0xC0 << 23 |
| movs r12, #0xFF // exponent of infinity |
| orrs r12, r12, r0, lsr #23 // exponent and sign at bottom of r12 |
| movs r0, r12, lsl #23 // shift it up to the top of r0 to return |
| bx lr |
| |
| LOCAL_LABEL(uncommon): |
| // We come here from the start of the function if either input is an uncommon |
| // value: zero, denormal, infinity or NaN. |
| // |
| // We arrive here with r12 = 0xFF000000, and r2 containing the exponent of x |
| // in bits 16..23. But r3 doesn't necessarily contain the exponent of y, |
| // because the instruction that set it up was conditional. So first we |
| // unconditionally repeat it. |
| and r3, r12, r1, lsr #7 |
| |
| // In all cases not involving a NaN as output, the sign of the output is made |
| // in the same way as for finite numbers, as the XOR of the input signs. So |
| // repeat the sign setup from the main branch. |
| teq r0, r1 // is the output sign bit 1? |
| orrmi r2, r2, #0x100 // if so, set bit 8 of r2 |
| |
| // Detect infinities and NaNs, by checking if either of r2 or r3 is at least |
| // 0xFF0000. |
| cmp r2, #0xFF0000 |
| cmplo r3, #0xFF0000 |
| bhs LOCAL_LABEL(inf_NaN) |
| |
| // Now we know there are no infinities or NaNs, but there's at least one zero |
| // or denormal. |
| movs r12, r1, lsl #1 // is y zero? |
| beq LOCAL_LABEL(divbyzero) // if so, go and handle division by zero |
| movs r12, r0, lsl #1 // is x zero? (now we know that y is not) |
| moveq r0, r2, lsl #23 // if so, 0/nonzero is just 0 (of right sign) |
| bxeq lr |
| |
| // Now we've eliminated zeroes as well, leaving only denormals: either x or |
| // y, or both, is a denormal. Call fnorm2 to convert both into a normalised |
| // mantissa and a (potentially small) exponent. |
| and r12, r2, #0x100 // save the result sign from r2 |
| lsr r2, #16 // shift extracted exponents down to bit 0 |
| lsr r3, #16 // where fnorm2 will expect them |
| push {r0, r1, r2, r3, r12, lr} |
| mov r0, sp // tell fnorm2 where to find its data |
| bl SYMBOL_NAME(__compiler_rt_fnorm2) |
| pop {r0, r1, r2, r3, r12, lr} |
| lsl r3, #16 // shift exponents back up to bit 16 |
| orr r2, r12, r2, lsl #16 // and put the result sign back in r2 |
| |
| // Now rejoin the main code path, having finished the setup it will expect: |
| // swap x and y, and shift the fractions back down to the low 24 bits. |
| mov r12, r0, lsr #8 |
| mov r0, r1, lsr #8 |
| mov r1, r12 |
| b LOCAL_LABEL(div) |
| |
| LOCAL_LABEL(inf_NaN): |
| // We come here if at least one input is a NaN or infinity. If either or both |
| // inputs are NaN then we hand off to fnan2 to propagate a NaN from the |
| // input. |
| mov r12, #0xFF000000 |
| cmp r12, r0, lsl #1 // if (r0 << 1) > 0xFF000000, r0 is a NaN |
| blo SYMBOL_NAME(__compiler_rt_fnan2) |
| cmp r12, r1, lsl #1 |
| blo SYMBOL_NAME(__compiler_rt_fnan2) |
| |
| // No NaNs, so we have three options: inf/inf = NaN, inf/finite = inf, and |
| // finite/inf = 0. |
| |
| // If both operands are infinity, we return a NaN. Since we know at |
| // least _one_ is infinity, we can test this by checking if they're |
| // equal apart from the sign bits. |
| eor r3, r0, r1 |
| lsls r3, #1 // were all bits of XOR zero other than top? |
| beq LOCAL_LABEL(invalid) // if so, both operands are infinity |
| |
| // See if x is infinite |
| cmp r12, r0, lsl #1 // (r0 << 1) == 0xFF000000? |
| beq LOCAL_LABEL(infret) // if so, infinity/finite = infinity |
| |
| // y is infinite and x is not, so we return a zero of the |
| // combined sign. |
| eor r0, r0, r1 // calculate the right sign |
| and r0, r0, #0x80000000 // throw away everything else |
| bx lr |
| |
| LOCAL_LABEL(divbyzero): |
| // Here, we know y is zero. But we don't know if x is zero or nonzero. So we |
| // might be calculating 0/0 (invalid operation, generating a NaN), or |
| // nonzero/0 (the IEEE "division by zero" exception, generating infinity). |
| movs r12, r0, lsl #1 // is x zero too? |
| beq LOCAL_LABEL(invalid) // if so, go and return a NaN |
| |
| LOCAL_LABEL(infret): |
| // Here, we're either dividing infinity by a finite number, or dividing a |
| // nonzero number by 0. (Or both, if we're dividing infinity by 0.) In all |
| // these cases we return infinity with the sign from r2. |
| // |
| // If we were implementing IEEE exceptions, we'd have to separate these |
| // cases: infinity / finite is not an _exception_, it just returns infinity, |
| // whereas (finite and nonzero) / 0 is a division-by-zero exception. But here |
| // we're not implementing exceptions, so we can treat all three cases the |
| // same. |
| // |
| // r2 contains the output sign in bit 8, which is a convenient place to find |
| // it when making an infinity, because we can fill in the 8 exponent bits |
| // below that and then shift it left. |
| orr r2, r2, #0xff // sign + maximum exponent |
| lsl r0, r2, #23 // shift up to the top |
| bx lr |
| |
| LOCAL_LABEL(invalid): |
| // Return the default NaN, from an invalid operation (either dividing |
| // infinity by infinity, or 0 by 0). |
| ldr r0, =0x7FC00000 |
| bx lr |
| |
| // Finally, the lookup table for the initial reciprocal approximation. |
| // |
| // The table index is made from the top 7 bits of the denominator mantissa. But |
| // the topmost bit is always 1, so only the other 6 bits vary. So it only has |
| // 64 entries, not 128. |
| // |
| // Each table entry is a single byte, with its top bit set. So the table |
| // entries correspond to the reciprocal of a 7-bit mantissa prefix scaled up by |
| // 2^14, or the reciprocal of a whole 24-bit mantissa scaled up by 2^31. |
| // |
| // Each of these 64 entries corresponds to a large interval of possible |
| // mantissas. For example, if the top 7 bits are 1000001 then the overall |
| // mantissa could be anything from 0x820000 to 0x83FFFF. And because the output |
| // of this table provides more bits than the input, there are several choices |
| // of 8-bit reciprocal approximation for a number in that interval. The |
| // reciprocal of 0x820000 starts with 0xFC plus a fraction, and the reciprocal |
| // of 0x83FFFF starts with 0xF9 minus a fraction, so there are four reasonable |
| // choices for that table entry: F9, FA, FB or FC. Which do we pick? |
| // |
| // The table below is generated by choosing whichever value minimises the |
| // maximum possible error _after_ the approximation is improved by the |
| // Newton-Raphson step. In the example above, we end up with FA. |
| // |
| // The Python code below will regenerate the table, complete with the per-entry |
| // comments. |
| |
| /* |
| |
| for prefix in range(64, 128): |
| best = None |
| |
| # Max and min 23-bit mantissas with this 7-bit prefix |
| mmin, mmax = prefix * 2**17, (prefix + 1) * 2**17 - 1 |
| |
| # Max and min table entry corresponding to the reciprocal of something in |
| # that range of mantissas: round up the reciprocal of mmax, and round down |
| # the reciprocal of mmin. Also clamp to the range [0x80,0xff], because |
| # 0x100 can't be used as a table entry due to not fitting in a byte, even |
| # though it's the exact reciprocal of the overall-smallest mantissa |
| # 0x800000. |
| gmin = max(128, (2**31 + mmin - 1) // mmax) |
| gmax = min(255, 2**31 // mmin) |
| |
| # For each of those table entries, compute the result of starting from that |
| # value and doing a Newton-Raphson iteration, with the mantissa at each end |
| # of the mantissa interval. One of these will be the worst possible error. |
| # Choose the table entry whose worst error is as small as possible. |
| # |
| # (To find the extreme values of a more general function on an interval, |
| # you must consider its values not only at the interval endpoints but also |
| # any turning points within the interval. Here, the function has only one |
| # turning point, and by construction it takes value 0 there, so we needn't |
| # worry.) |
| g = max( |
| range(gmin, gmax + 1), |
| key=lambda g: min( |
| (g * (2**32 - d * g) / 2**23 - 2**39 / d) for d in [mmin, mmax] |
| ), |
| ) |
| |
| print(f" .byte 0x{g:02x} // input [0x{mmin:06x},0x{mmax:06x}]" |
| f", candidate outputs [0x{gmin:02x},0x{gmax:02x}]" |
| ) |
| |
| */ |
| |
| .p2align 2 // make sure we start on a 4-byte boundary, even in Thumb |
| LOCAL_LABEL(tab): |
| .byte 0xfe // input [0x800000,0x81ffff], candidate outputs [0xfd,0xff] |
| .byte 0xfa // input [0x820000,0x83ffff], candidate outputs [0xf9,0xfc] |
| .byte 0xf6 // input [0x840000,0x85ffff], candidate outputs [0xf5,0xf8] |
| .byte 0xf3 // input [0x860000,0x87ffff], candidate outputs [0xf1,0xf4] |
| .byte 0xef // input [0x880000,0x89ffff], candidate outputs [0xee,0xf0] |
| .byte 0xec // input [0x8a0000,0x8bffff], candidate outputs [0xeb,0xed] |
| .byte 0xe8 // input [0x8c0000,0x8dffff], candidate outputs [0xe7,0xea] |
| .byte 0xe5 // input [0x8e0000,0x8fffff], candidate outputs [0xe4,0xe6] |
| .byte 0xe2 // input [0x900000,0x91ffff], candidate outputs [0xe1,0xe3] |
| .byte 0xdf // input [0x920000,0x93ffff], candidate outputs [0xde,0xe0] |
| .byte 0xdc // input [0x940000,0x95ffff], candidate outputs [0xdb,0xdd] |
| .byte 0xd9 // input [0x960000,0x97ffff], candidate outputs [0xd8,0xda] |
| .byte 0xd6 // input [0x980000,0x99ffff], candidate outputs [0xd5,0xd7] |
| .byte 0xd3 // input [0x9a0000,0x9bffff], candidate outputs [0xd3,0xd4] |
| .byte 0xd1 // input [0x9c0000,0x9dffff], candidate outputs [0xd0,0xd2] |
| .byte 0xce // input [0x9e0000,0x9fffff], candidate outputs [0xcd,0xcf] |
| .byte 0xcc // input [0xa00000,0xa1ffff], candidate outputs [0xcb,0xcc] |
| .byte 0xc9 // input [0xa20000,0xa3ffff], candidate outputs [0xc8,0xca] |
| .byte 0xc7 // input [0xa40000,0xa5ffff], candidate outputs [0xc6,0xc7] |
| .byte 0xc4 // input [0xa60000,0xa7ffff], candidate outputs [0xc4,0xc5] |
| .byte 0xc2 // input [0xa80000,0xa9ffff], candidate outputs [0xc1,0xc3] |
| .byte 0xc0 // input [0xaa0000,0xabffff], candidate outputs [0xbf,0xc0] |
| .byte 0xbd // input [0xac0000,0xadffff], candidate outputs [0xbd,0xbe] |
| .byte 0xbb // input [0xae0000,0xafffff], candidate outputs [0xbb,0xbc] |
| .byte 0xb9 // input [0xb00000,0xb1ffff], candidate outputs [0xb9,0xba] |
| .byte 0xb7 // input [0xb20000,0xb3ffff], candidate outputs [0xb7,0xb8] |
| .byte 0xb5 // input [0xb40000,0xb5ffff], candidate outputs [0xb5,0xb6] |
| .byte 0xb3 // input [0xb60000,0xb7ffff], candidate outputs [0xb3,0xb4] |
| .byte 0xb1 // input [0xb80000,0xb9ffff], candidate outputs [0xb1,0xb2] |
| .byte 0xaf // input [0xba0000,0xbbffff], candidate outputs [0xaf,0xb0] |
| .byte 0xad // input [0xbc0000,0xbdffff], candidate outputs [0xad,0xae] |
| .byte 0xac // input [0xbe0000,0xbfffff], candidate outputs [0xab,0xac] |
| .byte 0xaa // input [0xc00000,0xc1ffff], candidate outputs [0xa9,0xaa] |
| .byte 0xa8 // input [0xc20000,0xc3ffff], candidate outputs [0xa8,0xa8] |
| .byte 0xa6 // input [0xc40000,0xc5ffff], candidate outputs [0xa6,0xa7] |
| .byte 0xa5 // input [0xc60000,0xc7ffff], candidate outputs [0xa4,0xa5] |
| .byte 0xa3 // input [0xc80000,0xc9ffff], candidate outputs [0xa3,0xa3] |
| .byte 0xa1 // input [0xca0000,0xcbffff], candidate outputs [0xa1,0xa2] |
| .byte 0xa0 // input [0xcc0000,0xcdffff], candidate outputs [0xa0,0xa0] |
| .byte 0x9e // input [0xce0000,0xcfffff], candidate outputs [0x9e,0x9f] |
| .byte 0x9d // input [0xd00000,0xd1ffff], candidate outputs [0x9d,0x9d] |
| .byte 0x9b // input [0xd20000,0xd3ffff], candidate outputs [0x9b,0x9c] |
| .byte 0x9a // input [0xd40000,0xd5ffff], candidate outputs [0x9a,0x9a] |
| .byte 0x98 // input [0xd60000,0xd7ffff], candidate outputs [0x98,0x99] |
| .byte 0x97 // input [0xd80000,0xd9ffff], candidate outputs [0x97,0x97] |
| .byte 0x96 // input [0xda0000,0xdbffff], candidate outputs [0x95,0x96] |
| .byte 0x94 // input [0xdc0000,0xddffff], candidate outputs [0x94,0x94] |
| .byte 0x93 // input [0xde0000,0xdfffff], candidate outputs [0x93,0x93] |
| .byte 0x92 // input [0xe00000,0xe1ffff], candidate outputs [0x91,0x92] |
| .byte 0x90 // input [0xe20000,0xe3ffff], candidate outputs [0x90,0x90] |
| .byte 0x8f // input [0xe40000,0xe5ffff], candidate outputs [0x8f,0x8f] |
| .byte 0x8e // input [0xe60000,0xe7ffff], candidate outputs [0x8e,0x8e] |
| .byte 0x8d // input [0xe80000,0xe9ffff], candidate outputs [0x8d,0x8d] |
| .byte 0x8b // input [0xea0000,0xebffff], candidate outputs [0x8b,0x8c] |
| .byte 0x8a // input [0xec0000,0xedffff], candidate outputs [0x8a,0x8a] |
| .byte 0x89 // input [0xee0000,0xefffff], candidate outputs [0x89,0x89] |
| .byte 0x88 // input [0xf00000,0xf1ffff], candidate outputs [0x88,0x88] |
| .byte 0x87 // input [0xf20000,0xf3ffff], candidate outputs [0x87,0x87] |
| .byte 0x86 // input [0xf40000,0xf5ffff], candidate outputs [0x86,0x86] |
| .byte 0x85 // input [0xf60000,0xf7ffff], candidate outputs [0x85,0x85] |
| .byte 0x84 // input [0xf80000,0xf9ffff], candidate outputs [0x84,0x84] |
| .byte 0x83 // input [0xfa0000,0xfbffff], candidate outputs [0x83,0x83] |
| .byte 0x82 // input [0xfc0000,0xfdffff], candidate outputs [0x82,0x82] |
| .byte 0x81 // input [0xfe0000,0xffffff], candidate outputs [0x80,0x81] |
| |
| END_COMPILERRT_FUNCTION(__aeabi_fdiv) |
| |
| NO_EXEC_STACK_DIRECTIVE |