Annotation of sys/arch/m68k/fpe/fpu_sqrt.c, Revision 1.1
1.1 ! nbrk 1: /* $OpenBSD: fpu_sqrt.c,v 1.5 2006/06/11 20:43:28 miod Exp $ */
! 2: /* $NetBSD: fpu_sqrt.c,v 1.4 2003/08/07 16:28:12 agc Exp $ */
! 3:
! 4: /*
! 5: * Copyright (c) 1992, 1993
! 6: * The Regents of the University of California. All rights reserved.
! 7: *
! 8: * This software was developed by the Computer Systems Engineering group
! 9: * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and
! 10: * contributed to Berkeley.
! 11: *
! 12: * All advertising materials mentioning features or use of this software
! 13: * must display the following acknowledgement:
! 14: * This product includes software developed by the University of
! 15: * California, Lawrence Berkeley Laboratory.
! 16: *
! 17: * Redistribution and use in source and binary forms, with or without
! 18: * modification, are permitted provided that the following conditions
! 19: * are met:
! 20: * 1. Redistributions of source code must retain the above copyright
! 21: * notice, this list of conditions and the following disclaimer.
! 22: * 2. Redistributions in binary form must reproduce the above copyright
! 23: * notice, this list of conditions and the following disclaimer in the
! 24: * documentation and/or other materials provided with the distribution.
! 25: * 3. Neither the name of the University nor the names of its contributors
! 26: * may be used to endorse or promote products derived from this software
! 27: * without specific prior written permission.
! 28: *
! 29: * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
! 30: * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
! 31: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
! 32: * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
! 33: * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
! 34: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
! 35: * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
! 36: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
! 37: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
! 38: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
! 39: * SUCH DAMAGE.
! 40: *
! 41: * @(#)fpu_sqrt.c 8.1 (Berkeley) 6/11/93
! 42: */
! 43:
! 44: /*
! 45: * Perform an FPU square root (return sqrt(x)).
! 46: */
! 47:
! 48: #include <sys/types.h>
! 49:
! 50: #include <machine/reg.h>
! 51:
! 52: #include <m68k/fpe/fpu_arith.h>
! 53: #include <m68k/fpe/fpu_emulate.h>
! 54:
! 55: /*
! 56: * Our task is to calculate the square root of a floating point number x0.
! 57: * This number x normally has the form:
! 58: *
! 59: * exp
! 60: * x = mant * 2 (where 1 <= mant < 2 and exp is an integer)
! 61: *
! 62: * This can be left as it stands, or the mantissa can be doubled and the
! 63: * exponent decremented:
! 64: *
! 65: * exp-1
! 66: * x = (2 * mant) * 2 (where 2 <= 2 * mant < 4)
! 67: *
! 68: * If the exponent `exp' is even, the square root of the number is best
! 69: * handled using the first form, and is by definition equal to:
! 70: *
! 71: * exp/2
! 72: * sqrt(x) = sqrt(mant) * 2
! 73: *
! 74: * If exp is odd, on the other hand, it is convenient to use the second
! 75: * form, giving:
! 76: *
! 77: * (exp-1)/2
! 78: * sqrt(x) = sqrt(2 * mant) * 2
! 79: *
! 80: * In the first case, we have
! 81: *
! 82: * 1 <= mant < 2
! 83: *
! 84: * and therefore
! 85: *
! 86: * sqrt(1) <= sqrt(mant) < sqrt(2)
! 87: *
! 88: * while in the second case we have
! 89: *
! 90: * 2 <= 2*mant < 4
! 91: *
! 92: * and therefore
! 93: *
! 94: * sqrt(2) <= sqrt(2*mant) < sqrt(4)
! 95: *
! 96: * so that in any case, we are sure that
! 97: *
! 98: * sqrt(1) <= sqrt(n * mant) < sqrt(4), n = 1 or 2
! 99: *
! 100: * or
! 101: *
! 102: * 1 <= sqrt(n * mant) < 2, n = 1 or 2.
! 103: *
! 104: * This root is therefore a properly formed mantissa for a floating
! 105: * point number. The exponent of sqrt(x) is either exp/2 or (exp-1)/2
! 106: * as above. This leaves us with the problem of finding the square root
! 107: * of a fixed-point number in the range [1..4).
! 108: *
! 109: * Though it may not be instantly obvious, the following square root
! 110: * algorithm works for any integer x of an even number of bits, provided
! 111: * that no overflows occur:
! 112: *
! 113: * let q = 0
! 114: * for k = NBITS-1 to 0 step -1 do -- for each digit in the answer...
! 115: * x *= 2 -- multiply by radix, for next digit
! 116: * if x >= 2q + 2^k then -- if adding 2^k does not
! 117: * x -= 2q + 2^k -- exceed the correct root,
! 118: * q += 2^k -- add 2^k and adjust x
! 119: * fi
! 120: * done
! 121: * sqrt = q / 2^(NBITS/2) -- (and any remainder is in x)
! 122: *
! 123: * If NBITS is odd (so that k is initially even), we can just add another
! 124: * zero bit at the top of x. Doing so means that q is not going to acquire
! 125: * a 1 bit in the first trip around the loop (since x0 < 2^NBITS). If the
! 126: * final value in x is not needed, or can be off by a factor of 2, this is
! 127: * equivalant to moving the `x *= 2' step to the bottom of the loop:
! 128: *
! 129: * for k = NBITS-1 to 0 step -1 do if ... fi; x *= 2; done
! 130: *
! 131: * and the result q will then be sqrt(x0) * 2^floor(NBITS / 2).
! 132: * (Since the algorithm is destructive on x, we will call x's initial
! 133: * value, for which q is some power of two times its square root, x0.)
! 134: *
! 135: * If we insert a loop invariant y = 2q, we can then rewrite this using
! 136: * C notation as:
! 137: *
! 138: * q = y = 0; x = x0;
! 139: * for (k = NBITS; --k >= 0;) {
! 140: * #if (NBITS is even)
! 141: * x *= 2;
! 142: * #endif
! 143: * t = y + (1 << k);
! 144: * if (x >= t) {
! 145: * x -= t;
! 146: * q += 1 << k;
! 147: * y += 1 << (k + 1);
! 148: * }
! 149: * #if (NBITS is odd)
! 150: * x *= 2;
! 151: * #endif
! 152: * }
! 153: *
! 154: * If x0 is fixed point, rather than an integer, we can simply alter the
! 155: * scale factor between q and sqrt(x0). As it happens, we can easily arrange
! 156: * for the scale factor to be 2**0 or 1, so that sqrt(x0) == q.
! 157: *
! 158: * In our case, however, x0 (and therefore x, y, q, and t) are multiword
! 159: * integers, which adds some complication. But note that q is built one
! 160: * bit at a time, from the top down, and is not used itself in the loop
! 161: * (we use 2q as held in y instead). This means we can build our answer
! 162: * in an integer, one word at a time, which saves a bit of work. Also,
! 163: * since 1 << k is always a `new' bit in q, 1 << k and 1 << (k+1) are
! 164: * `new' bits in y and we can set them with an `or' operation rather than
! 165: * a full-blown multiword add.
! 166: *
! 167: * We are almost done, except for one snag. We must prove that none of our
! 168: * intermediate calculations can overflow. We know that x0 is in [1..4)
! 169: * and therefore the square root in q will be in [1..2), but what about x,
! 170: * y, and t?
! 171: *
! 172: * We know that y = 2q at the beginning of each loop. (The relation only
! 173: * fails temporarily while y and q are being updated.) Since q < 2, y < 4.
! 174: * The sum in t can, in our case, be as much as y+(1<<1) = y+2 < 6, and.
! 175: * Furthermore, we can prove with a bit of work that x never exceeds y by
! 176: * more than 2, so that even after doubling, 0 <= x < 8. (This is left as
! 177: * an exercise to the reader, mostly because I have become tired of working
! 178: * on this comment.)
! 179: *
! 180: * If our floating point mantissas (which are of the form 1.frac) occupy
! 181: * B+1 bits, our largest intermediary needs at most B+3 bits, or two extra.
! 182: * In fact, we want even one more bit (for a carry, to avoid compares), or
! 183: * three extra. There is a comment in fpu_emu.h reminding maintainers of
! 184: * this, so we have some justification in assuming it.
! 185: */
! 186: struct fpn *
! 187: fpu_sqrt(fe)
! 188: struct fpemu *fe;
! 189: {
! 190: struct fpn *x = &fe->fe_f2;
! 191: u_int bit, q, tt;
! 192: u_int x0, x1, x2;
! 193: u_int y0, y1, y2;
! 194: u_int d0, d1, d2;
! 195: int e;
! 196: FPU_DECL_CARRY
! 197:
! 198: /*
! 199: * Take care of special cases first. In order:
! 200: *
! 201: * sqrt(NaN) = NaN
! 202: * sqrt(+0) = +0
! 203: * sqrt(-0) = -0
! 204: * sqrt(x < 0) = NaN (including sqrt(-Inf))
! 205: * sqrt(+Inf) = +Inf
! 206: *
! 207: * Then all that remains are numbers with mantissas in [1..2).
! 208: */
! 209: if (ISNAN(x) || ISZERO(x))
! 210: return (x);
! 211: if (x->fp_sign)
! 212: return (fpu_newnan(fe));
! 213: if (ISINF(x))
! 214: return (x);
! 215:
! 216: /*
! 217: * Calculate result exponent. As noted above, this may involve
! 218: * doubling the mantissa. We will also need to double x each
! 219: * time around the loop, so we define a macro for this here, and
! 220: * we break out the multiword mantissa.
! 221: */
! 222: #ifdef FPU_SHL1_BY_ADD
! 223: #define DOUBLE_X { \
! 224: FPU_ADDS(x2, x2, x2); \
! 225: FPU_ADDCS(x1, x1, x1); FPU_ADDC(x0, x0, x0); \
! 226: }
! 227: #else
! 228: #define DOUBLE_X { \
! 229: x0 = (x0 << 1) | (x1 >> 31); x1 = (x1 << 1) | (x2 >> 31); \
! 230: x2 <<= 1; \
! 231: }
! 232: #endif
! 233: #if (FP_NMANT & 1) != 0
! 234: # define ODD_DOUBLE DOUBLE_X
! 235: # define EVEN_DOUBLE /* nothing */
! 236: #else
! 237: # define ODD_DOUBLE /* nothing */
! 238: # define EVEN_DOUBLE DOUBLE_X
! 239: #endif
! 240: x0 = x->fp_mant[0];
! 241: x1 = x->fp_mant[1];
! 242: x2 = x->fp_mant[2];
! 243: e = x->fp_exp;
! 244: if (e & 1) /* exponent is odd; use sqrt(2mant) */
! 245: DOUBLE_X;
! 246: /* THE FOLLOWING ASSUMES THAT RIGHT SHIFT DOES SIGN EXTENSION */
! 247: x->fp_exp = e >> 1; /* calculates (e&1 ? (e-1)/2 : e/2 */
! 248:
! 249: /*
! 250: * Now calculate the mantissa root. Since x is now in [1..4),
! 251: * we know that the first trip around the loop will definitely
! 252: * set the top bit in q, so we can do that manually and start
! 253: * the loop at the next bit down instead. We must be sure to
! 254: * double x correctly while doing the `known q=1.0'.
! 255: *
! 256: * We do this one mantissa-word at a time, as noted above, to
! 257: * save work. To avoid `(1 << 31) << 1', we also do the top bit
! 258: * outside of each per-word loop.
! 259: *
! 260: * The calculation `t = y + bit' breaks down into `t0 = y0, ...,
! 261: * t2 = y2, t? |= bit' for the appropriate word. Since the bit
! 262: * is always a `new' one, this means that three of the `t?'s are
! 263: * just the corresponding `y?'; we use `#define's here for this.
! 264: * The variable `tt' holds the actual `t?' variable.
! 265: */
! 266:
! 267: /* calculate q0 */
! 268: #define t0 tt
! 269: bit = FP_1;
! 270: EVEN_DOUBLE;
! 271: /* if (x >= (t0 = y0 | bit)) { */ /* always true */
! 272: q = bit;
! 273: x0 -= bit;
! 274: y0 = bit << 1;
! 275: /* } */
! 276: ODD_DOUBLE;
! 277: while ((bit >>= 1) != 0) { /* for remaining bits in q0 */
! 278: EVEN_DOUBLE;
! 279: t0 = y0 | bit; /* t = y + bit */
! 280: if (x0 >= t0) { /* if x >= t then */
! 281: x0 -= t0; /* x -= t */
! 282: q |= bit; /* q += bit */
! 283: y0 |= bit << 1; /* y += bit << 1 */
! 284: }
! 285: ODD_DOUBLE;
! 286: }
! 287: x->fp_mant[0] = q;
! 288: #undef t0
! 289:
! 290: /* calculate q1. note (y0&1)==0. */
! 291: #define t0 y0
! 292: #define t1 tt
! 293: q = 0;
! 294: y1 = 0;
! 295: bit = 1 << 31;
! 296: EVEN_DOUBLE;
! 297: t1 = bit;
! 298: FPU_SUBS(d1, x1, t1);
! 299: FPU_SUBC(d0, x0, t0); /* d = x - t */
! 300: if ((int)d0 >= 0) { /* if d >= 0 (i.e., x >= t) then */
! 301: x0 = d0, x1 = d1; /* x -= t */
! 302: q = bit; /* q += bit */
! 303: y0 |= 1; /* y += bit << 1 */
! 304: }
! 305: ODD_DOUBLE;
! 306: while ((bit >>= 1) != 0) { /* for remaining bits in q1 */
! 307: EVEN_DOUBLE; /* as before */
! 308: t1 = y1 | bit;
! 309: FPU_SUBS(d1, x1, t1);
! 310: FPU_SUBC(d0, x0, t0);
! 311: if ((int)d0 >= 0) {
! 312: x0 = d0, x1 = d1;
! 313: q |= bit;
! 314: y1 |= bit << 1;
! 315: }
! 316: ODD_DOUBLE;
! 317: }
! 318: x->fp_mant[1] = q;
! 319: #undef t1
! 320:
! 321: /* calculate q2. note (y1&1)==0; y0 (aka t0) is fixed. */
! 322: #define t1 y1
! 323: #define t2 tt
! 324: q = 0;
! 325: y2 = 0;
! 326: bit = 1 << 31;
! 327: EVEN_DOUBLE;
! 328: t2 = bit;
! 329: FPU_SUBS(d2, x2, t2);
! 330: FPU_SUBCS(d1, x1, t1);
! 331: FPU_SUBC(d0, x0, t0);
! 332: if ((int)d0 >= 0) {
! 333: x0 = d0, x1 = d1, x2 = d2;
! 334: q |= bit;
! 335: y1 |= 1; /* now t1, y1 are set in concrete */
! 336: }
! 337: ODD_DOUBLE;
! 338: while ((bit >>= 1) != 0) {
! 339: EVEN_DOUBLE;
! 340: t2 = y2 | bit;
! 341: FPU_SUBS(d2, x2, t2);
! 342: FPU_SUBCS(d1, x1, t1);
! 343: FPU_SUBC(d0, x0, t0);
! 344: if ((int)d0 >= 0) {
! 345: x0 = d0, x1 = d1, x2 = d2;
! 346: q |= bit;
! 347: y2 |= bit << 1;
! 348: }
! 349: ODD_DOUBLE;
! 350: }
! 351: x->fp_mant[2] = q;
! 352: #undef t2
! 353:
! 354: /*
! 355: * The result, which includes guard and round bits, is exact iff
! 356: * x is now zero; any nonzero bits in x represent sticky bits.
! 357: */
! 358: x->fp_sticky = x0 | x1 | x2;
! 359: return (x);
! 360: }
CVSweb