Annotation of sys/lib/libkern/arch/sparc/umul.S, Revision 1.1.1.1
1.1 nbrk 1: /* $OpenBSD: umul.S,v 1.5 2003/06/02 23:28:09 millert Exp $ */
2: /* $NetBSD: umul.S,v 1.2 1994/10/26 06:40:10 cgd Exp $ */
3:
4: /*
5: * Copyright (c) 1992, 1993
6: * The Regents of the University of California. All rights reserved.
7: *
8: * This software was developed by the Computer Systems Engineering group
9: * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and
10: * contributed to Berkeley.
11: *
12: * Redistribution and use in source and binary forms, with or without
13: * modification, are permitted provided that the following conditions
14: * are met:
15: * 1. Redistributions of source code must retain the above copyright
16: * notice, this list of conditions and the following disclaimer.
17: * 2. Redistributions in binary form must reproduce the above copyright
18: * notice, this list of conditions and the following disclaimer in the
19: * documentation and/or other materials provided with the distribution.
20: * 3. Neither the name of the University nor the names of its contributors
21: * may be used to endorse or promote products derived from this software
22: * without specific prior written permission.
23: *
24: * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25: * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27: * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28: * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30: * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34: * SUCH DAMAGE.
35: *
36: * Header: umul.s,v 1.4 92/06/25 13:24:05 torek Exp
37: */
38:
39: #if defined(LIBC_SCCS) && !defined(lint)
40: #ifdef notdef
41: .asciz "@(#)umul.s 8.1 (Berkeley) 6/4/93"
42: #endif
43: .asciz "$OpenBSD: umul.S,v 1.5 2003/06/02 23:28:09 millert Exp $"
44: #endif /* LIBC_SCCS and not lint */
45:
46: /*
47: * Unsigned multiply. Returns %o0 * %o1 in %o1%o0 (i.e., %o1 holds the
48: * upper 32 bits of the 64-bit product).
49: *
50: * This code optimizes short (less than 13-bit) multiplies. Short
51: * multiplies require 25 instruction cycles, and long ones require
52: * 45 instruction cycles.
53: *
54: * On return, overflow has occurred (%o1 is not zero) if and only if
55: * the Z condition code is clear, allowing, e.g., the following:
56: *
57: * call .umul
58: * nop
59: * bnz overflow (or tnz)
60: */
61:
62: #include "DEFS.h"
63: FUNC(.umul)
64: or %o0, %o1, %o4
65: mov %o0, %y ! multiplier -> Y
66: andncc %o4, 0xfff, %g0 ! test bits 12..31 of *both* args
67: be Lmul_shortway ! if zero, can do it the short way
68: andcc %g0, %g0, %o4 ! zero the partial product and clear N and V
69:
70: /*
71: * Long multiply. 32 steps, followed by a final shift step.
72: */
73: mulscc %o4, %o1, %o4 ! 1
74: mulscc %o4, %o1, %o4 ! 2
75: mulscc %o4, %o1, %o4 ! 3
76: mulscc %o4, %o1, %o4 ! 4
77: mulscc %o4, %o1, %o4 ! 5
78: mulscc %o4, %o1, %o4 ! 6
79: mulscc %o4, %o1, %o4 ! 7
80: mulscc %o4, %o1, %o4 ! 8
81: mulscc %o4, %o1, %o4 ! 9
82: mulscc %o4, %o1, %o4 ! 10
83: mulscc %o4, %o1, %o4 ! 11
84: mulscc %o4, %o1, %o4 ! 12
85: mulscc %o4, %o1, %o4 ! 13
86: mulscc %o4, %o1, %o4 ! 14
87: mulscc %o4, %o1, %o4 ! 15
88: mulscc %o4, %o1, %o4 ! 16
89: mulscc %o4, %o1, %o4 ! 17
90: mulscc %o4, %o1, %o4 ! 18
91: mulscc %o4, %o1, %o4 ! 19
92: mulscc %o4, %o1, %o4 ! 20
93: mulscc %o4, %o1, %o4 ! 21
94: mulscc %o4, %o1, %o4 ! 22
95: mulscc %o4, %o1, %o4 ! 23
96: mulscc %o4, %o1, %o4 ! 24
97: mulscc %o4, %o1, %o4 ! 25
98: mulscc %o4, %o1, %o4 ! 26
99: mulscc %o4, %o1, %o4 ! 27
100: mulscc %o4, %o1, %o4 ! 28
101: mulscc %o4, %o1, %o4 ! 29
102: mulscc %o4, %o1, %o4 ! 30
103: mulscc %o4, %o1, %o4 ! 31
104: mulscc %o4, %o1, %o4 ! 32
105: mulscc %o4, %g0, %o4 ! final shift
106:
107:
108: /*
109: * Normally, with the shift-and-add approach, if both numbers are
110: * positive you get the correct result. WIth 32-bit two's-complement
111: * numbers, -x is represented as
112: *
113: * x 32
114: * ( 2 - ------ ) mod 2 * 2
115: * 32
116: * 2
117: *
118: * (the `mod 2' subtracts 1 from 1.bbbb). To avoid lots of 2^32s,
119: * we can treat this as if the radix point were just to the left
120: * of the sign bit (multiply by 2^32), and get
121: *
122: * -x = (2 - x) mod 2
123: *
124: * Then, ignoring the `mod 2's for convenience:
125: *
126: * x * y = xy
127: * -x * y = 2y - xy
128: * x * -y = 2x - xy
129: * -x * -y = 4 - 2x - 2y + xy
130: *
131: * For signed multiplies, we subtract (x << 32) from the partial
132: * product to fix this problem for negative multipliers (see mul.s).
133: * Because of the way the shift into the partial product is calculated
134: * (N xor V), this term is automatically removed for the multiplicand,
135: * so we don't have to adjust.
136: *
137: * But for unsigned multiplies, the high order bit wasn't a sign bit,
138: * and the correction is wrong. So for unsigned multiplies where the
139: * high order bit is one, we end up with xy - (y << 32). To fix it
140: * we add y << 32.
141: */
142: tst %o1
143: bl,a 1f ! if %o1 < 0 (high order bit = 1),
144: add %o4, %o0, %o4 ! %o4 += %o0 (add y to upper half)
145: 1: rd %y, %o0 ! get lower half of product
146: retl
147: addcc %o4, %g0, %o1 ! put upper half in place and set Z for %o1==0
148:
149: Lmul_shortway:
150: /*
151: * Short multiply. 12 steps, followed by a final shift step.
152: * The resulting bits are off by 12 and (32-12) = 20 bit positions,
153: * but there is no problem with %o0 being negative (unlike above),
154: * and overflow is impossible (the answer is at most 24 bits long).
155: */
156: mulscc %o4, %o1, %o4 ! 1
157: mulscc %o4, %o1, %o4 ! 2
158: mulscc %o4, %o1, %o4 ! 3
159: mulscc %o4, %o1, %o4 ! 4
160: mulscc %o4, %o1, %o4 ! 5
161: mulscc %o4, %o1, %o4 ! 6
162: mulscc %o4, %o1, %o4 ! 7
163: mulscc %o4, %o1, %o4 ! 8
164: mulscc %o4, %o1, %o4 ! 9
165: mulscc %o4, %o1, %o4 ! 10
166: mulscc %o4, %o1, %o4 ! 11
167: mulscc %o4, %o1, %o4 ! 12
168: mulscc %o4, %g0, %o4 ! final shift
169:
170: /*
171: * %o4 has 20 of the bits that should be in the result; %y has
172: * the bottom 12 (as %y's top 12). That is:
173: *
174: * %o4 %y
175: * +----------------+----------------+
176: * | -12- | -20- | -12- | -20- |
177: * +------(---------+------)---------+
178: * -----result-----
179: *
180: * The 12 bits of %o4 left of the `result' area are all zero;
181: * in fact, all top 20 bits of %o4 are zero.
182: */
183:
184: rd %y, %o5
185: sll %o4, 12, %o0 ! shift middle bits left 12
186: srl %o5, 20, %o5 ! shift low bits right 20
187: or %o5, %o0, %o0
188: retl
189: addcc %g0, %g0, %o1 ! %o1 = zero, and set Z
CVSweb