Annotation of funnyos/libkern/memcpy.S, Revision 1.1.1.1
1.1 init 1: /* $Id: memcpy.S,v 1.1.1.1 2007/10/12 08:40:43 init Exp $ */
2: /* $OpenBSD: memcpy.S,v 1.2 2004/02/01 05:47:10 drahn Exp $ */
3: /* $NetBSD: memcpy.S,v 1.2 2001/11/20 00:29:20 chris Exp $ */
4:
5: /*-
6: * Copyright (c) 1997 The NetBSD Foundation, Inc.
7: * All rights reserved.
8: *
9: * This code is derived from software contributed to The NetBSD Foundation
10: * by Neil A. Carson and Mark Brinicombe
11: *
12: * Redistribution and use in source and binary forms, with or without
13: * modification, are permitted provided that the following conditions
14: * are met:
15: * 1. Redistributions of source code must retain the above copyright
16: * notice, this list of conditions and the following disclaimer.
17: * 2. Redistributions in binary form must reproduce the above copyright
18: * notice, this list of conditions and the following disclaimer in the
19: * documentation and/or other materials provided with the distribution.
20: * 3. All advertising materials mentioning features or use of this software
21: * must display the following acknowledgement:
22: * This product includes software developed by the NetBSD
23: * Foundation, Inc. and its contributors.
24: * 4. Neither the name of The NetBSD Foundation nor the names of its
25: * contributors may be used to endorse or promote products derived
26: * from this software without specific prior written permission.
27: *
28: * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
29: * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
30: * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
31: * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
32: * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33: * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34: * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35: * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
36: * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37: * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38: * POSSIBILITY OF SUCH DAMAGE.
39: */
40:
41: #include <libkern/asm.h>
42:
43: /*
44: * This is one fun bit of code ...
45: * Some easy listening music is suggested while trying to understand this
46: * code e.g. Iron Maiden
47: *
48: * For anyone attempting to understand it :
49: *
50: * The core code is implemented here with simple stubs for memcpy()
51: * memmove() and bcopy().
52: *
53: * All local labels are prefixed with Lmemcpy_
54: * Following the prefix a label starting f is used in the forward copy code
55: * while a label using b is used in the backwards copy code
56: * The source and destination addresses determine whether a forward or
57: * backward copy is performed.
58: * Separate bits of code are used to deal with the following situations
59: * for both the forward and backwards copy.
60: * unaligned source address
61: * unaligned destination address
62: * Separate copy routines are used to produce an optimised result for each
63: * of these cases.
64: * The copy code will use LDM/STM instructions to copy up to 32 bytes at
65: * a time where possible.
66: *
67: * Note: r12 (aka ip) can be trashed during the function along with
68: * r0-r3 although r0-r2 have defined uses i.e. src, dest, len through out.
69: * Additional registers are preserved prior to use i.e. r4, r5 & lr
70: *
71: * Apologies for the state of the comments ;-)
72: */
73:
74: ENTRY(memcpy)
75: ENTRY_NP(memmove)
76: /* Determine copy direction */
77: cmp r1, r0
78:
79: moveq r0, #0 /* Quick abort for len=0 */
80: #ifdef __APCS_26__
81: moveqs pc, lr
82: #else
83: moveq pc, lr
84: #endif
85:
86: /* save leaf functions having to store this away */
87: stmdb sp!, {r0, lr} /* memcpy() returns dest addr */
88:
89: bcc Lmemcpy_backwards
90:
91: /* start of forwards copy */
92: subs r2, r2, #4
93: blt Lmemcpy_fl4 /* less than 4 bytes */
94: ands r12, r0, #3
95: bne Lmemcpy_fdestul /* oh unaligned destination addr */
96: ands r12, r1, #3
97: bne Lmemcpy_fsrcul /* oh unaligned source addr */
98:
99: Lmemcpy_ft8:
100: /* We have aligned source and destination */
101: subs r2, r2, #8
102: blt Lmemcpy_fl12 /* less than 12 bytes (4 from above) */
103: subs r2, r2, #0x14
104: blt Lmemcpy_fl32 /* less than 32 bytes (12 from above) */
105: stmdb sp!, {r4} /* borrow r4 */
106:
107: /* blat 32 bytes at a time */
108: /* XXX for really big copies perhaps we should use more registers */
109: Lmemcpy_floop32:
110: ldmia r1!, {r3, r4, r12, lr}
111: stmia r0!, {r3, r4, r12, lr}
112: ldmia r1!, {r3, r4, r12, lr}
113: stmia r0!, {r3, r4, r12, lr}
114: subs r2, r2, #0x20
115: bge Lmemcpy_floop32
116:
117: cmn r2, #0x10
118: ldmgeia r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */
119: stmgeia r0!, {r3, r4, r12, lr}
120: subge r2, r2, #0x10
121: ldmia sp!, {r4} /* return r4 */
122:
123: Lmemcpy_fl32:
124: adds r2, r2, #0x14
125:
126: /* blat 12 bytes at a time */
127: Lmemcpy_floop12:
128: ldmgeia r1!, {r3, r12, lr}
129: stmgeia r0!, {r3, r12, lr}
130: subges r2, r2, #0x0c
131: bge Lmemcpy_floop12
132:
133: Lmemcpy_fl12:
134: adds r2, r2, #8
135: blt Lmemcpy_fl4
136:
137: subs r2, r2, #4
138: ldrlt r3, [r1], #4
139: strlt r3, [r0], #4
140: ldmgeia r1!, {r3, r12}
141: stmgeia r0!, {r3, r12}
142: subge r2, r2, #4
143:
144: Lmemcpy_fl4:
145: /* less than 4 bytes to go */
146: adds r2, r2, #4
147: #ifdef __APCS_26_
148: ldmeqia sp!, {r0, pc}^ /* done */
149: #else
150: ldmeqia sp!, {r0, pc} /* done */
151: #endif
152: /* copy the crud byte at a time */
153: cmp r2, #2
154: ldrb r3, [r1], #1
155: strb r3, [r0], #1
156: ldrgeb r3, [r1], #1
157: strgeb r3, [r0], #1
158: ldrgtb r3, [r1], #1
159: strgtb r3, [r0], #1
160: #ifdef __APCS_26__
161: ldmia sp!, {r0, pc}^
162: #else
163: ldmia sp!, {r0, pc}
164: #endif
165:
166: /* erg - unaligned destination */
167: Lmemcpy_fdestul:
168: rsb r12, r12, #4
169: cmp r12, #2
170:
171: /* align destination with byte copies */
172: ldrb r3, [r1], #1
173: strb r3, [r0], #1
174: ldrgeb r3, [r1], #1
175: strgeb r3, [r0], #1
176: ldrgtb r3, [r1], #1
177: strgtb r3, [r0], #1
178: subs r2, r2, r12
179: blt Lmemcpy_fl4 /* less the 4 bytes */
180:
181: ands r12, r1, #3
182: beq Lmemcpy_ft8 /* we have an aligned source */
183:
184: /* erg - unaligned source */
185: /* This is where it gets nasty ... */
186: Lmemcpy_fsrcul:
187: bic r1, r1, #3
188: ldr lr, [r1], #4
189: cmp r12, #2
190: bgt Lmemcpy_fsrcul3
191: beq Lmemcpy_fsrcul2
192: cmp r2, #0x0c
193: blt Lmemcpy_fsrcul1loop4
194: sub r2, r2, #0x0c
195: stmdb sp!, {r4, r5}
196:
197: Lmemcpy_fsrcul1loop16:
198: mov r3, lr, lsr #8
199: ldmia r1!, {r4, r5, r12, lr}
200: orr r3, r3, r4, lsl #24
201: mov r4, r4, lsr #8
202: orr r4, r4, r5, lsl #24
203: mov r5, r5, lsr #8
204: orr r5, r5, r12, lsl #24
205: mov r12, r12, lsr #8
206: orr r12, r12, lr, lsl #24
207: stmia r0!, {r3-r5, r12}
208: subs r2, r2, #0x10
209: bge Lmemcpy_fsrcul1loop16
210: ldmia sp!, {r4, r5}
211: adds r2, r2, #0x0c
212: blt Lmemcpy_fsrcul1l4
213:
214: Lmemcpy_fsrcul1loop4:
215: mov r12, lr, lsr #8
216: ldr lr, [r1], #4
217: orr r12, r12, lr, lsl #24
218: str r12, [r0], #4
219: subs r2, r2, #4
220: bge Lmemcpy_fsrcul1loop4
221:
222: Lmemcpy_fsrcul1l4:
223: sub r1, r1, #3
224: b Lmemcpy_fl4
225:
226: Lmemcpy_fsrcul2:
227: cmp r2, #0x0c
228: blt Lmemcpy_fsrcul2loop4
229: sub r2, r2, #0x0c
230: stmdb sp!, {r4, r5}
231:
232: Lmemcpy_fsrcul2loop16:
233: mov r3, lr, lsr #16
234: ldmia r1!, {r4, r5, r12, lr}
235: orr r3, r3, r4, lsl #16
236: mov r4, r4, lsr #16
237: orr r4, r4, r5, lsl #16
238: mov r5, r5, lsr #16
239: orr r5, r5, r12, lsl #16
240: mov r12, r12, lsr #16
241: orr r12, r12, lr, lsl #16
242: stmia r0!, {r3-r5, r12}
243: subs r2, r2, #0x10
244: bge Lmemcpy_fsrcul2loop16
245: ldmia sp!, {r4, r5}
246: adds r2, r2, #0x0c
247: blt Lmemcpy_fsrcul2l4
248:
249: Lmemcpy_fsrcul2loop4:
250: mov r12, lr, lsr #16
251: ldr lr, [r1], #4
252: orr r12, r12, lr, lsl #16
253: str r12, [r0], #4
254: subs r2, r2, #4
255: bge Lmemcpy_fsrcul2loop4
256:
257: Lmemcpy_fsrcul2l4:
258: sub r1, r1, #2
259: b Lmemcpy_fl4
260:
261: Lmemcpy_fsrcul3:
262: cmp r2, #0x0c
263: blt Lmemcpy_fsrcul3loop4
264: sub r2, r2, #0x0c
265: stmdb sp!, {r4, r5}
266:
267: Lmemcpy_fsrcul3loop16:
268: mov r3, lr, lsr #24
269: ldmia r1!, {r4, r5, r12, lr}
270: orr r3, r3, r4, lsl #8
271: mov r4, r4, lsr #24
272: orr r4, r4, r5, lsl #8
273: mov r5, r5, lsr #24
274: orr r5, r5, r12, lsl #8
275: mov r12, r12, lsr #24
276: orr r12, r12, lr, lsl #8
277: stmia r0!, {r3-r5, r12}
278: subs r2, r2, #0x10
279: bge Lmemcpy_fsrcul3loop16
280: ldmia sp!, {r4, r5}
281: adds r2, r2, #0x0c
282: blt Lmemcpy_fsrcul3l4
283:
284: Lmemcpy_fsrcul3loop4:
285: mov r12, lr, lsr #24
286: ldr lr, [r1], #4
287: orr r12, r12, lr, lsl #8
288: str r12, [r0], #4
289: subs r2, r2, #4
290: bge Lmemcpy_fsrcul3loop4
291:
292: Lmemcpy_fsrcul3l4:
293: sub r1, r1, #1
294: b Lmemcpy_fl4
295:
296: Lmemcpy_backwards:
297: add r1, r1, r2
298: add r0, r0, r2
299: subs r2, r2, #4
300: blt Lmemcpy_bl4 /* less than 4 bytes */
301: ands r12, r0, #3
302: bne Lmemcpy_bdestul /* oh unaligned destination addr */
303: ands r12, r1, #3
304: bne Lmemcpy_bsrcul /* oh unaligned source addr */
305:
306: Lmemcpy_bt8:
307: /* We have aligned source and destination */
308: subs r2, r2, #8
309: blt Lmemcpy_bl12 /* less than 12 bytes (4 from above) */
310: stmdb sp!, {r4}
311: subs r2, r2, #0x14 /* less than 32 bytes (12 from above) */
312: blt Lmemcpy_bl32
313:
314: /* blat 32 bytes at a time */
315: /* XXX for really big copies perhaps we should use more registers */
316: Lmemcpy_bloop32:
317: ldmdb r1!, {r3, r4, r12, lr}
318: stmdb r0!, {r3, r4, r12, lr}
319: ldmdb r1!, {r3, r4, r12, lr}
320: stmdb r0!, {r3, r4, r12, lr}
321: subs r2, r2, #0x20
322: bge Lmemcpy_bloop32
323:
324: Lmemcpy_bl32:
325: cmn r2, #0x10
326: ldmgedb r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */
327: stmgedb r0!, {r3, r4, r12, lr}
328: subge r2, r2, #0x10
329: adds r2, r2, #0x14
330: ldmgedb r1!, {r3, r12, lr} /* blat a remaining 12 bytes */
331: stmgedb r0!, {r3, r12, lr}
332: subge r2, r2, #0x0c
333: ldmia sp!, {r4}
334:
335: Lmemcpy_bl12:
336: adds r2, r2, #8
337: blt Lmemcpy_bl4
338: subs r2, r2, #4
339: ldrlt r3, [r1, #-4]!
340: strlt r3, [r0, #-4]!
341: ldmgedb r1!, {r3, r12}
342: stmgedb r0!, {r3, r12}
343: subge r2, r2, #4
344:
345: Lmemcpy_bl4:
346: /* less than 4 bytes to go */
347: adds r2, r2, #4
348: #ifdef __APCS_26__
349: ldmeqia sp!, {r0, pc}^
350: #else
351: ldmeqia sp!, {r0, pc}
352: #endif
353:
354: /* copy the crud byte at a time */
355: cmp r2, #2
356: ldrb r3, [r1, #-1]!
357: strb r3, [r0, #-1]!
358: ldrgeb r3, [r1, #-1]!
359: strgeb r3, [r0, #-1]!
360: ldrgtb r3, [r1, #-1]!
361: strgtb r3, [r0, #-1]!
362: #ifdef __APCS_26__
363: ldmia sp!, {r0, pc}^
364: #else
365: ldmia sp!, {r0, pc}
366: #endif
367:
368: /* erg - unaligned destination */
369: Lmemcpy_bdestul:
370: cmp r12, #2
371:
372: /* align destination with byte copies */
373: ldrb r3, [r1, #-1]!
374: strb r3, [r0, #-1]!
375: ldrgeb r3, [r1, #-1]!
376: strgeb r3, [r0, #-1]!
377: ldrgtb r3, [r1, #-1]!
378: strgtb r3, [r0, #-1]!
379: subs r2, r2, r12
380: blt Lmemcpy_bl4 /* less than 4 bytes to go */
381: ands r12, r1, #3
382: beq Lmemcpy_bt8 /* we have an aligned source */
383:
384: /* erg - unaligned source */
385: /* This is where it gets nasty ... */
386: Lmemcpy_bsrcul:
387: bic r1, r1, #3
388: ldr r3, [r1, #0]
389: cmp r12, #2
390: blt Lmemcpy_bsrcul1
391: beq Lmemcpy_bsrcul2
392: cmp r2, #0x0c
393: blt Lmemcpy_bsrcul3loop4
394: sub r2, r2, #0x0c
395: stmdb sp!, {r4, r5}
396:
397: Lmemcpy_bsrcul3loop16:
398: mov lr, r3, lsl #8
399: ldmdb r1!, {r3-r5, r12}
400: orr lr, lr, r12, lsr #24
401: mov r12, r12, lsl #8
402: orr r12, r12, r5, lsr #24
403: mov r5, r5, lsl #8
404: orr r5, r5, r4, lsr #24
405: mov r4, r4, lsl #8
406: orr r4, r4, r3, lsr #24
407: stmdb r0!, {r4, r5, r12, lr}
408: subs r2, r2, #0x10
409: bge Lmemcpy_bsrcul3loop16
410: ldmia sp!, {r4, r5}
411: adds r2, r2, #0x0c
412: blt Lmemcpy_bsrcul3l4
413:
414: Lmemcpy_bsrcul3loop4:
415: mov r12, r3, lsl #8
416: ldr r3, [r1, #-4]!
417: orr r12, r12, r3, lsr #24
418: str r12, [r0, #-4]!
419: subs r2, r2, #4
420: bge Lmemcpy_bsrcul3loop4
421:
422: Lmemcpy_bsrcul3l4:
423: add r1, r1, #3
424: b Lmemcpy_bl4
425:
426: Lmemcpy_bsrcul2:
427: cmp r2, #0x0c
428: blt Lmemcpy_bsrcul2loop4
429: sub r2, r2, #0x0c
430: stmdb sp!, {r4, r5}
431:
432: Lmemcpy_bsrcul2loop16:
433: mov lr, r3, lsl #16
434: ldmdb r1!, {r3-r5, r12}
435: orr lr, lr, r12, lsr #16
436: mov r12, r12, lsl #16
437: orr r12, r12, r5, lsr #16
438: mov r5, r5, lsl #16
439: orr r5, r5, r4, lsr #16
440: mov r4, r4, lsl #16
441: orr r4, r4, r3, lsr #16
442: stmdb r0!, {r4, r5, r12, lr}
443: subs r2, r2, #0x10
444: bge Lmemcpy_bsrcul2loop16
445: ldmia sp!, {r4, r5}
446: adds r2, r2, #0x0c
447: blt Lmemcpy_bsrcul2l4
448:
449: Lmemcpy_bsrcul2loop4:
450: mov r12, r3, lsl #16
451: ldr r3, [r1, #-4]!
452: orr r12, r12, r3, lsr #16
453: str r12, [r0, #-4]!
454: subs r2, r2, #4
455: bge Lmemcpy_bsrcul2loop4
456:
457: Lmemcpy_bsrcul2l4:
458: add r1, r1, #2
459: b Lmemcpy_bl4
460:
461: Lmemcpy_bsrcul1:
462: cmp r2, #0x0c
463: blt Lmemcpy_bsrcul1loop4
464: sub r2, r2, #0x0c
465: stmdb sp!, {r4, r5}
466:
467: Lmemcpy_bsrcul1loop32:
468: mov lr, r3, lsl #24
469: ldmdb r1!, {r3-r5, r12}
470: orr lr, lr, r12, lsr #8
471: mov r12, r12, lsl #24
472: orr r12, r12, r5, lsr #8
473: mov r5, r5, lsl #24
474: orr r5, r5, r4, lsr #8
475: mov r4, r4, lsl #24
476: orr r4, r4, r3, lsr #8
477: stmdb r0!, {r4, r5, r12, lr}
478: subs r2, r2, #0x10
479: bge Lmemcpy_bsrcul1loop32
480: ldmia sp!, {r4, r5}
481: adds r2, r2, #0x0c
482: blt Lmemcpy_bsrcul1l4
483:
484: Lmemcpy_bsrcul1loop4:
485: mov r12, r3, lsl #24
486: ldr r3, [r1, #-4]!
487: orr r12, r12, r3, lsr #8
488: str r12, [r0, #-4]!
489: subs r2, r2, #4
490: bge Lmemcpy_bsrcul1loop4
491:
492: Lmemcpy_bsrcul1l4:
493: add r1, r1, #1
494: b Lmemcpy_bl4
495:
CVSweb