Annotation of funnyos/libkern/memcpy.S, Revision 1.1
1.1 ! init 1: /* $Id: memcpy.S,v 1.1.1.1 2007/10/12 08:40:43 init Exp $ */
! 2: /* $OpenBSD: memcpy.S,v 1.2 2004/02/01 05:47:10 drahn Exp $ */
! 3: /* $NetBSD: memcpy.S,v 1.2 2001/11/20 00:29:20 chris Exp $ */
! 4:
! 5: /*-
! 6: * Copyright (c) 1997 The NetBSD Foundation, Inc.
! 7: * All rights reserved.
! 8: *
! 9: * This code is derived from software contributed to The NetBSD Foundation
! 10: * by Neil A. Carson and Mark Brinicombe
! 11: *
! 12: * Redistribution and use in source and binary forms, with or without
! 13: * modification, are permitted provided that the following conditions
! 14: * are met:
! 15: * 1. Redistributions of source code must retain the above copyright
! 16: * notice, this list of conditions and the following disclaimer.
! 17: * 2. Redistributions in binary form must reproduce the above copyright
! 18: * notice, this list of conditions and the following disclaimer in the
! 19: * documentation and/or other materials provided with the distribution.
! 20: * 3. All advertising materials mentioning features or use of this software
! 21: * must display the following acknowledgement:
! 22: * This product includes software developed by the NetBSD
! 23: * Foundation, Inc. and its contributors.
! 24: * 4. Neither the name of The NetBSD Foundation nor the names of its
! 25: * contributors may be used to endorse or promote products derived
! 26: * from this software without specific prior written permission.
! 27: *
! 28: * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
! 29: * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
! 30: * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
! 31: * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
! 32: * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
! 33: * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
! 34: * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
! 35: * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
! 36: * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
! 37: * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
! 38: * POSSIBILITY OF SUCH DAMAGE.
! 39: */
! 40:
! 41: #include <libkern/asm.h>
! 42:
! 43: /*
! 44: * This is one fun bit of code ...
! 45: * Some easy listening music is suggested while trying to understand this
! 46: * code e.g. Iron Maiden
! 47: *
! 48: * For anyone attempting to understand it :
! 49: *
! 50: * The core code is implemented here with simple stubs for memcpy()
! 51: * memmove() and bcopy().
! 52: *
! 53: * All local labels are prefixed with Lmemcpy_
! 54: * Following the prefix a label starting f is used in the forward copy code
! 55: * while a label using b is used in the backwards copy code
! 56: * The source and destination addresses determine whether a forward or
! 57: * backward copy is performed.
! 58: * Separate bits of code are used to deal with the following situations
! 59: * for both the forward and backwards copy.
! 60: * unaligned source address
! 61: * unaligned destination address
! 62: * Separate copy routines are used to produce an optimised result for each
! 63: * of these cases.
! 64: * The copy code will use LDM/STM instructions to copy up to 32 bytes at
! 65: * a time where possible.
! 66: *
! 67: * Note: r12 (aka ip) can be trashed during the function along with
! 68: * r0-r3 although r0-r2 have defined uses i.e. src, dest, len through out.
! 69: * Additional registers are preserved prior to use i.e. r4, r5 & lr
! 70: *
! 71: * Apologies for the state of the comments ;-)
! 72: */
! 73:
! 74: ENTRY(memcpy)
! 75: ENTRY_NP(memmove)
! 76: /* Determine copy direction */
! 77: cmp r1, r0
! 78:
! 79: moveq r0, #0 /* Quick abort for len=0 */
! 80: #ifdef __APCS_26__
! 81: moveqs pc, lr
! 82: #else
! 83: moveq pc, lr
! 84: #endif
! 85:
! 86: /* save leaf functions having to store this away */
! 87: stmdb sp!, {r0, lr} /* memcpy() returns dest addr */
! 88:
! 89: bcc Lmemcpy_backwards
! 90:
! 91: /* start of forwards copy */
! 92: subs r2, r2, #4
! 93: blt Lmemcpy_fl4 /* less than 4 bytes */
! 94: ands r12, r0, #3
! 95: bne Lmemcpy_fdestul /* oh unaligned destination addr */
! 96: ands r12, r1, #3
! 97: bne Lmemcpy_fsrcul /* oh unaligned source addr */
! 98:
! 99: Lmemcpy_ft8:
! 100: /* We have aligned source and destination */
! 101: subs r2, r2, #8
! 102: blt Lmemcpy_fl12 /* less than 12 bytes (4 from above) */
! 103: subs r2, r2, #0x14
! 104: blt Lmemcpy_fl32 /* less than 32 bytes (12 from above) */
! 105: stmdb sp!, {r4} /* borrow r4 */
! 106:
! 107: /* blat 32 bytes at a time */
! 108: /* XXX for really big copies perhaps we should use more registers */
! 109: Lmemcpy_floop32:
! 110: ldmia r1!, {r3, r4, r12, lr}
! 111: stmia r0!, {r3, r4, r12, lr}
! 112: ldmia r1!, {r3, r4, r12, lr}
! 113: stmia r0!, {r3, r4, r12, lr}
! 114: subs r2, r2, #0x20
! 115: bge Lmemcpy_floop32
! 116:
! 117: cmn r2, #0x10
! 118: ldmgeia r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */
! 119: stmgeia r0!, {r3, r4, r12, lr}
! 120: subge r2, r2, #0x10
! 121: ldmia sp!, {r4} /* return r4 */
! 122:
! 123: Lmemcpy_fl32:
! 124: adds r2, r2, #0x14
! 125:
! 126: /* blat 12 bytes at a time */
! 127: Lmemcpy_floop12:
! 128: ldmgeia r1!, {r3, r12, lr}
! 129: stmgeia r0!, {r3, r12, lr}
! 130: subges r2, r2, #0x0c
! 131: bge Lmemcpy_floop12
! 132:
! 133: Lmemcpy_fl12:
! 134: adds r2, r2, #8
! 135: blt Lmemcpy_fl4
! 136:
! 137: subs r2, r2, #4
! 138: ldrlt r3, [r1], #4
! 139: strlt r3, [r0], #4
! 140: ldmgeia r1!, {r3, r12}
! 141: stmgeia r0!, {r3, r12}
! 142: subge r2, r2, #4
! 143:
! 144: Lmemcpy_fl4:
! 145: /* less than 4 bytes to go */
! 146: adds r2, r2, #4
! 147: #ifdef __APCS_26_
! 148: ldmeqia sp!, {r0, pc}^ /* done */
! 149: #else
! 150: ldmeqia sp!, {r0, pc} /* done */
! 151: #endif
! 152: /* copy the crud byte at a time */
! 153: cmp r2, #2
! 154: ldrb r3, [r1], #1
! 155: strb r3, [r0], #1
! 156: ldrgeb r3, [r1], #1
! 157: strgeb r3, [r0], #1
! 158: ldrgtb r3, [r1], #1
! 159: strgtb r3, [r0], #1
! 160: #ifdef __APCS_26__
! 161: ldmia sp!, {r0, pc}^
! 162: #else
! 163: ldmia sp!, {r0, pc}
! 164: #endif
! 165:
! 166: /* erg - unaligned destination */
! 167: Lmemcpy_fdestul:
! 168: rsb r12, r12, #4
! 169: cmp r12, #2
! 170:
! 171: /* align destination with byte copies */
! 172: ldrb r3, [r1], #1
! 173: strb r3, [r0], #1
! 174: ldrgeb r3, [r1], #1
! 175: strgeb r3, [r0], #1
! 176: ldrgtb r3, [r1], #1
! 177: strgtb r3, [r0], #1
! 178: subs r2, r2, r12
! 179: blt Lmemcpy_fl4 /* less the 4 bytes */
! 180:
! 181: ands r12, r1, #3
! 182: beq Lmemcpy_ft8 /* we have an aligned source */
! 183:
! 184: /* erg - unaligned source */
! 185: /* This is where it gets nasty ... */
! 186: Lmemcpy_fsrcul:
! 187: bic r1, r1, #3
! 188: ldr lr, [r1], #4
! 189: cmp r12, #2
! 190: bgt Lmemcpy_fsrcul3
! 191: beq Lmemcpy_fsrcul2
! 192: cmp r2, #0x0c
! 193: blt Lmemcpy_fsrcul1loop4
! 194: sub r2, r2, #0x0c
! 195: stmdb sp!, {r4, r5}
! 196:
! 197: Lmemcpy_fsrcul1loop16:
! 198: mov r3, lr, lsr #8
! 199: ldmia r1!, {r4, r5, r12, lr}
! 200: orr r3, r3, r4, lsl #24
! 201: mov r4, r4, lsr #8
! 202: orr r4, r4, r5, lsl #24
! 203: mov r5, r5, lsr #8
! 204: orr r5, r5, r12, lsl #24
! 205: mov r12, r12, lsr #8
! 206: orr r12, r12, lr, lsl #24
! 207: stmia r0!, {r3-r5, r12}
! 208: subs r2, r2, #0x10
! 209: bge Lmemcpy_fsrcul1loop16
! 210: ldmia sp!, {r4, r5}
! 211: adds r2, r2, #0x0c
! 212: blt Lmemcpy_fsrcul1l4
! 213:
! 214: Lmemcpy_fsrcul1loop4:
! 215: mov r12, lr, lsr #8
! 216: ldr lr, [r1], #4
! 217: orr r12, r12, lr, lsl #24
! 218: str r12, [r0], #4
! 219: subs r2, r2, #4
! 220: bge Lmemcpy_fsrcul1loop4
! 221:
! 222: Lmemcpy_fsrcul1l4:
! 223: sub r1, r1, #3
! 224: b Lmemcpy_fl4
! 225:
! 226: Lmemcpy_fsrcul2:
! 227: cmp r2, #0x0c
! 228: blt Lmemcpy_fsrcul2loop4
! 229: sub r2, r2, #0x0c
! 230: stmdb sp!, {r4, r5}
! 231:
! 232: Lmemcpy_fsrcul2loop16:
! 233: mov r3, lr, lsr #16
! 234: ldmia r1!, {r4, r5, r12, lr}
! 235: orr r3, r3, r4, lsl #16
! 236: mov r4, r4, lsr #16
! 237: orr r4, r4, r5, lsl #16
! 238: mov r5, r5, lsr #16
! 239: orr r5, r5, r12, lsl #16
! 240: mov r12, r12, lsr #16
! 241: orr r12, r12, lr, lsl #16
! 242: stmia r0!, {r3-r5, r12}
! 243: subs r2, r2, #0x10
! 244: bge Lmemcpy_fsrcul2loop16
! 245: ldmia sp!, {r4, r5}
! 246: adds r2, r2, #0x0c
! 247: blt Lmemcpy_fsrcul2l4
! 248:
! 249: Lmemcpy_fsrcul2loop4:
! 250: mov r12, lr, lsr #16
! 251: ldr lr, [r1], #4
! 252: orr r12, r12, lr, lsl #16
! 253: str r12, [r0], #4
! 254: subs r2, r2, #4
! 255: bge Lmemcpy_fsrcul2loop4
! 256:
! 257: Lmemcpy_fsrcul2l4:
! 258: sub r1, r1, #2
! 259: b Lmemcpy_fl4
! 260:
! 261: Lmemcpy_fsrcul3:
! 262: cmp r2, #0x0c
! 263: blt Lmemcpy_fsrcul3loop4
! 264: sub r2, r2, #0x0c
! 265: stmdb sp!, {r4, r5}
! 266:
! 267: Lmemcpy_fsrcul3loop16:
! 268: mov r3, lr, lsr #24
! 269: ldmia r1!, {r4, r5, r12, lr}
! 270: orr r3, r3, r4, lsl #8
! 271: mov r4, r4, lsr #24
! 272: orr r4, r4, r5, lsl #8
! 273: mov r5, r5, lsr #24
! 274: orr r5, r5, r12, lsl #8
! 275: mov r12, r12, lsr #24
! 276: orr r12, r12, lr, lsl #8
! 277: stmia r0!, {r3-r5, r12}
! 278: subs r2, r2, #0x10
! 279: bge Lmemcpy_fsrcul3loop16
! 280: ldmia sp!, {r4, r5}
! 281: adds r2, r2, #0x0c
! 282: blt Lmemcpy_fsrcul3l4
! 283:
! 284: Lmemcpy_fsrcul3loop4:
! 285: mov r12, lr, lsr #24
! 286: ldr lr, [r1], #4
! 287: orr r12, r12, lr, lsl #8
! 288: str r12, [r0], #4
! 289: subs r2, r2, #4
! 290: bge Lmemcpy_fsrcul3loop4
! 291:
! 292: Lmemcpy_fsrcul3l4:
! 293: sub r1, r1, #1
! 294: b Lmemcpy_fl4
! 295:
! 296: Lmemcpy_backwards:
! 297: add r1, r1, r2
! 298: add r0, r0, r2
! 299: subs r2, r2, #4
! 300: blt Lmemcpy_bl4 /* less than 4 bytes */
! 301: ands r12, r0, #3
! 302: bne Lmemcpy_bdestul /* oh unaligned destination addr */
! 303: ands r12, r1, #3
! 304: bne Lmemcpy_bsrcul /* oh unaligned source addr */
! 305:
! 306: Lmemcpy_bt8:
! 307: /* We have aligned source and destination */
! 308: subs r2, r2, #8
! 309: blt Lmemcpy_bl12 /* less than 12 bytes (4 from above) */
! 310: stmdb sp!, {r4}
! 311: subs r2, r2, #0x14 /* less than 32 bytes (12 from above) */
! 312: blt Lmemcpy_bl32
! 313:
! 314: /* blat 32 bytes at a time */
! 315: /* XXX for really big copies perhaps we should use more registers */
! 316: Lmemcpy_bloop32:
! 317: ldmdb r1!, {r3, r4, r12, lr}
! 318: stmdb r0!, {r3, r4, r12, lr}
! 319: ldmdb r1!, {r3, r4, r12, lr}
! 320: stmdb r0!, {r3, r4, r12, lr}
! 321: subs r2, r2, #0x20
! 322: bge Lmemcpy_bloop32
! 323:
! 324: Lmemcpy_bl32:
! 325: cmn r2, #0x10
! 326: ldmgedb r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */
! 327: stmgedb r0!, {r3, r4, r12, lr}
! 328: subge r2, r2, #0x10
! 329: adds r2, r2, #0x14
! 330: ldmgedb r1!, {r3, r12, lr} /* blat a remaining 12 bytes */
! 331: stmgedb r0!, {r3, r12, lr}
! 332: subge r2, r2, #0x0c
! 333: ldmia sp!, {r4}
! 334:
! 335: Lmemcpy_bl12:
! 336: adds r2, r2, #8
! 337: blt Lmemcpy_bl4
! 338: subs r2, r2, #4
! 339: ldrlt r3, [r1, #-4]!
! 340: strlt r3, [r0, #-4]!
! 341: ldmgedb r1!, {r3, r12}
! 342: stmgedb r0!, {r3, r12}
! 343: subge r2, r2, #4
! 344:
! 345: Lmemcpy_bl4:
! 346: /* less than 4 bytes to go */
! 347: adds r2, r2, #4
! 348: #ifdef __APCS_26__
! 349: ldmeqia sp!, {r0, pc}^
! 350: #else
! 351: ldmeqia sp!, {r0, pc}
! 352: #endif
! 353:
! 354: /* copy the crud byte at a time */
! 355: cmp r2, #2
! 356: ldrb r3, [r1, #-1]!
! 357: strb r3, [r0, #-1]!
! 358: ldrgeb r3, [r1, #-1]!
! 359: strgeb r3, [r0, #-1]!
! 360: ldrgtb r3, [r1, #-1]!
! 361: strgtb r3, [r0, #-1]!
! 362: #ifdef __APCS_26__
! 363: ldmia sp!, {r0, pc}^
! 364: #else
! 365: ldmia sp!, {r0, pc}
! 366: #endif
! 367:
! 368: /* erg - unaligned destination */
! 369: Lmemcpy_bdestul:
! 370: cmp r12, #2
! 371:
! 372: /* align destination with byte copies */
! 373: ldrb r3, [r1, #-1]!
! 374: strb r3, [r0, #-1]!
! 375: ldrgeb r3, [r1, #-1]!
! 376: strgeb r3, [r0, #-1]!
! 377: ldrgtb r3, [r1, #-1]!
! 378: strgtb r3, [r0, #-1]!
! 379: subs r2, r2, r12
! 380: blt Lmemcpy_bl4 /* less than 4 bytes to go */
! 381: ands r12, r1, #3
! 382: beq Lmemcpy_bt8 /* we have an aligned source */
! 383:
! 384: /* erg - unaligned source */
! 385: /* This is where it gets nasty ... */
! 386: Lmemcpy_bsrcul:
! 387: bic r1, r1, #3
! 388: ldr r3, [r1, #0]
! 389: cmp r12, #2
! 390: blt Lmemcpy_bsrcul1
! 391: beq Lmemcpy_bsrcul2
! 392: cmp r2, #0x0c
! 393: blt Lmemcpy_bsrcul3loop4
! 394: sub r2, r2, #0x0c
! 395: stmdb sp!, {r4, r5}
! 396:
! 397: Lmemcpy_bsrcul3loop16:
! 398: mov lr, r3, lsl #8
! 399: ldmdb r1!, {r3-r5, r12}
! 400: orr lr, lr, r12, lsr #24
! 401: mov r12, r12, lsl #8
! 402: orr r12, r12, r5, lsr #24
! 403: mov r5, r5, lsl #8
! 404: orr r5, r5, r4, lsr #24
! 405: mov r4, r4, lsl #8
! 406: orr r4, r4, r3, lsr #24
! 407: stmdb r0!, {r4, r5, r12, lr}
! 408: subs r2, r2, #0x10
! 409: bge Lmemcpy_bsrcul3loop16
! 410: ldmia sp!, {r4, r5}
! 411: adds r2, r2, #0x0c
! 412: blt Lmemcpy_bsrcul3l4
! 413:
! 414: Lmemcpy_bsrcul3loop4:
! 415: mov r12, r3, lsl #8
! 416: ldr r3, [r1, #-4]!
! 417: orr r12, r12, r3, lsr #24
! 418: str r12, [r0, #-4]!
! 419: subs r2, r2, #4
! 420: bge Lmemcpy_bsrcul3loop4
! 421:
! 422: Lmemcpy_bsrcul3l4:
! 423: add r1, r1, #3
! 424: b Lmemcpy_bl4
! 425:
! 426: Lmemcpy_bsrcul2:
! 427: cmp r2, #0x0c
! 428: blt Lmemcpy_bsrcul2loop4
! 429: sub r2, r2, #0x0c
! 430: stmdb sp!, {r4, r5}
! 431:
! 432: Lmemcpy_bsrcul2loop16:
! 433: mov lr, r3, lsl #16
! 434: ldmdb r1!, {r3-r5, r12}
! 435: orr lr, lr, r12, lsr #16
! 436: mov r12, r12, lsl #16
! 437: orr r12, r12, r5, lsr #16
! 438: mov r5, r5, lsl #16
! 439: orr r5, r5, r4, lsr #16
! 440: mov r4, r4, lsl #16
! 441: orr r4, r4, r3, lsr #16
! 442: stmdb r0!, {r4, r5, r12, lr}
! 443: subs r2, r2, #0x10
! 444: bge Lmemcpy_bsrcul2loop16
! 445: ldmia sp!, {r4, r5}
! 446: adds r2, r2, #0x0c
! 447: blt Lmemcpy_bsrcul2l4
! 448:
! 449: Lmemcpy_bsrcul2loop4:
! 450: mov r12, r3, lsl #16
! 451: ldr r3, [r1, #-4]!
! 452: orr r12, r12, r3, lsr #16
! 453: str r12, [r0, #-4]!
! 454: subs r2, r2, #4
! 455: bge Lmemcpy_bsrcul2loop4
! 456:
! 457: Lmemcpy_bsrcul2l4:
! 458: add r1, r1, #2
! 459: b Lmemcpy_bl4
! 460:
! 461: Lmemcpy_bsrcul1:
! 462: cmp r2, #0x0c
! 463: blt Lmemcpy_bsrcul1loop4
! 464: sub r2, r2, #0x0c
! 465: stmdb sp!, {r4, r5}
! 466:
! 467: Lmemcpy_bsrcul1loop32:
! 468: mov lr, r3, lsl #24
! 469: ldmdb r1!, {r3-r5, r12}
! 470: orr lr, lr, r12, lsr #8
! 471: mov r12, r12, lsl #24
! 472: orr r12, r12, r5, lsr #8
! 473: mov r5, r5, lsl #24
! 474: orr r5, r5, r4, lsr #8
! 475: mov r4, r4, lsl #24
! 476: orr r4, r4, r3, lsr #8
! 477: stmdb r0!, {r4, r5, r12, lr}
! 478: subs r2, r2, #0x10
! 479: bge Lmemcpy_bsrcul1loop32
! 480: ldmia sp!, {r4, r5}
! 481: adds r2, r2, #0x0c
! 482: blt Lmemcpy_bsrcul1l4
! 483:
! 484: Lmemcpy_bsrcul1loop4:
! 485: mov r12, r3, lsl #24
! 486: ldr r3, [r1, #-4]!
! 487: orr r12, r12, r3, lsr #8
! 488: str r12, [r0, #-4]!
! 489: subs r2, r2, #4
! 490: bge Lmemcpy_bsrcul1loop4
! 491:
! 492: Lmemcpy_bsrcul1l4:
! 493: add r1, r1, #1
! 494: b Lmemcpy_bl4
! 495:
CVSweb