Annotation of sys/lib/libkern/arch/arm/memcpy.S, Revision 1.1
1.1 ! nbrk 1: /* $OpenBSD: memcpy.S,v 1.2 2004/02/01 05:47:10 drahn Exp $ */
! 2: /* $NetBSD: memcpy.S,v 1.2 2001/11/20 00:29:20 chris Exp $ */
! 3:
! 4: /*-
! 5: * Copyright (c) 1997 The NetBSD Foundation, Inc.
! 6: * All rights reserved.
! 7: *
! 8: * This code is derived from software contributed to The NetBSD Foundation
! 9: * by Neil A. Carson and Mark Brinicombe
! 10: *
! 11: * Redistribution and use in source and binary forms, with or without
! 12: * modification, are permitted provided that the following conditions
! 13: * are met:
! 14: * 1. Redistributions of source code must retain the above copyright
! 15: * notice, this list of conditions and the following disclaimer.
! 16: * 2. Redistributions in binary form must reproduce the above copyright
! 17: * notice, this list of conditions and the following disclaimer in the
! 18: * documentation and/or other materials provided with the distribution.
! 19: * 3. All advertising materials mentioning features or use of this software
! 20: * must display the following acknowledgement:
! 21: * This product includes software developed by the NetBSD
! 22: * Foundation, Inc. and its contributors.
! 23: * 4. Neither the name of The NetBSD Foundation nor the names of its
! 24: * contributors may be used to endorse or promote products derived
! 25: * from this software without specific prior written permission.
! 26: *
! 27: * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
! 28: * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
! 29: * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
! 30: * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
! 31: * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
! 32: * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
! 33: * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
! 34: * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
! 35: * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
! 36: * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
! 37: * POSSIBILITY OF SUCH DAMAGE.
! 38: */
! 39:
! 40: #include <machine/asm.h>
! 41:
! 42: /*
! 43: * This is one fun bit of code ...
! 44: * Some easy listening music is suggested while trying to understand this
! 45: * code e.g. Iron Maiden
! 46: *
! 47: * For anyone attempting to understand it :
! 48: *
! 49: * The core code is implemented here with simple stubs for memcpy()
! 50: * memmove() and bcopy().
! 51: *
! 52: * All local labels are prefixed with Lmemcpy_
! 53: * Following the prefix a label starting f is used in the forward copy code
! 54: * while a label using b is used in the backwards copy code
! 55: * The source and destination addresses determine whether a forward or
! 56: * backward copy is performed.
! 57: * Separate bits of code are used to deal with the following situations
! 58: * for both the forward and backwards copy.
! 59: * unaligned source address
! 60: * unaligned destination address
! 61: * Separate copy routines are used to produce an optimised result for each
! 62: * of these cases.
! 63: * The copy code will use LDM/STM instructions to copy up to 32 bytes at
! 64: * a time where possible.
! 65: *
! 66: * Note: r12 (aka ip) can be trashed during the function along with
! 67: * r0-r3 although r0-r2 have defined uses i.e. src, dest, len through out.
! 68: * Additional registers are preserved prior to use i.e. r4, r5 & lr
! 69: *
! 70: * Apologies for the state of the comments ;-)
! 71: */
! 72:
! 73: ENTRY(memcpy)
! 74: ENTRY_NP(memmove)
! 75: /* Determine copy direction */
! 76: cmp r1, r0
! 77:
! 78: moveq r0, #0 /* Quick abort for len=0 */
! 79: #ifdef __APCS_26__
! 80: moveqs pc, lr
! 81: #else
! 82: moveq pc, lr
! 83: #endif
! 84:
! 85: /* save leaf functions having to store this away */
! 86: stmdb sp!, {r0, lr} /* memcpy() returns dest addr */
! 87:
! 88: bcc Lmemcpy_backwards
! 89:
! 90: /* start of forwards copy */
! 91: subs r2, r2, #4
! 92: blt Lmemcpy_fl4 /* less than 4 bytes */
! 93: ands r12, r0, #3
! 94: bne Lmemcpy_fdestul /* oh unaligned destination addr */
! 95: ands r12, r1, #3
! 96: bne Lmemcpy_fsrcul /* oh unaligned source addr */
! 97:
! 98: Lmemcpy_ft8:
! 99: /* We have aligned source and destination */
! 100: subs r2, r2, #8
! 101: blt Lmemcpy_fl12 /* less than 12 bytes (4 from above) */
! 102: subs r2, r2, #0x14
! 103: blt Lmemcpy_fl32 /* less than 32 bytes (12 from above) */
! 104: stmdb sp!, {r4} /* borrow r4 */
! 105:
! 106: /* blat 32 bytes at a time */
! 107: /* XXX for really big copies perhaps we should use more registers */
! 108: Lmemcpy_floop32:
! 109: ldmia r1!, {r3, r4, r12, lr}
! 110: stmia r0!, {r3, r4, r12, lr}
! 111: ldmia r1!, {r3, r4, r12, lr}
! 112: stmia r0!, {r3, r4, r12, lr}
! 113: subs r2, r2, #0x20
! 114: bge Lmemcpy_floop32
! 115:
! 116: cmn r2, #0x10
! 117: ldmgeia r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */
! 118: stmgeia r0!, {r3, r4, r12, lr}
! 119: subge r2, r2, #0x10
! 120: ldmia sp!, {r4} /* return r4 */
! 121:
! 122: Lmemcpy_fl32:
! 123: adds r2, r2, #0x14
! 124:
! 125: /* blat 12 bytes at a time */
! 126: Lmemcpy_floop12:
! 127: ldmgeia r1!, {r3, r12, lr}
! 128: stmgeia r0!, {r3, r12, lr}
! 129: subges r2, r2, #0x0c
! 130: bge Lmemcpy_floop12
! 131:
! 132: Lmemcpy_fl12:
! 133: adds r2, r2, #8
! 134: blt Lmemcpy_fl4
! 135:
! 136: subs r2, r2, #4
! 137: ldrlt r3, [r1], #4
! 138: strlt r3, [r0], #4
! 139: ldmgeia r1!, {r3, r12}
! 140: stmgeia r0!, {r3, r12}
! 141: subge r2, r2, #4
! 142:
! 143: Lmemcpy_fl4:
! 144: /* less than 4 bytes to go */
! 145: adds r2, r2, #4
! 146: #ifdef __APCS_26_
! 147: ldmeqia sp!, {r0, pc}^ /* done */
! 148: #else
! 149: ldmeqia sp!, {r0, pc} /* done */
! 150: #endif
! 151: /* copy the crud byte at a time */
! 152: cmp r2, #2
! 153: ldrb r3, [r1], #1
! 154: strb r3, [r0], #1
! 155: ldrgeb r3, [r1], #1
! 156: strgeb r3, [r0], #1
! 157: ldrgtb r3, [r1], #1
! 158: strgtb r3, [r0], #1
! 159: #ifdef __APCS_26__
! 160: ldmia sp!, {r0, pc}^
! 161: #else
! 162: ldmia sp!, {r0, pc}
! 163: #endif
! 164:
! 165: /* erg - unaligned destination */
! 166: Lmemcpy_fdestul:
! 167: rsb r12, r12, #4
! 168: cmp r12, #2
! 169:
! 170: /* align destination with byte copies */
! 171: ldrb r3, [r1], #1
! 172: strb r3, [r0], #1
! 173: ldrgeb r3, [r1], #1
! 174: strgeb r3, [r0], #1
! 175: ldrgtb r3, [r1], #1
! 176: strgtb r3, [r0], #1
! 177: subs r2, r2, r12
! 178: blt Lmemcpy_fl4 /* less the 4 bytes */
! 179:
! 180: ands r12, r1, #3
! 181: beq Lmemcpy_ft8 /* we have an aligned source */
! 182:
! 183: /* erg - unaligned source */
! 184: /* This is where it gets nasty ... */
! 185: Lmemcpy_fsrcul:
! 186: bic r1, r1, #3
! 187: ldr lr, [r1], #4
! 188: cmp r12, #2
! 189: bgt Lmemcpy_fsrcul3
! 190: beq Lmemcpy_fsrcul2
! 191: cmp r2, #0x0c
! 192: blt Lmemcpy_fsrcul1loop4
! 193: sub r2, r2, #0x0c
! 194: stmdb sp!, {r4, r5}
! 195:
! 196: Lmemcpy_fsrcul1loop16:
! 197: mov r3, lr, lsr #8
! 198: ldmia r1!, {r4, r5, r12, lr}
! 199: orr r3, r3, r4, lsl #24
! 200: mov r4, r4, lsr #8
! 201: orr r4, r4, r5, lsl #24
! 202: mov r5, r5, lsr #8
! 203: orr r5, r5, r12, lsl #24
! 204: mov r12, r12, lsr #8
! 205: orr r12, r12, lr, lsl #24
! 206: stmia r0!, {r3-r5, r12}
! 207: subs r2, r2, #0x10
! 208: bge Lmemcpy_fsrcul1loop16
! 209: ldmia sp!, {r4, r5}
! 210: adds r2, r2, #0x0c
! 211: blt Lmemcpy_fsrcul1l4
! 212:
! 213: Lmemcpy_fsrcul1loop4:
! 214: mov r12, lr, lsr #8
! 215: ldr lr, [r1], #4
! 216: orr r12, r12, lr, lsl #24
! 217: str r12, [r0], #4
! 218: subs r2, r2, #4
! 219: bge Lmemcpy_fsrcul1loop4
! 220:
! 221: Lmemcpy_fsrcul1l4:
! 222: sub r1, r1, #3
! 223: b Lmemcpy_fl4
! 224:
! 225: Lmemcpy_fsrcul2:
! 226: cmp r2, #0x0c
! 227: blt Lmemcpy_fsrcul2loop4
! 228: sub r2, r2, #0x0c
! 229: stmdb sp!, {r4, r5}
! 230:
! 231: Lmemcpy_fsrcul2loop16:
! 232: mov r3, lr, lsr #16
! 233: ldmia r1!, {r4, r5, r12, lr}
! 234: orr r3, r3, r4, lsl #16
! 235: mov r4, r4, lsr #16
! 236: orr r4, r4, r5, lsl #16
! 237: mov r5, r5, lsr #16
! 238: orr r5, r5, r12, lsl #16
! 239: mov r12, r12, lsr #16
! 240: orr r12, r12, lr, lsl #16
! 241: stmia r0!, {r3-r5, r12}
! 242: subs r2, r2, #0x10
! 243: bge Lmemcpy_fsrcul2loop16
! 244: ldmia sp!, {r4, r5}
! 245: adds r2, r2, #0x0c
! 246: blt Lmemcpy_fsrcul2l4
! 247:
! 248: Lmemcpy_fsrcul2loop4:
! 249: mov r12, lr, lsr #16
! 250: ldr lr, [r1], #4
! 251: orr r12, r12, lr, lsl #16
! 252: str r12, [r0], #4
! 253: subs r2, r2, #4
! 254: bge Lmemcpy_fsrcul2loop4
! 255:
! 256: Lmemcpy_fsrcul2l4:
! 257: sub r1, r1, #2
! 258: b Lmemcpy_fl4
! 259:
! 260: Lmemcpy_fsrcul3:
! 261: cmp r2, #0x0c
! 262: blt Lmemcpy_fsrcul3loop4
! 263: sub r2, r2, #0x0c
! 264: stmdb sp!, {r4, r5}
! 265:
! 266: Lmemcpy_fsrcul3loop16:
! 267: mov r3, lr, lsr #24
! 268: ldmia r1!, {r4, r5, r12, lr}
! 269: orr r3, r3, r4, lsl #8
! 270: mov r4, r4, lsr #24
! 271: orr r4, r4, r5, lsl #8
! 272: mov r5, r5, lsr #24
! 273: orr r5, r5, r12, lsl #8
! 274: mov r12, r12, lsr #24
! 275: orr r12, r12, lr, lsl #8
! 276: stmia r0!, {r3-r5, r12}
! 277: subs r2, r2, #0x10
! 278: bge Lmemcpy_fsrcul3loop16
! 279: ldmia sp!, {r4, r5}
! 280: adds r2, r2, #0x0c
! 281: blt Lmemcpy_fsrcul3l4
! 282:
! 283: Lmemcpy_fsrcul3loop4:
! 284: mov r12, lr, lsr #24
! 285: ldr lr, [r1], #4
! 286: orr r12, r12, lr, lsl #8
! 287: str r12, [r0], #4
! 288: subs r2, r2, #4
! 289: bge Lmemcpy_fsrcul3loop4
! 290:
! 291: Lmemcpy_fsrcul3l4:
! 292: sub r1, r1, #1
! 293: b Lmemcpy_fl4
! 294:
! 295: Lmemcpy_backwards:
! 296: add r1, r1, r2
! 297: add r0, r0, r2
! 298: subs r2, r2, #4
! 299: blt Lmemcpy_bl4 /* less than 4 bytes */
! 300: ands r12, r0, #3
! 301: bne Lmemcpy_bdestul /* oh unaligned destination addr */
! 302: ands r12, r1, #3
! 303: bne Lmemcpy_bsrcul /* oh unaligned source addr */
! 304:
! 305: Lmemcpy_bt8:
! 306: /* We have aligned source and destination */
! 307: subs r2, r2, #8
! 308: blt Lmemcpy_bl12 /* less than 12 bytes (4 from above) */
! 309: stmdb sp!, {r4}
! 310: subs r2, r2, #0x14 /* less than 32 bytes (12 from above) */
! 311: blt Lmemcpy_bl32
! 312:
! 313: /* blat 32 bytes at a time */
! 314: /* XXX for really big copies perhaps we should use more registers */
! 315: Lmemcpy_bloop32:
! 316: ldmdb r1!, {r3, r4, r12, lr}
! 317: stmdb r0!, {r3, r4, r12, lr}
! 318: ldmdb r1!, {r3, r4, r12, lr}
! 319: stmdb r0!, {r3, r4, r12, lr}
! 320: subs r2, r2, #0x20
! 321: bge Lmemcpy_bloop32
! 322:
! 323: Lmemcpy_bl32:
! 324: cmn r2, #0x10
! 325: ldmgedb r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */
! 326: stmgedb r0!, {r3, r4, r12, lr}
! 327: subge r2, r2, #0x10
! 328: adds r2, r2, #0x14
! 329: ldmgedb r1!, {r3, r12, lr} /* blat a remaining 12 bytes */
! 330: stmgedb r0!, {r3, r12, lr}
! 331: subge r2, r2, #0x0c
! 332: ldmia sp!, {r4}
! 333:
! 334: Lmemcpy_bl12:
! 335: adds r2, r2, #8
! 336: blt Lmemcpy_bl4
! 337: subs r2, r2, #4
! 338: ldrlt r3, [r1, #-4]!
! 339: strlt r3, [r0, #-4]!
! 340: ldmgedb r1!, {r3, r12}
! 341: stmgedb r0!, {r3, r12}
! 342: subge r2, r2, #4
! 343:
! 344: Lmemcpy_bl4:
! 345: /* less than 4 bytes to go */
! 346: adds r2, r2, #4
! 347: #ifdef __APCS_26__
! 348: ldmeqia sp!, {r0, pc}^
! 349: #else
! 350: ldmeqia sp!, {r0, pc}
! 351: #endif
! 352:
! 353: /* copy the crud byte at a time */
! 354: cmp r2, #2
! 355: ldrb r3, [r1, #-1]!
! 356: strb r3, [r0, #-1]!
! 357: ldrgeb r3, [r1, #-1]!
! 358: strgeb r3, [r0, #-1]!
! 359: ldrgtb r3, [r1, #-1]!
! 360: strgtb r3, [r0, #-1]!
! 361: #ifdef __APCS_26__
! 362: ldmia sp!, {r0, pc}^
! 363: #else
! 364: ldmia sp!, {r0, pc}
! 365: #endif
! 366:
! 367: /* erg - unaligned destination */
! 368: Lmemcpy_bdestul:
! 369: cmp r12, #2
! 370:
! 371: /* align destination with byte copies */
! 372: ldrb r3, [r1, #-1]!
! 373: strb r3, [r0, #-1]!
! 374: ldrgeb r3, [r1, #-1]!
! 375: strgeb r3, [r0, #-1]!
! 376: ldrgtb r3, [r1, #-1]!
! 377: strgtb r3, [r0, #-1]!
! 378: subs r2, r2, r12
! 379: blt Lmemcpy_bl4 /* less than 4 bytes to go */
! 380: ands r12, r1, #3
! 381: beq Lmemcpy_bt8 /* we have an aligned source */
! 382:
! 383: /* erg - unaligned source */
! 384: /* This is where it gets nasty ... */
! 385: Lmemcpy_bsrcul:
! 386: bic r1, r1, #3
! 387: ldr r3, [r1, #0]
! 388: cmp r12, #2
! 389: blt Lmemcpy_bsrcul1
! 390: beq Lmemcpy_bsrcul2
! 391: cmp r2, #0x0c
! 392: blt Lmemcpy_bsrcul3loop4
! 393: sub r2, r2, #0x0c
! 394: stmdb sp!, {r4, r5}
! 395:
! 396: Lmemcpy_bsrcul3loop16:
! 397: mov lr, r3, lsl #8
! 398: ldmdb r1!, {r3-r5, r12}
! 399: orr lr, lr, r12, lsr #24
! 400: mov r12, r12, lsl #8
! 401: orr r12, r12, r5, lsr #24
! 402: mov r5, r5, lsl #8
! 403: orr r5, r5, r4, lsr #24
! 404: mov r4, r4, lsl #8
! 405: orr r4, r4, r3, lsr #24
! 406: stmdb r0!, {r4, r5, r12, lr}
! 407: subs r2, r2, #0x10
! 408: bge Lmemcpy_bsrcul3loop16
! 409: ldmia sp!, {r4, r5}
! 410: adds r2, r2, #0x0c
! 411: blt Lmemcpy_bsrcul3l4
! 412:
! 413: Lmemcpy_bsrcul3loop4:
! 414: mov r12, r3, lsl #8
! 415: ldr r3, [r1, #-4]!
! 416: orr r12, r12, r3, lsr #24
! 417: str r12, [r0, #-4]!
! 418: subs r2, r2, #4
! 419: bge Lmemcpy_bsrcul3loop4
! 420:
! 421: Lmemcpy_bsrcul3l4:
! 422: add r1, r1, #3
! 423: b Lmemcpy_bl4
! 424:
! 425: Lmemcpy_bsrcul2:
! 426: cmp r2, #0x0c
! 427: blt Lmemcpy_bsrcul2loop4
! 428: sub r2, r2, #0x0c
! 429: stmdb sp!, {r4, r5}
! 430:
! 431: Lmemcpy_bsrcul2loop16:
! 432: mov lr, r3, lsl #16
! 433: ldmdb r1!, {r3-r5, r12}
! 434: orr lr, lr, r12, lsr #16
! 435: mov r12, r12, lsl #16
! 436: orr r12, r12, r5, lsr #16
! 437: mov r5, r5, lsl #16
! 438: orr r5, r5, r4, lsr #16
! 439: mov r4, r4, lsl #16
! 440: orr r4, r4, r3, lsr #16
! 441: stmdb r0!, {r4, r5, r12, lr}
! 442: subs r2, r2, #0x10
! 443: bge Lmemcpy_bsrcul2loop16
! 444: ldmia sp!, {r4, r5}
! 445: adds r2, r2, #0x0c
! 446: blt Lmemcpy_bsrcul2l4
! 447:
! 448: Lmemcpy_bsrcul2loop4:
! 449: mov r12, r3, lsl #16
! 450: ldr r3, [r1, #-4]!
! 451: orr r12, r12, r3, lsr #16
! 452: str r12, [r0, #-4]!
! 453: subs r2, r2, #4
! 454: bge Lmemcpy_bsrcul2loop4
! 455:
! 456: Lmemcpy_bsrcul2l4:
! 457: add r1, r1, #2
! 458: b Lmemcpy_bl4
! 459:
! 460: Lmemcpy_bsrcul1:
! 461: cmp r2, #0x0c
! 462: blt Lmemcpy_bsrcul1loop4
! 463: sub r2, r2, #0x0c
! 464: stmdb sp!, {r4, r5}
! 465:
! 466: Lmemcpy_bsrcul1loop32:
! 467: mov lr, r3, lsl #24
! 468: ldmdb r1!, {r3-r5, r12}
! 469: orr lr, lr, r12, lsr #8
! 470: mov r12, r12, lsl #24
! 471: orr r12, r12, r5, lsr #8
! 472: mov r5, r5, lsl #24
! 473: orr r5, r5, r4, lsr #8
! 474: mov r4, r4, lsl #24
! 475: orr r4, r4, r3, lsr #8
! 476: stmdb r0!, {r4, r5, r12, lr}
! 477: subs r2, r2, #0x10
! 478: bge Lmemcpy_bsrcul1loop32
! 479: ldmia sp!, {r4, r5}
! 480: adds r2, r2, #0x0c
! 481: blt Lmemcpy_bsrcul1l4
! 482:
! 483: Lmemcpy_bsrcul1loop4:
! 484: mov r12, r3, lsl #24
! 485: ldr r3, [r1, #-4]!
! 486: orr r12, r12, r3, lsr #8
! 487: str r12, [r0, #-4]!
! 488: subs r2, r2, #4
! 489: bge Lmemcpy_bsrcul1loop4
! 490:
! 491: Lmemcpy_bsrcul1l4:
! 492: add r1, r1, #1
! 493: b Lmemcpy_bl4
! 494:
CVSweb