funnyos/libkern/memcpy.S - annotate

Return to memcpy.S CVS log
Up to [local] / funnyos / libkern
Annotation of funnyos/libkern/memcpy.S, Revision 1.1.1.1

1.1       init        1: /*  $Id: memcpy.S,v 1.1.1.1 2007/10/12 08:40:43 init Exp $ */
                      2: /*     $OpenBSD: memcpy.S,v 1.2 2004/02/01 05:47:10 drahn Exp $        */
                      3: /*     $NetBSD: memcpy.S,v 1.2 2001/11/20 00:29:20 chris Exp $ */
                      4:
                      5: /*-
                      6:  * Copyright (c) 1997 The NetBSD Foundation, Inc.
                      7:  * All rights reserved.
                      8:  *
                      9:  * This code is derived from software contributed to The NetBSD Foundation
                     10:  * by Neil A. Carson and Mark Brinicombe
                     11:  *
                     12:  * Redistribution and use in source and binary forms, with or without
                     13:  * modification, are permitted provided that the following conditions
                     14:  * are met:
                     15:  * 1. Redistributions of source code must retain the above copyright
                     16:  *    notice, this list of conditions and the following disclaimer.
                     17:  * 2. Redistributions in binary form must reproduce the above copyright
                     18:  *    notice, this list of conditions and the following disclaimer in the
                     19:  *    documentation and/or other materials provided with the distribution.
                     20:  * 3. All advertising materials mentioning features or use of this software
                     21:  *    must display the following acknowledgement:
                     22:  *        This product includes software developed by the NetBSD
                     23:  *        Foundation, Inc. and its contributors.
                     24:  * 4. Neither the name of The NetBSD Foundation nor the names of its
                     25:  *    contributors may be used to endorse or promote products derived
                     26:  *    from this software without specific prior written permission.
                     27:  *
                     28:  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
                     29:  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
                     30:  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
                     31:  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
                     32:  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
                     33:  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
                     34:  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
                     35:  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
                     36:  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
                     37:  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
                     38:  * POSSIBILITY OF SUCH DAMAGE.
                     39:  */
                     40:
                     41: #include <libkern/asm.h>
                     42:
                     43: /*
                     44:  * This is one fun bit of code ...
                     45:  * Some easy listening music is suggested while trying to understand this
                     46:  * code e.g. Iron Maiden
                     47:  *
                     48:  * For anyone attempting to understand it :
                     49:  *
                     50:  * The core code is implemented here with simple stubs for memcpy()
                     51:  * memmove() and bcopy().
                     52:  *
                     53:  * All local labels are prefixed with Lmemcpy_
                     54:  * Following the prefix a label starting f is used in the forward copy code
                     55:  * while a label using b is used in the backwards copy code
                     56:  * The source and destination addresses determine whether a forward or
                     57:  * backward copy is performed.
                     58:  * Separate bits of code are used to deal with the following situations
                     59:  * for both the forward and backwards copy.
                     60:  * unaligned source address
                     61:  * unaligned destination address
                     62:  * Separate copy routines are used to produce an optimised result for each
                     63:  * of these cases.
                     64:  * The copy code will use LDM/STM instructions to copy up to 32 bytes at
                     65:  * a time where possible.
                     66:  *
                     67:  * Note: r12 (aka ip) can be trashed during the function along with
                     68:  * r0-r3 although r0-r2 have defined uses i.e. src, dest, len through out.
                     69:  * Additional registers are preserved prior to use i.e. r4, r5 & lr
                     70:  *
                     71:  * Apologies for the state of the comments ;-)
                     72:  */
                     73:
                     74: ENTRY(memcpy)
                     75: ENTRY_NP(memmove)
                     76:        /* Determine copy direction */
                     77:        cmp     r1, r0
                     78:
                     79:        moveq   r0, #0                  /* Quick abort for len=0 */
                     80: #ifdef __APCS_26__
                     81:        moveqs  pc, lr
                     82: #else
                     83:        moveq   pc, lr
                     84: #endif
                     85:
                     86:        /* save leaf functions having to store this away */
                     87:        stmdb   sp!, {r0, lr}           /* memcpy() returns dest addr */
                     88:
                     89:        bcc     Lmemcpy_backwards
                     90:
                     91:        /* start of forwards copy */
                     92:        subs    r2, r2, #4
                     93:        blt     Lmemcpy_fl4             /* less than 4 bytes */
                     94:        ands    r12, r0, #3
                     95:        bne     Lmemcpy_fdestul         /* oh unaligned destination addr */
                     96:        ands    r12, r1, #3
                     97:        bne     Lmemcpy_fsrcul          /* oh unaligned source addr */
                     98:
                     99: Lmemcpy_ft8:
                    100:        /* We have aligned source and destination */
                    101:        subs    r2, r2, #8
                    102:        blt     Lmemcpy_fl12            /* less than 12 bytes (4 from above) */
                    103:        subs    r2, r2, #0x14
                    104:        blt     Lmemcpy_fl32            /* less than 32 bytes (12 from above) */
                    105:        stmdb   sp!, {r4}               /* borrow r4 */
                    106:
                    107:        /* blat 32 bytes at a time */
                    108:        /* XXX for really big copies perhaps we should use more registers */
                    109: Lmemcpy_floop32:
                    110:        ldmia   r1!, {r3, r4, r12, lr}
                    111:        stmia   r0!, {r3, r4, r12, lr}
                    112:        ldmia   r1!, {r3, r4, r12, lr}
                    113:        stmia   r0!, {r3, r4, r12, lr}
                    114:        subs    r2, r2, #0x20
                    115:        bge     Lmemcpy_floop32
                    116:
                    117:        cmn     r2, #0x10
                    118:        ldmgeia r1!, {r3, r4, r12, lr}  /* blat a remaining 16 bytes */
                    119:        stmgeia r0!, {r3, r4, r12, lr}
                    120:        subge   r2, r2, #0x10
                    121:        ldmia   sp!, {r4}               /* return r4 */
                    122:
                    123: Lmemcpy_fl32:
                    124:        adds    r2, r2, #0x14
                    125:
                    126:        /* blat 12 bytes at a time */
                    127: Lmemcpy_floop12:
                    128:        ldmgeia r1!, {r3, r12, lr}
                    129:        stmgeia r0!, {r3, r12, lr}
                    130:        subges  r2, r2, #0x0c
                    131:        bge     Lmemcpy_floop12
                    132:
                    133: Lmemcpy_fl12:
                    134:        adds    r2, r2, #8
                    135:        blt     Lmemcpy_fl4
                    136:
                    137:        subs    r2, r2, #4
                    138:        ldrlt   r3, [r1], #4
                    139:        strlt   r3, [r0], #4
                    140:        ldmgeia r1!, {r3, r12}
                    141:        stmgeia r0!, {r3, r12}
                    142:        subge   r2, r2, #4
                    143:
                    144: Lmemcpy_fl4:
                    145:        /* less than 4 bytes to go */
                    146:        adds    r2, r2, #4
                    147: #ifdef __APCS_26_
                    148:        ldmeqia sp!, {r0, pc}^          /* done */
                    149: #else
                    150:        ldmeqia sp!, {r0, pc}           /* done */
                    151: #endif
                    152:        /* copy the crud byte at a time */
                    153:        cmp     r2, #2
                    154:        ldrb    r3, [r1], #1
                    155:        strb    r3, [r0], #1
                    156:        ldrgeb  r3, [r1], #1
                    157:        strgeb  r3, [r0], #1
                    158:        ldrgtb  r3, [r1], #1
                    159:        strgtb  r3, [r0], #1
                    160: #ifdef __APCS_26__
                    161:        ldmia   sp!, {r0, pc}^
                    162: #else
                    163:        ldmia   sp!, {r0, pc}
                    164: #endif
                    165:
                    166:        /* erg - unaligned destination */
                    167: Lmemcpy_fdestul:
                    168:        rsb     r12, r12, #4
                    169:        cmp     r12, #2
                    170:
                    171:        /* align destination with byte copies */
                    172:        ldrb    r3, [r1], #1
                    173:        strb    r3, [r0], #1
                    174:        ldrgeb  r3, [r1], #1
                    175:        strgeb  r3, [r0], #1
                    176:        ldrgtb  r3, [r1], #1
                    177:        strgtb  r3, [r0], #1
                    178:        subs    r2, r2, r12
                    179:        blt     Lmemcpy_fl4             /* less the 4 bytes */
                    180:
                    181:        ands    r12, r1, #3
                    182:        beq     Lmemcpy_ft8             /* we have an aligned source */
                    183:
                    184:        /* erg - unaligned source */
                    185:        /* This is where it gets nasty ... */
                    186: Lmemcpy_fsrcul:
                    187:        bic     r1, r1, #3
                    188:        ldr     lr, [r1], #4
                    189:        cmp     r12, #2
                    190:        bgt     Lmemcpy_fsrcul3
                    191:        beq     Lmemcpy_fsrcul2
                    192:        cmp     r2, #0x0c
                    193:        blt     Lmemcpy_fsrcul1loop4
                    194:        sub     r2, r2, #0x0c
                    195:        stmdb   sp!, {r4, r5}
                    196:
                    197: Lmemcpy_fsrcul1loop16:
                    198:        mov     r3, lr, lsr #8
                    199:        ldmia   r1!, {r4, r5, r12, lr}
                    200:        orr     r3, r3, r4, lsl #24
                    201:        mov     r4, r4, lsr #8
                    202:        orr     r4, r4, r5, lsl #24
                    203:        mov     r5, r5, lsr #8
                    204:        orr     r5, r5, r12, lsl #24
                    205:        mov     r12, r12, lsr #8
                    206:        orr     r12, r12, lr, lsl #24
                    207:        stmia   r0!, {r3-r5, r12}
                    208:        subs    r2, r2, #0x10
                    209:        bge     Lmemcpy_fsrcul1loop16
                    210:        ldmia   sp!, {r4, r5}
                    211:        adds    r2, r2, #0x0c
                    212:        blt     Lmemcpy_fsrcul1l4
                    213:
                    214: Lmemcpy_fsrcul1loop4:
                    215:        mov     r12, lr, lsr #8
                    216:        ldr     lr, [r1], #4
                    217:        orr     r12, r12, lr, lsl #24
                    218:        str     r12, [r0], #4
                    219:        subs    r2, r2, #4
                    220:        bge     Lmemcpy_fsrcul1loop4
                    221:
                    222: Lmemcpy_fsrcul1l4:
                    223:        sub     r1, r1, #3
                    224:        b       Lmemcpy_fl4
                    225:
                    226: Lmemcpy_fsrcul2:
                    227:        cmp     r2, #0x0c
                    228:        blt     Lmemcpy_fsrcul2loop4
                    229:        sub     r2, r2, #0x0c
                    230:        stmdb   sp!, {r4, r5}
                    231:
                    232: Lmemcpy_fsrcul2loop16:
                    233:        mov     r3, lr, lsr #16
                    234:        ldmia   r1!, {r4, r5, r12, lr}
                    235:        orr     r3, r3, r4, lsl #16
                    236:        mov     r4, r4, lsr #16
                    237:        orr     r4, r4, r5, lsl #16
                    238:        mov     r5, r5, lsr #16
                    239:        orr     r5, r5, r12, lsl #16
                    240:        mov     r12, r12, lsr #16
                    241:        orr     r12, r12, lr, lsl #16
                    242:        stmia   r0!, {r3-r5, r12}
                    243:        subs    r2, r2, #0x10
                    244:        bge     Lmemcpy_fsrcul2loop16
                    245:        ldmia   sp!, {r4, r5}
                    246:        adds    r2, r2, #0x0c
                    247:        blt     Lmemcpy_fsrcul2l4
                    248:
                    249: Lmemcpy_fsrcul2loop4:
                    250:        mov     r12, lr, lsr #16
                    251:        ldr     lr, [r1], #4
                    252:        orr     r12, r12, lr, lsl #16
                    253:        str     r12, [r0], #4
                    254:        subs    r2, r2, #4
                    255:        bge     Lmemcpy_fsrcul2loop4
                    256:
                    257: Lmemcpy_fsrcul2l4:
                    258:        sub     r1, r1, #2
                    259:        b       Lmemcpy_fl4
                    260:
                    261: Lmemcpy_fsrcul3:
                    262:        cmp     r2, #0x0c
                    263:        blt     Lmemcpy_fsrcul3loop4
                    264:        sub     r2, r2, #0x0c
                    265:        stmdb   sp!, {r4, r5}
                    266:
                    267: Lmemcpy_fsrcul3loop16:
                    268:        mov     r3, lr, lsr #24
                    269:        ldmia   r1!, {r4, r5, r12, lr}
                    270:        orr     r3, r3, r4, lsl #8
                    271:        mov     r4, r4, lsr #24
                    272:        orr     r4, r4, r5, lsl #8
                    273:        mov     r5, r5, lsr #24
                    274:        orr     r5, r5, r12, lsl #8
                    275:        mov     r12, r12, lsr #24
                    276:        orr     r12, r12, lr, lsl #8
                    277:        stmia   r0!, {r3-r5, r12}
                    278:        subs    r2, r2, #0x10
                    279:        bge     Lmemcpy_fsrcul3loop16
                    280:        ldmia   sp!, {r4, r5}
                    281:        adds    r2, r2, #0x0c
                    282:        blt     Lmemcpy_fsrcul3l4
                    283:
                    284: Lmemcpy_fsrcul3loop4:
                    285:        mov     r12, lr, lsr #24
                    286:        ldr     lr, [r1], #4
                    287:        orr     r12, r12, lr, lsl #8
                    288:        str     r12, [r0], #4
                    289:        subs    r2, r2, #4
                    290:        bge     Lmemcpy_fsrcul3loop4
                    291:
                    292: Lmemcpy_fsrcul3l4:
                    293:        sub     r1, r1, #1
                    294:        b       Lmemcpy_fl4
                    295:
                    296: Lmemcpy_backwards:
                    297:        add     r1, r1, r2
                    298:        add     r0, r0, r2
                    299:        subs    r2, r2, #4
                    300:        blt     Lmemcpy_bl4             /* less than 4 bytes */
                    301:        ands    r12, r0, #3
                    302:        bne     Lmemcpy_bdestul         /* oh unaligned destination addr */
                    303:        ands    r12, r1, #3
                    304:        bne     Lmemcpy_bsrcul          /* oh unaligned source addr */
                    305:
                    306: Lmemcpy_bt8:
                    307:        /* We have aligned source and destination */
                    308:        subs    r2, r2, #8
                    309:        blt     Lmemcpy_bl12            /* less than 12 bytes (4 from above) */
                    310:        stmdb   sp!, {r4}
                    311:        subs    r2, r2, #0x14           /* less than 32 bytes (12 from above) */
                    312:        blt     Lmemcpy_bl32
                    313:
                    314:        /* blat 32 bytes at a time */
                    315:        /* XXX for really big copies perhaps we should use more registers */
                    316: Lmemcpy_bloop32:
                    317:        ldmdb   r1!, {r3, r4, r12, lr}
                    318:        stmdb   r0!, {r3, r4, r12, lr}
                    319:        ldmdb   r1!, {r3, r4, r12, lr}
                    320:        stmdb   r0!, {r3, r4, r12, lr}
                    321:        subs    r2, r2, #0x20
                    322:        bge     Lmemcpy_bloop32
                    323:
                    324: Lmemcpy_bl32:
                    325:        cmn     r2, #0x10
                    326:        ldmgedb r1!, {r3, r4, r12, lr}  /* blat a remaining 16 bytes */
                    327:        stmgedb r0!, {r3, r4, r12, lr}
                    328:        subge   r2, r2, #0x10
                    329:        adds    r2, r2, #0x14
                    330:        ldmgedb r1!, {r3, r12, lr}      /* blat a remaining 12 bytes */
                    331:        stmgedb r0!, {r3, r12, lr}
                    332:        subge   r2, r2, #0x0c
                    333:        ldmia   sp!, {r4}
                    334:
                    335: Lmemcpy_bl12:
                    336:        adds    r2, r2, #8
                    337:        blt     Lmemcpy_bl4
                    338:        subs    r2, r2, #4
                    339:        ldrlt   r3, [r1, #-4]!
                    340:        strlt   r3, [r0, #-4]!
                    341:        ldmgedb r1!, {r3, r12}
                    342:        stmgedb r0!, {r3, r12}
                    343:        subge   r2, r2, #4
                    344:
                    345: Lmemcpy_bl4:
                    346:        /* less than 4 bytes to go */
                    347:        adds    r2, r2, #4
                    348: #ifdef __APCS_26__
                    349:        ldmeqia sp!, {r0, pc}^
                    350: #else
                    351:        ldmeqia sp!, {r0, pc}
                    352: #endif
                    353:
                    354:        /* copy the crud byte at a time */
                    355:        cmp     r2, #2
                    356:        ldrb    r3, [r1, #-1]!
                    357:        strb    r3, [r0, #-1]!
                    358:        ldrgeb  r3, [r1, #-1]!
                    359:        strgeb  r3, [r0, #-1]!
                    360:        ldrgtb  r3, [r1, #-1]!
                    361:        strgtb  r3, [r0, #-1]!
                    362: #ifdef __APCS_26__
                    363:        ldmia   sp!, {r0, pc}^
                    364: #else
                    365:        ldmia   sp!, {r0, pc}
                    366: #endif
                    367:
                    368:        /* erg - unaligned destination */
                    369: Lmemcpy_bdestul:
                    370:        cmp     r12, #2
                    371:
                    372:        /* align destination with byte copies */
                    373:        ldrb    r3, [r1, #-1]!
                    374:        strb    r3, [r0, #-1]!
                    375:        ldrgeb  r3, [r1, #-1]!
                    376:        strgeb  r3, [r0, #-1]!
                    377:        ldrgtb  r3, [r1, #-1]!
                    378:        strgtb  r3, [r0, #-1]!
                    379:        subs    r2, r2, r12
                    380:        blt     Lmemcpy_bl4             /* less than 4 bytes to go */
                    381:        ands    r12, r1, #3
                    382:        beq     Lmemcpy_bt8             /* we have an aligned source */
                    383:
                    384:        /* erg - unaligned source */
                    385:        /* This is where it gets nasty ... */
                    386: Lmemcpy_bsrcul:
                    387:        bic     r1, r1, #3
                    388:        ldr     r3, [r1, #0]
                    389:        cmp     r12, #2
                    390:        blt     Lmemcpy_bsrcul1
                    391:        beq     Lmemcpy_bsrcul2
                    392:        cmp     r2, #0x0c
                    393:        blt     Lmemcpy_bsrcul3loop4
                    394:        sub     r2, r2, #0x0c
                    395:        stmdb   sp!, {r4, r5}
                    396:
                    397: Lmemcpy_bsrcul3loop16:
                    398:        mov     lr, r3, lsl #8
                    399:        ldmdb   r1!, {r3-r5, r12}
                    400:        orr     lr, lr, r12, lsr #24
                    401:        mov     r12, r12, lsl #8
                    402:        orr     r12, r12, r5, lsr #24
                    403:        mov     r5, r5, lsl #8
                    404:        orr     r5, r5, r4, lsr #24
                    405:        mov     r4, r4, lsl #8
                    406:        orr     r4, r4, r3, lsr #24
                    407:        stmdb   r0!, {r4, r5, r12, lr}
                    408:        subs    r2, r2, #0x10
                    409:        bge     Lmemcpy_bsrcul3loop16
                    410:        ldmia   sp!, {r4, r5}
                    411:        adds    r2, r2, #0x0c
                    412:        blt     Lmemcpy_bsrcul3l4
                    413:
                    414: Lmemcpy_bsrcul3loop4:
                    415:        mov     r12, r3, lsl #8
                    416:        ldr     r3, [r1, #-4]!
                    417:        orr     r12, r12, r3, lsr #24
                    418:        str     r12, [r0, #-4]!
                    419:        subs    r2, r2, #4
                    420:        bge     Lmemcpy_bsrcul3loop4
                    421:
                    422: Lmemcpy_bsrcul3l4:
                    423:        add     r1, r1, #3
                    424:        b       Lmemcpy_bl4
                    425:
                    426: Lmemcpy_bsrcul2:
                    427:        cmp     r2, #0x0c
                    428:        blt     Lmemcpy_bsrcul2loop4
                    429:        sub     r2, r2, #0x0c
                    430:        stmdb   sp!, {r4, r5}
                    431:
                    432: Lmemcpy_bsrcul2loop16:
                    433:        mov     lr, r3, lsl #16
                    434:        ldmdb   r1!, {r3-r5, r12}
                    435:        orr     lr, lr, r12, lsr #16
                    436:        mov     r12, r12, lsl #16
                    437:        orr     r12, r12, r5, lsr #16
                    438:        mov     r5, r5, lsl #16
                    439:        orr     r5, r5, r4, lsr #16
                    440:        mov     r4, r4, lsl #16
                    441:        orr     r4, r4, r3, lsr #16
                    442:        stmdb   r0!, {r4, r5, r12, lr}
                    443:        subs    r2, r2, #0x10
                    444:        bge     Lmemcpy_bsrcul2loop16
                    445:        ldmia   sp!, {r4, r5}
                    446:        adds    r2, r2, #0x0c
                    447:        blt     Lmemcpy_bsrcul2l4
                    448:
                    449: Lmemcpy_bsrcul2loop4:
                    450:        mov     r12, r3, lsl #16
                    451:        ldr     r3, [r1, #-4]!
                    452:        orr     r12, r12, r3, lsr #16
                    453:        str     r12, [r0, #-4]!
                    454:        subs    r2, r2, #4
                    455:        bge     Lmemcpy_bsrcul2loop4
                    456:
                    457: Lmemcpy_bsrcul2l4:
                    458:        add     r1, r1, #2
                    459:        b       Lmemcpy_bl4
                    460:
                    461: Lmemcpy_bsrcul1:
                    462:        cmp     r2, #0x0c
                    463:        blt     Lmemcpy_bsrcul1loop4
                    464:        sub     r2, r2, #0x0c
                    465:        stmdb   sp!, {r4, r5}
                    466:
                    467: Lmemcpy_bsrcul1loop32:
                    468:        mov     lr, r3, lsl #24
                    469:        ldmdb   r1!, {r3-r5, r12}
                    470:        orr     lr, lr, r12, lsr #8
                    471:        mov     r12, r12, lsl #24
                    472:        orr     r12, r12, r5, lsr #8
                    473:        mov     r5, r5, lsl #24
                    474:        orr     r5, r5, r4, lsr #8
                    475:        mov     r4, r4, lsl #24
                    476:        orr     r4, r4, r3, lsr #8
                    477:        stmdb   r0!, {r4, r5, r12, lr}
                    478:        subs    r2, r2, #0x10
                    479:        bge     Lmemcpy_bsrcul1loop32
                    480:        ldmia   sp!, {r4, r5}
                    481:        adds    r2, r2, #0x0c
                    482:        blt     Lmemcpy_bsrcul1l4
                    483:
                    484: Lmemcpy_bsrcul1loop4:
                    485:        mov     r12, r3, lsl #24
                    486:        ldr     r3, [r1, #-4]!
                    487:        orr     r12, r12, r3, lsr #8
                    488:        str     r12, [r0, #-4]!
                    489:        subs    r2, r2, #4
                    490:        bge     Lmemcpy_bsrcul1loop4
                    491:
                    492: Lmemcpy_bsrcul1l4:
                    493:        add     r1, r1, #1
                    494:        b       Lmemcpy_bl4
                    495:
CVSweb