diff -uNrp glibc.old/sysdeps/arm/memcpy.S glibc-2.3.2/sysdeps/arm/memcpy.S --- glibc.old/sysdeps/arm/memcpy.S 1969-12-31 19:00:00.000000000 -0500 +++ glibc-2.3.2/sysdeps/arm/memcpy.S 2004-03-09 11:11:52.000000000 -0500 @@ -0,0 +1,1715 @@ +/* + * from linux/arch/arm/lib/memcpy.S + * + * Copyright (C) 1999-2003 Intel Ltd + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * ASM optimised string functions + */ +#include +#include + +#define PLD(code...) code + .text +/* + * Prototype: void memcpy(void *to,const void *from,unsigned long n); + */ +ENTRY(memmove) + +@ If no overlap go to memcpy directly + mov r3, r0 + cmp r1, r3 + bcs 59f + add r0, r2, r1 + cmp r3, r0 + bcs 59f + + stmfd sp!, {r0, r4-r11, r14} + add r1, r2, r3 + cmp r2, #16 + bcc 10f + orr r4, r0, r1 + ands r3, r4, #3 + bne 10f +@ Use optimizing algorithm for a non-destructive copy to closely +@ match memcpy. If the size is small or either SRC or DST is unaligned, +@ then punt into the byte copy loop. This should be rare. + + cmp r2, #64 + bcc 2f +@ Copy 16X long words at a time if possible. +1: + subs r2, r2, #64 + PLD( pld [r0, #-96] ) + PLD( pld [r0, #-128] ) + ldr r5, [r0, #-4] + ldr r14, [r0, #-8] + ldr r6, [r0, #-12] + ldr r7, [r0, #-16] + str r5, [r1, #-4] + str r14, [r1, #-8] + str r6, [r1, #-12] + str r7, [r1, #-16] + + ldr r5, [r0, #-20] + ldr r6, [r0, #-24] + ldr r7, [r0, #-28] + ldr r8, [r0, #-32] + str r5, [r1, #-20] + str r6, [r1, #-24] + str r7, [r1, #-28] + str r8, [r1, #-32] + + ldr r5, [r0, #-36] + ldr r6, [r0, #-40] + ldr r7, [r0, #-44] + ldr r8, [r0, #-48] + str r5, [r1, #-36] + str r6, [r1, #-40] + str r7, [r1, #-44] + str r8, [r1, #-48] + + ldr r5, [r0, #-52] + ldr r6, [r0, #-56] + ldr r7, [r0, #-60] + ldr r3, [r0, #-64]! + str r5, [r1, #-52] + str r6, [r1, #-56] + str r7, [r1, #-60] + str r3, [r1, #-64]! + cmp r2, #64 + bge 1b +2: + cmp r2, #32 + bcc 4f +@ Copy 8X long words at a time if possible. +3: + PLD( pld [r0, #-64] ) + ldr r5, [r0, #-4] + ldr r6, [r0, #-8] + ldr r7, [r0, #-12] + ldr r8, [r0, #-16] + str r5, [r1, #-4] + str r6, [r1, #-8] + str r7, [r1, #-12] + str r8, [r1, #-16] + + ldr r5, [r0, #-20] + ldr r6, [r0, #-24] + ldr r14, [r0, #-28] + ldr r7, [r0, #-32] + str r5, [r1, #-20] + str r6, [r1, #-24] + str r14, [r1, #-28] + str r7, [r1, #-32] + + subs r2, r2, #32 + cmp r2, #32 + sub r0, r0, #32 + sub r1, r1, #32 + bge 3b +4: + cmp r2, #16 + bcc 6f +5: + ldr r5, [r0, #-4] + ldr r14, [r0, #-8] + ldr r6, [r0, #-12] + ldr r7, [r0, #-16] + + str r5, [r1, #-4] + str r14, [r1, #-8] + str r6, [r1, #-12] + str r7, [r1, #-16] + + sub r0, r0, #16 + sub r1, r1, #16 + subs r2, r2, #16 + cmp r2, #16 + bge 5b +6: + cmp r2, #4 + bcc 10f + sub r3, r2, #4 + cmp r3, #24 + bcc 8f +7: + ldr r5, [r0, #-4] + ldr r6, [r0, #-8] + ldr r7, [r0, #-12] + ldr r14, [r0, #-16] + ldr r8, [r0, #-20] + + str r5, [r1, #-4] + str r6, [r1, #-8] + str r7, [r1, #-12] + str r14, [r1, #-16] + str r8, [r1, #-20] + + subs r2, r2, #20 + sub r0, r0, #20 + sub r1, r1, #20 + cmp r2, #20 + bge 7b +8: + mov r4, r1 + mov r3, r0 +9: + sub r3, r3, #4 + ldr r5, [r3, #0] + sub r0, r0, #4 + sub r4, r4, #4 + sub r1, r1, #4 + str r5, [r4, #0] + subs r2, r2, #4 + cmp r2, #4 + bge 9b + + +@ If unaligned access do optimized unrolled byte copier +@ Pick up any residual with a byte copier. + +10: + sub r4, r2, #1 + cmp r2, #0 + beq 58f + cmp r4, #8 + blt 56f + + ands r5, r1, #3 + beq 11f + + ldrb r6, [r0, #-1]! + cmp r5, #2 + strb r6, [r1, #-1]! + ldrgeb r7, [r0, #-1]! @ >= 2 == at least 2 bytes + ldrgtb r8, [r0, #-1]! @ > 2 == 3 bytes unaligned + sub r4, r4, r5 + strgeb r7, [r1, #-1]! + strgtb r8, [r1, #-1]! +11: + tst r0, #1 + bne 32f + tst r0, #2 + bne 22f + sub r0, r0, #4 + sub r1, r1, #4 + cmp r4, #64 + blt 13f + add r0, r0, #4 + add r1, r1, #4 +12: + PLD( pld [r0, #-96] ) + PLD( pld [r0, #-128] ) + ldmdb r0!, {r5-r12} + stmdb r1!, {r5-r12} + subs r4, r4, #64 + ldmdb r0!, {r5-r12} + stmdb r1!, {r5-r12} + cmp r4, #64 + bge 12b + sub r0, r0, #4 + sub r1, r1, #4 + +13: + cmp r4, #32 + blt 15f + +14: + PLD( pld [r0, #-64] ) + subs r4, r4, #32 + ldr r6, [r0], #-4 + ldr r7, [r0], #-4 + ldr r8, [r0], #-4 + ldr r14, [r0], #-4 + + str r6, [r1], #-4 + str r7, [r1], #-4 + str r8, [r1], #-4 + str r14, [r1], #-4 + + ldr r6, [r0], #-4 + ldr r7, [r0], #-4 + ldr r8, [r0], #-4 + ldr r14, [r0], #-4 + + str r6, [r1], #-4 + str r7, [r1], #-4 + str r8, [r1], #-4 + str r14, [r1], #-4 + cmp r4, #32 + bge 14b +15: + cmp r4, #16 + blt 17f +16: + subs r4, r4, #16 + ldr r6, [r0], #-4 + ldr r7, [r0], #-4 + ldr r8, [r0], #-4 + ldr r14, [r0], #-4 + str r6, [r1], #-4 + str r7, [r1], #-4 + str r8, [r1], #-4 + str r14, [r1], #-4 + cmp r4, #16 + bge 16b +17: + cmp r4, #8 + blt 20f +18: + subs r4, r4, #8 + ldr r6, [r0], #-4 + ldr r7, [r0], #-4 + str r6, [r1], #-4 + str r7, [r1], #-4 + cmp r4, #8 + bge 18b + + cmp r4, #4 + blt 20f +19: + subs r4, r4, #4 + ldr r6, [r0], #-4 + str r6, [r1], #-4 + cmp r4, #4 + bge 19b +20: + add r0, r0, #3 + add r1, r1, #3 +21: + ldrb r5, [r0], #-1 + subs r4, r4, #1 + strb r5, [r1], #-1 + bpl 21b + + mov r0, r1 + ldmfd sp!, {r0, r4-r11, pc} + +22: + ldrh r14, [r0, #-2]! + sub r4, r4, #2 + mov r14, r14, LSL #16 + cmp r4, #64 + blt 24f +23: + PLD( pld [r0, #-96] ) + PLD( pld [r0, #-128] ) + ldmdb r0!, {r5-r12} + orr r14, r14, r12, LSR #16 + mov r12, r12, LSL #16 + orr r12, r12, r11, LSR #16 + mov r11, r11, LSL #16 + orr r11, r11, r10, LSR #16 + mov r10, r10, LSL #16 + orr r10, r10, r9, LSR #16 + mov r9, r9, LSL #16 + orr r9, r9, r8, LSR #16 + mov r8, r8, LSL #16 + orr r8, r8, r7, LSR #16 + mov r7, r7, LSL #16 + orr r7, r7, r6, LSR #16 + mov r6, r6, LSL #16 + orr r6, r6, r5, LSR #16 + stmdb r1!, {r6-r12, r14} + mov r14, r5, LSL #16 + subs r4, r4, #64 + ldmdb r0!, {r5-r12} + orr r14, r14, r12, LSR #16 + mov r12, r12, LSL #16 + orr r12, r12, r11, LSR #16 + mov r11, r11, LSL #16 + orr r11, r11, r10, LSR #16 + mov r10, r10, LSL #16 + orr r10, r10, r9, LSR #16 + mov r9, r9, LSL #16 + orr r9, r9, r8, LSR #16 + mov r8, r8, LSL #16 + orr r8, r8, r7, LSR #16 + mov r7, r7, LSL #16 + orr r7, r7, r6, LSR #16 + mov r6, r6, LSL #16 + orr r6, r6, r5, LSR #16 + stmdb r1!, {r6-r12, r14} + mov r14, r5, LSL #16 + cmp r4, #64 + bge 23b +24: + mov r5, r14 + cmp r4, #32 + blt 26f +25: + PLD( pld [r0, #-64] ) + subs r4, r4, #32 + ldr r6, [r0, #-4] + ldr r7, [r0, #-8] + ldr r8, [r0, #-12] + ldr r14, [r0, #-16] + orr r5, r5, r6, LSR #16 + mov r6, r6, LSL #16 + orr r6, r6, r7, LSR #16 + mov r7, r7, LSL #16 + orr r7, r7, r8, LSR #16 + mov r8, r8, LSL #16 + orr r8, r8, r14, LSR #16 + str r5, [r1, #-4] + str r6, [r1, #-8] + str r7, [r1, #-12] + str r8, [r1, #-16] + mov r5, r14, LSL #16 + ldr r6, [r0, #-20] + ldr r7, [r0, #-24] + ldr r8, [r0, #-28] + ldr r14, [r0, #-32]! + orr r5, r5, r6, LSR #16 + mov r6, r6, LSL #16 + orr r6, r6, r7, LSR #16 + mov r7, r7, LSL #16 + orr r7, r7, r8, LSR #16 + mov r8, r8, LSL #16 + orr r8, r8, r14, LSR #16 + str r5, [r1, #-20] + str r6, [r1, #-24] + str r7, [r1, #-28] + str r8, [r1, #-32]! + mov r5, r14, LSL #16 + cmp r4, #32 + bge 25b +26: + cmp r4, #16 + blt 30f +27: + PLD( pld [r0, #-128] ) + PLD( pld [r0, #-64] ) + subs r4, r4, #16 + ldr r6, [r0, #-4] + ldr r7, [r0, #-8] + ldr r8, [r0, #-12] + ldr r14, [r0, #-16]! + orr r5, r5, r6, LSR #16 + mov r6, r6, LSL #16 + orr r6, r6, r7, LSR #16 + mov r7, r7, LSL #16 + orr r7, r7, r8, LSR #16 + mov r8, r8, LSL #16 + orr r8, r8, r14, LSR #16 + str r5, [r1, #-4] + str r6, [r1, #-8] + str r7, [r1, #-12] + str r8, [r1, #-16]! + mov r5, r14, LSL #16 + cmp r4, #16 + bge 27b + cmp r4, #8 + blt 30f +28: + PLD( pld [r0, #-128] ) + PLD( pld [r0, #-64] ) + subs r4, r4, #8 + ldr r8, [r0, #-4] + ldr r14, [r0, #-8]! + orr r5, r5, r8, LSR #16 + mov r8, r8, LSL #16 + orr r8, r8, r14, LSR #16 + str r5, [r1, #-4] + str r8, [r1, #-8]! + mov r5, r14, LSL #16 + cmp r4, #8 + bge 28b + cmp r4, #4 + blt 30f +29: + PLD( pld [r0, #-128] ) + PLD( pld [r0, #-64] ) + subs r4, r4, #4 + ldr r8, [r0, #-4]! + orr r5, r5, r8, LSR #16 + str r5, [r1, #-4]! + mov r5, r8, LSL #16 + cmp r4, #4 + bge 29b +30: + mov r5, r5, LSR #16 + sub r1, r1, #2 + sub r0, r0, #1 + strh r5, [r1], #-1 +31: + ldrb r5, [r0], #-1 + strb r5, [r1], #-1 + subs r4, r4, #1 + bpl 31b + + mov r0, r1 + + ldmfd sp!, {r0, r4-r11, pc} + +32: + TST r0, #2 + BNE 44f + ldrb r14, [r0, #-1]! + sub r0, r0, #4 + sub r1, r1, #4 + sub r4, r4, #1 + mov r14, r14, LSL #24 + cmp r4, #64 + blt 34f + add r0, r0, #4 + add r1, r1, #4 +33: + PLD( pld [r0, #-96] ) + PLD( pld [r0, #-128] ) + ldmdb r0!, {r5-r12} + orr r14, r14, r12, LSR #8 + mov r12, r12, LSL #24 + orr r12, r12, r11, LSR #8 + mov r11, r11, LSL #24 + orr r11, r11, r10, LSR #8 + mov r10, r10, LSL #24 + orr r10, r10, r9, LSR #8 + mov r9, r9, LSL #24 + orr r9, r9, r8, LSR #8 + mov r8, r8, LSL #24 + orr r8, r8, r7, LSR #8 + mov r7, r7, LSL #24 + orr r7, r7, r6, LSR #8 + mov r6, r6, LSL #24 + orr r6, r6, r5, LSR #8 + stmdb r1!, {r6-r12, r14} + mov r14, r5, LSL #24 + subs r4, r4, #64 + ldmdb r0!, {r5-r12} + orr r14, r14, r12, LSR #8 + mov r12, r12, LSL #24 + orr r12, r12, r11, LSR #8 + mov r11, r11, LSL #24 + orr r11, r11, r10, LSR #8 + mov r10, r10, LSL #24 + orr r10, r10, r9, LSR #8 + mov r9, r9, LSL #24 + orr r9, r9, r8, LSR #8 + mov r8, r8, LSL #24 + orr r8, r8, r7, LSR #8 + mov r7, r7, LSL #24 + orr r7, r7, r6, LSR #8 + mov r6, r6, LSL #24 + orr r6, r6, r5, LSR #8 + stmdb r1!, {r6-r12, r14} + mov r14, r5, LSL #24 + cmp r4, #64 + bge 33b + sub r0, r0, #4 + sub r1, r1, #4 +34: + mov r5, r14 + cmp r4, #32 + blt 36f +35: + PLD( pld [r0, #-64] ) + subs r4, r4, #32 + ldr r6, [r0], #-4 + ldr r7, [r0], #-4 + ldr r8, [r0], #-4 + ldr r14, [r0], #-4 + orr r5, r5, r6, LSR #8 + mov r6, r6, LSL #24 + orr r6, r6, r7, LSR #8 + mov r7, r7, LSL #24 + orr r7, r7, r8, LSR #8 + mov r8, r8, LSL #24 + orr r8, r8, r14, LSR #8 + + str r5, [r1], #-4 + str r6, [r1], #-4 + str r7, [r1], #-4 + str r8, [r1], #-4 + + mov r5, r14, LSL #24 + ldr r6, [r0], #-4 + ldr r7, [r0], #-4 + ldr r8, [r0], #-4 + ldr r14, [r0], #-4 + + orr r5, r5, r6, LSR #8 + mov r6, r6, LSL #24 + orr r6, r6, r7, LSR #8 + mov r7, r7, LSL #24 + orr r7, r7, r8, LSR #8 + mov r8, r8, LSL #24 + orr r8, r8, r14, LSR #8 + + str r5, [r1], #-4 + str r6, [r1], #-4 + str r7, [r1], #-4 + str r8, [r1], #-4 + mov r5, r14, LSL #24 + cmp r4, #32 + bge 35b +36: + cmp r4, #16 + blt 38f +37: + PLD( pld [r0, #-128] ) + PLD( pld [r0, #-64] ) + subs r4, r4, #16 + ldr r6, [r0], #-4 + ldr r7, [r0], #-4 + ldr r8, [r0], #-4 + ldr r14, [r0], #-4 + + orr r5, r5, r6, LSR #8 + mov r6, r6, LSL #24 + orr r6, r6, r7, LSR #8 + mov r7, r7, LSL #24 + orr r7, r7, r8, LSR #8 + mov r8, r8, LSL #24 + orr r8, r8, r14, LSR #8 + + str r5, [r1], #-4 + str r6, [r1], #-4 + str r7, [r1], #-4 + str r8, [r1], #-4 + mov r5, r14, LSL #24 + cmp r4, #16 + bge 37b +38: + cmp r4, #8 + blt 40f +39: + subs r4, r4, #8 + ldr r6, [r0], #-4 + ldr r7, [r0], #-4 + orr r5, r5, r6, LSR #8 + mov r6, r6, LSL #24 + orr r6, r6, r7, LSR #8 + str r5, [r1], #-4 + str r6, [r1], #-4 + mov r5, r7, LSL #24 + cmp r4, #8 + bge 39b +40: + cmp r4, #4 + blt 42f +41: + subs r4, r4, #4 + ldr r6, [r0], #-4 + orr r5, r5, r6, LSR #8 + str r5, [r1], #-4 + mov r5, r6, LSL #24 + cmp r4, #4 + bge 41b +42: + mov r5, r5, LSR #24 + add r1, r1, #3 + add r0, r0, #3 + strb r5, [r1], #-1 +43: + ldrb r5, [r0], #-1 + subs r4, r4, #1 + strb r5, [r1], #-1 + bpl 43b + + mov r0, r1 + ldmfd sp!, {r0, r4-r11, pc} +44: + ldrb r5, [r0, #-1]! + ldrh r6, [r0, #-2]! + sub r0, r0, #4 + sub r1, r1, #4 + sub r4, r4, #3 + orr r5, r6, r5, LSL #16 + mov r14, r5, LSL #8 + cmp r4, #64 + blt 46f + add r0, r0, #4 + add r1, r1, #4 +45: + PLD( pld [r0, #-96] ) + PLD( pld [r0, #-128] ) + ldmdb r0!, {r5-r12} + orr r14, r14, r12, LSR #24 + mov r12, r12, LSL #8 + orr r12, r12, r11, LSR #24 + mov r11, r11, LSL #8 + orr r11, r11, r10, LSR #24 + mov r10, r10, LSL #8 + orr r10, r10, r9, LSR #24 + mov r9, r9, LSL #8 + orr r9, r9, r8, LSR #24 + mov r8, r8, LSL #8 + orr r8, r8, r7, LSR #24 + mov r7, r7, LSL #8 + orr r7, r7, r6, LSR #24 + mov r6, r6, LSL #8 + orr r6, r6, r5, LSR #24 + stmdb r1!, {r6-r12, r14} + mov r14, r5, LSL #8 + subs r4, r4, #64 + ldmdb r0!, {r5-r12} + orr r14, r14, r12, LSR #24 + mov r12, r12, LSL #8 + orr r12, r12, r11, LSR #24 + mov r11, r11, LSL #8 + orr r11, r11, r10, LSR #24 + mov r10, r10, LSL #8 + orr r10, r10, r9, LSR #24 + mov r9, r9, LSL #8 + orr r9, r9, r8, LSR #24 + mov r8, r8, LSL #8 + orr r8, r8, r7, LSR #24 + mov r7, r7, LSL #8 + orr r7, r7, r6, LSR #24 + mov r6, r6, LSL #8 + orr r6, r6, r5, LSR #24 + stmdb r1!, {r6-r12, r14} + mov r14, r5, LSL #8 + cmp r4, #64 + bge 45b + sub r0, r0, #4 + sub r1, r1, #4 +46: + mov r5, r14 + cmp r4, #32 + blt 48f +47: + PLD( pld [r0, #-64] ) + subs r4, r4, #32 + ldr r6, [r0], #-4 + ldr r7, [r0], #-4 + ldr r8, [r0], #-4 + ldr r14, [r0], #-4 + orr r5, r5, r6, LSR #24 + mov r6, r6, LSL #8 + orr r6, r6, r7, LSR #24 + mov r7, r7, LSL #8 + orr r7, r7, r8, LSR #24 + mov r8, r8, LSL #8 + orr r8, r8, r14, LSR #24 + str r5, [r1], #-4 + str r6, [r1], #-4 + str r7, [r1], #-4 + str r8, [r1], #-4 + mov r5, r14, LSL #8 + ldr r6, [r0], #-4 + ldr r7, [r0], #-4 + ldr r8, [r0], #-4 + ldr r14, [r0], #-4 + orr r5, r5, r6, LSR #24 + mov r6, r6, LSL #8 + orr r6, r6, r7, LSR #24 + mov r7, r7, LSL #8 + orr r7, r7, r8, LSR #24 + mov r8, r8, LSL #8 + orr r8, r8, r14, LSR #24 + str r5, [r1], #-4 + str r6, [r1], #-4 + str r7, [r1], #-4 + str r8, [r1], #-4 + mov r5, r14, LSL #8 + cmp r4, #32 + bge 47b +48: + cmp r4, #16 + blt 50f +49: + PLD( pld [r0, #-128] ) + PLD( pld [r0, #-64] ) + subs r4, r4, #16 + ldr r6, [r0], #-4 + ldr r7, [r0], #-4 + ldr r8, [r0], #-4 + ldr r14, [r0], #-4 + orr r5, r5, r6, LSR #24 + mov r6, r6, LSL #8 + orr r6, r6, r7, LSR #24 + mov r7, r7, LSL #8 + orr r7, r7, r8, LSR #24 + mov r8, r8, LSL #8 + orr r8, r8, r14, LSR #24 + str r5, [r1], #-4 + str r6, [r1], #-4 + str r7, [r1], #-4 + str r8, [r1], #-4 + mov r5, r14, LSL #8 + cmp r4, #16 + bge 49b +50: + cmp r4, #8 + blt 52f +51: + subs r4, r4, #8 + ldr r6, [r0], #-4 + ldr r7, [r0], #-4 + orr r5, r5, r6, LSR #24 + mov r6, r6, LSL #8 + orr r6, r6, r7, LSR #24 + str r5, [r1], #-4 + str r6, [r1], #-4 + mov r5, r7, LSL #8 + cmp r4, #8 + bge 51b +52: + cmp r4, #4 + blt 54f +53: + subs r4, r4, #4 + ldr r6, [r0], #-4 + orr r5, r5, r6, LSR #24 + str r5, [r1], #-4 + mov r5, r6, LSL #8 + cmp r4, #4 + bge 53b +54: + mov r6, r5, LSR #8 + add r1, r1, #3 + add r0, r0, #3 + mov r7, r5, LSR #24 + strb r7, [r1], #-1 + mov r5, r5, LSR #16 + strb r5, [r1], #-1 + strb r6, [r1], #-1 +55: + ldrb r5, [r0], #-1 + subs r4, r4, #1 + strb r5, [r1], #-1 + bpl 55b + + mov r0, r1 + ldmfd sp!, {r0, r4-r11, pc} +56: + sub r0, r0, #1 + sub r1, r1, #1 +57: + ldrb r5, [r0], #-1 + subs r4, r4, #1 + strb r5, [r1], #-1 + bpl 57b +58: + mov r0, r1 + ldmfd sp!, {r0, r4-r11, pc} +59: + mov r0, r3 + +ENTRY(memcpy) + stmfd sp!, {r0,r3-r11, lr} +1: + mov r5, r1 + and r5, r5, #~0x3 + + PLD( pld [r5, #0] ) + PLD( pld [r5, #0x20] ) + PLD( pld [r5, #0x40] ) + + cmp r2, #4 + bls 42f + + rsb r4, r0, #0 + ands r4, r4, #0x2 + ldrneb r5, [r1], #1 + ldrneb r6, [r1], #1 + subne r2, r2, #2 + strneb r5, [r0], #1 + strneb r6, [r0], #1 + + ands r4, r0, #0x1 + ldrneb r5, [r1], #1 + subne r2, r2, #1 + strneb r5, [r0], #1 + + and r3, r1, #3 + cmp r3, #3 + beq 34f + cmp r3, #2 + beq 24f + cmp r3, #1 + beq 14f + +@The source and destination are word aligned. We get an easy job. +2: + + and r4, r0, #0x1C + rsb r4, r4, #32 + + and r5, r2, #0x1C + cmp r4, r2 + movhi r4, r5 + + cmp r4, #0 + beq 5f + + rsb r3, r4, #32 + and r3, r3, #0x1C + sub r2, r2, r4 + + adr r12, 3f + add pc, r12, r3 + +3: + ldr r4, [r1], #4 + ldr r5, [r1], #4 + ldr r6, [r1], #4 + ldr r7, [r1], #4 + ldr r8, [r1], #4 + ldr r9, [r1], #4 + ldr r10,[r1], #4 + ldr r11,[r1], #4 + +@Now jump into the store table + adr r12, 4f + add pc, r12, r3 + +4: + str r4, [r0], #4 + str r5, [r0], #4 + str r6, [r0], #4 + str r7, [r0], #4 + str r8, [r0], #4 + str r9, [r0], #4 + str r10,[r0], #4 + str r11,[r0], #4 + +@We are now cache line aligned. +5: + cmp r2, #(32*3 + 32) + bmi 6f + + PLD( pld [r1, #0x60] ) + PLD( pld [r1, #0x80] ) + +@Here is the main loop that handles pipelining the loads + + ldmia r1!, {r4-r11} + stmia r0!, {r4-r11} + + ldmia r1!, {r4-r11} + stmia r0!, {r4-r11} + + sub r2, r2, #64 + + b 5b + +6: + cmp r2, #32*4 + bls 7f + PLD( pld [r1, #0x80] ) + +7: + +@Now we finish up the copy without any preloads. The data should have already +@been loaded into the caches +8: + cmp r2, #32 + bmi 9f + + ldmia r1!, {r4-r11} + stmia r0!, {r4-r11} + + sub r2, r2, #32 + b 8b + +9: + ands r3, r2, #0x1C + beq 42f + sub r2, r2, r3 + rsb r3, r3, #32 + + adr r12, 10f + add pc, r12, r3 + +10: + ldr r4, [r1], #4 + ldr r5, [r1], #4 + ldr r6, [r1], #4 + ldr r7, [r1], #4 + ldr r8, [r1], #4 + ldr r9, [r1], #4 + ldr r10,[r1], #4 + ldr r11,[r1], #4 + + adr r12, 11f + add pc, r12, r3 + +11: + str r4, [r0], #4 + str r5, [r0], #4 + str r6, [r0], #4 + str r7, [r0], #4 + str r8, [r0], #4 + str r9, [r0], #4 + str r10,[r0], #4 + str r11,[r0], #4 + + rsb r2, r2, #4 + adr r12, 12f + add pc, r12, r2, LSL #2 + +12: + ldrb r3, [r1], #1 + ldrb r4, [r1], #1 + ldrb r5, [r1], #1 + ldrb r6, [r1], #1 + + adr r12, 13f + add pc, r12, r2, LSL #2 + +13: + strb r3, [r0], #1 + strb r4, [r0], #1 + strb r5, [r0], #1 + strb r6, [r0], #1 + + ldmfd sp!, {r0,r3-r11, pc} + + +@The source and destination are not aligned. We're going to have +@to load and shift data from a temporary buffer. Stuff needs to be +@shifted to the right by 8 bits to align properly +14: + + and r3, r1, #~0x3 + ldr lr, [r3], #4 + mov lr, lr, LSR #8 + + and r4, r0, #0x1C + rsb r4, r4, #32 + + and r5, r2, #0x1C + cmp r4, r2 + movhi r4, r5 + + cmp r4, #0 + beq 16f + rsb r6, r4, #32 + and r6, r6, #0x1C + + sub r2, r2, r4 + + adr r12, 15f + add pc, r12, r6, LSL #2 + +15: + ldr r4, [r3], #4 + orr r12,lr, r4, LSL #24 + str r12,[r0], #4 + mov lr, r4, LSR #8 + + ldr r4, [r3], #4 + orr r12,lr, r4, LSL #24 + str r12,[r0], #4 + mov lr, r4, LSR #8 + + ldr r4, [r3], #4 + orr r12,lr, r4, LSL #24 + str r12,[r0], #4 + mov lr, r4, LSR #8 + + ldr r4, [r3], #4 + orr r12,lr, r4, LSL #24 + str r12,[r0], #4 + mov lr, r4, LSR #8 + + ldr r4, [r3], #4 + orr r12,lr, r4, LSL #24 + str r12,[r0], #4 + mov lr, r4, LSR #8 + + ldr r4, [r3], #4 + orr r12,lr, r4, LSL #24 + str r12,[r0], #4 + mov lr, r4, LSR #8 + + ldr r4, [r3], #4 + orr r12,lr, r4, LSL #24 + str r12,[r0], #4 + mov lr, r4, LSR #8 + + ldr r4, [r3], #4 + orr r12,lr, r4, LSL #24 + str r12,[r0], #4 + mov lr, r4, LSR #8 + +@We are now cache line aligned. +16: + cmp r2, #(32*4 + 32) + bmi 17f + + PLD( pld [r3, #0x60] ) + PLD( pld [r3, #0x80] ) + + ldmia r3!, {r4, r5, r6, r7, r8, r9, r10, r11} + + orr r1,lr, r4, LSL #24 + mov lr, r4, LSR #8 + + orr r4, lr, r5, LSL #24 + mov lr, r5, LSR #8 + + orr r5, lr, r6, LSL #24 + mov lr, r6, LSR #8 + + orr r6, lr, r7, LSL #24 + mov lr, r7, LSR #8 + + orr r7, lr, r8, LSL #24 + mov lr, r8, LSR #8 + + orr r8, lr, r9, LSL #24 + mov lr, r9, LSR #8 + + orr r9, lr, r10, LSL #24 + mov lr, r10, LSR #8 + + orr r10, lr, r11, LSL #24 + mov lr, r11, LSR #8 + + stmia r0!, {r1, r4, r5, r6, r7, r8, r9, r10} + + ldmia r3!, {r4, r5, r6, r7, r8, r9, r10, r11} + + orr r1,lr, r4, LSL #24 + mov lr, r4, LSR #8 + + orr r4, lr, r5, LSL #24 + mov lr, r5, LSR #8 + + orr r5, lr, r6, LSL #24 + mov lr, r6, LSR #8 + + orr r6, lr, r7, LSL #24 + mov lr, r7, LSR #8 + + orr r7, lr, r8, LSL #24 + mov lr, r8, LSR #8 + + orr r8, lr, r9, LSL #24 + mov lr, r9, LSR #8 + + orr r9, lr, r10, LSL #24 + mov lr, r10, LSR #8 + + orr r10, lr, r11, LSL #24 + mov lr, r11, LSR #8 + + stmia r0!, {r1, r4, r5, r6, r7, r8, r9, r10} + + sub r2, r2, #64 + + b 16b + +17: + cmp r2, #32*4 + bls 18f + PLD( pld [r3, #0x80] ) + +18: + +19: + cmp r2, #32 + bmi 20f + + ldmia r3!, {r4, r5, r6, r7, r8, r9, r10, r11} + + orr r1,lr, r4, LSL #24 + mov lr, r4, LSR #8 + + orr r4, lr, r5, LSL #24 + mov lr, r5, LSR #8 + + orr r5, lr, r6, LSL #24 + mov lr, r6, LSR #8 + + orr r6, lr, r7, LSL #24 + mov lr, r7, LSR #8 + + orr r7, lr, r8, LSL #24 + mov lr, r8, LSR #8 + + orr r8, lr, r9, LSL #24 + mov lr, r9, LSR #8 + + orr r9, lr, r10, LSL #24 + mov lr, r10, LSR #8 + + orr r10, lr, r11, LSL #24 + mov lr, r11, LSR #8 + + stmia r0!, {r1, r4, r5, r6, r7, r8, r9, r10} + + sub r2, r2, #32 + b 19b + +20: + + ands r6, r2, #0x1C + subeq r1, r3, #3 + beq 42f + sub r2, r2, r6 + rsb r6, r6, #32 + + adr r12, 21f + add pc, r12, r6, LSL #2 + +21: + ldr r4, [r3], #4 + orr r12,lr, r4, LSL #24 + str r12,[r0], #4 + mov lr, r4, LSR #8 + + ldr r4, [r3], #4 + orr r12,lr, r4, LSL #24 + str r12,[r0], #4 + mov lr, r4, LSR #8 + + ldr r4, [r3], #4 + orr r12,lr, r4, LSL #24 + str r12,[r0], #4 + mov lr, r4, LSR #8 + + ldr r4, [r3], #4 + orr r12,lr, r4, LSL #24 + str r12,[r0], #4 + mov lr, r4, LSR #8 + + ldr r4, [r3], #4 + orr r12,lr, r4, LSL #24 + str r12,[r0], #4 + mov lr, r4, LSR #8 + + ldr r4, [r3], #4 + orr r12,lr, r4, LSL #24 + str r12,[r0], #4 + mov lr, r4, LSR #8 + + ldr r4, [r3], #4 + orr r12,lr, r4, LSL #24 + str r12,[r0], #4 + mov lr, r4, LSR #8 + + ldr r4, [r3], #4 + orr r12,lr, r4, LSL #24 + str r12,[r0], #4 + mov lr, r4, LSR #8 + + sub r1, r3, #3 + + rsb r2, r2, #4 + adr r12, 22f + add pc, r12, r2, LSL #2 + +22: + ldrb r3, [r1], #1 + ldrb r4, [r1], #1 + ldrb r5, [r1], #1 + ldrb r6, [r1], #1 + + adr r12, 23f + add pc, r12, r2, LSL #2 + +23: + strb r3, [r0], #1 + strb r4, [r0], #1 + strb r5, [r0], #1 + strb r6, [r0], #1 + + ldmfd sp!, {r0,r3-r11, pc} + +@The source and destination are not aligned. We're going to have to load +@and shift data from a temporary buffer. Stuff needs to be shifted to the +@right by 16 bits to align properly +24: + and r3, r1, #~0x3 + ldr lr, [r3], #4 + mov lr, lr, LSR #16 + + and r4, r0, #0x1C + rsb r4, r4, #32 + + and r5, r2, #0x1C + cmp r4, r2 + movhi r4, r5 + + cmp r4, #0 + beq 26f + + rsb r6, r4, #32 + and r6, r6, #0x1C + + sub r2, r2, r4 + + adr r12, 25f + add pc, r12, r6, LSL #2 + +25: + ldr r4, [r3], #4 + orr r12,lr, r4, LSL #16 + str r12,[r0], #4 + mov lr, r4, LSR #16 + + ldr r4, [r3], #4 + orr r12,lr, r4, LSL #16 + str r12,[r0], #4 + mov lr, r4, LSR #16 + + ldr r4, [r3], #4 + orr r12,lr, r4, LSL #16 + str r12,[r0], #4 + mov lr, r4, LSR #16 + + ldr r4, [r3], #4 + orr r12,lr, r4, LSL #16 + str r12,[r0], #4 + mov lr, r4, LSR #16 + + ldr r4, [r3], #4 + orr r12,lr, r4, LSL #16 + str r12,[r0], #4 + mov lr, r4, LSR #16 + + ldr r4, [r3], #4 + orr r12,lr, r4, LSL #16 + str r12,[r0], #4 + mov lr, r4, LSR #16 + + ldr r4, [r3], #4 + orr r12,lr, r4, LSL #16 + str r12,[r0], #4 + mov lr, r4, LSR #16 + + ldr r4, [r3], #4 + orr r12,lr, r4, LSL #16 + str r12,[r0], #4 + mov lr, r4, LSR #16 + +26: + cmp r2, #(32*4 + 32) + bmi 27f + + PLD( pld [r3, #0x60] ) + PLD( pld [r3, #0x80] ) + + ldmia r3!, {r4, r5, r6, r7, r8, r9, r10, r11} + + orr r1,lr, r4, LSL #16 + mov lr, r4, LSR #16 + + orr r4, lr, r5, LSL #16 + mov lr, r5, LSR #16 + + orr r5, lr, r6, LSL #16 + mov lr, r6, LSR #16 + + orr r6, lr, r7, LSL #16 + mov lr, r7, LSR #16 + + orr r7, lr, r8, LSL #16 + mov lr, r8, LSR #16 + + orr r8, lr, r9, LSL #16 + mov lr, r9, LSR #16 + + orr r9, lr, r10, LSL #16 + mov lr, r10, LSR #16 + + orr r10, lr, r11, LSL #16 + mov lr, r11, LSR #16 + + stmia r0!, {r1, r4, r5, r6, r7, r8, r9, r10} + + ldmia r3!, {r4, r5, r6, r7, r8, r9, r10, r11} + + orr r1,lr, r4, LSL #16 + mov lr, r4, LSR #16 + + orr r4, lr, r5, LSL #16 + mov lr, r5, LSR #16 + + orr r5, lr, r6, LSL #16 + mov lr, r6, LSR #16 + + orr r6, lr, r7, LSL #16 + mov lr, r7, LSR #16 + + orr r7, lr, r8, LSL #16 + mov lr, r8, LSR #16 + + orr r8, lr, r9, LSL #16 + mov lr, r9, LSR #16 + + orr r9, lr, r10, LSL #16 + mov lr, r10, LSR #16 + + orr r10, lr, r11, LSL #16 + mov lr, r11, LSR #16 + + stmia r0!, {r1, r4, r5, r6, r7, r8, r9, r10} + + sub r2, r2, #64 + b 26b + +27: + cmp r2, #32*4 + bls 28f + PLD( pld [r3, #0x80] ) + +28: + +29: + cmp r2, #32 + bmi 30f + + ldmia r3!, {r4, r5, r6, r7, r8, r9, r10, r11} + + orr r1,lr, r4, LSL #16 + mov lr, r4, LSR #16 + + orr r4, lr, r5, LSL #16 + mov lr, r5, LSR #16 + + orr r5, lr, r6, LSL #16 + mov lr, r6, LSR #16 + + orr r6, lr, r7, LSL #16 + mov lr, r7, LSR #16 + + orr r7, lr, r8, LSL #16 + mov lr, r8, LSR #16 + + orr r8, lr, r9, LSL #16 + mov lr, r9, LSR #16 + + orr r9, lr, r10, LSL #16 + mov lr, r10, LSR #16 + + orr r10, lr, r11, LSL #16 + mov lr, r11, LSR #16 + + stmia r0!, {r1, r4, r5, r6, r7, r8, r9, r10} + + sub r2, r2, #32 + b 29b + +30: + ands r6, r2, #0x1C + subeq r1, r3, #2 + beq 42f + sub r2, r2, r6 + rsb r6, r6, #32 + + adr r12, 31f + add pc, r12, r6, LSL #2 + +31: + ldr r4, [r3], #4 + orr r12,lr, r4, LSL #16 + str r12,[r0], #4 + mov lr, r4, LSR #16 + + ldr r4, [r3], #4 + orr r12,lr, r4, LSL #16 + str r12,[r0], #4 + mov lr, r4, LSR #16 + + ldr r4, [r3], #4 + orr r12,lr, r4, LSL #16 + str r12,[r0], #4 + mov lr, r4, LSR #16 + + ldr r4, [r3], #4 + orr r12,lr, r4, LSL #16 + str r12,[r0], #4 + mov lr, r4, LSR #16 + + ldr r4, [r3], #4 + orr r12,lr, r4, LSL #16 + str r12,[r0], #4 + mov lr, r4, LSR #16 + + ldr r4, [r3], #4 + orr r12,lr, r4, LSL #16 + str r12,[r0], #4 + mov lr, r4, LSR #16 + + ldr r4, [r3], #4 + orr r12,lr, r4, LSL #16 + str r12,[r0], #4 + mov lr, r4, LSR #16 + + ldr r4, [r3], #4 + orr r12,lr, r4, LSL #16 + str r12,[r0], #4 + mov lr, r4, LSR #16 + + sub r1, r3, #2 + + rsb r2, r2, #4 + adr r12, 32f + add pc, r12, r2, LSL #2 + +32: + ldrb r3, [r1], #1 + ldrb r4, [r1], #1 + ldrb r5, [r1], #1 + ldrb r6, [r1], #1 + + adr r12, 33f + add pc, r12, r2, LSL #2 + +33: + strb r3, [r0], #1 + strb r4, [r0], #1 + strb r5, [r0], #1 + strb r6, [r0], #1 + + ldmfd sp!, {r0,r3-r11, pc} + +@The source and destination are not aligned. We're going to have to load +@and shift data from a temporary buffer. Stuff needs to be shifted to the +@right by 24 bits to align properly +34: + + and r3, r1, #~0x3 + ldr lr, [r3], #4 + mov lr, lr, LSR #24 + + and r4, r0, #0x1C + rsb r4, r4, #32 + and r5, r2, #0x1C + cmp r4, r2 + movhi r4, r5 + + cmp r4, #0 + beq 36f + + rsb r6, r4, #32 + and r6, r6, #0x1C + sub r2, r2, r4 + + adr r12, 35f + add pc, r12, r6, LSL #2 + +35: + ldr r4, [r3], #4 + orr r12,lr, r4, LSL #8 + str r12,[r0], #4 + mov lr, r4, LSR #24 + + ldr r4, [r3], #4 + orr r12,lr, r4, LSL #8 + str r12,[r0], #4 + mov lr, r4, LSR #24 + + ldr r4, [r3], #4 + orr r12,lr, r4, LSL #8 + str r12,[r0], #4 + mov lr, r4, LSR #24 + + ldr r4, [r3], #4 + orr r12,lr, r4, LSL #8 + str r12,[r0], #4 + mov lr, r4, LSR #24 + + ldr r4, [r3], #4 + orr r12,lr, r4, LSL #8 + str r12,[r0], #4 + mov lr, r4, LSR #24 + + ldr r4, [r3], #4 + orr r12,lr, r4, LSL #8 + str r12,[r0], #4 + mov lr, r4, LSR #24 + + ldr r4, [r3], #4 + orr r12,lr, r4, LSL #8 + str r12,[r0], #4 + mov lr, r4, LSR #24 + + ldr r4, [r3], #4 + orr r12,lr, r4, LSL #8 + str r12,[r0], #4 + mov lr, r4, LSR #24 + +36: + cmp r2, #(32*4 + 32) + bmi 37f + + PLD( pld [r3, #0x60] ) + PLD( pld [r3, #0x80] ) + + ldmia r3!, {r4, r5, r6, r7, r8, r9, r10, r11} + + orr r1,lr, r4, LSL #8 + mov lr, r4, LSR #24 + + orr r4, lr, r5, LSL #8 + mov lr, r5, LSR #24 + + orr r5, lr, r6, LSL #8 + mov lr, r6, LSR #24 + + orr r6, lr, r7, LSL #8 + mov lr, r7, LSR #24 + + orr r7, lr, r8, LSL #8 + mov lr, r8, LSR #24 + + orr r8, lr, r9, LSL #8 + mov lr, r9, LSR #24 + + orr r9, lr, r10, LSL #8 + mov lr, r10, LSR #24 + + orr r10, lr, r11, LSL #8 + mov lr, r11, LSR #24 + + stmia r0!, {r1, r4, r5, r6, r7, r8, r9, r10} + + ldmia r3!, {r4, r5, r6, r7, r8, r9, r10, r11} + + orr r1,lr, r4, LSL #8 + mov lr, r4, LSR #24 + + orr r4, lr, r5, LSL #8 + mov lr, r5, LSR #24 + + orr r5, lr, r6, LSL #8 + mov lr, r6, LSR #24 + + orr r6, lr, r7, LSL #8 + mov lr, r7, LSR #24 + + orr r7, lr, r8, LSL #8 + mov lr, r8, LSR #24 + + orr r8, lr, r9, LSL #8 + mov lr, r9, LSR #24 + + orr r9, lr, r10, LSL #8 + mov lr, r10, LSR #24 + + orr r10, lr, r11, LSL #8 + mov lr, r11, LSR #24 + + stmia r0!, {r1, r4, r5, r6, r7, r8, r9, r10} + + sub r2, r2, #64 + b 36b + +37: + cmp r2, #32*4 + bls 38f + PLD( pld [r3, #0x80] ) + +38: + +39: + cmp r2, #32 + bmi 40f + + ldmia r3!, {r4, r5, r6, r7, r8, r9, r10, r11} + + orr r1,lr, r4, LSL #8 + mov lr, r4, LSR #24 + + orr r4, lr, r5, LSL #8 + mov lr, r5, LSR #24 + + orr r5, lr, r6, LSL #8 + mov lr, r6, LSR #24 + + orr r6, lr, r7, LSL #8 + mov lr, r7, LSR #24 + + orr r7, lr, r8, LSL #8 + mov lr, r8, LSR #24 + + orr r8, lr, r9, LSL #8 + mov lr, r9, LSR #24 + + orr r9, lr, r10, LSL #8 + mov lr, r10, LSR #24 + + orr r10, lr, r11, LSL #8 + mov lr, r11, LSR #24 + + stmia r0!, {r1, r4, r5, r6, r7, r8, r9, r10} + + sub r2, r2, #32 + b 39b + +40: + + ands r6, r2, #0x1C + subeq r1, r3, #1 + beq 42f + sub r2, r2, r6 + rsb r6, r6, #32 + + adr r12, 41f + add pc, r12, r6, LSL #2 + +41: + ldr r4, [r3], #4 + orr r12,lr, r4, LSL #8 + str r12,[r0], #4 + mov lr, r4, LSR #24 + + ldr r4, [r3], #4 + orr r12,lr, r4, LSL #8 + str r12,[r0], #4 + mov lr, r4, LSR #24 + + ldr r4, [r3], #4 + orr r12,lr, r4, LSL #8 + str r12,[r0], #4 + mov lr, r4, LSR #24 + + ldr r4, [r3], #4 + orr r12,lr, r4, LSL #8 + str r12,[r0], #4 + mov lr, r4, LSR #24 + + ldr r4, [r3], #4 + orr r12,lr, r4, LSL #8 + str r12,[r0], #4 + mov lr, r4, LSR #24 + + ldr r4, [r3], #4 + orr r12,lr, r4, LSL #8 + str r12,[r0], #4 + mov lr, r4, LSR #24 + + ldr r4, [r3], #4 + orr r12,lr, r4, LSL #8 + str r12,[r0], #4 + mov lr, r4, LSR #24 + + ldr r4, [r3], #4 + orr r12,lr, r4, LSL #8 + str r12,[r0], #4 + mov lr, r4, LSR #24 + sub r1, r3, #1 + +42: + rsb r2, r2, #4 + adr r12, 43f + add pc, r12, r2, LSL #2 + +43: + ldrb r3, [r1], #1 + ldrb r4, [r1], #1 + ldrb r5, [r1], #1 + ldrb r6, [r1], #1 + + adr r12, 44f + add pc, r12, r2, LSL #2 + +44: + strb r3, [r0], #1 + strb r4, [r0], #1 + strb r5, [r0], #1 + strb r6, [r0], #1 + + ldmfd sp!, {r0,r3-r11, pc} + diff -uNrp glibc.old/sysdeps/arm/memmove.S glibc-2.3.2/sysdeps/arm/memmove.S --- glibc.old/sysdeps/arm/memmove.S 1969-12-31 19:00:00.000000000 -0500 +++ glibc-2.3.2/sysdeps/arm/memmove.S 2004-03-09 14:37:53.000000000 -0500 @@ -0,0 +1 @@ +/* see memcpy.S */