From d65e214d666269d0bd20d88ba2bc83349810c668 Mon Sep 17 00:00:00 2001 From: Julius Werner Date: Fri, 13 Dec 2013 12:59:57 -0800 Subject: [PATCH] arm: Update mem* functions to newer versions The memcpy/memset/memmove assembly implementations have been taken from U-Boot, which originally got them from Linux. I turns out that they are actually not that bad, but they could use an update. This patch pulls in the current Linux upstream versions of those files, removing some old U-Boot cruft such as checking whether the two pointers in a memcpy() are equal (really now?) or side-stepping the R8 register because it was used for special purposes. It also returns to the good old Linux ENTRY/ENDPROC macros since we have them now anyway, and straightens out the W() macro in preparation for unified thumb support. Change-Id: I138af269b423bef0a237759ac29f1ee58ca206a0 Signed-off-by: Julius Werner Reviewed-on: https://chromium-review.googlesource.com/182179 Reviewed-by: Vincent Palatin (cherry picked from commit 777127997bde5785b21d422d0b6eb04c4328b478) Signed-off-by: Isaac Christensen Reviewed-on: http://review.coreboot.org/6918 Tested-by: build bot (Jenkins) Reviewed-by: David Hendricks --- payloads/libpayload/arch/arm/Makefile.inc | 2 +- .../arch/arm/{assembler.h => asmlib.h} | 28 ++- payloads/libpayload/arch/arm/memcpy.S | 13 +- payloads/libpayload/arch/arm/memmove.S | 199 ++++++++++++++++++ payloads/libpayload/arch/arm/memset.S | 109 +++++----- payloads/libpayload/include/arm/arch/asm.h | 2 + .../arm/{include/assembler.h => asmlib.h} | 28 ++- src/arch/arm/include/arch/asm.h | 5 + src/arch/arm/memcpy.S | 13 +- src/arch/arm/memmove.S | 10 +- src/arch/arm/memset.S | 109 +++++----- 11 files changed, 366 insertions(+), 152 deletions(-) rename payloads/libpayload/arch/arm/{assembler.h => asmlib.h} (60%) create mode 100644 payloads/libpayload/arch/arm/memmove.S rename src/arch/arm/{include/assembler.h => asmlib.h} (61%) diff --git a/payloads/libpayload/arch/arm/Makefile.inc b/payloads/libpayload/arch/arm/Makefile.inc index 8d7bb5e2f8..53a52df6aa 100644 --- a/payloads/libpayload/arch/arm/Makefile.inc +++ b/payloads/libpayload/arch/arm/Makefile.inc @@ -33,7 +33,7 @@ head.o-y += head.S libc-y += main.c sysinfo.c libc-y += timer.c coreboot.c util.S libc-y += virtual.c -libc-y += memcpy.S memset.S +libc-y += memcpy.S memset.S memmove.S libc-y += exception_asm.S exception.c libc-y += cache.c libcbfs-$(CONFIG_LP_CBFS) += dummy_media.c diff --git a/payloads/libpayload/arch/arm/assembler.h b/payloads/libpayload/arch/arm/asmlib.h similarity index 60% rename from payloads/libpayload/arch/arm/assembler.h rename to payloads/libpayload/arch/arm/asmlib.h index 5e4789b145..8b3fa22cc2 100644 --- a/payloads/libpayload/arch/arm/assembler.h +++ b/payloads/libpayload/arch/arm/asmlib.h @@ -1,5 +1,7 @@ /* - * arch/arm/include/asm/assembler.h + * arch/arm/asmlib.h + * + * Adapted from Linux arch/arm/include/assembler.h * * Copyright (C) 1996-2000 Russell King * @@ -14,6 +16,16 @@ * assembler source. */ +/* + * WARNING: This file is *only* meant for memcpy.S and friends which were copied + * from Linux and require some weird macros. It does unspeakable things like + * redefining "push", so do *not* try to turn it into a general assembly macro + * file, and keep it out of global include directories. + */ + +#ifndef __ARM_ASMLIB_H__ +#define __ARM_ASMLIB_H__ + /* * Endian independent macros for shifting bytes within registers. */ @@ -44,17 +56,17 @@ /* * Data preload for architectures that support it */ -#if defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) || \ - defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || \ - defined(__ARM_ARCH_6T2__) || defined(__ARM_ARCH_6Z__) || \ - defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_7A__) || \ - defined(__ARM_ARCH_7R__) +#if 1 /* TODO: differentiate once libpayload supports more ARM versions */ #define PLD(code...) code #else #define PLD(code...) #endif /* - * Cache alligned + * This can be used to enable code to cacheline align the destination + * pointer when bulk writing to memory. Linux doesn't enable this except + * for the "Feroceon" processor, so we better just leave it out. */ -#define CALGN(code...) code +#define CALGN(code...) + +#endif /* __ARM_ASMLIB_H */ diff --git a/payloads/libpayload/arch/arm/memcpy.S b/payloads/libpayload/arch/arm/memcpy.S index e68b28f542..b8f857bb56 100644 --- a/payloads/libpayload/arch/arm/memcpy.S +++ b/payloads/libpayload/arch/arm/memcpy.S @@ -10,9 +10,8 @@ * published by the Free Software Foundation. */ -#include "assembler.h" - -#define W(instr) instr +#include +#include "asmlib.h" #define LDR1W_SHIFT 0 #define STR1W_SHIFT 0 @@ -57,12 +56,7 @@ /* Prototype: void *memcpy(void *dest, const void *src, size_t n); */ -.type memcpy, function -.globl memcpy -memcpy: - - cmp r0, r1 - moveq pc, lr +ENTRY(memcpy) enter r4, lr @@ -242,3 +236,4 @@ memcpy: 17: forward_copy_shift pull=16 push=16 18: forward_copy_shift pull=24 push=8 +ENDPROC(memcpy) diff --git a/payloads/libpayload/arch/arm/memmove.S b/payloads/libpayload/arch/arm/memmove.S new file mode 100644 index 0000000000..dc29f7458c --- /dev/null +++ b/payloads/libpayload/arch/arm/memmove.S @@ -0,0 +1,199 @@ +/* + * linux/arch/arm/lib/memmove.S + * + * Author: Nicolas Pitre + * Created: Sep 28, 2005 + * Copyright: (C) MontaVista Software Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include "asmlib.h" + + .text + +/* + * Prototype: void *memmove(void *dest, const void *src, size_t n); + * + * Note: + * + * If the memory regions don't overlap, we simply branch to memcpy which is + * normally a bit faster. Otherwise the copy is done going downwards. This + * is a transposition of the code from copy_template.S but with the copy + * occurring in the opposite direction. + */ + +ENTRY(memmove) + + subs ip, r0, r1 + cmphi r2, ip + bls memcpy + + stmfd sp!, {r0, r4, lr} + add r1, r1, r2 + add r0, r0, r2 + subs r2, r2, #4 + blt 8f + ands ip, r0, #3 + PLD( pld [r1, #-4] ) + bne 9f + ands ip, r1, #3 + bne 10f + +1: subs r2, r2, #(28) + stmfd sp!, {r5 - r8} + blt 5f + + CALGN( ands ip, r0, #31 ) + CALGN( sbcnes r4, ip, r2 ) @ C is always set here + CALGN( bcs 2f ) + CALGN( adr r4, 6f ) + CALGN( subs r2, r2, ip ) @ C is set here + CALGN( rsb ip, ip, #32 ) + CALGN( add pc, r4, ip ) + + PLD( pld [r1, #-4] ) +2: PLD( subs r2, r2, #96 ) + PLD( pld [r1, #-32] ) + PLD( blt 4f ) + PLD( pld [r1, #-64] ) + PLD( pld [r1, #-96] ) + +3: PLD( pld [r1, #-128] ) +4: ldmdb r1!, {r3, r4, r5, r6, r7, r8, ip, lr} + subs r2, r2, #32 + stmdb r0!, {r3, r4, r5, r6, r7, r8, ip, lr} + bge 3b + PLD( cmn r2, #96 ) + PLD( bge 4b ) + +5: ands ip, r2, #28 + rsb ip, ip, #32 + addne pc, pc, ip @ C is always clear here + b 7f +6: W(nop) + W(ldr) r3, [r1, #-4]! + W(ldr) r4, [r1, #-4]! + W(ldr) r5, [r1, #-4]! + W(ldr) r6, [r1, #-4]! + W(ldr) r7, [r1, #-4]! + W(ldr) r8, [r1, #-4]! + W(ldr) lr, [r1, #-4]! + + add pc, pc, ip + nop + W(nop) + W(str) r3, [r0, #-4]! + W(str) r4, [r0, #-4]! + W(str) r5, [r0, #-4]! + W(str) r6, [r0, #-4]! + W(str) r7, [r0, #-4]! + W(str) r8, [r0, #-4]! + W(str) lr, [r0, #-4]! + + CALGN( bcs 2b ) + +7: ldmfd sp!, {r5 - r8} + +8: movs r2, r2, lsl #31 + ldrneb r3, [r1, #-1]! + ldrcsb r4, [r1, #-1]! + ldrcsb ip, [r1, #-1] + strneb r3, [r0, #-1]! + strcsb r4, [r0, #-1]! + strcsb ip, [r0, #-1] + ldmfd sp!, {r0, r4, pc} + +9: cmp ip, #2 + ldrgtb r3, [r1, #-1]! + ldrgeb r4, [r1, #-1]! + ldrb lr, [r1, #-1]! + strgtb r3, [r0, #-1]! + strgeb r4, [r0, #-1]! + subs r2, r2, ip + strb lr, [r0, #-1]! + blt 8b + ands ip, r1, #3 + beq 1b + +10: bic r1, r1, #3 + cmp ip, #2 + ldr r3, [r1, #0] + beq 17f + blt 18f + + + .macro backward_copy_shift push pull + + subs r2, r2, #28 + blt 14f + + CALGN( ands ip, r0, #31 ) + CALGN( sbcnes r4, ip, r2 ) @ C is always set here + CALGN( subcc r2, r2, ip ) + CALGN( bcc 15f ) + +11: stmfd sp!, {r5 - r9} + + PLD( pld [r1, #-4] ) + PLD( subs r2, r2, #96 ) + PLD( pld [r1, #-32] ) + PLD( blt 13f ) + PLD( pld [r1, #-64] ) + PLD( pld [r1, #-96] ) + +12: PLD( pld [r1, #-128] ) +13: ldmdb r1!, {r7, r8, r9, ip} + mov lr, r3, push #\push + subs r2, r2, #32 + ldmdb r1!, {r3, r4, r5, r6} + orr lr, lr, ip, pull #\pull + mov ip, ip, push #\push + orr ip, ip, r9, pull #\pull + mov r9, r9, push #\push + orr r9, r9, r8, pull #\pull + mov r8, r8, push #\push + orr r8, r8, r7, pull #\pull + mov r7, r7, push #\push + orr r7, r7, r6, pull #\pull + mov r6, r6, push #\push + orr r6, r6, r5, pull #\pull + mov r5, r5, push #\push + orr r5, r5, r4, pull #\pull + mov r4, r4, push #\push + orr r4, r4, r3, pull #\pull + stmdb r0!, {r4 - r9, ip, lr} + bge 12b + PLD( cmn r2, #96 ) + PLD( bge 13b ) + + ldmfd sp!, {r5 - r9} + +14: ands ip, r2, #28 + beq 16f + +15: mov lr, r3, push #\push + ldr r3, [r1, #-4]! + subs ip, ip, #4 + orr lr, lr, r3, pull #\pull + str lr, [r0, #-4]! + bgt 15b + CALGN( cmp r2, #0 ) + CALGN( bge 11b ) + +16: add r1, r1, #(\pull / 8) + b 8b + + .endm + + + backward_copy_shift push=8 pull=24 + +17: backward_copy_shift push=16 pull=16 + +18: backward_copy_shift push=24 pull=8 + +ENDPROC(memmove) diff --git a/payloads/libpayload/arch/arm/memset.S b/payloads/libpayload/arch/arm/memset.S index aa4f57a993..945767c599 100644 --- a/payloads/libpayload/arch/arm/memset.S +++ b/payloads/libpayload/arch/arm/memset.S @@ -9,33 +9,21 @@ * * ASM optimised string functions */ -#include "assembler.h" + +#include +#include "asmlib.h" .text .align 5 - .word 0 -1: subs r2, r2, #4 @ 1 do we have enough - blt 5f @ 1 bytes to align with? - cmp r3, #2 @ 1 - strltb r1, [r0], #1 @ 1 - strleb r1, [r0], #1 @ 1 - strb r1, [r0], #1 @ 1 - add r2, r2, r3 @ 1 (r2 = r2 - (4 - r3)) -/* - * The pointer is now aligned and the length is adjusted. Try doing the - * memset again. - */ - -.type memset, function -.globl memset -memset: +ENTRY(memset) ands r3, r0, #3 @ 1 unaligned? - bne 1b @ 1 + mov ip, r0 @ preserve r0 as return value + bne 6f @ 1 /* - * we know that the pointer in r0 is aligned to a word boundary. + * we know that the pointer in ip is aligned to a word boundary. */ - orr r1, r1, r1, lsl #8 +1: orr r1, r1, r1, lsl #8 orr r1, r1, r1, lsl #16 mov r3, r1 cmp r2, #16 @@ -44,29 +32,28 @@ memset: #if ! CALGN(1)+0 /* - * We need an extra register for this loop - save the return address and - * use the LR + * We need 2 extra registers for this loop - use r8 and the LR */ - str lr, [sp, #-4]! - mov ip, r1 + stmfd sp!, {r8, lr} + mov r8, r1 mov lr, r1 2: subs r2, r2, #64 - stmgeia r0!, {r1, r3, ip, lr} @ 64 bytes at a time. - stmgeia r0!, {r1, r3, ip, lr} - stmgeia r0!, {r1, r3, ip, lr} - stmgeia r0!, {r1, r3, ip, lr} + stmgeia ip!, {r1, r3, r8, lr} @ 64 bytes at a time. + stmgeia ip!, {r1, r3, r8, lr} + stmgeia ip!, {r1, r3, r8, lr} + stmgeia ip!, {r1, r3, r8, lr} bgt 2b - ldmeqfd sp!, {pc} @ Now <64 bytes to go. + ldmeqfd sp!, {r8, pc} @ Now <64 bytes to go. /* * No need to correct the count; we're only testing bits from now on */ tst r2, #32 - stmneia r0!, {r1, r3, ip, lr} - stmneia r0!, {r1, r3, ip, lr} + stmneia ip!, {r1, r3, r8, lr} + stmneia ip!, {r1, r3, r8, lr} tst r2, #16 - stmneia r0!, {r1, r3, ip, lr} - ldr lr, [sp], #4 + stmneia ip!, {r1, r3, r8, lr} + ldmfd sp!, {r8, lr} #else @@ -75,53 +62,63 @@ memset: * whole cache lines at once. */ - stmfd sp!, {r4-r7, lr} + stmfd sp!, {r4-r8, lr} mov r4, r1 mov r5, r1 mov r6, r1 mov r7, r1 - mov ip, r1 + mov r8, r1 mov lr, r1 cmp r2, #96 - tstgt r0, #31 + tstgt ip, #31 ble 3f - and ip, r0, #31 - rsb ip, ip, #32 - sub r2, r2, ip - movs ip, ip, lsl #(32 - 4) - stmcsia r0!, {r4, r5, r6, r7} - stmmiia r0!, {r4, r5} - tst ip, #(1 << 30) - mov ip, r1 - strne r1, [r0], #4 + and r8, ip, #31 + rsb r8, r8, #32 + sub r2, r2, r8 + movs r8, r8, lsl #(32 - 4) + stmcsia ip!, {r4, r5, r6, r7} + stmmiia ip!, {r4, r5} + tst r8, #(1 << 30) + mov r8, r1 + strne r1, [ip], #4 3: subs r2, r2, #64 - stmgeia r0!, {r1, r3-r7, ip, lr} - stmgeia r0!, {r1, r3-r7, ip, lr} + stmgeia ip!, {r1, r3-r8, lr} + stmgeia ip!, {r1, r3-r8, lr} bgt 3b - ldmeqfd sp!, {r4-r7, pc} + ldmeqfd sp!, {r4-r8, pc} tst r2, #32 - stmneia r0!, {r1, r3-r7, ip, lr} + stmneia ip!, {r1, r3-r8, lr} tst r2, #16 - stmneia r0!, {r4-r7} - ldmfd sp!, {r4-r7, lr} + stmneia ip!, {r4-r7} + ldmfd sp!, {r4-r8, lr} #endif 4: tst r2, #8 - stmneia r0!, {r1, r3} + stmneia ip!, {r1, r3} tst r2, #4 - strne r1, [r0], #4 + strne r1, [ip], #4 /* * When we get here, we've got less than 4 bytes to zero. We * may have an unaligned pointer as well. */ 5: tst r2, #2 - strneb r1, [r0], #1 - strneb r1, [r0], #1 + strneb r1, [ip], #1 + strneb r1, [ip], #1 tst r2, #1 - strneb r1, [r0], #1 + strneb r1, [ip], #1 mov pc, lr + +6: subs r2, r2, #4 @ 1 do we have enough + blt 5b @ 1 bytes to align with? + cmp r3, #2 @ 1 + strltb r1, [ip], #1 @ 1 + strleb r1, [ip], #1 @ 1 + strb r1, [ip], #1 @ 1 + add r2, r2, r3 @ 1 (r2 = r2 - (4 - r3)) + b 1b +ENDPROC(memset) diff --git a/payloads/libpayload/include/arm/arch/asm.h b/payloads/libpayload/include/arm/arch/asm.h index 1c3f7a21c3..2f8859939c 100644 --- a/payloads/libpayload/include/arm/arch/asm.h +++ b/payloads/libpayload/include/arm/arch/asm.h @@ -23,9 +23,11 @@ #if defined __arm__ # define ARM(x...) x # define THUMB(x...) +# define W(instr) instr #elif defined __thumb__ # define ARM(x...) # define THUMB(x...) x +# define W(instr) instr.w #else # error Not in ARM or thumb mode! #endif diff --git a/src/arch/arm/include/assembler.h b/src/arch/arm/asmlib.h similarity index 61% rename from src/arch/arm/include/assembler.h rename to src/arch/arm/asmlib.h index 10363c4e4e..cef0e7ea1a 100644 --- a/src/arch/arm/include/assembler.h +++ b/src/arch/arm/asmlib.h @@ -1,5 +1,7 @@ /* - * arch/arm/include/asm/assembler.h + * arch/arm/asmlib.h + * + * Adapted from Linux arch/arm/include/assembler.h * * Copyright (C) 1996-2000 Russell King * @@ -14,6 +16,16 @@ * assembler source. */ +/* + * WARNING: This file is *only* meant for memcpy.S and friends which were copied + * from Linux and require some weird macros. It does unspeakable things like + * redefining "push", so do *not* try to turn it into a general assembly macro + * file, and keep it out of global include directories. + */ + +#ifndef __ARM_ASMLIB_H__ +#define __ARM_ASMLIB_H__ + /* * Endian independent macros for shifting bytes within registers. */ @@ -44,19 +56,17 @@ /* * Data preload for architectures that support it */ -#if defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) || \ - defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || \ - defined(__ARM_ARCH_6T2__) || defined(__ARM_ARCH_6Z__) || \ - defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_7A__) || \ - defined(__ARM_ARCH_7R__) +#if __COREBOOT_ARM_ARCH__ >= 5 #define PLD(code...) code #else #define PLD(code...) #endif /* - * Cache aligned + * This can be used to enable code to cacheline align the destination + * pointer when bulk writing to memory. Linux doesn't enable this except + * for the "Feroceon" processor, so we better just leave it out. */ -#define CALGN(code...) code +#define CALGN(code...) -#define W(instr) instr +#endif /* __ARM_ASMLIB_H */ diff --git a/src/arch/arm/include/arch/asm.h b/src/arch/arm/include/arch/asm.h index 1c3f7a21c3..5f3e55f135 100644 --- a/src/arch/arm/include/arch/asm.h +++ b/src/arch/arm/include/arch/asm.h @@ -23,9 +23,14 @@ #if defined __arm__ # define ARM(x...) x # define THUMB(x...) +# define W(instr) instr #elif defined __thumb__ # define ARM(x...) # define THUMB(x...) x +# define W(instr) instr.w +# if __COREBOOT_ARM_ARCH__ < 7 +# error thumb mode has not been tested with ARM < v7! +# endif #else # error Not in ARM or thumb mode! #endif diff --git a/src/arch/arm/memcpy.S b/src/arch/arm/memcpy.S index 921fc2a6bc..b8f857bb56 100644 --- a/src/arch/arm/memcpy.S +++ b/src/arch/arm/memcpy.S @@ -10,9 +10,8 @@ * published by the Free Software Foundation. */ -#include - -#define W(instr) instr +#include +#include "asmlib.h" #define LDR1W_SHIFT 0 #define STR1W_SHIFT 0 @@ -57,12 +56,7 @@ /* Prototype: void *memcpy(void *dest, const void *src, size_t n); */ -.type memcpy, function -.globl memcpy -memcpy: - - cmp r0, r1 - moveq pc, lr +ENTRY(memcpy) enter r4, lr @@ -242,3 +236,4 @@ memcpy: 17: forward_copy_shift pull=16 push=16 18: forward_copy_shift pull=24 push=8 +ENDPROC(memcpy) diff --git a/src/arch/arm/memmove.S b/src/arch/arm/memmove.S index a2f9ea18ae..dc29f7458c 100644 --- a/src/arch/arm/memmove.S +++ b/src/arch/arm/memmove.S @@ -10,7 +10,8 @@ * published by the Free Software Foundation. */ -#include +#include +#include "asmlib.h" .text @@ -25,9 +26,8 @@ * occurring in the opposite direction. */ -.type memmove, function -.globl memmove -memmove: +ENTRY(memmove) + subs ip, r0, r1 cmphi r2, ip bls memcpy @@ -195,3 +195,5 @@ memmove: 17: backward_copy_shift push=16 pull=16 18: backward_copy_shift push=24 pull=8 + +ENDPROC(memmove) diff --git a/src/arch/arm/memset.S b/src/arch/arm/memset.S index a3cc9477f8..945767c599 100644 --- a/src/arch/arm/memset.S +++ b/src/arch/arm/memset.S @@ -9,33 +9,21 @@ * * ASM optimised string functions */ -#include + +#include +#include "asmlib.h" .text .align 5 - .word 0 -1: subs r2, r2, #4 @ 1 do we have enough - blt 5f @ 1 bytes to align with? - cmp r3, #2 @ 1 - strltb r1, [r0], #1 @ 1 - strleb r1, [r0], #1 @ 1 - strb r1, [r0], #1 @ 1 - add r2, r2, r3 @ 1 (r2 = r2 - (4 - r3)) -/* - * The pointer is now aligned and the length is adjusted. Try doing the - * memset again. - */ - -.type memset, function -.globl memset -memset: +ENTRY(memset) ands r3, r0, #3 @ 1 unaligned? - bne 1b @ 1 + mov ip, r0 @ preserve r0 as return value + bne 6f @ 1 /* - * we know that the pointer in r0 is aligned to a word boundary. + * we know that the pointer in ip is aligned to a word boundary. */ - orr r1, r1, r1, lsl #8 +1: orr r1, r1, r1, lsl #8 orr r1, r1, r1, lsl #16 mov r3, r1 cmp r2, #16 @@ -44,29 +32,28 @@ memset: #if ! CALGN(1)+0 /* - * We need an extra register for this loop - save the return address and - * use the LR + * We need 2 extra registers for this loop - use r8 and the LR */ - str lr, [sp, #-4]! - mov ip, r1 + stmfd sp!, {r8, lr} + mov r8, r1 mov lr, r1 2: subs r2, r2, #64 - stmgeia r0!, {r1, r3, ip, lr} @ 64 bytes at a time. - stmgeia r0!, {r1, r3, ip, lr} - stmgeia r0!, {r1, r3, ip, lr} - stmgeia r0!, {r1, r3, ip, lr} + stmgeia ip!, {r1, r3, r8, lr} @ 64 bytes at a time. + stmgeia ip!, {r1, r3, r8, lr} + stmgeia ip!, {r1, r3, r8, lr} + stmgeia ip!, {r1, r3, r8, lr} bgt 2b - ldmeqfd sp!, {pc} @ Now <64 bytes to go. + ldmeqfd sp!, {r8, pc} @ Now <64 bytes to go. /* * No need to correct the count; we're only testing bits from now on */ tst r2, #32 - stmneia r0!, {r1, r3, ip, lr} - stmneia r0!, {r1, r3, ip, lr} + stmneia ip!, {r1, r3, r8, lr} + stmneia ip!, {r1, r3, r8, lr} tst r2, #16 - stmneia r0!, {r1, r3, ip, lr} - ldr lr, [sp], #4 + stmneia ip!, {r1, r3, r8, lr} + ldmfd sp!, {r8, lr} #else @@ -75,53 +62,63 @@ memset: * whole cache lines at once. */ - stmfd sp!, {r4-r7, lr} + stmfd sp!, {r4-r8, lr} mov r4, r1 mov r5, r1 mov r6, r1 mov r7, r1 - mov ip, r1 + mov r8, r1 mov lr, r1 cmp r2, #96 - tstgt r0, #31 + tstgt ip, #31 ble 3f - and ip, r0, #31 - rsb ip, ip, #32 - sub r2, r2, ip - movs ip, ip, lsl #(32 - 4) - stmcsia r0!, {r4, r5, r6, r7} - stmmiia r0!, {r4, r5} - tst ip, #(1 << 30) - mov ip, r1 - strne r1, [r0], #4 + and r8, ip, #31 + rsb r8, r8, #32 + sub r2, r2, r8 + movs r8, r8, lsl #(32 - 4) + stmcsia ip!, {r4, r5, r6, r7} + stmmiia ip!, {r4, r5} + tst r8, #(1 << 30) + mov r8, r1 + strne r1, [ip], #4 3: subs r2, r2, #64 - stmgeia r0!, {r1, r3-r7, ip, lr} - stmgeia r0!, {r1, r3-r7, ip, lr} + stmgeia ip!, {r1, r3-r8, lr} + stmgeia ip!, {r1, r3-r8, lr} bgt 3b - ldmeqfd sp!, {r4-r7, pc} + ldmeqfd sp!, {r4-r8, pc} tst r2, #32 - stmneia r0!, {r1, r3-r7, ip, lr} + stmneia ip!, {r1, r3-r8, lr} tst r2, #16 - stmneia r0!, {r4-r7} - ldmfd sp!, {r4-r7, lr} + stmneia ip!, {r4-r7} + ldmfd sp!, {r4-r8, lr} #endif 4: tst r2, #8 - stmneia r0!, {r1, r3} + stmneia ip!, {r1, r3} tst r2, #4 - strne r1, [r0], #4 + strne r1, [ip], #4 /* * When we get here, we've got less than 4 bytes to zero. We * may have an unaligned pointer as well. */ 5: tst r2, #2 - strneb r1, [r0], #1 - strneb r1, [r0], #1 + strneb r1, [ip], #1 + strneb r1, [ip], #1 tst r2, #1 - strneb r1, [r0], #1 + strneb r1, [ip], #1 mov pc, lr + +6: subs r2, r2, #4 @ 1 do we have enough + blt 5b @ 1 bytes to align with? + cmp r3, #2 @ 1 + strltb r1, [ip], #1 @ 1 + strleb r1, [ip], #1 @ 1 + strb r1, [ip], #1 @ 1 + add r2, r2, r3 @ 1 (r2 = r2 - (4 - r3)) + b 1b +ENDPROC(memset)