Eliminate do_div().

This eliminates the use of do_div() in favor of using libgcc
functions.

This was tested by building and booting on Google Snow (ARMv7)
and Qemu (x86). printk()s which use division in vtxprintf() look good.

Change-Id: Icad001d84a3c05bfbf77098f3d644816280b4a4d
Signed-off-by: Gabe Black <gabeblack@chromium.org>
Signed-off-by: David Hendricks <dhendrix@chromium.org>
Reviewed-on: http://review.coreboot.org/2606
Tested-by: build bot (Jenkins)
Reviewed-by: Paul Menzel <paulepanter@users.sourceforge.net>
Reviewed-by: Ronald G. Minnich <rminnich@gmail.com>
This commit is contained in:
David Hendricks 2013-03-06 20:43:55 -08:00 committed by Ronald G. Minnich
parent 31c5e07a04
commit ae0e8d3613
9 changed files with 11 additions and 495 deletions

View File

@ -124,7 +124,7 @@ endif
$(objgenerated)/coreboot_ram.o: $(stages_o) $$(ramstage-objs) $(LIBGCC_FILE_NAME)
@printf " CC $(subst $(obj)/,,$(@))\n"
ifeq ($(CONFIG_COMPILER_LLVM_CLANG),y)
$(LD) -m -m armelf_linux_eabi -r -o $@ --wrap __divdi3 --wrap __udivdi3 --wrap __moddi3 --wrap __umoddi3 --wrap __uidiv --wrap __do_div64 --start-group $(ramstage-objs) $(LIBGCC_FILE_NAME) --end-group
$(LD) -m -m armelf_linux_eabi -r -o $@ --wrap __divdi3 --wrap __udivdi3 --wrap __moddi3 --wrap __umoddi3 --wrap __uidiv --start-group $(ramstage-objs) $(LIBGCC_FILE_NAME) --end-group
else
$(CC) $(CFLAGS) -nostdlib -r -o $@ -Wl,--start-group $(stages_o) $(ramstage-objs) $(LIBGCC_FILE_NAME) -Wl,--end-group
endif

View File

@ -1,233 +0,0 @@
/* taken from linux 2.6.31.14 */
#ifndef __ASM_ARM_DIV64
#define __ASM_ARM_DIV64
//#include <asm/system.h>
//#include <linux/types.h>
// FIXME
#define __asmeq(x, y) ".ifnc " x "," y " ; .err ; .endif\n\t"
#define __LINUX_ARM_ARCH__ 7
/*
* The semantics of do_div() are:
*
* uint32_t do_div(uint64_t *n, uint32_t base)
* {
* uint32_t remainder = *n % base;
* *n = *n / base;
* return remainder;
* }
*
* In other words, a 64-bit dividend with a 32-bit divisor producing
* a 64-bit result and a 32-bit remainder. To accomplish this optimally
* we call a special __do_div64 helper with completely non standard
* calling convention for arguments and results (beware).
*/
#ifdef __ARMEB__
#define __xh "r0"
#define __xl "r1"
#else
#define __xl "r0"
#define __xh "r1"
#endif
#define __do_div_asm(n, base) \
({ \
register unsigned int __base asm("r4") = base; \
register unsigned long long __n asm("r0") = n; \
register unsigned long long __res asm("r2"); \
register unsigned int __rem asm(__xh); \
asm( __asmeq("%0", __xh) \
__asmeq("%1", "r2") \
__asmeq("%2", "r0") \
__asmeq("%3", "r4") \
"bl __do_div64" \
: "=r" (__rem), "=r" (__res) \
: "r" (__n), "r" (__base) \
: "ip", "lr", "cc"); \
n = __res; \
__rem; \
})
#if __GNUC__ < 4
/*
* gcc versions earlier than 4.0 are simply too problematic for the
* optimized implementation below. First there is gcc PR 15089 that
* tend to trig on more complex constructs, spurious .global __udivsi3
* are inserted even if none of those symbols are referenced in the
* generated code, and those gcc versions are not able to do constant
* propagation on long long values anyway.
*/
#define do_div(n, base) __do_div_asm(n, base)
#elif __GNUC__ >= 4
//#include <asm/bug.h>
/*
* If the divisor happens to be constant, we determine the appropriate
* inverse at compile time to turn the division into a few inline
* multiplications instead which is much faster. And yet only if compiling
* for ARMv4 or higher (we need umull/umlal) and if the gcc version is
* sufficiently recent to perform proper long long constant propagation.
* (It is unfortunate that gcc doesn't perform all this internally.)
*/
#define do_div(n, base) \
({ \
unsigned int __r, __b = (base); \
if (!__builtin_constant_p(__b) || __b == 0 || \
(__LINUX_ARM_ARCH__ < 4 && (__b & (__b - 1)) != 0)) { \
/* non-constant divisor (or zero): slow path */ \
__r = __do_div_asm(n, __b); \
} else if ((__b & (__b - 1)) == 0) { \
/* Trivial: __b is constant and a power of 2 */ \
/* gcc does the right thing with this code. */ \
__r = n; \
__r &= (__b - 1); \
n /= __b; \
} else { \
/* Multiply by inverse of __b: n/b = n*(p/b)/p */ \
/* We rely on the fact that most of this code gets */ \
/* optimized away at compile time due to constant */ \
/* propagation and only a couple inline assembly */ \
/* instructions should remain. Better avoid any */ \
/* code construct that might prevent that. */ \
unsigned long long __res, __x, __t, __m, __n = n; \
unsigned int __c, __p, __z = 0; \
/* preserve low part of n for reminder computation */ \
__r = __n; \
/* determine number of bits to represent __b */ \
__p = 1 << __div64_fls(__b); \
/* compute __m = ((__p << 64) + __b - 1) / __b */ \
__m = (~0ULL / __b) * __p; \
__m += (((~0ULL % __b + 1) * __p) + __b - 1) / __b; \
/* compute __res = __m*(~0ULL/__b*__b-1)/(__p << 64) */ \
__x = ~0ULL / __b * __b - 1; \
__res = (__m & 0xffffffff) * (__x & 0xffffffff); \
__res >>= 32; \
__res += (__m & 0xffffffff) * (__x >> 32); \
__t = __res; \
__res += (__x & 0xffffffff) * (__m >> 32); \
__t = (__res < __t) ? (1ULL << 32) : 0; \
__res = (__res >> 32) + __t; \
__res += (__m >> 32) * (__x >> 32); \
__res /= __p; \
/* Now sanitize and optimize what we've got. */ \
if (~0ULL % (__b / (__b & -__b)) == 0) { \
/* those cases can be simplified with: */ \
__n /= (__b & -__b); \
__m = ~0ULL / (__b / (__b & -__b)); \
__p = 1; \
__c = 1; \
} else if (__res != __x / __b) { \
/* We can't get away without a correction */ \
/* to compensate for bit truncation errors. */ \
/* To avoid it we'd need an additional bit */ \
/* to represent __m which would overflow it. */ \
/* Instead we do m=p/b and n/b=(n*m+m)/p. */ \
__c = 1; \
/* Compute __m = (__p << 64) / __b */ \
__m = (~0ULL / __b) * __p; \
__m += ((~0ULL % __b + 1) * __p) / __b; \
} else { \
/* Reduce __m/__p, and try to clear bit 31 */ \
/* of __m when possible otherwise that'll */ \
/* need extra overflow handling later. */ \
unsigned int __bits = -(__m & -__m); \
__bits |= __m >> 32; \
__bits = (~__bits) << 1; \
/* If __bits == 0 then setting bit 31 is */ \
/* unavoidable. Simply apply the maximum */ \
/* possible reduction in that case. */ \
/* Otherwise the MSB of __bits indicates the */ \
/* best reduction we should apply. */ \
if (!__bits) { \
__p /= (__m & -__m); \
__m /= (__m & -__m); \
} else { \
__p >>= __div64_fls(__bits); \
__m >>= __div64_fls(__bits); \
} \
/* No correction needed. */ \
__c = 0; \
} \
/* Now we have a combination of 2 conditions: */ \
/* 1) whether or not we need a correction (__c), and */ \
/* 2) whether or not there might be an overflow in */ \
/* the cross product (__m & ((1<<63) | (1<<31))) */ \
/* Select the best insn combination to perform the */ \
/* actual __m * __n / (__p << 64) operation. */ \
if (!__c) { \
asm ( "umull %Q0, %R0, %1, %Q2\n\t" \
"mov %Q0, #0" \
: "=&r" (__res) \
: "r" (__m), "r" (__n) \
: "cc" ); \
} else if (!(__m & ((1ULL << 63) | (1ULL << 31)))) { \
__res = __m; \
asm ( "umlal %Q0, %R0, %Q1, %Q2\n\t" \
"mov %Q0, #0" \
: "+&r" (__res) \
: "r" (__m), "r" (__n) \
: "cc" ); \
} else { \
asm ( "umull %Q0, %R0, %Q1, %Q2\n\t" \
"cmn %Q0, %Q1\n\t" \
"adcs %R0, %R0, %R1\n\t" \
"adc %Q0, %3, #0" \
: "=&r" (__res) \
: "r" (__m), "r" (__n), "r" (__z) \
: "cc" ); \
} \
if (!(__m & ((1ULL << 63) | (1ULL << 31)))) { \
asm ( "umlal %R0, %Q0, %R1, %Q2\n\t" \
"umlal %R0, %Q0, %Q1, %R2\n\t" \
"mov %R0, #0\n\t" \
"umlal %Q0, %R0, %R1, %R2" \
: "+&r" (__res) \
: "r" (__m), "r" (__n) \
: "cc" ); \
} else { \
asm ( "umlal %R0, %Q0, %R2, %Q3\n\t" \
"umlal %R0, %1, %Q2, %R3\n\t" \
"mov %R0, #0\n\t" \
"adds %Q0, %1, %Q0\n\t" \
"adc %R0, %R0, #0\n\t" \
"umlal %Q0, %R0, %R2, %R3" \
: "+&r" (__res), "+&r" (__z) \
: "r" (__m), "r" (__n) \
: "cc" ); \
} \
__res /= __p; \
/* The reminder can be computed with 32-bit regs */ \
/* only, and gcc is good at that. */ \
{ \
unsigned int __res0 = __res; \
unsigned int __b0 = __b; \
__r -= __res0 * __b0; \
} \
/* BUG_ON(__r >= __b || __res * __b + __r != n); */ \
n = __res; \
} \
__r; \
})
/* our own fls implementation to make sure constant propagation is fine */
#define __div64_fls(bits) \
({ \
unsigned int __left = (bits), __nr = 0; \
if (__left & 0xffff0000) __nr += 16, __left >>= 16; \
if (__left & 0x0000ff00) __nr += 8, __left >>= 8; \
if (__left & 0x000000f0) __nr += 4, __left >>= 4; \
if (__left & 0x0000000c) __nr += 2, __left >>= 2; \
if (__left & 0x00000002) __nr += 1; \
__nr; \
})
#endif
#endif

View File

@ -6,12 +6,10 @@ bootblock-y += cache-cp15.c
romstage-y += cache_v7.c
romstage-y += cache-cp15.c
romstage-y += div0.c
romstage-y += div64.S
romstage-y += syslib.c
romstage-$(CONFIG_EARLY_CONSOLE) += early_console.c
ramstage-y += div0.c
ramstage-y += div64.S
#ramstage-y += interrupts.c
#ramstage-y += memcpy.S
#ramstage-y += memset.S

View File

@ -1,208 +0,0 @@
/*
* linux/arch/arm/lib/div64.S
*
* Optimized computation of 64-bit dividend / 32-bit divisor
*
* Author: Nicolas Pitre
* Created: Oct 5, 2003
* Copyright: Monta Vista Software, Inc.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
// FIXME
//#include <linux/linkage.h>
#define __LINUX_ARM_ARCH__ 7
#ifdef __ARMEB__
#define xh r0
#define xl r1
#define yh r2
#define yl r3
#else
#define xl r0
#define xh r1
#define yl r2
#define yh r3
#endif
/*
* __do_div64: perform a division with 64-bit dividend and 32-bit divisor.
*
* Note: Calling convention is totally non standard for optimal code.
* This is meant to be used by do_div() from include/asm/div64.h only.
*
* Input parameters:
* xh-xl = dividend (clobbered)
* r4 = divisor (preserved)
*
* Output values:
* yh-yl = result
* xh = remainder
*
* Clobbered regs: xl, ip
*/
.globl __do_div64;
.align 4,0x90
__do_div64:
@ Test for easy paths first.
subs ip, r4, #1
bls 9f @ divisor is 0 or 1
tst ip, r4
beq 8f @ divisor is power of 2
@ See if we need to handle upper 32-bit result.
cmp xh, r4
mov yh, #0
blo 3f
@ Align divisor with upper part of dividend.
@ The aligned divisor is stored in yl preserving the original.
@ The bit position is stored in ip.
#if __LINUX_ARM_ARCH__ >= 5
clz yl, r4
clz ip, xh
sub yl, yl, ip
mov ip, #1
mov ip, ip, lsl yl
mov yl, r4, lsl yl
#else
mov yl, r4
mov ip, #1
1: cmp yl, #0x80000000
cmpcc yl, xh
movcc yl, yl, lsl #1
movcc ip, ip, lsl #1
bcc 1b
#endif
@ The division loop for needed upper bit positions.
@ Break out early if dividend reaches 0.
2: cmp xh, yl
orrcs yh, yh, ip
subcss xh, xh, yl
movnes ip, ip, lsr #1
mov yl, yl, lsr #1
bne 2b
@ See if we need to handle lower 32-bit result.
3: cmp xh, #0
mov yl, #0
cmpeq xl, r4
movlo xh, xl
movlo pc, lr
@ The division loop for lower bit positions.
@ Here we shift remainer bits leftwards rather than moving the
@ divisor for comparisons, considering the carry-out bit as well.
mov ip, #0x80000000
4: movs xl, xl, lsl #1
adcs xh, xh, xh
beq 6f
cmpcc xh, r4
5: orrcs yl, yl, ip
subcs xh, xh, r4
movs ip, ip, lsr #1
bne 4b
mov pc, lr
@ The top part of remainder became zero. If carry is set
@ (the 33th bit) this is a false positive so resume the loop.
@ Otherwise, if lower part is also null then we are done.
6: bcs 5b
cmp xl, #0
moveq pc, lr
@ We still have remainer bits in the low part. Bring them up.
#if __LINUX_ARM_ARCH__ >= 5
clz xh, xl @ we know xh is zero here so...
add xh, xh, #1
mov xl, xl, lsl xh
mov ip, ip, lsr xh
#else
7: movs xl, xl, lsl #1
mov ip, ip, lsr #1
bcc 7b
#endif
@ Current remainder is now 1. It is worthless to compare with
@ divisor at this point since divisor can not be smaller than 3 here.
@ If possible, branch for another shift in the division loop.
@ If no bit position left then we are done.
movs ip, ip, lsr #1
mov xh, #1
bne 4b
mov pc, lr
8: @ Division by a power of 2: determine what that divisor order is
@ then simply shift values around
#if __LINUX_ARM_ARCH__ >= 5
clz ip, r4
rsb ip, ip, #31
#else
mov yl, r4
cmp r4, #(1 << 16)
mov ip, #0
movhs yl, yl, lsr #16
movhs ip, #16
cmp yl, #(1 << 8)
movhs yl, yl, lsr #8
addhs ip, ip, #8
cmp yl, #(1 << 4)
movhs yl, yl, lsr #4
addhs ip, ip, #4
cmp yl, #(1 << 2)
addhi ip, ip, #3
addls ip, ip, yl, lsr #1
#endif
mov yh, xh, lsr ip
mov yl, xl, lsr ip
rsb ip, ip, #32
orr yl, yl, xh, lsl ip
mov xh, xl, lsl ip
mov xh, xh, lsr ip
mov pc, lr
@ eq -> division by 1: obvious enough...
9: moveq yl, xl
moveq yh, xh
moveq xh, #0
moveq pc, lr
@ Division by 0:
str lr, [sp, #-8]!
bl __div0
@ as wrong as it could be...
mov yl, #0
mov yh, #0
mov xh, #0
ldr pc, [sp], #8
@.type __do_div64, @function;
@.size __do_div64, .-__do_div64,

View File

@ -354,9 +354,9 @@ endif
$(objcbfs)/romstage_null.debug: $$(romstage-objs) $(objgenerated)/romstage_null.ld
@printf " LINK $(subst $(obj)/,,$(@))\n"
ifeq ($(CONFIG_COMPILER_LLVM_CLANG),y)
$(LD) -nostdlib -nostartfiles -static -o $@ -L$(obj) $(romstage-objs) -T $(objgenerated)/romstage_null.ld
$(LD) -nostdlib -nostartfiles -static -o $@ -L$(obj) --wrap __divdi3 --wrap __udivdi3 --wrap __moddi3 --wrap __umoddi3 --start-group $(romstage-objs) $(LIBGCC_FILE_NAME) --end-group -T $(objgenerated)/romstage_null.ld
else
$(CC) -nostdlib -nostartfiles -static -o $@ -L$(obj) -T $(objgenerated)/romstage_null.ld $(romstage-objs)
$(CC) -nostdlib -nostartfiles -static -o $@ -L$(obj) -T $(objgenerated)/romstage_null.ld -Wl,--wrap,__divdi3 -Wl,--wrap,__udivdi3 -Wl,--wrap,__moddi3 -Wl,--wrap,__umoddi3 -Wl,--start-group $(romstage-objs) $(LIBGCC_FILE_NAME) -Wl,--end-group
endif
$(NM) $@ | grep -q " [DdBb] "; if [ $$? -eq 0 ]; then \
echo "Forbidden global variables in romstage:"; \
@ -366,9 +366,9 @@ endif
$(objcbfs)/romstage_xip.debug: $$(romstage-objs) $(objgenerated)/romstage_xip.ld
@printf " LINK $(subst $(obj)/,,$(@))\n"
ifeq ($(CONFIG_COMPILER_LLVM_CLANG),y)
$(LD) -nostdlib -nostartfiles -static -o $@ -L$(obj) $(romstage-objs) -T $(objgenerated)/romstage_xip.ld
$(LD) -nostdlib -nostartfiles -static -o $@ -L$(obj) --wrap __divdi3 --wrap __udivdi3 --wrap __moddi3 --wrap __umoddi3 --start-group $(romstage-objs) $(LIBGCC_FILE_NAME) --end-group -T $(objgenerated)/romstage_xip.ld
else
$(CC) -nostdlib -nostartfiles -static -o $@ -L$(obj) -T $(objgenerated)/romstage_xip.ld $(romstage-objs)
$(CC) -nostdlib -nostartfiles -static -o $@ -L$(obj) -T $(objgenerated)/romstage_xip.ld -Wl,--wrap,__divdi3 -Wl,--wrap,__udivdi3 -Wl,--wrap,__moddi3 -Wl,--wrap,__umoddi3 -Wl,--start-group $(romstage-objs) $(LIBGCC_FILE_NAME) -Wl,--end-group
endif
$(objgenerated)/romstage_null.ld: $$(ldscripts) $(obj)/ldoptions

View File

@ -1,30 +0,0 @@
#ifndef __I386_DIV64
#define __I386_DIV64
/*
* do_div() is NOT a C function. It wants to return
* two values (the quotient and the remainder), but
* since that doesn't work very well in C, what it
* does is:
*
* - modifies the 64-bit dividend _in_place_
* - returns the 32-bit remainder
*
* This ends up being the most efficient "calling
* convention" on x86.
*/
#define do_div(n,base) ({ \
unsigned long __upper, __low, __high, __mod, __base; \
__base = (base); \
asm("":"=a" (__low), "=d" (__high):"A" (n)); \
__upper = __high; \
if (__high) { \
__upper = __high % (__base); \
__high = __high / (__base); \
} \
asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (__base), "0" (__low), "1" (__upper)); \
asm("":"=A" (n):"a" (__low),"d" (__high)); \
__mod; \
})
#endif

View File

@ -5,7 +5,6 @@
*/
#include <string.h>
#include <div64.h>
#include <console/console.h>
#include <console/vtxprintf.h>
@ -70,20 +69,8 @@ static int number(void (*tx_byte)(unsigned char byte),
if (num == 0)
tmp[i++]='0';
else while (num != 0){
/* there are some nice optimizations in the
* Macros-From-Hell that form the div64 code
* *IF* you call it with a constant.
* We're firmware, we only do bases
* 8, 10, and 16. Let's be smart.
* This greatly helps ARM, reduces the
* code footprint at compile time, and does not hurt x86.
*/
if (base == 10)
tmp[i++] = digits[do_div(num,10)];
else if (base == 8)
tmp[i++] = digits[do_div(num,8)];
else /* sorry, you're out of choices */
tmp[i++] = digits[do_div(num,16)];
tmp[i++] = digits[num % base];
num /= base;
}
if (i > precision)
precision = i;

View File

@ -36,8 +36,8 @@ endif
smm-y += smihandler.c
smm-y += smiutil.c
$(obj)/cpu/x86/smm/smm.o: $$(smm-objs)
$(CC) $(LDFLAGS) -nostdlib -r -o $@ $^
$(obj)/cpu/x86/smm/smm.o: $$(smm-objs) $(LIBGCC_FILE_NAME)
$(CC) $(LDFLAGS) -nostdlib -r -o $@ -Wl,--wrap,__divdi3 -Wl,--wrap,__udivdi3 -Wl,--wrap,__moddi3 -Wl,--wrap,__umoddi3 -Wl,--start-group $(smm-objs) $(LIBGCC_FILE_NAME) -Wl,--end-group
$(obj)/cpu/x86/smm/smm_wrap: $(obj)/cpu/x86/smm/smm.o $(src)/cpu/x86/smm/$(SMM_LDSCRIPT) $(obj)/ldoptions
$(CC) $(SMM_LDFLAGS) -nostdlib -nostartfiles -static -o $(obj)/cpu/x86/smm/smm.elf -T $(src)/cpu/x86/smm/$(SMM_LDSCRIPT) $(obj)/cpu/x86/smm/smm.o

View File

@ -48,6 +48,7 @@ romstage-$(CONFIG_USBDEBUG) += usbdebug.c
romstage-$(CONFIG_COLLECT_TIMESTAMPS) += timestamp.c cbmem.c
romstage-y += compute_ip_checksum.c
romstage-y += memmove.c
romstage-$(CONFIG_ARCH_X86) += gcc.c
ramstage-y += hardwaremain.c
ramstage-y += selfboot.c
@ -94,6 +95,7 @@ smm-y += cbfs.c memcmp.c
smm-$(CONFIG_CONSOLE_SERIAL8250) += uart8250.c
smm-$(CONFIG_CONSOLE_SERIAL8250MEM) += uart8250mem.c
smm-$(CONFIG_USBDEBUG) += usbdebug.c
smm-y += gcc.c
$(obj)/lib/version.ramstage.o : $(obj)/build.h