Since there already two users of full 64 bit division in the kernel,
and other places maybe hiding out as well. Add a full 64/64 bit divide.
Yes this expensive, but there are places where it is necessary.
It is not clear if doing the scaling buys any advantage on 64 bit platforms,
so for them a full divide is done.
---
include/asm-arm/div64.h | 2 ++
include/asm-generic/div64.h | 8 ++++++++
include/asm-m68k/div64.h | 2 ++
include/asm-mips/div64.h | 8 ++++++++
include/asm-um/div64.h | 1 +
include/asm-xtensa/div64.h | 1 +
lib/div64.c | 22 ++++++++++++++++++++++
net/ipv4/tcp_cubic.c | 22 ----------------------
net/netfilter/xt_connbytes.c | 16 ----------------
9 files changed, 44 insertions(+), 38 deletions(-)
--- linux-2.6.21-rc1.orig/include/asm-arm/div64.h 2007-02-23 16:44:41.000000000 -0800
+++ linux-2.6.21-rc1/include/asm-arm/div64.h 2007-02-23 16:57:36.000000000 -0800
@@ -221,6 +221,8 @@
__nr; \
})
+extern uint64_t div64_64(uint64_t dividend, uint64_t divisor);
+
#endif
#endif
--- linux-2.6.21-rc1.orig/include/asm-generic/div64.h 2007-02-23 16:35:27.000000000 -0800
+++ linux-2.6.21-rc1/include/asm-generic/div64.h 2007-02-23 16:56:40.000000000 -0800
@@ -30,9 +30,17 @@
__rem; \
})
+/*
+ * div64_64 - Divide two 64 bit numbers
+ */
+static inline uint64_t div64_64(uint64_t dividend, uint64_t divisor)
+{
+ return dividend / divisor;
+}
#elif BITS_PER_LONG == 32
extern uint32_t __div64_32(uint64_t *dividend, uint32_t divisor);
+extern uint64_t div64_64(uint64_t dividend, uint64_t divisor);
/* The unnecessary pointer compare is there
* to check for type safety (n must be 64bit)
--- linux-2.6.21-rc1.orig/include/asm-m68k/div64.h 2007-02-23 16:45:20.000000000 -0800
+++ linux-2.6.21-rc1/include/asm-m68k/div64.h 2007-02-23 16:56:27.000000000 -0800
@@ -23,4 +23,6 @@
__rem; \
})
+extern uint64_t div64_64(uint64_t dividend, uint64_t ...Still does not work after these fixes... how came? WARNING: "div64_64" [net/netfilter/xt_connbytes.ko] undefined! WARNING: "div64_64" [net/ipv4/tcp_cubic.ko] undefined! --- linux-2.6.19/include/asm-i386/div64.h.bak 2006-11-29 23:57:37.000000000 +0200 +++ linux-2.6.19/include/asm-i386/div64.h 2007-02-24 16:24:55.822529880 +0200 @@ -45,4 +45,7 @@ div_ll_X_l_rem(long long divs, long div, return dum2; } + +extern uint64_t div64_64(uint64_t dividend, uint64_t divisor); + #endif --- linux-2.6.19/lib/div64.c.bak 2007-02-24 16:10:03.686084000 +0200 +++ linux-2.6.19/lib/div64.c 2007-02-24 17:01:11.224517353 +0200 @@ -80,4 +80,6 @@ uint64_t div64_64(uint64_t dividend, uin return dividend; } +EXPORT_SYMBOL(div64_64); + #endif /* BITS_PER_LONG == 32 */ -- -
Actually, there is udivdi3 support in the kernel ./arch/arm26/lib/udivdi3.c ./arch/sh/lib/udivdi3.c ./arch/sparc/lib/udivdi3.S should not this be consolidated too? Jan -- ft: http://freshmeat.net/p/chaostables/ -
On Mon, 26 Feb 2007 21:09:26 +0100 (MET) Hmm. Those are the GCC internal versions, that are picked up but doing divide in place. Do we want to allow general 64 bit in kernel to be easily used? It could cause sloppy slow code, but it would look cleaner. -- Stephen Hemminger <shemminger@linux-foundation.org> -
... and it would handle datatypes which may be architecture-dependent a lot cleaner. I thought the motivation for div64() was that a 64:32->32 divide could be done a lot faster on a number of platforms (including the important x86) than a generic 64:64->64 divide, but gcc doesn't handle the devolution automatically -- there is no such libgcc function. -hpa -
That there's no such function in libgcc doesn't mean GCC cannot handle this; libgcc is a bunch of library functions that are really needed for generated code (because you really don't want to expand those functions inline everywhere) -- you won't find an addsi3 in libgcc either. There does exist a divmoddisi4, sort of. It used to be defined in three GCC targets, but commented out in all three. The NS32k is long gone. For Vax, a comment says the machine insn for this isn't used because it is just too slow. And the i386 version is disabled because it returns the wrong result on overflow (not the truncated 64-bit result, required by the implicit cast to 32-bit, but the i386 arch traps to the overflow handler). The only way to express the semantics you want in (GNU) C is to use asm() -- and that's exactly what div64() does :-) Blame it on the C language, but not on GCC. Not this time. Segher -
Here is another way to handle the 64 bit divide case.
It allows full 64 bit divide by adding the support routine
GCC needs.
---
arch/alpha/Kconfig | 4 ++++
arch/arm/Kconfig | 4 ++++
arch/arm26/Kconfig | 4 ++++
arch/avr32/Kconfig | 4 ++++
arch/cris/Kconfig | 4 ++++
arch/frv/Kconfig | 4 ++++
arch/h8300/Kconfig | 4 ++++
arch/i386/Kconfig | 4 ++++
arch/m32r/Kconfig | 4 ++++
arch/m68k/Kconfig | 4 ++++
arch/m68knommu/Kconfig | 4 ++++
arch/mips/Kconfig | 4 ++++
arch/parisc/Kconfig | 4 ++++
arch/powerpc/Kconfig | 4 ++++
arch/ppc/Kconfig | 4 ++++
arch/s390/Kconfig | 4 ++++
arch/sh64/Kconfig | 4 ++++
arch/v850/Kconfig | 3 +++
arch/xtensa/Kconfig | 4 ++++
lib/Makefile | 1 +
lib/udivdi3.c | 37 +++++++++++++++++++++++++++++++++++++
net/ipv4/tcp_cubic.c | 26 ++------------------------
net/netfilter/xt_connbytes.c | 19 +------------------
23 files changed, 116 insertions(+), 42 deletions(-)
--- pktgen.orig/net/ipv4/tcp_cubic.c 2007-02-26 13:40:08.000000000 -0800
+++ pktgen/net/ipv4/tcp_cubic.c 2007-02-26 14:30:00.000000000 -0800
@@ -51,7 +51,6 @@
module_param(tcp_friendliness, int, 0644);
MODULE_PARM_DESC(tcp_friendliness, "turn on/off tcp friendliness");
-#include <asm/div64.h>
/* BIC TCP Parameters */
struct bictcp {
@@ -93,27 +92,6 @@
tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
}
-/* 64bit divisor, dividend and result. dynamic precision */
-static inline u_int64_t div64_64(u_int64_t dividend, u_int64_t divisor)
-{
- u_int32_t d = divisor;
-
- if (divisor > 0xffffffffULL) {
- unsigned int shift = fls(divisor >> 32);
-
- d = divisor >> shift;
- dividend >>= shift;
- }
-
- /* avoid 64 bit division if possible */
- if (dividend >> ...Then our reviewers should catch it, and if not, the janitors will Eye see a bug. Previously there was div64_64(a, x*x) which is equivalent to (a)/(x*x), or just: a/(x^2). But now you do a/x*x, which is equivalent to a*x/x (in the domain of real numbers). Furthermore, a/x*x is a-(a%x), which does not even remotely match a/(x^2). Please keep the math intact, thank you ;-) Jan -- -
On Tue, 27 Feb 2007 00:02:50 +0100 (MET) Been there, done that, don't want to repeat it... -- Stephen Hemminger <shemminger@linux-foundation.org> -
On Tue, 27 Feb 2007 01:05:26 +0100 (MET) Once before a missed paren's caused a TCP congestion window bug that took 6 months before it was found... -- Stephen Hemminger <shemminger@linux-foundation.org> -
Hah, just as I expected. |On Tue, 27 Feb 2007 00:02:50 +0100 (MET), Jan Engelhardt wrote: |>Then our reviewers should catch it, and if not, the janitors will. Jan -- -
<snip> I know ARM already went through the process of removing __udivdi3 support: http://www.arm.linux.org.uk/developer/patches/viewpatch.php?id=2723/2 Copying Russell and Nicolas as a heads up. -- Dan -
Not supplying that was intentional by Linus so that people think twice (or more often) before they using such expensive operations. A plain / looks too innocent. Is it really needed by CUBIC anyways? It uses it for getting the cubic root, but the algorithm recommended by Hacker's Delight (great book) doesn't use any divisions at all. Probably better to use a better algorithm without divisions. -Andi -
On 03 Mar 2007 03:31:52 +0100 I tried the code from Hacker's Delight. It is cool, but performance is CPU (and data) dependent: Average # of usecs per operation: Hacker Newton Pentium 3 68.6 < 90.4 T2050 98.6 > 92.0 U1400 450 > 415 Xeon 70 < 90 Xeon (newer) 71 < 78 EM64T 21.8 < 24.6 AMD64 23.4 < 32.0 It might be worth the change for code size reduction though. -- Stephen Hemminger <shemminger@linux-foundation.org> -
From: Stephen Hemminger <shemminger@linux-foundation.org> Interesting results. The problem with these algorithms that tradoff one or more multiplies in order to avoid a divide is that they don't give anything and often lose when both multiplies and divides are emulated in software. This is particularly true in this cube-root case from Hacker's Delight, because it's using 3 multiplies per iteration in place of one divide per iteration. Actually, sorry, there is only one real multiply in there since the other two can be computed using addition and shifts. Another thing is that the non-Hacker's Delight version iterates differently for different input values, so the input value space is very important to consider when comparing these two pieces of code. -
I did some stochastic testing on my version. It gave 1 off for < 1% of the arguments. Probably not an issue. Besides it actually works for >2^43 @) -Andi -
Actually on rereading this: is there really any Linux port that emulates multiplies in software? I thought that was only done on really small microcontrollers or smart cards; but anything 32bit+ that runs Linux should have hardware multiply, shouldn't it? -Andi -
SPARCv7 (sun4/sun4c) doesn't have hardware mul/div. This includes SparcStation 1, 1+, 2, SLC, ELC, IPC and IPX. -- ilmari "A disappointingly low fraction of the human race is, at any given time, on fire." - Stig Sandbeck Mathisen -
From: ilmari@ilmari.org (Dagfinn Ilmari Mannsåker) Right and the cypress sun4m parts don't do multiply/divide in hardware either. I believe the Alpha does divide in software too, see arch/alpha/lib/divide.S -
I did too. My experiences were mixed: on 32bit it was slower,
on 64bit faster on average. Strangely the 32bit version ran
faster again without -fomit-frame-pointer, so it's likely
some funny interaction with 32bit long long code generation.
The difference is never more than 100 cycles so it shouldn't
be a big issue either way.
For some input arguments (<1% in my testing)
it also gave an answer 1 off from the existing code,
but I don't think that's a problem.
But more importantly during testing I found that the cubic
code gives a division by zero for input arguments >2^43. If you
have a system with >16TB of memory this could actually be a remotely
exploitable bug :)
I still think it's a good idea to switch to the new function,
especially since it's shorter code.
Here's the patch. Note I didn't verify it with real large window
TCP operations; only unit testing.
-Andi
Use Hacker's delight cube root algorithm in cubic TCP
Shorter code and fixes a theoretically remote exploitable bug.
Signed-off-by: Andi Kleen <ak@suse.de>
Index: linux-2.6.21-rc1-net/net/ipv4/tcp_cubic.c
===================================================================
--- linux-2.6.21-rc1-net.orig/net/ipv4/tcp_cubic.c
+++ linux-2.6.21-rc1-net/net/ipv4/tcp_cubic.c
@@ -93,51 +93,24 @@ static void bictcp_init(struct sock *sk)
tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
}
-/* 64bit divisor, dividend and result. dynamic precision */
-static inline u_int64_t div64_64(u_int64_t dividend, u_int64_t divisor)
+static u32 cubic_root(u64 x)
{
- u_int32_t d = divisor;
-
- if (divisor > 0xffffffffULL) {
- unsigned int shift = fls(divisor >> 32);
-
- d = divisor >> shift;
- dividend >>= shift;
- }
-
- /* avoid 64 bit division if possible */
- if (dividend >> 32)
- do_div(dividend, d);
- else
- dividend = (uint32_t) dividend / d;
-
- return dividend;
-}
-
-/*
- * calculate the cubic root of x using Newton-Raphson
- */
-static u32 cubic_root(u64 a)
-{
- u32 x, ...Andi <rant> Let me see... You throw code like that and expect someone to actually understand it in one year, and be able to correct a bug ? </rant> Please add something, an URL or even better a nice explanation, per favor... Thank you -
--Apple-Mail-33--534403500
Content-Transfer-Encoding: 7bit
Content-Type: text/plain;
charset=US-ASCII;
delsp=yes;
format=flowed
Hi Andi!
And it's pretty neat, too. Hint: (y+1)**3 = y**3 + 3*y**2 + 3*y + 1.
The algorithm is exactly the same as for calculating the cubic root
on paper, digit by digit. I found that algo in the school notebook of
my grandpa (late 1920ies), a pity that it's not taught anymore...
pocket calculators _do_ have downsides ;-)
Ciao,
Roland
--
TU Muenchen, Physik-Department E18, James-Franck-Str., 85748 Garching
Telefon 089/289-12575; Telefax 089/289-12570
--
CERN office: 892-1-D23 phone: +41 22 7676540 mobile: +41 76 487 4482
--
Any society that would give up a little liberty to gain a little
security will deserve neither and lose both. - Benjamin Franklin
-----BEGIN GEEK CODE BLOCK-----
Version: 3.12
GS/CS/M/MU d-(++) s:+ a-> C+++ UL++++ P+++ L+++ E(+) W+ !N K- w--- M
+ !V Y+
PGP++ t+(++) 5 R+ tv-- b+ DI++ e+++>++++ h---- y+++
------END GEEK CODE BLOCK------
--Apple-Mail-33--534403500
Content-Transfer-Encoding: base64
Content-Type: application/pkcs7-signature;
name=smime.p7s
Content-Disposition: ...Don't count the existing Newton-Raphson out. It turns out that to get enough
precision for 32 bits, only 4 iterations are needed. By unrolling those, it
gets much better timing.
Slightly gross test program (with original cubic wraparound bug fixed).
---
/* Test and measure perf of cube root algorithms. */
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <math.h>
#ifdef __x86_64
#define rdtscll(val) do { \
unsigned int __a,__d; \
asm volatile("rdtsc" : "=a" (__a), "=d" (__d)); \
(val) = ((unsigned long)__a) | (((unsigned long)__d)<<32); \
} while(0)
# define do_div(n,base) ({ \
uint32_t __base = (base); \
uint32_t __rem; \
__rem = ((uint64_t)(n)) % __base; \
(n) = ((uint64_t)(n)) / __base; \
__rem; \
})
/**
* __ffs - find first bit in word.
* @word: The word to search
*
* Undefined if no bit exists, so code should check against 0 first.
*/
static __inline__ unsigned long __ffs(unsigned long word)
{
__asm__("bsfq %1,%0"
:"=r" (word)
:"rm" (word));
return word;
}
/*
* __fls: find last bit set.
* @word: The word to search
*
* Undefined if no zero exists, so code should check against ~0UL first.
*/
static inline unsigned long __fls(unsigned long word)
{
__asm__("bsrq %1,%0"
:"=r" (word)
:"rm" (word));
return word;
}
/**
* ffs - find first bit set
* @x: the word to search
*
* This is defined the same way as
* the libc and compiler builtin ffs routines, therefore
* differs in spirit from the above ffz (man ffs).
*/
static __inline__ int ffs(int x)
{
int r;
__asm__("bsfl %1,%0\n\t"
"cmovzl %2,%0"
: "=r" (r) : "rm" (x), "r" (-1));
return r+1;
}
/**
* fls - find last bit set
* @x: the word to search
*
* This is defined the same way as ffs.
*/
static inline int fls(int x)
{
int r;
__asm__("bsrl %1,%0\n\t"
"cmovzl %2,%0"
: "=&r" (r) : "rm" (x), "rm" (-1));
return r+1;
}
/**
* fls64 - find last bit set in ...But did you fix the >2^43 bug too? SGI has already shipped 10TB Altixen, so it's not entirely theoretical. -Andi -
^^^^^^^ this should be 2642245. Without serializing instruction before rdtsc and with one loop I do not get very accurate results (104 for ncubic, > 1000 for others). #define rdtscll_serialize(val) \ __asm__ __volatile__("movl $0, %%eax\n\tcpuid\n\trdtsc\n" : "=A" (val) : : "ebx", "ecx") Here Pentium D timings for 1000 loops. ~0, 2097151 Function clocks mean(us) max(us) std(us) total error ocubic 912 0.306 20.317 0.730 545101 ncubic 777 0.261 14.799 0.486 576263 acbrt 1168 0.392 21.681 0.547 547562 hcbrt 827 0.278 15.244 0.387 2410 ~0, 2642245 Function clocks mean(us) max(us) std(us) total error ocubic 908 0.305 20.210 0.656 7 ncubic 775 0.260 14.792 0.550 31169 acbrt 1176 0.395 22.017 0.970 2468 hcbrt 826 0.278 15.326 0.670 547504 And I found bug in gcc-4.1.2, it gave 0 for ncubic results when doing 1000 loops test... gcc-4.0.3 works. -- -
On Tue, Mar 06, 2007 at 23:53:49 +0200, Sami Farin wrote: Found it. --- cbrt-test.c~ 2007-03-07 00:20:54.735248105 +0200 +++ cbrt-test.c 2007-03-07 00:21:03.964864343 +0200 @@ -209,7 +209,7 @@ __asm__("bsrl %1,%0\n\t" "cmovzl %2,%0" - : "=&r" (r) : "rm" (x), "rm" (-1)); + : "=&r" (r) : "rm" (x), "rm" (-1) : "memory"); return r+1; } Now Linux 2.6 does not have "memory" in fls, maybe it causes some gcc funnies some people are seeing. -- -
Fun.. looks when not using "memory" gcc does not even bother calling ncubic() 666 times. So it gets better timings ( 42/666=0 ) =) --- cbrt-test-no_memory.s 2007-03-07 20:22:27.838466385 +0200 +++ cbrt-test-using_memory.s 2007-03-07 20:22:38.237013197 +0200 ... main: leal 4(%esp), %ecx andl $-16, %esp pushl -4(%ecx) pushl %ebp pushl %edi pushl %esi pushl %ebx pushl %ecx - subl $136, %esp + subl $152, %esp movl $.LC0, (%esp) call puts xorl %edx, %edx movl $27, %eax call ncubic cmpl $3, %eax - je .L83 + je .L87 movl $.LC1, (%esp) call puts -.L83: - xorl %eax, %eax - xorl %edi, %edi - movl %eax, 88(%esp) +.L87: xorl %eax, %eax - xorl %esi, %esi + xorl %ebp, %ebp movl %eax, 92(%esp) xorl %eax, %eax - xorl %ebp, %ebp + xorl %edi, %edi movl %eax, 96(%esp) xorl %eax, %eax + xorl %esi, %esi movl %eax, 100(%esp) xorl %eax, %eax movl %eax, 104(%esp) xorl %eax, %eax movl %eax, 108(%esp) - movl %edi, 112(%esp) - movl %esi, 116(%esp) - .p2align 4,,15 -.L84: + xorl %eax, %eax + movl %eax, 112(%esp) + movl %ebp, 116(%esp) + movl %edi, 120(%esp) + movl %esi, 124(%esp) +.L88: #APP movl $0, %eax cpuid rdtsc #NO_APP movl %eax, 56(%esp) movl %edx, 60(%esp) #APP movl $0, %eax cpuid rdtsc #NO_APP movl %eax, %esi movl %edx, %edi subl 56(%esp), %esi sbbl 60(%esp), %edi cmpl $0, %edi ja .L66 cmpl $999, %esi - jbe .L84 + jbe .L88 .L66: + movl 92(%esp), %edx + leal (%edx,%edx,2), %eax + movl cases+4(,%eax,4), %edi + movl cases(,%eax,4), %esi + movl %edi, %edx + movl %esi, %eax + call ncubic #APP movl $0, %eax cpuid rdtsc #NO_APP - movl %eax, %esi - movl %edx, %edi + movl $666, %ebx + movl %eax, 128(%esp) + movl %edx, 132(%esp) + .p2align 4,,15 +.L67: + movl %esi, %eax + movl %edi, %edx + call ncubic + decl %ebx + movl %eax, %ebp + jne .L67 #APP movl $0, %eax cpuid rdtsc #NO_APP - subl %esi, ...
It also works without "memory" if I do "__asm__ volatile". Why some functions have volatile and some have not in include/asm-*/*.h ? -- -
From: Sami Farin <7atbggg02@sneakemail.com> "volatile" is really only needed if there is some side effect that cannot be expressed to gcc which makes ordering over the asm wrt. other pieces of code important. But in these case it should absolutely not be needed. It's simply computing an interger result from some inputs and some values in memory. GCC should see perfectly fine what is memory is read by the asm and therefore what ordering constraints there are wrt. writes to the same memory location. -
From: Stephen Hemminger <shemminger@linux-foundation.org> Indeed that will be the fastest variant for cpus with hw integer division. I did a quick sparc64 port, here is what I got: Function clocks mean(us) max(us) std(us) total error ocubic 529 0.35 15.16 0.66 545101 ncubic 498 0.33 12.83 0.36 576263 acbrt 427 0.28 11.04 0.33 547562 hcbrt 393 0.26 10.18 0.47 2410 -
The Newton-Raphson method is quadratically convergent so
only a small fixed number of steps are necessary.
Therefore it is faster to unroll the loop. Since div64_64 is no longer
inline it won't cause code explosion.
Also fixes a bug that can occur if x^2 was bigger than 32 bits.
Signed-off-by: Stephen Hemminger <shemminger@linux-foundation.org>
---
net/ipv4/tcp_cubic.c | 16 +++++-----------
1 file changed, 5 insertions(+), 11 deletions(-)
--- net-2.6.22.orig/net/ipv4/tcp_cubic.c 2007-03-06 12:24:34.000000000 -0800
+++ net-2.6.22/net/ipv4/tcp_cubic.c 2007-03-06 14:43:37.000000000 -0800
@@ -96,23 +96,17 @@
*/
static u32 cubic_root(u64 a)
{
- u32 x, x1;
+ u64 x;
/* Initial estimate is based on:
* cbrt(x) = exp(log(x) / 3)
*/
x = 1u << (fls64(a)/3);
- /*
- * Iteration based on:
- * 2
- * x = ( 2 * x + a / x ) / 3
- * k+1 k k
- */
- do {
- x1 = x;
- x = (2 * x + (uint32_t) div64_64(a, x*x)) / 3;
- } while (abs(x1 - x) > 1);
+ /* converges to 32 bits in 3 iterations */
+ x = (2 * x + div64_64(a, x*x)) / 3;
+ x = (2 * x + div64_64(a, x*x)) / 3;
+ x = (2 * x + div64_64(a, x*x)) / 3;
return x;
}
-
Here is a better version of the benchmark code.
It has the original code used in 2.4 version of Cubic for comparison
-----------------------------------------------------------
/* Test and measure perf of cube root algorithms. */
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <math.h>
#include <unistd.h>
#ifdef __x86_64
#define rdtscll(val) do { \
unsigned int __a,__d; \
asm volatile("rdtsc" : "=a" (__a), "=d" (__d)); \
(val) = ((unsigned long)__a) | (((unsigned long)__d)<<32); \
} while(0)
# define do_div(n,base) ({ \
uint32_t __base = (base); \
uint32_t __rem; \
__rem = ((uint64_t)(n)) % __base; \
(n) = ((uint64_t)(n)) / __base; \
__rem; \
})
/**
* __ffs - find first bit in word.
* @word: The word to search
*
* Undefined if no bit exists, so code should check against 0 first.
*/
static __inline__ unsigned long __ffs(unsigned long word)
{
__asm__("bsfq %1,%0"
:"=r" (word)
:"rm" (word));
return word;
}
/*
* __fls: find last bit set.
* @word: The word to search
*
* Undefined if no zero exists, so code should check against ~0UL first.
*/
static inline unsigned long __fls(unsigned long word)
{
__asm__("bsrq %1,%0"
:"=r" (word)
:"rm" (word));
return word;
}
/**
* ffs - find first bit set
* @x: the word to search
*
* This is defined the same way as
* the libc and compiler builtin ffs routines, therefore
* differs in spirit from the above ffz (man ffs).
*/
static __inline__ int ffs(int x)
{
int r;
__asm__("bsfl %1,%0\n\t"
"cmovzl %2,%0"
: "=r" (r) : "rm" (x), "r" (-1));
return r+1;
}
/**
* fls - find last bit set
* @x: the word to search
*
* This is defined the same way as ffs.
*/
static inline int fls(int x)
{
int r;
__asm__("bsrl %1,%0\n\t"
"cmovzl %2,%0"
: "=&r" (r) : "rm" (x), "rm" (-1));
return r+1;
}
/**
* fls64 - find last bit set in 64 bit word
* @x: the word to search
*
* This is ...Hi Stephen,
Thanks for this code, it's easy to experiment with it.
Let me propose this simple update with a variation on your ncubic() function.
I noticed that all intermediate results were far below 32 bits, so I did a
new version which is 30% faster on my athlon with the same results. This is
because we only use x and a/x^2 in the function, with x very close to cbrt(a).
So a/x^2 is very close to cbrt(a) which is at most 22 bits. So we only use
the 32 lower bits of the result of div64_64(), and all intermediate
computations can be done on 32 bits (including multiplies and divides).
willy@pcw:~$ ./bictcp
Calibrating
Function clocks mean(us) max(us) std(us) Avg error
bictcp 1085 0.70 28.19 2.30 0.172%
ocubic 869 0.56 22.76 1.23 0.274%
ncubic 637 0.41 16.29 1.41 0.247%
ncubic32 435 0.28 11.18 1.03 0.247%
acbrt 824 0.53 21.03 0.85 0.275%
hcbrt 547 0.35 13.96 0.42 1.580%
I also tried to improve a bit by checking for early convergence and
returning before last divide, but it is worthless because it almost
never happens so it does not make the code any faster.
Here's the code. I think that it would be fine if we merged this
version since it's supposed to behave better on most 32 bits machines.
Best regards,
Willy
/*
Here is a better version of the benchmark code.
It has the original code used in 2.4 version of Cubic for comparison
-----------------------------------------------------------
*/
/* Test and measure perf of cube root algorithms. */
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <math.h>
#include <unistd.h>
#ifdef __x86_64
#define rdtscll(val) do { \
unsigned int __a,__d; \
asm volatile("rdtsc" : "=a" (__a), "=d" (__d)); \
(val) = ((unsigned long)__a) | (((unsigned long)__d)<<32); \
} while(0)
# define do_div(n,base) ({ \
uint32_t __base = (base); \
uint32_t ...The basic calculation has to be done in 32 bits to avoid
doing 64 bit divide by 3. The value x is only 22bits max
so only need full 64 bits only for x^2.
Signed-off-by: Stephen Hemminger <shemminger@linux-foundation.org>
---
net/ipv4/tcp_cubic.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
--- net-2.6.22.orig/net/ipv4/tcp_cubic.c 2007-03-07 15:51:37.000000000 -0800
+++ net-2.6.22/net/ipv4/tcp_cubic.c 2007-03-07 17:06:02.000000000 -0800
@@ -96,7 +96,7 @@
*/
static u32 cubic_root(u64 a)
{
- u64 x;
+ u32 x;
/* Initial estimate is based on:
* cbrt(x) = exp(log(x) / 3)
@@ -104,9 +104,9 @@
x = 1u << (fls64(a)/3);
/* converges to 32 bits in 3 iterations */
- x = (2 * x + div64_64(a, x*x)) / 3;
- x = (2 * x + div64_64(a, x*x)) / 3;
- x = (2 * x + div64_64(a, x*x)) / 3;
+ x = (2 * x + (u32)div64_64(a, (u64)x*(u64)x)) / 3;
+ x = (2 * x + (u32)div64_64(a, (u64)x*(u64)x)) / 3;
+ x = (2 * x + (u32)div64_64(a, (u64)x*(u64)x)) / 3;
return x;
}
-
From: Stephen Hemminger <shemminger@linux-foundation.org> Applied, thanks Stephen. What about Willy Tarreau's supposedly even faster variant? Or does this incorporate that set of improvements? -
That's what this is:
x = (2 * x + (uint32_t)div64_64(a, (uint64_t)x*(uint64_t)x)) / 3;
-
From: Stephen Hemminger <shemminger@linux-foundation.org> Great, thanks for the clarification. -
Oh BTW, I have a newer version with a first approximation of the cbrt() before the div64_64, which allows us to reduce from 3 div64 to only 2 div64. This results in a version which is twice as fast as the initial one (ncubic), but with slightly less accuracy (0.286% compared to 0.247). But I see that other functions such as hcbrt() had a 1.5% avg error, so I think this is not dramatic. Also, I managed to remove all other divides, to be kind with CPUs having a slow divide instruction or no divide at all. Since we compute on limited range (22 bits), we can multiply then shift right. It shows me even slightly better time on pentium-m and athlon, with a slightly higher avg error (0.297% compared to 0.286%), and slightly smaller code. I just have to clean experiments from my code to provide a patch. David, Stephen, are you interested ? $ ./bictcp fls(0)=0, fls(1)=1, fls(256)=9 Calibrating Function clocks mean(us) max(us) std(us) Avg error bictcp 936 0.61 24.28 1.99 0.172% ocubic 886 0.57 23.51 3.18 0.274% ncubic 644 0.42 16.59 2.18 0.247% ncubic32 444 0.29 11.47 1.50 0.247% ncubic32_1 444 0.29 11.56 1.88 0.238% ncubic32b3 337 0.22 8.67 0.88 0.286% ncubic_ndiv3 329 0.21 8.46 0.69 0.297% acbrt 707 0.46 18.05 0.80 0.275% hcbrt 644 0.42 16.44 0.51 1.580% Regards, Willy -
On Sat, 10 Mar 2007 12:48:26 +0100 -- Stephen Hemminger <shemminger@linux-foundation.org> -
Hi Stephen,
Well, I have cleaned it a little bit, there were more comments and ifdefs
than code ! I've appended it to the end of this mail.
I have changed it a bit, because I noticed that integer divide precision
was so coarse that there were other possibilities to play with the bits.
I have experimented with combinations of several methods :
- replace integer divides with multiplies/shifts where possible.
- compensation for divide imprecisions by adding/removing small
values bofore/after them. Often, the integer result of 1/(x*(x-1))
is closer to (float)1/(float)x^2 than 1/(x*x). This is because
the divide always truncates the result.
- use direct result lookup for small values. Small inputs give small
outputs which have very few moving bits. Many different values fit
in a 32bit integer, so we use a shift offset to lookup the value.
I used this in an fls function I wrote a while ago, that I should
also post because it is up to twice as fast as the kernel's.
Sometimes it seems faster to lookup in from memory, sometimes it
is faster from an immediate value. Maybe more visible differences
would show up on RISC CPUs where loading 32 bits immediate needs
two instructions. I don't know yet, I've not tested on my sparc
yet.
- use small lookup tables (64 bytes) with 6 bits inputs and at least
as many on output. We only lookup the 6 MSB and return the 2-3 MSB
of the result.
- iterative search and manual refinment of the lookup tables for best
accuracy. The avg error rate can easily be halved this way.
I have duplicated tried several functions with 0, 1, 2 and 3 divides.
Several of them offer better accuracy over what we currently have, in
less cycles. Others offer faster results (up to 5 times) with slightly
less accuracy.
There is one function which is not to be used, but is just here for
comparison (ncubic_0div). It does no divide but has awful avg error.
But one which is interesting is the ...On Tue, 13 Mar 2007 21:50:20 +0100 The following version of div64_64 is faster because do_div already optimized for the 32 bit case.. I get the following results on ULV Core Solo (ie slow current processor) and the following on 64bit Core Duo. ncubic_tab1 seems like the best (no additional error and about as fast) ULV Core Solo Function clocks mean(us) max(us) std(us) Avg err size ncubic_tab0 192 11.24 45.10 15.28 0.450% -2262 ncubic_0div 201 11.77 47.23 27.40 3.357% -2404 ncubic_1div 324 19.02 76.32 25.82 0.189% -2567 ncubic_tab1 326 19.13 76.73 23.71 0.043% -2059 ncubic_2div 456 26.72 108.92 493.16 0.028% -2790 ncubic_ndiv3 463 27.15 133.37 1889.39 0.104% -3344 ncubic32 549 32.18 130.59 508.97 0.041% -3794 ncubic32_1 574 33.66 138.32 548.48 0.029% -3604 ncubic_3div 581 34.04 140.24 608.55 0.018% -3050 ncubic 733 42.92 173.35 523.19 0.041% 299 ocubic 1046 61.25 283.68 3305.65 0.027% -2232 acbrt 1149 67.32 284.91 1941.55 0.029% 168 bictcp 1663 97.41 394.29 604.86 0.017% 628 Core 2 Duo Function clocks mean(us) max(us) std(us) Avg err size ncubic_0div 74 0.03 1.60 0.07 3.357% -2101 ncubic_tab0 74 0.03 1.60 0.04 0.450% -2029 ncubic_1div 142 0.07 3.11 1.05 0.189% -2195 ncubic_tab1 144 0.07 3.18 1.02 0.043% -1638 ncubic_2div 216 0.10 4.74 1.07 0.028% -2326 ncubic_ndiv3 219 0.10 4.76 1.04 0.104% -2709 ncubic32 269 0.13 5.87 1.13 0.041% -1500 ncubic32_1 272 0.13 5.92 1.10 0.029% -2881 ncubic 273 0.13 5.96 1.13 0.041% -1763 ncubic_3div 290 0.14 6.32 1.01 0.018% -2499 acbrt ...
Hi Stephen, Cool, this is interesting because I first wanted to optimize it but did not find how to start with this. You seem to get very good results. BTW, you did not append your changes. However, one thing I do not understand is why your avg error is about 1/3 below the original one. Was there a precision bug in the original div_64_64 or did you extend the values used in the test ? Or perhaps you used -fast-math to build and the original cbrt() is less Thanks, Willy -
On Wed, 21 Mar 2007 20:15:37 +0100
/* 64bit divisor, dividend and result. dynamic precision */
static uint64_t div64_64(uint64_t dividend, uint64_t divisor)
{
uint32_t high, d;
high = divisor >> 32;
if (high) {
unsigned int shift = fls(high);
d = divisor >> shift;
dividend >>= shift;
} else
d = divisor;
do_div(dividend, d);
return dividend;
No, but I did use -mtune=pentiumm on the ULV
--
Stephen Hemminger <shemminger@linux-foundation.org>
-
Minor optimization of div64_64. do_div() already does optimization
for the case of 32 by 32 divide, so no need to do it here.
Signed-off-by: Stephen Hemminger <shemminger@linux-foundation.org>
--- net-2.6.22.orig/lib/div64.c 2007-03-21 12:03:59.000000000 -0700
+++ net-2.6.22/lib/div64.c 2007-03-21 12:04:46.000000000 -0700
@@ -61,20 +61,18 @@
/* 64bit divisor, dividend and result. dynamic precision */
uint64_t div64_64(uint64_t dividend, uint64_t divisor)
{
- uint32_t d = divisor;
+ uint32_t high, d;
- if (divisor > 0xffffffffULL) {
- unsigned int shift = fls(divisor >> 32);
+ high = divisor >> 32;
+ if (high) {
+ unsigned int shift = fls(high);
d = divisor >> shift;
dividend >>= shift;
- }
+ } else
+ d = divisor;
- /* avoid 64 bit division if possible */
- if (dividend >> 32)
- do_div(dividend, d);
- else
- dividend = (uint32_t) dividend / d;
+ do_div(dividend, d);
return dividend;
}
-
Use willy's work in optimizing cube root by having table for small values.
Signed-off-by: Stephen Hemminger <shemminger@linux-foundation.org>
--- net-2.6.22.orig/net/ipv4/tcp_cubic.c 2007-03-21 12:57:11.000000000 -0700
+++ net-2.6.22/net/ipv4/tcp_cubic.c 2007-03-21 13:04:59.000000000 -0700
@@ -91,23 +91,51 @@
tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
}
-/*
- * calculate the cubic root of x using Newton-Raphson
+/* calculate the cubic root of x using a table lookup followed by one
+ * Newton-Raphson iteration.
+ * Avg err ~= 0.195%
*/
static u32 cubic_root(u64 a)
{
- u32 x;
-
- /* Initial estimate is based on:
- * cbrt(x) = exp(log(x) / 3)
+ u32 x, b, shift;
+ /*
+ * cbrt(x) MSB values for x MSB values in [0..63].
+ * Precomputed then refined by hand - Willy Tarreau
+ *
+ * For x in [0..63],
+ * v = cbrt(x << 18) - 1
+ * cbrt(x) = (v[x] + 10) >> 6
*/
- x = 1u << (fls64(a)/3);
+ static const u8 v[] = {
+ /* 0x00 */ 0, 54, 54, 54, 118, 118, 118, 118,
+ /* 0x08 */ 123, 129, 134, 138, 143, 147, 151, 156,
+ /* 0x10 */ 157, 161, 164, 168, 170, 173, 176, 179,
+ /* 0x18 */ 181, 185, 187, 190, 192, 194, 197, 199,
+ /* 0x20 */ 200, 202, 204, 206, 209, 211, 213, 215,
+ /* 0x28 */ 217, 219, 221, 222, 224, 225, 227, 229,
+ /* 0x30 */ 231, 232, 234, 236, 237, 239, 240, 242,
+ /* 0x38 */ 244, 245, 246, 248, 250, 251, 252, 254,
+ };
+
+ b = fls64(a);
+ if (b < 7) {
+ /* a in [0..63] */
+ return ((u32)v[(u32)a] + 35) >> 6;
+ }
+
+ b = ((b * 84) >> 8) - 1;
+ shift = (a >> (b * 3));
- /* converges to 32 bits in 3 iterations */
- x = (2 * x + (u32)div64_64(a, (u64)x*(u64)x)) / 3;
- x = (2 * x + (u32)div64_64(a, (u64)x*(u64)x)) / 3;
- x = (2 * x + (u32)div64_64(a, (u64)x*(u64)x)) / 3;
+ x = ((u32)(((u32)v[shift] + 10) << b)) >> 6;
+ /*
+ * Newton-Raphson iteration
+ * 2
+ * x = ( 2 * x + a / x ) / 3
+ * k+1 k ...From: Stephen Hemminger <shemminger@linux-foundation.org> Also applied to net-2.6.22, thanks Stephen. -
From: Stephen Hemminger <shemminger@linux-foundation.org> Applied to net-2.6.22 -
Confirmed, it's the same. BTW, has someone tested on a 64bit system if it brings any difference ? Willy -
From: Stephen Hemminger <shemminger@linux-foundation.org> Applied, thanks Stephen. -
Well that still needs the ugly div64_64 function. At least my goal was to eliminate that, not make it faster (I don't see any evidence this function is particularly performance critical). You prefer to keep div64_64? -Andi -
From: Andi Kleen <andi@firstfloor.org> I really have no issues with it. -
