extern int ipow(int b, unsigned int e);
int ipow(int b, unsigned int e) {
return (e == 0 ? 1 : b * ipow(b, e - 1));
}
(reference implementation,
speed-up factor
1.0;
all speed-up factors are relative to this;
larger values are better)
The speed factors are taken from measuring
109 or
108
iterations
of calculating ipow(3,19)
(319;
3 being the first (lowest) multi-bit number,
19 the last number not resulting in overflow with the chosen base number 3 on (signed) 32 bit systems),
on a x86-32 (ia32) and armv6 (arm-32) systems
(number of cores doesn't matter here, as being in the single-thread domain).
int ipow(int b, unsigned int e) {
int p = 1;
while (e > 0) {
p *= b;
e--;
}
return p;
}
speed-up factor
4.4 (x86-32)
3.4 (armv6)
Still being linear in run time behavior (O(n)),
but avoiding the function call overheads and stack operations.
int ipow(int b, unsigned int e) {
return
(e == 0 ? 1 :
(e % 2 == 0 ? ipow(b * b, e / 2) :
b * ipow(b, e - 1)));
}
speed-up factor
3.7 (x86-32)
2.5 (armv6)
There is
still quite an overhead from function calls
and stack operations.
int ipow(int b, unsigned int e) {
int p = 1;
while (e > 0) {
if (e % 2 == 0) {
b *= b;
e /= 2;
} else {
p *= b;
e--;
}
}
return p;
}
speed-up factor
6.4 (x86-32)
3.5 (armv6)
.L14: testl %edx, %edx # e == 0 ?
je .L3
.L16: movl %edx, %ebx # temporary e
movl %ecx, %edi # temporary b
shrl %ebx # e / 2
movl %ebx, -16(%ebp) # temporary e
movl %eax, %ebx # temporary p
imull %ecx, %edi # b *= b
imull %ecx, %ebx # p *= b
leal -1(%edx), %esi # e - 1
andl $1, %edx # e % 2
jne .L6
movl -16(%ebp), %edx
movl %edi, %ecx
testl %edx, %edx
jne .L16
.L3: ...
.L6: movl %esi, %edx
movl %ebx, %eax
jmp .L14
The produced x86 code however uses stack based variables
(besides the cpu registers edx/ebx/ecx/edi/esi,
as the optimizer run out of usable i386 registers)
and has lots of movs,
both making it relatively slow.
.L9: and r2, r1, #1 @ e % 2 [e&1]
cmp r2, #0 @ (e%2) == 0 ?
mul ip, r3, r3 @ b * b
mul r2, r3, r0 @ p * b
sub r4, r1, #1 @ e - 1
movne r1, r4 @ if((e%2)!=0) e = e-1
moveq r1, r1, lsr #1 @ if((e%2)==0) e /= 2 [>>=1]
moveq r3, ip @ if((e%2)==0) b = (b*b)
movne r0, r2 @ if((e%2)!=0) p = (p*b)
cmp r1, #0 @ e == 0 ?
bne .L9
The produced armv6 code does both multiplications per iteration,
using conditional moves (and discards some of the calculations).
Both problems can be countered
by simplifying the C code
(possibly introducing overhead such as unneeded additional operations in the last iteration).
.L9: and r2, r1, #1
cmp r2, #0
muleq r3, r3, r3
mulne r0, r3, r0
subne r1, r1, #1
moveq r1, r1, lsr #1
cmp r1, #0
bne .L9
int ipow(int b, unsigned int e) {
int p = 1;
for (;;) {
if (e % 2 != 0) {
p *= b;
e--;
}
if (e == 0) {
break;
}
b *= b;
e /= 2;
}
return p;
}
speed-up factor
14.7 (x86-32)
6.7 (armv6)
.L7: imull %ecx, %ecx # b *= b
shrl %edx # e >>= 1
.L4: testb $1, %dl # (e & 1) == 0 ?
je .L2
imull %ecx, %eax # p *= b
subl $1, %edx # e--
.L2: testl %edx, %edx # e == 0 ?
jne .L7
The compiler (optimizer) has anticipated the use of binary operations
(formulated explicitly in C in the iterative implementation with binary operators
implementation below
(right shift instead of division by 2,
least significant bit testing instead of remainder of 2),
so this version is already quite at speed.
.L4: tst r1, #1
subne r1, r1, #1
mulne r0, r3, r0
cmp r1, #0
mul r3, r3, r3
mov r1, r1, lsr #1
bne .L4
In the armv6 code, unnecessary multiplications are now too avoided.
int ipowx(int b, unsigned int e, int two) {
int p = 1;
for (;;) {
if (e % two != 0) {
p *= b;
e--;
}
if (e == 0) {
break;
}
b *= b;
e /= two;
}
return p;
}
int ipow(int b, unsigned int e) {
return ipowx(b, e, 2);
}
speed-up factor
1.1 (x86-32)
1.1 (armv6)
.L7: movl %ecx, %eax # e
xorl %edx, %edx # preparation for division/remainder operation
divl %esi # e / 2 , e % 2
imull %ebx, %ebx # b *= b
movl %eax, %ecx # e = e/2
.L4: xorl %edx, %edx # preparation for division/remainder operation
movl %ecx, %eax # e
divl %esi # e / 2 , e % 2
testl %edx, %edx # e % 2 == 0 ?
je .L2
imull %ebx, %edi # p *= b
subl $1, %ecx # e--
.L2: testl %ecx, %ecx # e != 0 ?
jne .L7
The use of division operation(s) (instead of cheaper shift and and) is a killer:
very slow, and taking additional registers and preparation.
The expensive division/remainder operation is even done twice per iteration in this case.
More register moves, and register clear operations
(the div operation is especially inflexible on x86-32) are needed.
.L6: bl __aeabi_uidiv
mov r4, r0
mov r0, r4 @ e
mov r1, r6 @ 2
bl __aeabi_uidivmod
cmp r1, #0 @ e % 2 == 0 ?
subne r4, r4, #1 @ if(e%2!=0) e--
mulne r7, r5, r7 @ if(e%2!=0) p *= b
cmp r4, #0 @ e == 0 ?
mov r1, r6 @ restore arg0, arg1
mov r0, r4
mul r5, r5, r5 @ b *= b
bne .L6
Similar, double div[mod] calculation in the armv6 code,
as well as additional movs and registers used,
and the more expensive div operation(s).
int ipowx(int b, unsigned int e, int two) {
int p = 1;
do {
if (e % two != 0) {
p *= b;
}
e /= two;
b *= b;
} while (e != 0);
return p;
}
int ipow(int b, unsigned int e) {
return ipowx(b, e, 2);
}
speed-up factor
1.0 (x86-32)
1.1 (armv6)
.L7: imull %ebx, %ebx
.L4: xorl %edx, %edx
movl %ecx, %eax
divl %esi
testl %edx, %edx
je .L2
imull %ebx, %edi
.L2: movl %ecx, %eax
xorl %edx, %edx
divl %esi
testl %eax, %eax
movl %eax, %ecx
jne .L7
This is even slower due to one additional multiplication in the last iteration.
This version however serves as the base for an optimization eliminating half of the division operations.
.L7: imull %ebx, %ebx
.L4: xorl %edx, %edx
movl %ecx, %eax # e
divl %esi # e / 2, e % 2
movl %eax, %ecx # e = e / 2
testl %edx, %edx # e % 2 == 0 ?
je .L2
imull %ebx, %edi # p *= b
.L2: testl %ecx, %ecx # e == 0 ?
jne .L7
faster when eliminating the redundant division,
but still slow
int ipow(int b, unsigned int e) {
int p = 1;
for (;;) {
if (e & 1) {
p *= b;
e &= ~1;
}
if (!e) {
return p;
}
b *= b;
e >>= 1;
}
}
speed-up factor
14.8 (x86-32)
6.7 (armv6)
.L3: imull %ecx, %ecx # b *= b
shrl %edx # e >>= 1
.L4: testb $1, %dl # (e & 1) == 0 ?
je .L2
imull %ecx, %eax # p *= b
andl $-2, %edx # e &= ~1
.L2: testl %edx, %edx # e == 0 ?
jne .L3
This version explicitly uses binary operations in the C code.
Assembly outputs and performances are almost identical to the coalesced implementation seen earlier
(only differences:
subl $1,%edx vs. andl $-2,%edx
/
subne r1,r1,#1 vs. bicne r1,r1,#1
).
.L4: tst r1, #1 @ (e & 1) == 0 ?
bicne r1, r1, #1 @ if((e&1)!=0) e &= ~1
mulne r0, r3, r0 @ if((e&1)!=0) p *= b
cmp r1, #0 @ e == 0 ?
mul r3, r3, r3 @ b *= b
mov r1, r1, lsr #1 @ e >>= 1
bne .L4
The armv6 version is easier to read due to the conditional operations imposing less local jumps.
int ipowx(int b, unsigned int e, int one, int noteone) {
int p = 1;
for (;;) {
if (e & one) {
p *= b;
e &= noteone;
}
if (!e) {
return p;
}
b *= b;
e >>= 1;
}
}
int ipow(int b, unsigned int e) {
return ipowx(b, e, 1, ~1);
}
speed-up factor
9.3 (x86-32)
6.3 (armv6)
.L3: imull %ecx, %ecx
shrl %edx
.L4: testl %ebx, %edx
je .L2
imull %ecx, %eax
andl %esi, %edx
.L2: testl %edx, %edx
jne .L3
The parametrized x86-32 version is
slower here (factor 1.6),
than using constants,
despite they take more space in code and cpu caches.
.L4: tst r1, r2
andne r1, r1, r3
mulne ip, r0, ip
cmp r1, #0
mul r0, r0, r0
mov r1, r1, lsr #1
bne .L4
The parametrized armv6 version is
only marginally slower here (factor 1.1).
Dropping the
e--
or e&=~1
operation would lead to an even tighter loop,
but bears the overhead of an additional (discarded) multiplication
(and a shift),
turning out to be less effective on the chosen test case.