Parts of the optimizations analyzed here arise from the optimizing compiler (optimizing option -O3 vs. unoptimizing/debugging options -O0 and/or -g), part from the algorithms used (runtime behavior O(lg(n)) vs. O(n)), part from wise knowledge of number of available work registers (not that many e.g. on the x86-32 architecture), parts from avoiding expensive and/or overhead-imposing operations (e.g. div).

The sample implementations are compiled for a
x86-32 (ia32, Intel x86 32 bit) system,
and an armv6 (arm-32, ARM 32 bit) system,
with gcc (GNU C compiler),
advanced optimization level -O3
(so the compiler already does a good job;
our optimizations are additions to that;
assembly level optimizations take the route over
adapted C code here and therefore are helped by the compiler as well).
All sources, x86-32 assembly output, makefiles, etc, can be found as
*ipow-optimizations.tar.gz*
while snippets of it are spread into this article
*(C language in red,
assembly in green (loop code only))*.

The (C language) method's signature shall be:

```
extern int ipow(int b, unsigned int e);
```

int ipow(int b, unsigned int e) {
return (e == 0 ? 1 : b * ipow(b, e - 1));
}

The speed factors are taken from measuring
10^{9} or
10^{8}
iterations
of calculating ipow(3,19)
(3^{19};
3 being the first (lowest) multi-bit number,
19 the last number not resulting in overflow with the chosen base number 3 on (signed) 32 bit systems),
on a x86-32 (ia32) and armv6 (arm-32) systems
(number of cores doesn't matter here, as being in the single-thread domain).

int ipow(int b, unsigned int e) {
int p = 1;
while (e > 0) {
p *= b;
e--;
}
return p;
}

Still being linear in run time behavior (O(n)), but avoiding the function call overheads and stack operations.

int ipow(int b, unsigned int e) {
return
(e == 0 ? 1 :
(e % 2 == 0 ? ipow(b * b, e / 2) :
b * ipow(b, e - 1)));
}

There is still quite an overhead from function calls and stack operations.

int ipow(int b, unsigned int e) {
int p = 1;
while (e > 0) {
if (e % 2 == 0) {
b *= b;
e /= 2;
} else {
p *= b;
e--;
}
}
return p;
}

.L14: testl %edx, %edx # e == 0 ?
je .L3
.L16: movl %edx, %ebx # temporary e
movl %ecx, %edi # temporary b
shrl %ebx # e / 2
movl %ebx, -16(%ebp) # temporary e
movl %eax, %ebx # temporary p
imull %ecx, %edi # b *= b
imull %ecx, %ebx # p *= b
leal -1(%edx), %esi # e - 1
andl $1, %edx # e % 2
jne .L6
movl -16(%ebp), %edx
movl %edi, %ecx
testl %edx, %edx
jne .L16
.L3: ...
.L6: movl %esi, %edx
movl %ebx, %eax
jmp .L14

The produced x86 code however uses stack based variables
(besides the cpu registers edx/ebx/ecx/edi/esi,
as the optimizer run out of usable i386 registers)
and has lots of movs,
both making it relatively slow.
.L9: and r2, r1, #1 @ e % 2 [e&1]
cmp r2, #0 @ (e%2) == 0 ?
mul ip, r3, r3 @ b * b
mul r2, r3, r0 @ p * b
sub r4, r1, #1 @ e - 1
movne r1, r4 @ if((e%2)!=0) e = e-1
moveq r1, r1, lsr #1 @ if((e%2)==0) e /= 2 [>>=1]
moveq r3, ip @ if((e%2)==0) b = (b*b)
movne r0, r2 @ if((e%2)!=0) p = (p*b)
cmp r1, #0 @ e == 0 ?
bne .L9

The produced armv6 code does both multiplications per iteration,
using conditional moves (and discards some of the calculations).
Both problems can be countered by simplifying the C code (possibly introducing overhead such as unneeded additional operations in the last iteration).

*speed-up factor
4.3 (armv6)
*

```
.L9: and r2, r1, #1
cmp r2, #0
muleq r3, r3, r3
mulne r0, r3, r0
subne r1, r1, #1
moveq r1, r1, lsr #1
cmp r1, #0
bne .L9
```

int ipow(int b, unsigned int e) {
int p = 1;
for (;;) {
if (e % 2 != 0) {
p *= b;
e--;
}
if (e == 0) {
break;
}
b *= b;
e /= 2;
}
return p;
}

.L7: imull %ecx, %ecx # b *= b
shrl %edx # e >>= 1
.L4: testb $1, %dl # (e & 1) == 0 ?
je .L2
imull %ecx, %eax # p *= b
subl $1, %edx # e--
.L2: testl %edx, %edx # e == 0 ?
jne .L7

The compiler (optimizer) has anticipated the use of binary operations
(formulated explicitly in C in the .L4: tst r1, #1
subne r1, r1, #1
mulne r0, r3, r0
cmp r1, #0
mul r3, r3, r3
mov r1, r1, lsr #1
bne .L4

In the armv6 code, unnecessary multiplications are now too avoided.
int ipowx(int b, unsigned int e, int two) {
int p = 1;
for (;;) {
if (e % two != 0) {
p *= b;
e--;
}
if (e == 0) {
break;
}
b *= b;
e /= two;
}
return p;
}
int ipow(int b, unsigned int e) {
return ipowx(b, e, 2);
}

.L7: movl %ecx, %eax # e
xorl %edx, %edx # preparation for division/remainder operation
divl %esi # e / 2 , e % 2
imull %ebx, %ebx # b *= b
movl %eax, %ecx # e = e/2
.L4: xorl %edx, %edx # preparation for division/remainder operation
movl %ecx, %eax # e
divl %esi # e / 2 , e % 2
testl %edx, %edx # e % 2 == 0 ?
je .L2
imull %ebx, %edi # p *= b
subl $1, %ecx # e--
.L2: testl %ecx, %ecx # e != 0 ?
jne .L7

The use of division operation(s) (instead of cheaper shift and and) is a killer:
very slow, and taking additional registers and preparation.
The expensive division/remainder operation is even done twice per iteration in this case.
More register moves, and register clear operations
(the div operation is especially inflexible on x86-32) are needed.
.L6: bl __aeabi_uidiv
mov r4, r0
mov r0, r4 @ e
mov r1, r6 @ 2
bl __aeabi_uidivmod
cmp r1, #0 @ e % 2 == 0 ?
subne r4, r4, #1 @ if(e%2!=0) e--
mulne r7, r5, r7 @ if(e%2!=0) p *= b
cmp r4, #0 @ e == 0 ?
mov r1, r6 @ restore arg0, arg1
mov r0, r4
mul r5, r5, r5 @ b *= b
bne .L6

Similar, double div[mod] calculation in the armv6 code,
as well as additional movs and registers used,
and the more expensive div operation(s).
int ipowx(int b, unsigned int e, int two) {
int p = 1;
do {
if (e % two != 0) {
p *= b;
}
e /= two;
b *= b;
} while (e != 0);
return p;
}
int ipow(int b, unsigned int e) {
return ipowx(b, e, 2);
}

.L7: imull %ebx, %ebx
.L4: xorl %edx, %edx
movl %ecx, %eax
divl %esi
testl %edx, %edx
je .L2
imull %ebx, %edi
.L2: movl %ecx, %eax
xorl %edx, %edx
divl %esi
testl %eax, %eax
movl %eax, %ecx
jne .L7

This is even slower due to one additional multiplication in the last iteration.
This version however serves as the base for an optimization eliminating half of the division operations.
*speed-up factor
1.5 (x86-32)
*

```
.L7: imull %ebx, %ebx
.L4: xorl %edx, %edx
movl %ecx, %eax # e
divl %esi # e / 2, e % 2
movl %eax, %ecx # e = e / 2
testl %edx, %edx # e % 2 == 0 ?
je .L2
imull %ebx, %edi # p *= b
.L2: testl %ecx, %ecx # e == 0 ?
jne .L7
```

faster when eliminating the redundant division,
but still slow
int ipow(int b, unsigned int e) {
int p = 1;
for (;;) {
if (e & 1) {
p *= b;
e &= ~1;
}
if (!e) {
return p;
}
b *= b;
e >>= 1;
}
}

.L3: imull %ecx, %ecx # b *= b
shrl %edx # e >>= 1
.L4: testb $1, %dl # (e & 1) == 0 ?
je .L2
imull %ecx, %eax # p *= b
andl $-2, %edx # e &= ~1
.L2: testl %edx, %edx # e == 0 ?
jne .L3

This version explicitly uses binary operations in the C code.
Assembly outputs and performances are almost identical to the coalesced implementation seen earlier
(only differences:
subl $1,%edx vs. andl $-2,%edx
/
subne r1,r1,#1 vs. bicne r1,r1,#1
).
.L4: tst r1, #1 @ (e & 1) == 0 ?
bicne r1, r1, #1 @ if((e&1)!=0) e &= ~1
mulne r0, r3, r0 @ if((e&1)!=0) p *= b
cmp r1, #0 @ e == 0 ?
mul r3, r3, r3 @ b *= b
mov r1, r1, lsr #1 @ e >>= 1
bne .L4

The armv6 version is easier to read due to the conditional operations imposing less local jumps.
int ipowx(int b, unsigned int e, int one, int noteone) {
int p = 1;
for (;;) {
if (e & one) {
p *= b;
e &= noteone;
}
if (!e) {
return p;
}
b *= b;
e >>= 1;
}
}
int ipow(int b, unsigned int e) {
return ipowx(b, e, 1, ~1);
}

.L3: imull %ecx, %ecx
shrl %edx
.L4: testl %ebx, %edx
je .L2
imull %ecx, %eax
andl %esi, %edx
.L2: testl %edx, %edx
jne .L3

The parametrized x86-32 version is
slower here (factor 1.6),
than using constants,
despite they take more space in code and cpu caches.
.L4: tst r1, r2
andne r1, r1, r3
mulne ip, r0, ip
cmp r1, #0
mul r0, r0, r0
mov r1, r1, lsr #1
bne .L4

The parametrized armv6 version is
only marginally slower here (factor 1.1).
Dropping the
`e--`

or `e&=~1`

operation would lead to an even tighter loop,
but bears the overhead of an additional (discarded) multiplication
(and a shift),
turning out to be less effective on the chosen test case.

Eric Laroche / Mon Jan 19 2015