

/* 32 bit implementation. */


/* Reverse the bits in an unsigned integer.
* The algorithm has a run time of O(lb(bitsize)) [where lb names the
* binary logarithm, i.e. 5 with 32 bit, 6 with 64 bit, etc].
*/
unsigned int bitreverse(unsigned int n)
{

/* Swap two sets of bit-groups.
* One set is calculated by applying 'lowmask' to the input ['n'], the
* other by applying 'highmask'; lowmask<highmask; they are complements
* (highmask=~lowmask) and highmask=lowmask<<shift, i.e. they are
* alternating patterns of m bits [0101.., 00110011.., etc].
* So first we swap bits [e.g. abcdefgh -> badcfehg], then bit pairs
* [badcfehg -> dcbahgfe], then quadruples [dcbahgfe -> hgfedcba], etc.
* This part of the algorithm uses the fact that the algorithm 'a^=b,
* b^=a, a^=b' swaps bit patterns [and hence bit groups]; that algorithm
* goes back to the fact that in the second step b=b^(a^b) -> b=b^(b^a)
* -> b=(b^b)^a -> b=0^a -> b=a [the exclusive-or (xor) operation is
* commutative and associative and x^x==0 and x^0==x; third step works
* analogous].
* 'n' is evaluated 6 times, 'lowmask' 2 times, 'highmask' 1 time,
* 'shift' 3 times.
*/
#define bitgroupswap3(n, lowmask, highmask, shift) \
    ((n) ^= ((n) & (lowmask)) << (shift), \
    (n) ^= ((n) & (highmask)) >> (shift), \
    (n) ^= ((n) & (lowmask)) << (shift))

/* Wrapper to calculate the high mask from the low mask and the shift.
* These wrappers make the over-all algorithm less redundant and terser.
* 'n' is evaluated once, 'lowmask' twice, 'shift' twice.
*/
#define bitgroupswap2(n, lowmask, shift) \
    (bitgroupswap3((n), (lowmask), (lowmask) << (shift), (shift)))

/* Wrapper to calculate the shift from the order of the bit mask.
*/
#define bitgroupswap(n, lowmask, order) \
    (bitgroupswap2((n), (lowmask), 1 << (order)))

    /* Swap groups of 1, 2, 4, 8 and 16 bits.
    * Note that the order of these swaps does not matter, they are
    * orthogonal.
    * [Applies the patterns 0x55555555 / 0xaaaaaaaa, 0x33333333 /
    * 0xcccccccc, 0x0f0f0f0f / 0xf0f0f0f0, 0x00ff00ff / 0xff00ff00,
    * 0x0000ffff / 0xffff0000
    * (55555555 / aaaaaaaa, 33333333 / cccccccc, 0f0f0f0f / f0f0f0f0,
    * 00ff00ff / ff00ff00, 0000ffff / ffff0000).]
    */
    bitgroupswap(n, 0x55555555u, 0);
    bitgroupswap(n, 0x33333333u, 1);
    bitgroupswap(n, 0x0f0f0f0fu, 2);
    bitgroupswap(n, 0x00ff00ffu, 3);
    bitgroupswap(n, 0x0000ffffu, 4);

    return n;
}


/* Sample 32 bit x86 assembly.
* This sample does not reuse 'immediate' number literals and fills the
* higer significant number literal bits with ones.
*
* _bitreverse:
*
*     mov ecx,dword ptr [esp+4]   ; ecx: 'n'
*
*     mov eax,ecx                 ; round order 0
*     and eax,0D5555555h          ; [highest bit will fall out]
*     shl eax,1
*     xor ecx,eax
*
*     mov edx,ecx                 ; second step
*     shr edx,1                   ; [shift/and order reversed]
*     and edx,55555555h
*     xor ecx,edx
*
*     mov eax,ecx                 ; third step [same as first]
*     and eax,0D5555555h
*     shl eax,1
*     xor ecx,eax
*
*     mov edx,ecx                 ; round order 1
*     and edx,0F3333333h          ; [highest two bits will fall out]
*     shl edx,2
*     xor ecx,edx
*
*     mov eax,ecx
*     shr eax,2
*     and eax,33333333h
*     xor ecx,eax
*
*     mov edx,ecx
*     and edx,0F3333333h
*     shl edx,2
*     xor ecx,edx
*
*     mov eax,ecx                 ; round order 2
*     and eax,0FF0F0F0Fh
*     shl eax,4
*     xor ecx,eax
*
*     mov edx,ecx
*     shr edx,4
*     and edx,0F0F0F0Fh
*     xor ecx,edx
*
*     mov eax,ecx
*     and eax,0FF0F0F0Fh
*     shl eax,4
*     xor ecx,eax
*
*     mov edx,ecx                 ; round order 3
*     and dh,0                    ; [&0xffff00ff]
*     shl edx,8
*     xor ecx,edx
*
*     mov eax,ecx
*     shr eax,8
*     and eax,0FF00FFh
*     xor ecx,eax
*
*     mov edx,ecx
*     and dh,0
*     shl edx,8
*     xor ecx,edx
*
*     mov eax,ecx                 ; [final] round order 4
*     shl eax,10h                 ; [no initial masking necessary]
*     xor ecx,eax
*
*     mov edx,ecx
*     shr edx,10h                 ; [masking done by logical shift]
*     xor ecx,edx
*
*     mov eax,ecx
*     shl eax,10h
*     xor eax,ecx                 ; final result to return register eax
*
*     ret
*/


/* Reference implementation to 'bitreverse'.
* The algorithm has a run time of O(bitsize).
*/
unsigned int bitreversereference(unsigned int n)
{
    unsigned int m;
    int i;

    m = 0;
    /* Go over all [32] bits. */
    for (i = 0; i < 32; i++) {
        /* Shift result vector to higher significance. */
        m <<= 1;
        /* Get least significant input bit. */
        m |= n & 1;
        /* Shift input vector to lower significance. */
        n >>= 1;
    }

    return m;
}


