--- byteswap.c --- // inline unsigned short swap16(unsigned short argument) { return (argument >> 8) | (argument << 8); } // inline unsigned int swap32(unsigned int argument) { return (unsigned int) swap16((unsigned short) argument) << 16 | (unsigned int) swap16((unsigned short) (argument >> 16)); } unsigned long swap64(unsigned long argument) { return (unsigned long) swap32((unsigned int) argument) << 32 | (unsigned long) swap32((unsigned int) (argument >> 32)); } --- EOF --- Compiled with "-O3" this generates the following UNOPTIMISED code for the swap32() and swap64() functions (see <https://godbolt.org/z/DwnG-X>): swap16: # @swap16 rol di, 8 mov eax, edi ret swap32: # @swap32 mov ecx, edi rol cx, 8 shl ecx, 16 shr edi, 16 rol di, 8 movzx eax, di or eax, ecx ret swap64: # @swap64 mov eax, edi rol ax, 8 mov ecx, edi shr ecx, 16 rol cx, 8 shl eax, 16 movzx ecx, cx or ecx, eax shl rcx, 32 mov rax, rdi shr rax, 32 rol ax, 8 movzx edx, ax shr rdi, 48 rol di, 8 shl rdx, 16 movzx eax, di or rax, rdx or rax, rcx ret Now look what GCC 8.2 generates (see <https://godbolt.org/z/2_XhQN>): swap16: mov eax, edi rol ax, 8 ret swap32: mov edx, edi shr edi, 16 rol dx, 8 rol di, 8 sal edx, 16 movzx eax, di or eax, edx ret swap64: mov rax, rdi bswap rax ret While GCC too fails to optimise swap32() to a BSWAP, it but does so for swap64()! regards Stefan Kanthak