Stefan Kanthak via llvm-dev
2020-Sep-06 11:29 UTC
[llvm-dev] Disastrous performance of 128/128-bit integer division
<https://compiler-rt.llvm.org/index.html> boasts: | The builtins library provides optimized implementations of this | and other low-level routines, either in target-independent C form, | or as a heavily-optimized assembly. Compile the following source with both GCC -m64 -O3 and clang -m64 -O3, run both programs at least on an AMD Ryzen and on different Intel Core processors, compare the measured execution times ... then remove every occurance of the word "optimized" on the above web page. --- 128-bit.c --- // Copyright © 2018-2020, Stefan Kanthak <stefan.kanthak at nexgo.de> #include <stdint.h> #include <stdio.h> #include <time.h> extern __uint128_t __udivmodti4(__uint128_t dividend, __uint128_t divisor, __uint128_t *remainder); __attribute__((noinline)) __uint128_t __unopti4(__uint128_t dividend, __uint128_t divisor, __uint128_t *remainder) { if (remainder != NULL) *remainder = divisor; return dividend; } __attribute__((always_inline)) __uint128_t lfsr128(void) { // 128-bit linear feedback shift register (Galois form) using // primitive polynomial 0x5DB2B62B0C5F8E1B:D8CCE715FCB2726D, // initialised with 2**128 / golden ratio const __uint128_t poly = (__uint128_t) 0x5DB2B62B0C5F8E1B << 64 | 0xD8CCE715FCB2726D; static __uint128_t lfsr = (__uint128_t) 0x9E3779B97F4A7C15 << 64 | 0xF39CC0605CEDC834; const __uint128_t sign = (__int128_t) lfsr >> 127; return lfsr = (lfsr << 1) ^ (poly & sign); } __attribute__((always_inline)) uint64_t lfsr64(void) { // 64-bit linear feedback shift register (Galois form) using // primitive polynomial 0xAD93D23594C935A9 (CRC-64 "Jones"), // initialised with 2**64 / golden ratio static uint64_t lfsr = 0x9E3779B97F4A7C15; const uint64_t sign = (int64_t) lfsr >> 63; return lfsr = (lfsr << 1) ^ (0xAD93D23594C935A9 & sign); } __attribute__((always_inline)) uint32_t lfsr32(void) { // 32-bit linear feedback shift register (Galois form) using // primitive polynomial 0xDB710641 (CRC-32 IEEE), // initialised with 2**32 / golden ratio static uint32_t lfsr = 0x9E3779B9; const uint32_t sign = (int32_t) lfsr >> 31; return lfsr = (lfsr << 1) ^ (0xDB710641 & sign); } int main(void) { clock_t t0, t1, t2, tt; uint32_t n; __uint128_t dividend, divisor = ~0, remainder; volatile __uint128_t quotient; t0 = clock(); for (n = 500000000u; n > 0u; --n) { dividend = lfsr128(); quotient = __unopti4(dividend, divisor, NULL); divisor = lfsr64(); quotient = __unopti4(dividend, divisor, &remainder); } t1 = clock(); for (n = 500000000u; n > 0u; --n) { dividend = lfsr128(); quotient = __udivmodti4(dividend, divisor, NULL); divisor = lfsr64(); quotient = __udivmodti4(dividend, divisor, &remainder); } t2 = clock(); tt = t2 - t0; t2 -= t1; t1 -= t0; t0 = t2 - t1; printf("\n" "__unopti4() %4lu.%06lu 0\n" "__udivmodti4() %4lu.%06lu %4lu.%06lu\n" " %4lu.%06lu nano-seconds\n", t1 / CLOCKS_PER_SEC, (t1 % CLOCKS_PER_SEC) * 1000000u / CLOCKS_PER_SEC, t2 / CLOCKS_PER_SEC, (t2 % CLOCKS_PER_SEC) * 1000000u / CLOCKS_PER_SEC, t0 / CLOCKS_PER_SEC, (t0 % CLOCKS_PER_SEC) * 1000000u / CLOCKS_PER_SEC, tt / CLOCKS_PER_SEC, (tt % CLOCKS_PER_SEC) * 1000000u / CLOCKS_PER_SEC); t0 = clock(); for (n = 500000000u; n > 0u; --n) { dividend = lfsr128(); quotient = __unopti4(dividend, divisor, NULL); divisor = lfsr32(); quotient = __unopti4(dividend, divisor, &remainder); } t1 = clock(); for (n = 500000000u; n > 0u; --n) { dividend = lfsr128(); quotient = __udivmodti4(dividend, divisor, NULL); divisor = lfsr32(); quotient = __udivmodti4(dividend, divisor, &remainder); } t2 = clock(); tt = t2 - t0; t2 -= t1; t1 -= t0; t0 = t2 - t1; printf("\n" "__unopti4() %4lu.%06lu 0\n" "__udivmodti4() %4lu.%06lu %4lu.%06lu\n" " %4lu.%06lu nano-seconds\n", t1 / CLOCKS_PER_SEC, (t1 % CLOCKS_PER_SEC) * 1000000u / CLOCKS_PER_SEC, t2 / CLOCKS_PER_SEC, (t2 % CLOCKS_PER_SEC) * 1000000u / CLOCKS_PER_SEC, t0 / CLOCKS_PER_SEC, (t0 % CLOCKS_PER_SEC) * 1000000u / CLOCKS_PER_SEC, tt / CLOCKS_PER_SEC, (tt % CLOCKS_PER_SEC) * 1000000u / CLOCKS_PER_SEC); t0 = clock(); for (n = 500000000u; n > 0u; --n) { dividend = lfsr64(); quotient = __unopti4(dividend, divisor, NULL); divisor = lfsr32(); quotient = __unopti4(dividend, divisor, &remainder); } t1 = clock(); for (n = 500000000u; n > 0u; --n) { dividend = lfsr64(); quotient = __udivmodti4(dividend, divisor, NULL); divisor = lfsr32(); quotient = __udivmodti4(dividend, divisor, &remainder); } t2 = clock(); tt = t2 - t0; t2 -= t1; t1 -= t0; t0 = t2 - t1; printf("\n" "__unopti4() %4lu.%06lu 0\n" "__udivmodti4() %4lu.%06lu %4lu.%06lu\n" " %4lu.%06lu nano-seconds\n", t1 / CLOCKS_PER_SEC, (t1 % CLOCKS_PER_SEC) * 1000000u / CLOCKS_PER_SEC, t2 / CLOCKS_PER_SEC, (t2 % CLOCKS_PER_SEC) * 1000000u / CLOCKS_PER_SEC, t0 / CLOCKS_PER_SEC, (t0 % CLOCKS_PER_SEC) * 1000000u / CLOCKS_PER_SEC, tt / CLOCKS_PER_SEC, (tt % CLOCKS_PER_SEC) * 1000000u / CLOCKS_PER_SEC); } --- EOF --- AMD Ryzen 5 3600 6-Core Processor ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GCC/libgcc LLVM/clang/compiler-rt __unopti4() 1.668976 0 1.695685 0 __udivmodti4() 12.617999 10.949023 167.891651 166.195966 14.286975 nano-seconds 169.587336 nano-seconds __unopti4() 1.760362 0 1.708451 0 __udivmodti4() 18.046460 16.286098 246.065291 244.356840 19.806822 nano-seconds 247.773742 nano-seconds __unopti4() 1.248846 0 1.463868 0 __udivmodti4() 7.155582 5.906736 10.833658 9.369790 8.404428 nano-seconds 12.297526 nano-seconds The factor 15 demonstrated here can be bumped up^Wdown to 30 if need be!