Stefan Kanthak via llvm-dev
2020-Sep-06 11:33 UTC
[llvm-dev] Abysmal performance of 64/64-bit integer division
<https://compiler-rt.llvm.org/index.html> boasts: | The builtins library provides optimized implementations of this | and other low-level routines, either in target-independent C form, | or as a heavily-optimized assembly. Compile the following source with both GCC -m32 -O3 and clang -m32 -O3, run both programs at least on an AMD Ryzen and on different Intel Core processors, and compare the measured execution times ... then remove every occurance of the word "optimized" on the above web page. --- 64-bit.c --- // Copyright © 2018-2020, Stefan Kanthak <stefan.kanthak at nexgo.de> #include <stdint.h> #include <stdio.h> #include <time.h> extern uint64_t __udivmoddi4(uint64_t dividend, uint64_t divisor, uint64_t *remainder); __attribute__((noinline)) uint64_t __unopdi4(uint64_t dividend, uint64_t divisor, uint64_t *remainder) { if (remainder != NULL) *remainder = divisor; return dividend; } __attribute__((always_inline)) uint64_t lfsr64(void) { // 64-bit linear feedback shift register (Galois form) using // primitive polynomial 0xAD93D23594C935A9 (CRC-64 "Jones"), // initialised with 2**64 / golden ratio static uint64_t lfsr = 0x9E3779B97F4A7C15; const uint64_t sign = (int64_t) lfsr >> 63; return lfsr = (lfsr << 1) ^ (0xAD93D23594C935A9 & sign); } __attribute__((always_inline)) uint32_t lfsr32(void) { // 32-bit linear feedback shift register (Galois form) using // primitive polynomial 0xDB710641 (CRC-32 IEEE), // initialised with 2**32 / golden ratio static uint32_t lfsr = 0x9E3779B9; const uint32_t sign = (int32_t) lfsr >> 31; return lfsr = (lfsr << 1) ^ (0xDB710641 & sign); } int main(void) { clock_t t0, t1, t2, tt; uint32_t n; uint64_t dividend, divisor = ~0, remainder; volatile uint64_t quotient; t0 = clock(); for (n = 500000000u; n > 0u; --n) { dividend = lfsr64(); quotient = __unopdi4(dividend, divisor, NULL); divisor = lfsr32(); quotient = __unopdi4(dividend, divisor, &remainder); } t1 = clock(); for (n = 500000000u; n > 0u; --n) { dividend = lfsr64(); quotient = __udivmoddi4(dividend, divisor, NULL); divisor = lfsr32(); quotient = __udivmoddi4(dividend, divisor, &remainder); } t2 = clock(); tt = t2 - t0; t2 -= t1; t1 -= t0; t0 = t2 - t1; printf("\n" "__unopdi4() %4lu.%06lu 0\n" "__udivmoddi4() %4lu.%06lu %4lu.%06lu\n" " %4lu.%06lu nano-seconds\n", t1 / CLOCKS_PER_SEC, (t1 % CLOCKS_PER_SEC) * 1000000u / CLOCKS_PER_SEC, t2 / CLOCKS_PER_SEC, (t2 % CLOCKS_PER_SEC) * 1000000u / CLOCKS_PER_SEC, t0 / CLOCKS_PER_SEC, (t0 % CLOCKS_PER_SEC) * 1000000u / CLOCKS_PER_SEC, tt / CLOCKS_PER_SEC, (tt % CLOCKS_PER_SEC) * 1000000u / CLOCKS_PER_SEC); } --- EOF --- On an AMD Ryzen 5 3600 this demonstrates a factor 11, which but can be "raised" to 15 if need be.