thr3ads.net - llvm dev - [LLVMdev] Bignum development [Jun 2010]

If this information is useful, please help other people find it:
Share via:

Bill Hart

2010-Jun-13 01:27 UTC

[LLVMdev] Bignum development

I was able to get the loop to increment from -999 to 0 using IR
directly. That got rid of the cmpq.

The carry i was after was able to be obtained using the intrinsic
@llvm.uadd.with.overflow.i64, however there is no way to add with
carry and have it realise that the resulting *carry out* cannot exceed
1. It actually writes the carry to a byte, and then uses logical
operations on it, which slows things down much more.

I guess what is needed is an intrinsic @llvm.uadc.with.overflow.i64
which should take three arguments, the two i64's being added and an
i1, being the carry from before.

This and a similar usbb might be the only things missing to make IR
efficient for developing low level routines for a bignum library!

Bill.

On 12 June 2010 19:37, Bill Hart <goodwillhart at googlemail.com>
wrote:> On 12 June 2010 00:51, Eli Friedman <eli.friedman at gmail.com>
wrote:
>> On Fri, Jun 11, 2010 at 3:28 PM, Bill Hart <goodwillhart at
googlemail.com> wrote:
>>> Hi Eli,
>>>
>>> On 11 June 2010 22:44, Eli Friedman <eli.friedman at
gmail.com> wrote:
>>>> On Fri, Jun 11, 2010 at 10:37 AM, Bill Hart <goodwillhart at
googlemail.com> wrote:
>>>>> a) What plans are there to support addition, subtraction,
>>>>> multiplication, division, shifting, logical not and other
logic ops
>>>>> for >= 64/128 bits (machine word size, whatever)? The
documentation
>>>>> mentions integers of millions of bits....
>>>>
>>>> Pretty much everything besides muliplication and division
should work
>>>> at any width; the result is generally not especially efficient,
>>>> though.
>>>
>>> OK, I only tried multiplication last night when I first had a
decent
>>> fiddle (been marking exams, so haven't had much time to really
get
>>> dirty hands, sorry).
>>>
>>> Of course efficiency is imperative for bignum stuff as it underpins
>>> absolutely behemoth projects like Sage (http://www.sagemath.org).
>>> Every cycle counts, so to speak. One cycle per limb difference in
>>> multiplication makes a nearly 40% difference in absolutely
everything
>>> for literally many thousands of developers!
>>
>> Right; improvements are definitely welcome here; you might want to
>> take a look at what we currently generate here, though, to see what
>> improvements are most necessary.
>>
>
> I think I have an example of why it is somehow important to be able to
> retrieve the carry and do an add with carry. Consider this short C
> program:
>
>
> #include <stdlib.h>
> #include <stdio.h>
>
> #define BITS 64
>
> /****************************************
>
>   Types
>
> ****************************************/
>
> typedef unsigned long ul;
> typedef __uint128_t ull;
> typedef ulong mp_size;
> typedef const ulong * mp_src;
> typedef ulong * mp_dst;
> typedef ulong * mp_ptr;
>
>
> /****************************************
>
>   Random routines
>
> ****************************************/
>
> ull __randval = (ull) 13993185049168412078UL;
> const ull __randprime = (ull) 9223372036854775814UL * 2 + 1;
> const ull __randmult = 18148508189596611927UL;
>
> ul ul_randlimb(void)
> {
>   __randval = (__randval * __randmult) % __randprime;
>   return (ul) __randval;
> }
>
> /****************************************
>
>   Unsigned multiple precision routines
>
>
> ****************************************/
>
> mp_ptr mp_init(mp_size n)
> {
>   return malloc(n*sizeof(ul));
> }
>
> static inline
> ul mp_add_nc(mp_dst r, mp_src a, mp_src b, mp_size n)
> {
>   long i;
>
>   ul cy;
>
>   const __uint128_t v = (__uint128_t) a[0] + b[0];
>   r[0] = (ul) v;
>   cy = v >> 64;
>
>   for (i = 1; i < n; i++) {
>      __uint128_t u = (__uint128_t) a[i] + b[i] + cy;
>      r[i] = (ul) u;
>      cy = u >> BITS;
>   }
>
>   return cy;
> }
>
> void mp_rand_n(mp_dst r, mp_size n)
> {
>   mp_size i;
>
>   for (i = 0; i < n; i++)
>      r[i] = ul_randlimb();
> }
>
> int main(void)
> {
>
>   mp_ptr a, b, c;
>   ul i;
>
>   a = mp_init(1000);
>   b = mp_init(1000);
>   c = mp_init(1000);
>
>   mp_rand_n(a, 1000);
>   mp_rand_n(b, 1000);
>
>   for (i = 0; i < 2400000; i++)
>      mp_add_nc(c, a, b, 1000);
>
>   return 0;
> }
>
> Ignore all of it except the mp_add_nc function. Now this runs at 4
> cycles per int64 addition on AMD K10. If I fiddle a bit and loop
> unroll, I get 2.5 cycles. But optimal is actually 1.6 cycles.
>
> The part of the loop in question becomes:
>
> %tmp.i = add i64 %indvar.i, 1                   ; <i64> [#uses=2]
>  %22 = load i64* %scevgep.i, align 8             ; <i64> [#uses=1]
>  %23 = zext i64 %22 to i128                      ; <i128> [#uses=1]
>  %24 = load i64* %scevgep3.i, align 8            ; <i64> [#uses=1]
>  %25 = zext i64 %24 to i128                      ; <i128> [#uses=1]
>  %26 = zext i64 %cy.02.i to i128                 ; <i128> [#uses=1]
>  %27 = add i128 %23, %26                         ; <i128> [#uses=1]
>  %28 = add i128 %27, %25                         ; <i128> [#uses=2]
>  %29 = trunc i128 %28 to i64                     ; <i64> [#uses=1]
>  store i64 %29, i64* %scevgep4.i, align 8
>  %30 = lshr i128 %28, 64                         ; <i128> [#uses=1]
>  %31 = trunc i128 %30 to i64                     ; <i64> [#uses=1]
>  %exitcond = icmp eq i64 %tmp.i, 999             ; <i1> [#uses=1]
>
> In other words, it just extends everything to an i128 and adds.
> There's no way to tell it that it can add a[i], b[i] and cy with a
> single adc. (Well it could if the loop iteration wasn't messing with
> the carry flag).
>
> Indeed, this compiles to:
>
>
> .LBB1_7:                                # %bb.i
>                                        #   Parent Loop BB1_6 Depth=1
>                                        # =>  This Inner Loop Header:
Depth=2
>        addq    (%rbx,%rsi,8), %rdi
>        movl    $0, %r8d
>        adcq    $0, %r8
>        addq    (%r14,%rsi,8), %rdi
>        adcq    $0, %r8
>        movq    %rdi, (%r15,%rsi,8)
>        incq    %rsi
>        cmpq    $1000, %rsi             # imm = 0x3E8
>        movq    %r8, %rdi
>        jne     .LBB1_7
>
> So it basically tries to keep track of the carry in %r8 instead of in
> the carry flag.
>
> As hinted, the other optimisation missed here, is that instead of
> comparing with $1000 it can start with %rsi at $-1000 and increment
> each iteration and simply do a jnz .LBB1_7 doing away with the cmpq.
>
> I've tried tricking it into doing this in every way I can think of,
> but without success so far.
>
> Bill.
>

Alistair Lynn

2010-Jun-13 03:33 UTC

head link

[LLVMdev] Bignum development

Hi Bill-

I think, ideally, the backend would be able to match arbitrary-precision
arithmetic to add-with-carry or subtract-with-borrow through i65/i33. That would
remove the need for the overflow intrinsics entirely.

Alistair

On 13 Jun 2010, at 02:27, Bill Hart wrote:
> I was able to get the loop to increment from -999 to 0 using IR
> directly. That got rid of the cmpq.
> 
> The carry i was after was able to be obtained using the intrinsic
> @llvm.uadd.with.overflow.i64, however there is no way to add with
> carry and have it realise that the resulting *carry out* cannot exceed
> 1. It actually writes the carry to a byte, and then uses logical
> operations on it, which slows things down much more.
> 
> I guess what is needed is an intrinsic @llvm.uadc.with.overflow.i64
> which should take three arguments, the two i64's being added and an
> i1, being the carry from before.
> 
> This and a similar usbb might be the only things missing to make IR
> efficient for developing low level routines for a bignum library!
> 
> Bill.
> 
> On 12 June 2010 19:37, Bill Hart <goodwillhart at googlemail.com>
wrote:
>> On 12 June 2010 00:51, Eli Friedman <eli.friedman at gmail.com>
wrote:
>>> On Fri, Jun 11, 2010 at 3:28 PM, Bill Hart <goodwillhart at
googlemail.com> wrote:
>>>> Hi Eli,
>>>> 
>>>> On 11 June 2010 22:44, Eli Friedman <eli.friedman at
gmail.com> wrote:
>>>>> On Fri, Jun 11, 2010 at 10:37 AM, Bill Hart
<goodwillhart at googlemail.com> wrote:
>>>>>> a) What plans are there to support addition,
subtraction,
>>>>>> multiplication, division, shifting, logical not and
other logic ops
>>>>>> for >= 64/128 bits (machine word size, whatever)?
The documentation
>>>>>> mentions integers of millions of bits....
>>>>> 
>>>>> Pretty much everything besides muliplication and division
should work
>>>>> at any width; the result is generally not especially
efficient,
>>>>> though.
>>>> 
>>>> OK, I only tried multiplication last night when I first had a
decent
>>>> fiddle (been marking exams, so haven't had much time to
really get
>>>> dirty hands, sorry).
>>>> 
>>>> Of course efficiency is imperative for bignum stuff as it
underpins
>>>> absolutely behemoth projects like Sage
(http://www.sagemath.org).
>>>> Every cycle counts, so to speak. One cycle per limb difference
in
>>>> multiplication makes a nearly 40% difference in absolutely
everything
>>>> for literally many thousands of developers!
>>> 
>>> Right; improvements are definitely welcome here; you might want to
>>> take a look at what we currently generate here, though, to see what
>>> improvements are most necessary.
>>> 
>> 
>> I think I have an example of why it is somehow important to be able to
>> retrieve the carry and do an add with carry. Consider this short C
>> program:
>> 
>> 
>> #include <stdlib.h>
>> #include <stdio.h>
>> 
>> #define BITS 64
>> 
>> /****************************************
>> 
>>   Types
>> 
>> ****************************************/
>> 
>> typedef unsigned long ul;
>> typedef __uint128_t ull;
>> typedef ulong mp_size;
>> typedef const ulong * mp_src;
>> typedef ulong * mp_dst;
>> typedef ulong * mp_ptr;
>> 
>> 
>> /****************************************
>> 
>>   Random routines
>> 
>> ****************************************/
>> 
>> ull __randval = (ull) 13993185049168412078UL;
>> const ull __randprime = (ull) 9223372036854775814UL * 2 + 1;
>> const ull __randmult = 18148508189596611927UL;
>> 
>> ul ul_randlimb(void)
>> {
>>   __randval = (__randval * __randmult) % __randprime;
>>   return (ul) __randval;
>> }
>> 
>> /****************************************
>> 
>>   Unsigned multiple precision routines
>> 
>> 
>> ****************************************/
>> 
>> mp_ptr mp_init(mp_size n)
>> {
>>   return malloc(n*sizeof(ul));
>> }
>> 
>> static inline
>> ul mp_add_nc(mp_dst r, mp_src a, mp_src b, mp_size n)
>> {
>>   long i;
>> 
>>   ul cy;
>> 
>>   const __uint128_t v = (__uint128_t) a[0] + b[0];
>>   r[0] = (ul) v;
>>   cy = v >> 64;
>> 
>>   for (i = 1; i < n; i++) {
>>      __uint128_t u = (__uint128_t) a[i] + b[i] + cy;
>>      r[i] = (ul) u;
>>      cy = u >> BITS;
>>   }
>> 
>>   return cy;
>> }
>> 
>> void mp_rand_n(mp_dst r, mp_size n)
>> {
>>   mp_size i;
>> 
>>   for (i = 0; i < n; i++)
>>      r[i] = ul_randlimb();
>> }
>> 
>> int main(void)
>> {
>> 
>>   mp_ptr a, b, c;
>>   ul i;
>> 
>>   a = mp_init(1000);
>>   b = mp_init(1000);
>>   c = mp_init(1000);
>> 
>>   mp_rand_n(a, 1000);
>>   mp_rand_n(b, 1000);
>> 
>>   for (i = 0; i < 2400000; i++)
>>      mp_add_nc(c, a, b, 1000);
>> 
>>   return 0;
>> }
>> 
>> Ignore all of it except the mp_add_nc function. Now this runs at 4
>> cycles per int64 addition on AMD K10. If I fiddle a bit and loop
>> unroll, I get 2.5 cycles. But optimal is actually 1.6 cycles.
>> 
>> The part of the loop in question becomes:
>> 
>> %tmp.i = add i64 %indvar.i, 1                   ; <i64> [#uses=2]
>>  %22 = load i64* %scevgep.i, align 8             ; <i64>
[#uses=1]
>>  %23 = zext i64 %22 to i128                      ; <i128>
[#uses=1]
>>  %24 = load i64* %scevgep3.i, align 8            ; <i64>
[#uses=1]
>>  %25 = zext i64 %24 to i128                      ; <i128>
[#uses=1]
>>  %26 = zext i64 %cy.02.i to i128                 ; <i128>
[#uses=1]
>>  %27 = add i128 %23, %26                         ; <i128>
[#uses=1]
>>  %28 = add i128 %27, %25                         ; <i128>
[#uses=2]
>>  %29 = trunc i128 %28 to i64                     ; <i64>
[#uses=1]
>>  store i64 %29, i64* %scevgep4.i, align 8
>>  %30 = lshr i128 %28, 64                         ; <i128>
[#uses=1]
>>  %31 = trunc i128 %30 to i64                     ; <i64>
[#uses=1]
>>  %exitcond = icmp eq i64 %tmp.i, 999             ; <i1> [#uses=1]
>> 
>> In other words, it just extends everything to an i128 and adds.
>> There's no way to tell it that it can add a[i], b[i] and cy with a
>> single adc. (Well it could if the loop iteration wasn't messing
with
>> the carry flag).
>> 
>> Indeed, this compiles to:
>> 
>> 
>> .LBB1_7:                                # %bb.i
>>                                        #   Parent Loop BB1_6 Depth=1
>>                                        # =>  This Inner Loop Header:
Depth=2
>>        addq    (%rbx,%rsi,8), %rdi
>>        movl    $0, %r8d
>>        adcq    $0, %r8
>>        addq    (%r14,%rsi,8), %rdi
>>        adcq    $0, %r8
>>        movq    %rdi, (%r15,%rsi,8)
>>        incq    %rsi
>>        cmpq    $1000, %rsi             # imm = 0x3E8
>>        movq    %r8, %rdi
>>        jne     .LBB1_7
>> 
>> So it basically tries to keep track of the carry in %r8 instead of in
>> the carry flag.
>> 
>> As hinted, the other optimisation missed here, is that instead of
>> comparing with $1000 it can start with %rsi at $-1000 and increment
>> each iteration and simply do a jnz .LBB1_7 doing away with the cmpq.
>> 
>> I've tried tricking it into doing this in every way I can think of,
>> but without success so far.
>> 
>> Bill.
>> 
> 
> _______________________________________________
> LLVM Developers mailing list
> LLVMdev at cs.uiuc.edu         http://llvm.cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev

Bill Hart

2010-Jun-13 03:52 UTC

head link

[LLVMdev] Bignum development

Yeah I had a think about it, and I think intrinsics are the wrong way
to do it. So I'd say you are likely right.

Bill.

On 13 June 2010 04:33, Alistair Lynn <arplynn at gmail.com>
wrote:> Hi Bill-
>
> I think, ideally, the backend would be able to match arbitrary-precision
arithmetic to add-with-carry or subtract-with-borrow through i65/i33. That would
remove the need for the overflow intrinsics entirely.
>
> Alistair
>
> On 13 Jun 2010, at 02:27, Bill Hart wrote:
>
>> I was able to get the loop to increment from -999 to 0 using IR
>> directly. That got rid of the cmpq.
>>
>> The carry i was after was able to be obtained using the intrinsic
>> @llvm.uadd.with.overflow.i64, however there is no way to add with
>> carry and have it realise that the resulting *carry out* cannot exceed
>> 1. It actually writes the carry to a byte, and then uses logical
>> operations on it, which slows things down much more.
>>
>> I guess what is needed is an intrinsic @llvm.uadc.with.overflow.i64
>> which should take three arguments, the two i64's being added and an
>> i1, being the carry from before.
>>
>> This and a similar usbb might be the only things missing to make IR
>> efficient for developing low level routines for a bignum library!
>>
>> Bill.
>>
>> On 12 June 2010 19:37, Bill Hart <goodwillhart at googlemail.com>
wrote:
>>> On 12 June 2010 00:51, Eli Friedman <eli.friedman at
gmail.com> wrote:
>>>> On Fri, Jun 11, 2010 at 3:28 PM, Bill Hart <goodwillhart at
googlemail.com> wrote:
>>>>> Hi Eli,
>>>>>
>>>>> On 11 June 2010 22:44, Eli Friedman <eli.friedman at
gmail.com> wrote:
>>>>>> On Fri, Jun 11, 2010 at 10:37 AM, Bill Hart
<goodwillhart at googlemail.com> wrote:
>>>>>>> a) What plans are there to support addition,
subtraction,
>>>>>>> multiplication, division, shifting, logical not and
other logic ops
>>>>>>> for >= 64/128 bits (machine word size,
whatever)? The documentation
>>>>>>> mentions integers of millions of bits....
>>>>>>
>>>>>> Pretty much everything besides muliplication and
division should work
>>>>>> at any width; the result is generally not especially
efficient,
>>>>>> though.
>>>>>
>>>>> OK, I only tried multiplication last night when I first had
a decent
>>>>> fiddle (been marking exams, so haven't had much time to
really get
>>>>> dirty hands, sorry).
>>>>>
>>>>> Of course efficiency is imperative for bignum stuff as it
underpins
>>>>> absolutely behemoth projects like Sage
(http://www.sagemath.org).
>>>>> Every cycle counts, so to speak. One cycle per limb
difference in
>>>>> multiplication makes a nearly 40% difference in absolutely
everything
>>>>> for literally many thousands of developers!
>>>>
>>>> Right; improvements are definitely welcome here; you might want
to
>>>> take a look at what we currently generate here, though, to see
what
>>>> improvements are most necessary.
>>>>
>>>
>>> I think I have an example of why it is somehow important to be able
to
>>> retrieve the carry and do an add with carry. Consider this short C
>>> program:
>>>
>>>
>>> #include <stdlib.h>
>>> #include <stdio.h>
>>>
>>> #define BITS 64
>>>
>>> /****************************************
>>>
>>>   Types
>>>
>>> ****************************************/
>>>
>>> typedef unsigned long ul;
>>> typedef __uint128_t ull;
>>> typedef ulong mp_size;
>>> typedef const ulong * mp_src;
>>> typedef ulong * mp_dst;
>>> typedef ulong * mp_ptr;
>>>
>>>
>>> /****************************************
>>>
>>>   Random routines
>>>
>>> ****************************************/
>>>
>>> ull __randval = (ull) 13993185049168412078UL;
>>> const ull __randprime = (ull) 9223372036854775814UL * 2 + 1;
>>> const ull __randmult = 18148508189596611927UL;
>>>
>>> ul ul_randlimb(void)
>>> {
>>>   __randval = (__randval * __randmult) % __randprime;
>>>   return (ul) __randval;
>>> }
>>>
>>> /****************************************
>>>
>>>   Unsigned multiple precision routines
>>>
>>>
>>> ****************************************/
>>>
>>> mp_ptr mp_init(mp_size n)
>>> {
>>>   return malloc(n*sizeof(ul));
>>> }
>>>
>>> static inline
>>> ul mp_add_nc(mp_dst r, mp_src a, mp_src b, mp_size n)
>>> {
>>>   long i;
>>>
>>>   ul cy;
>>>
>>>   const __uint128_t v = (__uint128_t) a[0] + b[0];
>>>   r[0] = (ul) v;
>>>   cy = v >> 64;
>>>
>>>   for (i = 1; i < n; i++) {
>>>      __uint128_t u = (__uint128_t) a[i] + b[i] + cy;
>>>      r[i] = (ul) u;
>>>      cy = u >> BITS;
>>>   }
>>>
>>>   return cy;
>>> }
>>>
>>> void mp_rand_n(mp_dst r, mp_size n)
>>> {
>>>   mp_size i;
>>>
>>>   for (i = 0; i < n; i++)
>>>      r[i] = ul_randlimb();
>>> }
>>>
>>> int main(void)
>>> {
>>>
>>>   mp_ptr a, b, c;
>>>   ul i;
>>>
>>>   a = mp_init(1000);
>>>   b = mp_init(1000);
>>>   c = mp_init(1000);
>>>
>>>   mp_rand_n(a, 1000);
>>>   mp_rand_n(b, 1000);
>>>
>>>   for (i = 0; i < 2400000; i++)
>>>      mp_add_nc(c, a, b, 1000);
>>>
>>>   return 0;
>>> }
>>>
>>> Ignore all of it except the mp_add_nc function. Now this runs at 4
>>> cycles per int64 addition on AMD K10. If I fiddle a bit and loop
>>> unroll, I get 2.5 cycles. But optimal is actually 1.6 cycles.
>>>
>>> The part of the loop in question becomes:
>>>
>>> %tmp.i = add i64 %indvar.i, 1                   ; <i64>
[#uses=2]
>>>  %22 = load i64* %scevgep.i, align 8             ; <i64>
[#uses=1]
>>>  %23 = zext i64 %22 to i128                      ; <i128>
[#uses=1]
>>>  %24 = load i64* %scevgep3.i, align 8            ; <i64>
[#uses=1]
>>>  %25 = zext i64 %24 to i128                      ; <i128>
[#uses=1]
>>>  %26 = zext i64 %cy.02.i to i128                 ; <i128>
[#uses=1]
>>>  %27 = add i128 %23, %26                         ; <i128>
[#uses=1]
>>>  %28 = add i128 %27, %25                         ; <i128>
[#uses=2]
>>>  %29 = trunc i128 %28 to i64                     ; <i64>
[#uses=1]
>>>  store i64 %29, i64* %scevgep4.i, align 8
>>>  %30 = lshr i128 %28, 64                         ; <i128>
[#uses=1]
>>>  %31 = trunc i128 %30 to i64                     ; <i64>
[#uses=1]
>>>  %exitcond = icmp eq i64 %tmp.i, 999             ; <i1>
[#uses=1]
>>>
>>> In other words, it just extends everything to an i128 and adds.
>>> There's no way to tell it that it can add a[i], b[i] and cy
with a
>>> single adc. (Well it could if the loop iteration wasn't messing
with
>>> the carry flag).
>>>
>>> Indeed, this compiles to:
>>>
>>>
>>> .LBB1_7:                                # %bb.i
>>>                                        #   Parent Loop BB1_6
Depth=1
>>>                                        # =>  This Inner Loop
Header: Depth=2
>>>        addq    (%rbx,%rsi,8), %rdi
>>>        movl    $0, %r8d
>>>        adcq    $0, %r8
>>>        addq    (%r14,%rsi,8), %rdi
>>>        adcq    $0, %r8
>>>        movq    %rdi, (%r15,%rsi,8)
>>>        incq    %rsi
>>>        cmpq    $1000, %rsi             # imm = 0x3E8
>>>        movq    %r8, %rdi
>>>        jne     .LBB1_7
>>>
>>> So it basically tries to keep track of the carry in %r8 instead of
in
>>> the carry flag.
>>>
>>> As hinted, the other optimisation missed here, is that instead of
>>> comparing with $1000 it can start with %rsi at $-1000 and increment
>>> each iteration and simply do a jnz .LBB1_7 doing away with the
cmpq.
>>>
>>> I've tried tricking it into doing this in every way I can think
of,
>>> but without success so far.
>>>
>>> Bill.
>>>
>>
>> _______________________________________________
>> LLVM Developers mailing list
>> LLVMdev at cs.uiuc.edu         http://llvm.cs.uiuc.edu
>> http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev
>
>

Seemingly Similar Threads

Search for more possibly parallel threads

llvm dev - Jun 2010 - [LLVMdev] Bignum development

[LLVMdev] Bignum development

[LLVMdev] Bignum development

[LLVMdev] Bignum development

Seemingly Similar Threads