Le 29 mai 2010 à 01:08, Bill Wendling a écrit :> Hi Stéphane, > > The SSE support is the LLVM backend is fine. What is the code that's generated? Do you have some short examples of where LLVM doesn't do as well as the equivalent scalar code? > > -bw > > On May 28, 2010, at 12:13 PM, Stéphane Letz wrote:We are actually testing LLVM for the Faust language (http://faust.grame.fr/) Currently Faust generates à C++ class from its .dsp Faust source file. So for the simple following Faust example : process = (+,+):*; Which can be displayed as the following processor (takes 4 streams of float samples, do a "+" and then a "*" operation on the streams to produce a single output) -------------- next part -------------- A non-text attachment was scrubbed... Name: plus.png Type: image/png Size: 10191 bytes Desc: not available URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20100529/b44926fc/attachment.png> -------------- next part -------------- For scalar code in C++ code is : virtual void compute (int count, FAUSTFLOAT** input, FAUSTFLOAT** output) { FAUSTFLOAT* input0 = input[0]; FAUSTFLOAT* input1 = input[1]; FAUSTFLOAT* input2 = input[2]; FAUSTFLOAT* input3 = input[3]; FAUSTFLOAT* output0 = output[0]; for (int i=0; i<count; i++) { output0[i] = (FAUSTFLOAT)(((float)input2[i] + (float)input3[i]) * ((float)input0[i] + (float)input1[i])); } } The "vectorized" C++ code is : virtual void compute (int fullcount, FAUSTFLOAT** input, FAUSTFLOAT** output) { for (int index = 0; index < fullcount; index += 32) { int count = min(32, fullcount-index); FAUSTFLOAT* input0 = &input[0][index]; FAUSTFLOAT* input1 = &input[1][index]; FAUSTFLOAT* input2 = &input[2][index]; FAUSTFLOAT* input3 = &input[3][index]; FAUSTFLOAT* output0 = &output[0][index]; // SECTION : 1 for (int i=0; i<count; i++) { output0[i] = (FAUSTFLOAT)(((float)input2[i] + (float)input3[i]) * ((float)input0[i] + (float)input1[i])); } } } (so basically the C++ code is separated in "vectors" [here 32 samples] to be computed in separated loops that can be auto-vectorized by some compilers like Intel ICC, this works quite well...) The scalar LLVM code is : define void @llvm_compute(%struct.llvm_dsp* %obj, i32 %count, float** noalias %inputs, float** noalias %outputs) nounwind readnone ssp { entry: %input_array_ptr0 = getelementptr inbounds float** %inputs, i64 0 %input0 = load float** %input_array_ptr0, align 8 %input_array_ptr1 = getelementptr inbounds float** %inputs, i64 1 %input1 = load float** %input_array_ptr1, align 8 %input_array_ptr2 = getelementptr inbounds float** %inputs, i64 2 %input2 = load float** %input_array_ptr2, align 8 %input_array_ptr3 = getelementptr inbounds float** %inputs, i64 3 %input3 = load float** %input_array_ptr3, align 8 %output_array_ptr0 = getelementptr inbounds float** %outputs, i64 0 %output0 = load float** %output_array_ptr0, align 8 %out = icmp sgt i32 %count, 0 br i1 %out, label %convert, label %return convert: %count_64 = zext i32 %count to i64 br label %loop loop: %indvar = phi i64 [ 0, %convert ], [ %indvar.next, %loop ] %output_ptr0 = getelementptr float* %output0, i64 %indvar %input_ptr1 = getelementptr float* %input1, i64 %indvar %fTemp0 = load float* %input_ptr1, align 4 %input_ptr0 = getelementptr float* %input0, i64 %indvar %fTemp1 = load float* %input_ptr0, align 4 %fTemp2 = fadd float %fTemp1, %fTemp0 %input_ptr3 = getelementptr float* %input3, i64 %indvar %fTemp3 = load float* %input_ptr3, align 4 %input_ptr2 = getelementptr float* %input2, i64 %indvar %fTemp4 = load float* %input_ptr2, align 4 %fTemp5 = fadd float %fTemp4, %fTemp3 %fTemp6 = fmul float %fTemp5, %fTemp2 store float %fTemp6, float* %output_ptr0, align 4 %indvar.next = add i64 %indvar, 1 %exitcond = icmp eq i64 %indvar.next, %count_64 br i1 %exitcond, label %return, label %loop return: ret void } And the vectorized LLVM code is : define void @llvm_compute(%struct.llvm_dsp* noalias %obj, i32 %count, <32 x float>** noalias %inputs, <32 x float>** noalias %outputs) nounwind readnone ssp { entry: %input_array_ptr0 = getelementptr inbounds <32 x float>** %inputs, i64 0 %input0 = load <32 x float>** %input_array_ptr0 %input_array_ptr1 = getelementptr inbounds <32 x float>** %inputs, i64 1 %input1 = load <32 x float>** %input_array_ptr1 %input_array_ptr2 = getelementptr inbounds <32 x float>** %inputs, i64 2 %input2 = load <32 x float>** %input_array_ptr2 %input_array_ptr3 = getelementptr inbounds <32 x float>** %inputs, i64 3 %input3 = load <32 x float>** %input_array_ptr3 %output_array_ptr0 = getelementptr inbounds <32 x float>** %outputs, i64 0 %output0 = load <32 x float>** %output_array_ptr0 %out = icmp sgt i32 %count, 0 br i1 %out, label %convert, label %return convert: %count_64 = zext i32 %count to i64 br label %loop0 loop0: %indvar = phi i64 [ 0, %convert ], [ %indvar.next, %loop0 ] %output_ptr0 = getelementptr <32 x float>* %output0, i64 %indvar %input_ptr1 = getelementptr <32 x float>* %input1, i64 %indvar %fVector0 = load <32 x float>* %input_ptr1, align 16; %input_ptr0 = getelementptr <32 x float>* %input0, i64 %indvar %fVector1 = load <32 x float>* %input_ptr0, align 16; %fVector2 = fadd <32 x float> %fVector1, %fVector0; %input_ptr3 = getelementptr <32 x float>* %input3, i64 %indvar %fVector3 = load <32 x float>* %input_ptr3, align 16; %input_ptr2 = getelementptr <32 x float>* %input2, i64 %indvar %fVector4 = load <32 x float>* %input_ptr2, align 16; %fVector5 = fadd <32 x float> %fVector4, %fVector3; %fVector6 = fmul <32 x float> %fVector5, %fVector2; store <32 x float> %fVector6, <32 x float>* %output_ptr0, align 16 %indvar.next = add i64 %indvar, 1 %exitcond = icmp eq i64 %indvar.next, %count_64 br i1 %exitcond, label %return, label %loop0 return: ret void } We tried to play with the "align" on the load/store or "noalias" on the compute function parameters without real change. Do you see anything clear that not correct in the generated vectorized LLVM code? Maybe the memory bandwidth is the limiting factor in this simple example without much computation on the samples? Thanks. Stéphane Letz
On Sat, May 29, 2010 at 12:42 AM, Stéphane Letz <letz at grame.fr> wrote:> > Le 29 mai 2010 à 01:08, Bill Wendling a écrit : > >> Hi Stéphane, >> >> The SSE support is the LLVM backend is fine. What is the code that's generated? Do you have some short examples of where LLVM doesn't do as well as the equivalent scalar code? >> >> -bw >> >> On May 28, 2010, at 12:13 PM, Stéphane Letz wrote: > > > We are actually testing LLVM for the Faust language (http://faust.grame.fr/) > > Currently Faust generates à C++ class from its .dsp Faust source file. So for the simple following Faust example : > > process = (+,+):*; > > Which can be displayed as the following processor (takes 4 streams of float samples, do a "+" and then a "*" operation on the streams to produce a single output) > > > > > > For scalar code in C++ code is : > > virtual void compute (int count, FAUSTFLOAT** input, FAUSTFLOAT** output) { > FAUSTFLOAT* input0 = input[0]; > FAUSTFLOAT* input1 = input[1]; > FAUSTFLOAT* input2 = input[2]; > FAUSTFLOAT* input3 = input[3]; > FAUSTFLOAT* output0 = output[0]; > for (int i=0; i<count; i++) { > output0[i] = (FAUSTFLOAT)(((float)input2[i] + (float)input3[i]) * ((float)input0[i] + (float)input1[i])); > } > } > > The "vectorized" C++ code is : > > virtual void compute (int fullcount, FAUSTFLOAT** input, FAUSTFLOAT** output) { > for (int index = 0; index < fullcount; index += 32) { > int count = min(32, fullcount-index); > FAUSTFLOAT* input0 = &input[0][index]; > FAUSTFLOAT* input1 = &input[1][index]; > FAUSTFLOAT* input2 = &input[2][index]; > FAUSTFLOAT* input3 = &input[3][index]; > FAUSTFLOAT* output0 = &output[0][index]; > // SECTION : 1 > for (int i=0; i<count; i++) { > output0[i] = (FAUSTFLOAT)(((float)input2[i] + (float)input3[i]) * ((float)input0[i] + (float)input1[i])); > } > } > } > > (so basically the C++ code is separated in "vectors" [here 32 samples] to be computed in separated loops that can be auto-vectorized by some compilers like Intel ICC, this works quite well...) > > The scalar LLVM code is : > > define void @llvm_compute(%struct.llvm_dsp* %obj, i32 %count, float** noalias %inputs, float** noalias %outputs) nounwind readnone ssp { > entry: > %input_array_ptr0 = getelementptr inbounds float** %inputs, i64 0 > %input0 = load float** %input_array_ptr0, align 8 > %input_array_ptr1 = getelementptr inbounds float** %inputs, i64 1 > %input1 = load float** %input_array_ptr1, align 8 > %input_array_ptr2 = getelementptr inbounds float** %inputs, i64 2 > %input2 = load float** %input_array_ptr2, align 8 > %input_array_ptr3 = getelementptr inbounds float** %inputs, i64 3 > %input3 = load float** %input_array_ptr3, align 8 > %output_array_ptr0 = getelementptr inbounds float** %outputs, i64 0 > %output0 = load float** %output_array_ptr0, align 8 > %out = icmp sgt i32 %count, 0 > br i1 %out, label %convert, label %return > convert: > %count_64 = zext i32 %count to i64 > br label %loop > loop: > %indvar = phi i64 [ 0, %convert ], [ %indvar.next, %loop ] > %output_ptr0 = getelementptr float* %output0, i64 %indvar > %input_ptr1 = getelementptr float* %input1, i64 %indvar > %fTemp0 = load float* %input_ptr1, align 4 > %input_ptr0 = getelementptr float* %input0, i64 %indvar > %fTemp1 = load float* %input_ptr0, align 4 > %fTemp2 = fadd float %fTemp1, %fTemp0 > %input_ptr3 = getelementptr float* %input3, i64 %indvar > %fTemp3 = load float* %input_ptr3, align 4 > %input_ptr2 = getelementptr float* %input2, i64 %indvar > %fTemp4 = load float* %input_ptr2, align 4 > %fTemp5 = fadd float %fTemp4, %fTemp3 > %fTemp6 = fmul float %fTemp5, %fTemp2 > store float %fTemp6, float* %output_ptr0, align 4 > %indvar.next = add i64 %indvar, 1 > %exitcond = icmp eq i64 %indvar.next, %count_64 > br i1 %exitcond, label %return, label %loop > return: > ret void > } > > > And the vectorized LLVM code is : > > define void @llvm_compute(%struct.llvm_dsp* noalias %obj, i32 %count, <32 x float>** noalias %inputs, <32 x float>** noalias %outputs) nounwind readnone ssp { > entry: > %input_array_ptr0 = getelementptr inbounds <32 x float>** %inputs, i64 0 > %input0 = load <32 x float>** %input_array_ptr0 > %input_array_ptr1 = getelementptr inbounds <32 x float>** %inputs, i64 1 > %input1 = load <32 x float>** %input_array_ptr1 > %input_array_ptr2 = getelementptr inbounds <32 x float>** %inputs, i64 2 > %input2 = load <32 x float>** %input_array_ptr2 > %input_array_ptr3 = getelementptr inbounds <32 x float>** %inputs, i64 3 > %input3 = load <32 x float>** %input_array_ptr3 > %output_array_ptr0 = getelementptr inbounds <32 x float>** %outputs, i64 0 > %output0 = load <32 x float>** %output_array_ptr0 > %out = icmp sgt i32 %count, 0 > br i1 %out, label %convert, label %return > convert: > %count_64 = zext i32 %count to i64 > br label %loop0 > loop0: > %indvar = phi i64 [ 0, %convert ], [ %indvar.next, %loop0 ] > %output_ptr0 = getelementptr <32 x float>* %output0, i64 %indvar > %input_ptr1 = getelementptr <32 x float>* %input1, i64 %indvar > %fVector0 = load <32 x float>* %input_ptr1, align 16; > %input_ptr0 = getelementptr <32 x float>* %input0, i64 %indvar > %fVector1 = load <32 x float>* %input_ptr0, align 16; > %fVector2 = fadd <32 x float> %fVector1, %fVector0; > %input_ptr3 = getelementptr <32 x float>* %input3, i64 %indvar > %fVector3 = load <32 x float>* %input_ptr3, align 16; > %input_ptr2 = getelementptr <32 x float>* %input2, i64 %indvar > %fVector4 = load <32 x float>* %input_ptr2, align 16; > %fVector5 = fadd <32 x float> %fVector4, %fVector3; > %fVector6 = fmul <32 x float> %fVector5, %fVector2; > store <32 x float> %fVector6, <32 x float>* %output_ptr0, align 16 > > %indvar.next = add i64 %indvar, 1 > %exitcond = icmp eq i64 %indvar.next, %count_64 > br i1 %exitcond, label %return, label %loop0 > return: > ret void > } > > We tried to play with the "align" on the load/store or "noalias" on the compute function parameters without real change. > > Do you see anything clear that not correct in the generated vectorized LLVM code? Maybe the memory bandwidth is the limiting factor in this simple example without much computation on the samples?<32 x float> takes up 8 SSE registers; you're likely running into issues with register pressure. Does it work better if you use something smaller like <4 x float>? Besides that, I don't see any obvious issues. -Eli
On Sat, May 29, 2010 at 1:18 AM, Eli Friedman <eli.friedman at gmail.com> wrote:> On Sat, May 29, 2010 at 12:42 AM, Stéphane Letz <letz at grame.fr> wrote: >> >> Le 29 mai 2010 à 01:08, Bill Wendling a écrit : >> >>> Hi Stéphane, >>> >>> The SSE support is the LLVM backend is fine. What is the code that's generated? Do you have some short examples of where LLVM doesn't do as well as the equivalent scalar code? >>> >>> -bw >>> >>> On May 28, 2010, at 12:13 PM, Stéphane Letz wrote: >> >> >> We are actually testing LLVM for the Faust language (http://faust.grame.fr/) >> >> Currently Faust generates à C++ class from its .dsp Faust source file. So for the simple following Faust example : >> >> process = (+,+):*; >> >> Which can be displayed as the following processor (takes 4 streams of float samples, do a "+" and then a "*" operation on the streams to produce a single output) >> >> >> >> >> >> For scalar code in C++ code is : >> >> virtual void compute (int count, FAUSTFLOAT** input, FAUSTFLOAT** output) { >> FAUSTFLOAT* input0 = input[0]; >> FAUSTFLOAT* input1 = input[1]; >> FAUSTFLOAT* input2 = input[2]; >> FAUSTFLOAT* input3 = input[3]; >> FAUSTFLOAT* output0 = output[0]; >> for (int i=0; i<count; i++) { >> output0[i] = (FAUSTFLOAT)(((float)input2[i] + (float)input3[i]) * ((float)input0[i] + (float)input1[i])); >> } >> } >> >> The "vectorized" C++ code is : >> >> virtual void compute (int fullcount, FAUSTFLOAT** input, FAUSTFLOAT** output) { >> for (int index = 0; index < fullcount; index += 32) { >> int count = min(32, fullcount-index); >> FAUSTFLOAT* input0 = &input[0][index]; >> FAUSTFLOAT* input1 = &input[1][index]; >> FAUSTFLOAT* input2 = &input[2][index]; >> FAUSTFLOAT* input3 = &input[3][index]; >> FAUSTFLOAT* output0 = &output[0][index]; >> // SECTION : 1 >> for (int i=0; i<count; i++) { >> output0[i] = (FAUSTFLOAT)(((float)input2[i] + (float)input3[i]) * ((float)input0[i] + (float)input1[i])); >> } >> } >> } >> >> (so basically the C++ code is separated in "vectors" [here 32 samples] to be computed in separated loops that can be auto-vectorized by some compilers like Intel ICC, this works quite well...) >> >> The scalar LLVM code is : >> >> define void @llvm_compute(%struct.llvm_dsp* %obj, i32 %count, float** noalias %inputs, float** noalias %outputs) nounwind readnone ssp { >> entry: >> %input_array_ptr0 = getelementptr inbounds float** %inputs, i64 0 >> %input0 = load float** %input_array_ptr0, align 8 >> %input_array_ptr1 = getelementptr inbounds float** %inputs, i64 1 >> %input1 = load float** %input_array_ptr1, align 8 >> %input_array_ptr2 = getelementptr inbounds float** %inputs, i64 2 >> %input2 = load float** %input_array_ptr2, align 8 >> %input_array_ptr3 = getelementptr inbounds float** %inputs, i64 3 >> %input3 = load float** %input_array_ptr3, align 8 >> %output_array_ptr0 = getelementptr inbounds float** %outputs, i64 0 >> %output0 = load float** %output_array_ptr0, align 8 >> %out = icmp sgt i32 %count, 0 >> br i1 %out, label %convert, label %return >> convert: >> %count_64 = zext i32 %count to i64 >> br label %loop >> loop: >> %indvar = phi i64 [ 0, %convert ], [ %indvar.next, %loop ] >> %output_ptr0 = getelementptr float* %output0, i64 %indvar >> %input_ptr1 = getelementptr float* %input1, i64 %indvar >> %fTemp0 = load float* %input_ptr1, align 4 >> %input_ptr0 = getelementptr float* %input0, i64 %indvar >> %fTemp1 = load float* %input_ptr0, align 4 >> %fTemp2 = fadd float %fTemp1, %fTemp0 >> %input_ptr3 = getelementptr float* %input3, i64 %indvar >> %fTemp3 = load float* %input_ptr3, align 4 >> %input_ptr2 = getelementptr float* %input2, i64 %indvar >> %fTemp4 = load float* %input_ptr2, align 4 >> %fTemp5 = fadd float %fTemp4, %fTemp3 >> %fTemp6 = fmul float %fTemp5, %fTemp2 >> store float %fTemp6, float* %output_ptr0, align 4 >> %indvar.next = add i64 %indvar, 1 >> %exitcond = icmp eq i64 %indvar.next, %count_64 >> br i1 %exitcond, label %return, label %loop >> return: >> ret void >> } >> >> >> And the vectorized LLVM code is : >> >> define void @llvm_compute(%struct.llvm_dsp* noalias %obj, i32 %count, <32 x float>** noalias %inputs, <32 x float>** noalias %outputs) nounwind readnone ssp { >> entry: >> %input_array_ptr0 = getelementptr inbounds <32 x float>** %inputs, i64 0 >> %input0 = load <32 x float>** %input_array_ptr0 >> %input_array_ptr1 = getelementptr inbounds <32 x float>** %inputs, i64 1 >> %input1 = load <32 x float>** %input_array_ptr1 >> %input_array_ptr2 = getelementptr inbounds <32 x float>** %inputs, i64 2 >> %input2 = load <32 x float>** %input_array_ptr2 >> %input_array_ptr3 = getelementptr inbounds <32 x float>** %inputs, i64 3 >> %input3 = load <32 x float>** %input_array_ptr3 >> %output_array_ptr0 = getelementptr inbounds <32 x float>** %outputs, i64 0 >> %output0 = load <32 x float>** %output_array_ptr0 >> %out = icmp sgt i32 %count, 0 >> br i1 %out, label %convert, label %return >> convert: >> %count_64 = zext i32 %count to i64 >> br label %loop0 >> loop0: >> %indvar = phi i64 [ 0, %convert ], [ %indvar.next, %loop0 ] >> %output_ptr0 = getelementptr <32 x float>* %output0, i64 %indvar >> %input_ptr1 = getelementptr <32 x float>* %input1, i64 %indvar >> %fVector0 = load <32 x float>* %input_ptr1, align 16; >> %input_ptr0 = getelementptr <32 x float>* %input0, i64 %indvar >> %fVector1 = load <32 x float>* %input_ptr0, align 16; >> %fVector2 = fadd <32 x float> %fVector1, %fVector0; >> %input_ptr3 = getelementptr <32 x float>* %input3, i64 %indvar >> %fVector3 = load <32 x float>* %input_ptr3, align 16; >> %input_ptr2 = getelementptr <32 x float>* %input2, i64 %indvar >> %fVector4 = load <32 x float>* %input_ptr2, align 16; >> %fVector5 = fadd <32 x float> %fVector4, %fVector3; >> %fVector6 = fmul <32 x float> %fVector5, %fVector2; >> store <32 x float> %fVector6, <32 x float>* %output_ptr0, align 16 >> >> %indvar.next = add i64 %indvar, 1 >> %exitcond = icmp eq i64 %indvar.next, %count_64 >> br i1 %exitcond, label %return, label %loop0 >> return: >> ret void >> } >> >> We tried to play with the "align" on the load/store or "noalias" on the compute function parameters without real change. >> >> Do you see anything clear that not correct in the generated vectorized LLVM code? Maybe the memory bandwidth is the limiting factor in this simple example without much computation on the samples? > > <32 x float> takes up 8 SSE registers; you're likely running into > issues with register pressure. Does it work better if you use > something smaller like <4 x float>? > > Besides that, I don't see any obvious issues. > > -Eli >Oh, and you might also want to check that you're actually getting SSE code; if you accidentally disable SSE somehow, you'll end up with x87 code, which will completely expand the vectors into scalars. -Eli
> > <32 x float> takes up 8 SSE registers; you're likely running into > issues with register pressure. Does it work better if you use > something smaller like <4 x float>? > > Besides that, I don't see any obvious issues. > > -EliYou are right yes. The code works faster with <4 x float> types, with still works a bit slower than the scalar version. Stéphane Letz