Le 29 mai 2010 à 01:08, Bill Wendling a écrit :
> Hi Stéphane,
>
> The SSE support is the LLVM backend is fine. What is the code that's
generated? Do you have some short examples of where LLVM doesn't do as well
as the equivalent scalar code?
>
> -bw
>
> On May 28, 2010, at 12:13 PM, Stéphane Letz wrote:
We are actually testing LLVM for the Faust language (http://faust.grame.fr/)
Currently Faust generates à C++ class from its .dsp Faust source file. So for
the simple following Faust example :
process = (+,+):*;
Which can be displayed as the following processor (takes 4 streams of float
samples, do a "+" and then a "*" operation on the streams to
produce a single output)
-------------- next part --------------
A non-text attachment was scrubbed...
Name: plus.png
Type: image/png
Size: 10191 bytes
Desc: not available
URL:
<http://lists.llvm.org/pipermail/llvm-dev/attachments/20100529/b44926fc/attachment.png>
-------------- next part --------------
For scalar code in C++ code is :
virtual void compute (int count, FAUSTFLOAT** input, FAUSTFLOAT** output) {
FAUSTFLOAT* input0 = input[0];
FAUSTFLOAT* input1 = input[1];
FAUSTFLOAT* input2 = input[2];
FAUSTFLOAT* input3 = input[3];
FAUSTFLOAT* output0 = output[0];
for (int i=0; i<count; i++) {
output0[i] = (FAUSTFLOAT)(((float)input2[i] + (float)input3[i]) *
((float)input0[i] + (float)input1[i]));
}
}
The "vectorized" C++ code is :
virtual void compute (int fullcount, FAUSTFLOAT** input, FAUSTFLOAT** output) {
for (int index = 0; index < fullcount; index += 32) {
int count = min(32, fullcount-index);
FAUSTFLOAT* input0 = &input[0][index];
FAUSTFLOAT* input1 = &input[1][index];
FAUSTFLOAT* input2 = &input[2][index];
FAUSTFLOAT* input3 = &input[3][index];
FAUSTFLOAT* output0 = &output[0][index];
// SECTION : 1
for (int i=0; i<count; i++) {
output0[i] = (FAUSTFLOAT)(((float)input2[i] + (float)input3[i]) *
((float)input0[i] + (float)input1[i]));
}
}
}
(so basically the C++ code is separated in "vectors" [here 32 samples]
to be computed in separated loops that can be auto-vectorized by some compilers
like Intel ICC, this works quite well...)
The scalar LLVM code is :
define void @llvm_compute(%struct.llvm_dsp* %obj, i32 %count, float** noalias
%inputs, float** noalias %outputs) nounwind readnone ssp {
entry:
%input_array_ptr0 = getelementptr inbounds float** %inputs, i64 0
%input0 = load float** %input_array_ptr0, align 8
%input_array_ptr1 = getelementptr inbounds float** %inputs, i64 1
%input1 = load float** %input_array_ptr1, align 8
%input_array_ptr2 = getelementptr inbounds float** %inputs, i64 2
%input2 = load float** %input_array_ptr2, align 8
%input_array_ptr3 = getelementptr inbounds float** %inputs, i64 3
%input3 = load float** %input_array_ptr3, align 8
%output_array_ptr0 = getelementptr inbounds float** %outputs, i64 0
%output0 = load float** %output_array_ptr0, align 8
%out = icmp sgt i32 %count, 0
br i1 %out, label %convert, label %return
convert:
%count_64 = zext i32 %count to i64
br label %loop
loop:
%indvar = phi i64 [ 0, %convert ], [ %indvar.next, %loop ]
%output_ptr0 = getelementptr float* %output0, i64 %indvar
%input_ptr1 = getelementptr float* %input1, i64 %indvar
%fTemp0 = load float* %input_ptr1, align 4
%input_ptr0 = getelementptr float* %input0, i64 %indvar
%fTemp1 = load float* %input_ptr0, align 4
%fTemp2 = fadd float %fTemp1, %fTemp0
%input_ptr3 = getelementptr float* %input3, i64 %indvar
%fTemp3 = load float* %input_ptr3, align 4
%input_ptr2 = getelementptr float* %input2, i64 %indvar
%fTemp4 = load float* %input_ptr2, align 4
%fTemp5 = fadd float %fTemp4, %fTemp3
%fTemp6 = fmul float %fTemp5, %fTemp2
store float %fTemp6, float* %output_ptr0, align 4
%indvar.next = add i64 %indvar, 1
%exitcond = icmp eq i64 %indvar.next, %count_64
br i1 %exitcond, label %return, label %loop
return:
ret void
}
And the vectorized LLVM code is :
define void @llvm_compute(%struct.llvm_dsp* noalias %obj, i32 %count, <32 x
float>** noalias %inputs, <32 x float>** noalias %outputs) nounwind
readnone ssp {
entry:
%input_array_ptr0 = getelementptr inbounds <32 x float>** %inputs,
i64 0
%input0 = load <32 x float>** %input_array_ptr0
%input_array_ptr1 = getelementptr inbounds <32 x float>** %inputs,
i64 1
%input1 = load <32 x float>** %input_array_ptr1
%input_array_ptr2 = getelementptr inbounds <32 x float>** %inputs,
i64 2
%input2 = load <32 x float>** %input_array_ptr2
%input_array_ptr3 = getelementptr inbounds <32 x float>** %inputs,
i64 3
%input3 = load <32 x float>** %input_array_ptr3
%output_array_ptr0 = getelementptr inbounds <32 x float>** %outputs,
i64 0
%output0 = load <32 x float>** %output_array_ptr0
%out = icmp sgt i32 %count, 0
br i1 %out, label %convert, label %return
convert:
%count_64 = zext i32 %count to i64
br label %loop0
loop0:
%indvar = phi i64 [ 0, %convert ], [ %indvar.next, %loop0 ]
%output_ptr0 = getelementptr <32 x float>* %output0, i64 %indvar
%input_ptr1 = getelementptr <32 x float>* %input1, i64 %indvar
%fVector0 = load <32 x float>* %input_ptr1, align 16;
%input_ptr0 = getelementptr <32 x float>* %input0, i64 %indvar
%fVector1 = load <32 x float>* %input_ptr0, align 16;
%fVector2 = fadd <32 x float> %fVector1, %fVector0;
%input_ptr3 = getelementptr <32 x float>* %input3, i64 %indvar
%fVector3 = load <32 x float>* %input_ptr3, align 16;
%input_ptr2 = getelementptr <32 x float>* %input2, i64 %indvar
%fVector4 = load <32 x float>* %input_ptr2, align 16;
%fVector5 = fadd <32 x float> %fVector4, %fVector3;
%fVector6 = fmul <32 x float> %fVector5, %fVector2;
store <32 x float> %fVector6, <32 x float>* %output_ptr0, align
16
%indvar.next = add i64 %indvar, 1
%exitcond = icmp eq i64 %indvar.next, %count_64
br i1 %exitcond, label %return, label %loop0
return:
ret void
}
We tried to play with the "align" on the load/store or
"noalias" on the compute function parameters without real change.
Do you see anything clear that not correct in the generated vectorized LLVM
code? Maybe the memory bandwidth is the limiting factor in this simple example
without much computation on the samples?
Thanks.
Stéphane Letz