Hi all,
Is there any optimization pass which can move vector ops out of loops ?
For example:
typedef short short2 __attribute__((ext_vector_type(2)));
short2 a[50],b[50],c;
void test() {
for (i=0; i<50; i++) {
c.y += a[i].x * b[i].y;
}
}
clang in -O3 gives me the following IR:
@i = common global i32 0, align 4
@a = common global [50 x <2 x i16>] zeroinitializer, align 4
@b = common global [50 x <2 x i16>] zeroinitializer, align 4
@c = common global <2 x i16> zeroinitializer, align 4
define void @test() nounwind {
entry:
store i32 0, i32* @i, align 4
%c.promoted = load <2 x i16>* @c, align 4
br label %for.body
for.body: ; preds = %entry,
%for.body
%inc7 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
%0 = phi <2 x i16> [ %c.promoted, %entry ], [ %6, %for.body ]
%storemerge6.off0 = phi i16 [ 0, %entry ], [ %extract.t, %for.body ]
%arrayidx = getelementptr inbounds [50 x <2 x i16>]* @a, i16 0, i16
%storemerge6.off0
%1 = load <2 x i16>* %arrayidx, align 4
%2 = extractelement <2 x i16> %1, i32 0
%arrayidx1 = getelementptr inbounds [50 x <2 x i16>]* @b, i16 0, i16
%storemerge6.off0
%3 = load <2 x i16>* %arrayidx1, align 4
%4 = extractelement <2 x i16> %3, i32 1
%mul = mul i16 %4, %2
%5 = extractelement <2 x i16> %0, i32 1
%add = add i16 %5, %mul
%6 = insertelement <2 x i16> %0, i16 %add, i32 1
%inc = add nsw i32 %inc7, 1
%cmp = icmp slt i32 %inc, 50
%extract.t = trunc i32 %inc to i16
br i1 %cmp, label %for.body, label %for.end
for.end: ; preds = %for.body
store i32 50, i32* @i, align 4
store <2 x i16> %6, <2 x i16>* @c, align 4
ret void
}
The store to "c" is efficiently moved out of the loop but insertelt
and
extractelt are not.
Because we are always accessing the second element of vector c, is it
not more efficient to move extractelt to the loop entry and insertelt to
the end ?
Ivan