Hi,
For a long time we've been having a problem we've been working around in
OpenCL where loads are moving across an intrinsic used for a barrier.
Attached is the testcase, and the result of opt -S -basicaa -gvn on it.
This example is essentially this:
void foo(global float2* result, local float2* restrict data0, ...)
{
int id = get_local_id(0);
// ...
data0[id] = ...;
barrier();
if (id < N)
{
float2 x = data0[idx];
int other_index = ...;
data0[other_index] = x;
}
barrier();
result[id] = data0[id];
}
This is transformed so that the load from data0 after the second barrier
never occurs, but it is necessary. The final value written is replaced
with a phi, so the value isn't reloaded for the threads that don't take
the id < N branch. The threads that did take the branch did write to the
same index, so the load needs to occur after the barrier. This transform
does not occur if noalias is removed from the %data0 parameter. The
basic question ends up being if this is the intended behavior of noalias
/ restrict or just a bug.
Here are 2 previous threads about attempts at fixing this problem:
http://lists.cs.uiuc.edu/pipermail/llvmdev/2013-June/062895.html
http://lists.cs.uiuc.edu/pipermail/llvmdev/2013-August/064594.html
Both of these I think sort of went in the wrong direction and talked
specifically about the semantics of the atomic instructions (fence in
particular), which isn't the real question. Is noalias supposed to mean
that no other thread can also have a copy of the pointer it also
modifies? My guess at what was happening is that since the parameter is
noalias, the assumption is there is no possible way for the
side-effecting function to modify the pointer. The second thread brings
up an ambiguity in the C spec about how restrict is supposed to be
interpreted in the presense of multiple threads. OpenCL still has
restrict, but unless this is supposed to work, it is pretty close to
useless.
Right now we are working around this with a custom alias analysis pass
inserted that reports calls to the barrier intrinsics modify anything
with the right address spaces. Is a new intrinsic necessary to get the
right semantics for this case? Even if this worked correctly as it is
now, I don't believe there is any way to truly specify the fence per
address space so that other memory accesses to non-fenced ones could
move across it.
Thanks for any help
-------------- next part --------------
; ModuleID = '/home/marsenau/before_gvn.ll'
target datalayout = "e-p:32:32:32"
; Function Attrs: noduplicate nounwind
declare void @__amdil_barrier_local() #0
; Function Attrs: nounwind readnone
declare <4 x i32> @__amdil_get_local_id_int() #1
; Function Attrs: nounwind
define void @__OpenCL_execFFT_reduced_kernel(<2 x float> addrspace(1)*
noalias nocapture %in, <2 x float> addrspace(1)* noalias nocapture %out,
<2 x float> addrspace(3)* nocapture %data0) #2 {
entry:
%0 = tail call <4 x i32> @__amdil_get_local_id_int() #2
%1 = extractelement <4 x i32> %0, i32 0
%arrayidx = getelementptr <2 x float> addrspace(3)* %data0, i32 %1
%arrayidx3 = getelementptr <2 x float> addrspace(1)* %in, i32 %1
%tmp4 = load <2 x float> addrspace(1)* %arrayidx3, align 8
store <2 x float> %tmp4, <2 x float> addrspace(3)* %arrayidx,
align 8
tail call void @__amdil_barrier_local() #0
%cmp = icmp ult i32 %1, 5
br i1 %cmp, label %if.then, label %if.end
if.then: ; preds = %entry
%tmp10 = load <2 x float> addrspace(3)* %arrayidx, align 8
%tmp16 = mul i32 %1, 2
%arrayidx17 = getelementptr <2 x float> addrspace(3)* %data0, i32 %tmp16
store <2 x float> %tmp10, <2 x float> addrspace(3)* %arrayidx17,
align 8
br label %if.end
if.end: ; preds = %if.then, %entry
tail call void @__amdil_barrier_local() #0
%arrayidx21 = getelementptr <2 x float> addrspace(1)* %out, i32 %1
%tmp25 = load <2 x float> addrspace(3)* %arrayidx, align 8
store <2 x float> %tmp25, <2 x float> addrspace(1)* %arrayidx21,
align 8
ret void
}
attributes #0 = { noduplicate nounwind }
attributes #1 = { nounwind readnone }
attributes #2 = { nounwind }
-------------- next part --------------
; RUN: opt -S -basicaa -gvn < %s | FileCheck %s
target datalayout = "e-p:32:32:32"
; Function Attrs: noduplicate nounwind
declare void @__amdil_barrier_local() #0
; Function Attrs: nounwind readnone
declare <4 x i32> @__amdil_get_local_id_int() #1
; CHECK-LABEL: @__OpenCL_execFFT_reduced_kernel(
; CHECK: load <2 x float> addrspace(1)*
; CHECK: store <2 x float> %{{.*}}, <2 x float> addrspace(3)*
; CHECK: call void @__amdil_barrier_local()
; CHECK: br
; CHECK: if.end:
; CHECK: call void @__amdil_barrier_local()
; CHECK: load <2 x float> addrspace(3)*
; CHECK: store <2 x float> %{{.*}}, <2 x float> addrspace(1)*
define void @__OpenCL_execFFT_reduced_kernel(<2 x float> addrspace(1)*
noalias nocapture %in, <2 x float> addrspace(1)* noalias nocapture %out,
<2 x float> addrspace(3)* noalias nocapture %data0) #2 {
entry:
%0 = tail call <4 x i32> @__amdil_get_local_id_int() #2
%1 = extractelement <4 x i32> %0, i32 0
%arrayidx = getelementptr <2 x float> addrspace(3)* %data0, i32 %1
%arrayidx3 = getelementptr <2 x float> addrspace(1)* %in, i32 %1
%tmp4 = load <2 x float> addrspace(1)* %arrayidx3, align 8
store <2 x float> %tmp4, <2 x float> addrspace(3)* %arrayidx,
align 8
tail call void @__amdil_barrier_local() #0
%cmp = icmp ult i32 %1, 5
br i1 %cmp, label %if.then, label %if.end
if.then: ; preds = %entry
%tmp10 = load <2 x float> addrspace(3)* %arrayidx, align 8
%tmp16 = mul i32 %1, 2
%arrayidx17 = getelementptr <2 x float> addrspace(3)* %data0, i32 %tmp16
store <2 x float> %tmp10, <2 x float> addrspace(3)* %arrayidx17,
align 8
br label %if.end
if.end: ; preds = %if.then, %entry
tail call void @__amdil_barrier_local() #0
%arrayidx21 = getelementptr <2 x float> addrspace(1)* %out, i32 %1
%tmp25 = load <2 x float> addrspace(3)* %arrayidx, align 8
store <2 x float> %tmp25, <2 x float> addrspace(1)* %arrayidx21,
align 8
ret void
}
attributes #0 = { noduplicate nounwind }
attributes #1 = { nounwind readnone }
attributes #2 = { nounwind }