Hi all, redux sync intrinsics are not working as expected. clang/test/redux-builtins.cu has usage as out = __nvvm_redux_sync_add(val, 0xFF); out is the write location for the warp, val is the thread's contributed value, and 0xFF is the mask for a fully active warp. So far all usage of this builtin has resulted in an Illegal instruction. This is an nvcc application using the nvcc builtin to reduce across a warp: #include "cuda_runtime.h" #include "device_launch_parameters.h" #include <cuda_profiler_api.h> #include <stdio.h> #include <stdlib.h> #include <cuda_runtime.h> __inline__ __device__ unsigned warpReduceRedux(unsigned val) { return __reduce_add_sync(0xFF, val); } __global__ void reduceKernel(unsigned *in, unsigned* out, int N) { unsigned sum = in[threadIdx.x]; sum = warpReduceRedux(sum); if (threadIdx.x == 0) out[0] = sum; } int main() { const int SIZE = 32; const int ARRAY_BYTES = SIZE * sizeof(unsigned); // generate the input array on the host unsigned h_in[SIZE]; unsigned sum = 0.0f; for (int i = 0; i < SIZE; i++) { h_in[i] = i; sum += h_in[i]; } // declare GPU memory pointers unsigned * d_in, *d_out; // allocate GPU memory cudaMalloc((void **)&d_in, ARRAY_BYTES); cudaMalloc((void **)&d_out, sizeof(unsigned)); // transfer the input array to the GPU cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice); // offload to device reduceKernel<<<1, SIZE>>>(d_in, d_out, SIZE); // copy back the sum from GPU unsigned h_out; cudaMemcpy(&h_out, d_out, sizeof(unsigned), cudaMemcpyDeviceToHost); printf("%u\n", h_out); } cuda-memcheck is clear and has verifiable output. The same application, substituting the nvcc builtin for the clang one then building with clang: __inline__ __device__ unsigned warpReduceRedux(unsigned val) { return __nvvm_redux_sync_add(val, 0xFF); } compiles but does not pass cuda-memcheck and does not provide the correct output: ========= CUDA-MEMCHECK ========= Illegal Instruction ========= at 0x00000cf0 in reduceKernel(unsigned int*, unsigned int*, int) ========= by thread (0,0,0) in block (0,0,0) What is the usage for these? I've also attached the PTX emitted by these apps in case there's a backend issue to be found. -------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20210806/bc8b118c/attachment.html> -------------- next part -------------- A non-text attachment was scrubbed... Name: clang.ptx Type: image/ptx Size: 7038 bytes Desc: not available URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20210806/bc8b118c/attachment.bin> -------------- next part -------------- A non-text attachment was scrubbed... Name: nvcc.ptx Type: image/ptx Size: 944 bytes Desc: not available URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20210806/bc8b118c/attachment-0001.bin>