Dmitry N. Mikushin
2012-Jul-11 01:46 UTC
[LLVMdev] [NVPTX] llc -march=nvptx64 -mcpu=sm_20 generates invalid zero align for device function params
Hello, FYI, this is a bug http://llvm.org/bugs/show_bug.cgi?id=13324 When compiling the following code for sm_20, func params are by some reason given with .align 0, which is invalid. Problem does not occur if compiled for sm_10.> cat test.ll; ModuleID = '__kernelgen_main_module' target datalayout = "e-p:64:64-i64:64:64-f64:64:64-n1:8:16:32:64" target triple = "ptx64-unknown-unknown" %struct.float2 = type { float, float } define ptx_device void @__internal_dsmul(%struct.float2* noalias nocapture sret %agg.result, %struct.float2* nocapture byval %x, %struct.float2* nocapture byval %y) nounwind inlinehint alwaysinline { entry: %y1 = getelementptr inbounds %struct.float2* %x, i64 0, i32 1 %0 = load float* %y1, align 4 %sub = fsub float -0.000000e+00, %0 %1 = tail call float asm "mad.f32 $0, $1, $2, $3;", "=f,f,f,f"(float %sub, float 4.097000e+03, float %0) nounwind %2 = tail call float asm "mad.f32 $0, $1, $2, $3;", "=f,f,f,f"(float %0, float 4.097000e+03, float %1) nounwind %y5 = getelementptr inbounds %struct.float2* %y, i64 0, i32 1 %3 = load float* %y5, align 4 %sub7 = fsub float -0.000000e+00, %3 %4 = tail call float asm "mad.f32 $0, $1, $2, $3;", "=f,f,f,f"(float %sub7, float 4.097000e+03, float %3) nounwind %5 = tail call float asm "mad.f32 $0, $1, $2, $3;", "=f,f,f,f"(float %3, float 4.097000e+03, float %4) nounwind %sub12 = fsub float %0, %2 %sub14 = fsub float %3, %5 %6 = tail call float @llvm.nvvm.mul.rn.f(float %0, float %3) nounwind %sub18 = fsub float -0.000000e+00, %6 %7 = tail call float asm "mad.f32 $0, $1, $2, $3;", "=f,f,f,f"(float %2, float %5, float %sub18) nounwind %8 = tail call float asm "mad.f32 $0, $1, $2, $3;", "=f,f,f,f"(float %2, float %sub14, float %7) nounwind %9 = tail call float asm "mad.f32 $0, $1, $2, $3;", "=f,f,f,f"(float %5, float %sub12, float %8) nounwind %10 = tail call float asm "mad.f32 $0, $1, $2, $3;", "=f,f,f,f"(float %sub12, float %sub14, float %9) nounwind %x24 = getelementptr inbounds %struct.float2* %y, i64 0, i32 0 %11 = load float* %x24, align 4 %12 = tail call float @llvm.nvvm.mul.rn.f(float %0, float %11) nounwind %x26 = getelementptr inbounds %struct.float2* %x, i64 0, i32 0 %13 = load float* %x26, align 4 %14 = tail call float @llvm.nvvm.mul.rn.f(float %13, float %3) nounwind %add = fadd float %12, %14 %add29 = fadd float %10, %add %15 = tail call float @llvm.nvvm.add.rn.f(float %6, float %add29) nounwind %sub32 = fsub float %6, %15 %16 = tail call float @llvm.nvvm.add.rn.f(float %sub32, float %add29) nounwind %agg.result.0 = getelementptr inbounds %struct.float2* %agg.result, i64 0, i32 0 store float %16, float* %agg.result.0, align 8 %agg.result.1 = getelementptr inbounds %struct.float2* %agg.result, i64 0, i32 1 store float %15, float* %agg.result.1, align 4 ret void } declare ptx_device float @llvm.nvvm.add.rn.f(float, float) nounwind readnone declare ptx_device float @llvm.nvvm.mul.rn.f(float, float) nounwind readnone> llc -march=nvptx64 -mcpu=sm_20 test.ll -o test.ptx > cat test.ptx// // Generated by LLVM NVPTX Back-End // .version 3.0 .target sm_20, texmode_independent .address_size 64 // .globl __internal_dsmul .func __internal_dsmul( .param .b64 __internal_dsmul_param_0, .param .align 0 .b8 __internal_dsmul_param_1[8], .param .align 0 .b8 __internal_dsmul_param_2[8] ) // @__internal_dsmul { .reg .pred %p<396>; .reg .s16 %rc<396>; .reg .s16 %rs<396>; .reg .s32 %r<396>; .reg .s64 %rl<396>; .reg .f32 %f<396>; .reg .f64 %fl<396>; // BB#0: // %entry mov.b64 %rl0, __internal_dsmul_param_1; cvta.local.u64 %rl0, %rl0; ld.f32 %f0, [%rl0+4]; neg.f32 %f1, %f0; mov.b64 %rl1, __internal_dsmul_param_2; mov.f32 %f2, 0f45800800; // inline asm mad.f32 %f1, %f1, %f2, %f0; // inline asm // inline asm mad.f32 %f3, %f0, %f2, %f1; // inline asm cvta.local.u64 %rl1, %rl1; ld.f32 %f1, [%rl1+4]; neg.f32 %f4, %f1; // inline asm mad.f32 %f4, %f4, %f2, %f1; // inline asm // inline asm mad.f32 %f4, %f1, %f2, %f4; // inline asm sub.f32 %f5, %f0, %f3; sub.f32 %f6, %f1, %f4; mul.rn.f32 %f2, %f0, %f1; neg.f32 %f7, %f2; // inline asm mad.f32 %f7, %f3, %f4, %f7; // inline asm // inline asm mad.f32 %f3, %f3, %f6, %f7; // inline asm // inline asm mad.f32 %f3, %f4, %f5, %f3; // inline asm // inline asm mad.f32 %f3, %f5, %f6, %f3; // inline asm ld.f32 %f4, [%rl1]; mul.rn.f32 %f0, %f0, %f4; ld.f32 %f4, [%rl0]; mul.rn.f32 %f1, %f4, %f1; add.f32 %f0, %f0, %f1; add.f32 %f1, %f3, %f0; add.rn.f32 %f0, %f2, %f1; sub.f32 %f2, %f2, %f0; add.rn.f32 %f1, %f2, %f1; ld.param.u64 %rl0, [__internal_dsmul_param_0]; st.f32 [%rl0], %f1; st.f32 [%rl0+4], %f0; ret; }> ptxas -arch=sm_20 test.ptx -o ptx.cubinptxas test.ptx, line 13; error : Alignment must be a power of two ptxas test.ptx, line 14; error : Alignment must be a power of two ptxas fatal : Ptx assembly aborted due to errors -------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20120711/e55c1cd9/attachment.html>
Dmitry N. Mikushin
2012-Nov-09 12:16 UTC
[LLVMdev] [NVPTX] llc -march=nvptx64 -mcpu=sm_20 generates invalid zero align for device function params
Dear all, I'm attaching a patch that should fix the issue mentioned above. It simply makes the same check seen in the same file for global variables: emitPTXAddressSpace(PTy->getAddressSpace(), O); if (GVar->getAlignment() == 0) O << " .align " << (int) TD->getPrefTypeAlignment(ETy); else O << " .align " << GVar->getAlignment(); Could you please review and commit? Do you think it needs a test case? Thanks, - D. dmikushin at hp2:~/forge/align0> llc -march=nvptx64 -mcpu=sm_20 align0.ll -o - // // Generated by LLVM NVPTX Back-End // .version 3.1 .target sm_20 .address_size 64 // .globl __internal_dsmul .visible .func __internal_dsmul( .param .b64 __internal_dsmul_param_0, .param .align 4 .b8 __internal_dsmul_param_1[8], .param .align 4 .b8 __internal_dsmul_param_2[8] ) // @__internal_dsmul { .reg .pred %p<396>; .reg .s16 %rc<396>; .reg .s16 %rs<396>; .reg .s32 %r<396>; .reg .s64 %rl<396>; .reg .f32 %f<396>; .reg .f64 %fl<396>; // BB#0: // %entry mov.b64 %rl0, __internal_dsmul_param_1; cvta.local.u64 %rl0, %rl0; ld.f32 %f0, [%rl0+4]; neg.f32 %f1, %f0; mov.b64 %rl1, __internal_dsmul_param_2; mov.f32 %f2, 0f45800800; // inline asm mad.f32 %f1, %f1, %f2, %f0; // inline asm // inline asm mad.f32 %f3, %f0, %f2, %f1; // inline asm cvta.local.u64 %rl1, %rl1; ld.f32 %f1, [%rl1+4]; neg.f32 %f4, %f1; // inline asm mad.f32 %f4, %f4, %f2, %f1; // inline asm // inline asm mad.f32 %f4, %f1, %f2, %f4; // inline asm sub.f32 %f5, %f0, %f3; sub.f32 %f6, %f1, %f4; mul.rn.f32 %f2, %f0, %f1; neg.f32 %f7, %f2; // inline asm mad.f32 %f7, %f3, %f4, %f7; // inline asm // inline asm mad.f32 %f3, %f3, %f6, %f7; // inline asm // inline asm mad.f32 %f3, %f4, %f5, %f3; // inline asm // inline asm mad.f32 %f3, %f5, %f6, %f3; // inline asm ld.f32 %f4, [%rl1]; mul.rn.f32 %f0, %f0, %f4; ld.f32 %f4, [%rl0]; mul.rn.f32 %f1, %f4, %f1; add.f32 %f0, %f0, %f1; add.f32 %f1, %f3, %f0; add.rn.f32 %f0, %f2, %f1; sub.f32 %f2, %f2, %f0; add.rn.f32 %f1, %f2, %f1; ld.param.u64 %rl0, [__internal_dsmul_param_0]; st.f32 [%rl0], %f1; st.f32 [%rl0+4], %f0; ret; } 2012/7/11 Dmitry N. Mikushin <maemarcus at gmail.com>:> Hello, > > FYI, this is a bug http://llvm.org/bugs/show_bug.cgi?id=13324 > > When compiling the following code for sm_20, func params are by some reason > given with .align 0, which is invalid. Problem does not occur if compiled > for sm_10. > >> cat test.ll > ; ModuleID = '__kernelgen_main_module' > target datalayout = "e-p:64:64-i64:64:64-f64:64:64-n1:8:16:32:64" > target triple = "ptx64-unknown-unknown" > > %struct.float2 = type { float, float } > > define ptx_device void @__internal_dsmul(%struct.float2* noalias nocapture > sret %agg.result, %struct.float2* nocapture byval %x, %struct.float2* > nocapture byval %y) nounwind inlinehint alwaysinline { > entry: > %y1 = getelementptr inbounds %struct.float2* %x, i64 0, i32 1 > %0 = load float* %y1, align 4 > %sub = fsub float -0.000000e+00, %0 > %1 = tail call float asm "mad.f32 $0, $1, $2, $3;", "=f,f,f,f"(float %sub, > float 4.097000e+03, float %0) nounwind > %2 = tail call float asm "mad.f32 $0, $1, $2, $3;", "=f,f,f,f"(float %0, > float 4.097000e+03, float %1) nounwind > %y5 = getelementptr inbounds %struct.float2* %y, i64 0, i32 1 > %3 = load float* %y5, align 4 > %sub7 = fsub float -0.000000e+00, %3 > %4 = tail call float asm "mad.f32 $0, $1, $2, $3;", "=f,f,f,f"(float > %sub7, float 4.097000e+03, float %3) nounwind > %5 = tail call float asm "mad.f32 $0, $1, $2, $3;", "=f,f,f,f"(float %3, > float 4.097000e+03, float %4) nounwind > %sub12 = fsub float %0, %2 > %sub14 = fsub float %3, %5 > %6 = tail call float @llvm.nvvm.mul.rn.f(float %0, float %3) nounwind > %sub18 = fsub float -0.000000e+00, %6 > %7 = tail call float asm "mad.f32 $0, $1, $2, $3;", "=f,f,f,f"(float %2, > float %5, float %sub18) nounwind > %8 = tail call float asm "mad.f32 $0, $1, $2, $3;", "=f,f,f,f"(float %2, > float %sub14, float %7) nounwind > %9 = tail call float asm "mad.f32 $0, $1, $2, $3;", "=f,f,f,f"(float %5, > float %sub12, float %8) nounwind > %10 = tail call float asm "mad.f32 $0, $1, $2, $3;", "=f,f,f,f"(float > %sub12, float %sub14, float %9) nounwind > %x24 = getelementptr inbounds %struct.float2* %y, i64 0, i32 0 > %11 = load float* %x24, align 4 > %12 = tail call float @llvm.nvvm.mul.rn.f(float %0, float %11) nounwind > %x26 = getelementptr inbounds %struct.float2* %x, i64 0, i32 0 > %13 = load float* %x26, align 4 > %14 = tail call float @llvm.nvvm.mul.rn.f(float %13, float %3) nounwind > %add = fadd float %12, %14 > %add29 = fadd float %10, %add > %15 = tail call float @llvm.nvvm.add.rn.f(float %6, float %add29) nounwind > %sub32 = fsub float %6, %15 > %16 = tail call float @llvm.nvvm.add.rn.f(float %sub32, float %add29) > nounwind > %agg.result.0 = getelementptr inbounds %struct.float2* %agg.result, i64 0, > i32 0 > store float %16, float* %agg.result.0, align 8 > %agg.result.1 = getelementptr inbounds %struct.float2* %agg.result, i64 0, > i32 1 > store float %15, float* %agg.result.1, align 4 > ret void > } > > declare ptx_device float @llvm.nvvm.add.rn.f(float, float) nounwind readnone > > declare ptx_device float @llvm.nvvm.mul.rn.f(float, float) nounwind readnone > >> llc -march=nvptx64 -mcpu=sm_20 test.ll -o test.ptx >> cat test.ptx > // > // Generated by LLVM NVPTX Back-End > // > > .version 3.0 > .target sm_20, texmode_independent > .address_size 64 > > > // .globl __internal_dsmul > .func __internal_dsmul( > .param .b64 __internal_dsmul_param_0, > .param .align 0 .b8 __internal_dsmul_param_1[8], > .param .align 0 .b8 __internal_dsmul_param_2[8] > ) // @__internal_dsmul > { > .reg .pred %p<396>; > .reg .s16 %rc<396>; > .reg .s16 %rs<396>; > .reg .s32 %r<396>; > .reg .s64 %rl<396>; > .reg .f32 %f<396>; > .reg .f64 %fl<396>; > > // BB#0: // %entry > mov.b64 %rl0, __internal_dsmul_param_1; > cvta.local.u64 %rl0, %rl0; > ld.f32 %f0, [%rl0+4]; > neg.f32 %f1, %f0; > mov.b64 %rl1, __internal_dsmul_param_2; > mov.f32 %f2, 0f45800800; > // inline asm > mad.f32 %f1, %f1, %f2, %f0; > // inline asm > // inline asm > mad.f32 %f3, %f0, %f2, %f1; > // inline asm > cvta.local.u64 %rl1, %rl1; > ld.f32 %f1, [%rl1+4]; > neg.f32 %f4, %f1; > // inline asm > mad.f32 %f4, %f4, %f2, %f1; > // inline asm > // inline asm > mad.f32 %f4, %f1, %f2, %f4; > // inline asm > sub.f32 %f5, %f0, %f3; > sub.f32 %f6, %f1, %f4; > mul.rn.f32 %f2, %f0, %f1; > neg.f32 %f7, %f2; > // inline asm > mad.f32 %f7, %f3, %f4, %f7; > // inline asm > // inline asm > mad.f32 %f3, %f3, %f6, %f7; > // inline asm > // inline asm > mad.f32 %f3, %f4, %f5, %f3; > // inline asm > // inline asm > mad.f32 %f3, %f5, %f6, %f3; > // inline asm > ld.f32 %f4, [%rl1]; > mul.rn.f32 %f0, %f0, %f4; > ld.f32 %f4, [%rl0]; > mul.rn.f32 %f1, %f4, %f1; > add.f32 %f0, %f0, %f1; > add.f32 %f1, %f3, %f0; > add.rn.f32 %f0, %f2, %f1; > sub.f32 %f2, %f2, %f0; > add.rn.f32 %f1, %f2, %f1; > ld.param.u64 %rl0, [__internal_dsmul_param_0]; > st.f32 [%rl0], %f1; > st.f32 [%rl0+4], %f0; > ret; > } > >> ptxas -arch=sm_20 test.ptx -o ptx.cubin > ptxas test.ptx, line 13; error : Alignment must be a power of two > ptxas test.ptx, line 14; error : Alignment must be a power of two > ptxas fatal : Ptx assembly aborted due to errors >-------------- next part -------------- A non-text attachment was scrubbed... Name: align0.patch Type: application/octet-stream Size: 517 bytes Desc: not available URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20121109/d1c3438d/attachment.obj>
Duncan Sands
2012-Nov-09 12:55 UTC
[LLVMdev] [NVPTX] llc -march=nvptx64 -mcpu=sm_20 generates invalid zero align for device function params
Hi Dmitry, > I'm attaching a patch that should fix the issue mentioned above. It> simply makes the same check seen in the same file for global > variables: > > emitPTXAddressSpace(PTy->getAddressSpace(), O); > if (GVar->getAlignment() == 0) > O << " .align " << (int) TD->getPrefTypeAlignment(ETy); > else > O << " .align " << GVar->getAlignment();it's not quite the same because your patch uses the ABI alignment, while in this snippet it is the preferred alignment (which is usually the same as the ABI alignment, but may be bigger).> Could you please review and commit? Do you think it needs a test case?Yes, it needs a testcase. Ciao, Duncan.
Reasonably Related Threads
- [LLVMdev] [NVPTX] llc -march=nvptx64 -mcpu=sm_20 generates invalid zero align for device function params
- [LLVMdev] NVPTX CUDA_ERROR_NO_BINARY_FOR_GPU
- [LLVMdev] NVPTX CUDA_ERROR_NO_BINARY_FOR_GPU
- [LLVMdev] NVPTX CUDA_ERROR_NO_BINARY_FOR_GPU
- [LLVMdev] NVPTX CUDA_ERROR_NO_BINARY_FOR_GPU