I've written a compiler that outputs PTX code, the result seems fairly reasonable, but I'm not sure the intrinsics are getting compiled correctly. In addition, when I try load the module using CUDA, I get an error: CUDA_ERROR_NO_BINARY_FOR_GPU. I'm running this on a 2012 MBP with a 640M GPU. PTX Code (for a mandelbrot calculation): // // Generated by LLVM NVPTX Back-End // .version 3.1 .target sm_10, texmode_independent .address_size 64 .func (.reg .b32 func_retval0) INT_PTX_SREG_CTAID_X ( ) ; .func (.reg .b32 func_retval0) INT_PTX_SREG_CTAID_Y ( ) ; .func (.reg .b32 func_retval0) INT_PTX_SREG_TID_X ( ) ; .func (.reg .b32 func_retval0) INT_PTX_SREG_NTID_X ( ) ; .func (.reg .b32 func_retval0) INT_PTX_SREG_NTID_Y ( ) ; // .globl examples_2E_mandelbrot_2F_square .func (.reg .b64 func_retval0) examples_2E_mandelbrot_2F_square( .reg .b64 examples_2E_mandelbrot_2F_square_param_0 ) { .reg .pred %p<396>; .reg .s16 %rc<396>; .reg .s16 %rs<396>; .reg .s32 %r<396>; .reg .s64 %rl<396>; .reg .f32 %f<396>; .reg .f64 %fl<396>; mov.f64 %fl0, examples_2E_mandelbrot_2F_square_param_0; mul.f64 %fl0, %fl0, %fl0; mov.f64 func_retval0, %fl0; ret; } // .globl examples_2E_mandelbrot_2F_calc_2D_iteration .func (.reg .b64 func_retval0) examples_2E_mandelbrot_2F_calc_2D_iteration( .reg .b64 examples_2E_mandelbrot_2F_calc_2D_iteration_param_0, .reg .b64 examples_2E_mandelbrot_2F_calc_2D_iteration_param_1, .reg .b64 examples_2E_mandelbrot_2F_calc_2D_iteration_param_2, .reg .b64 examples_2E_mandelbrot_2F_calc_2D_iteration_param_3, .reg .b64 examples_2E_mandelbrot_2F_calc_2D_iteration_param_4 ) { .reg .pred %p<396>; .reg .s16 %rc<396>; .reg .s16 %rs<396>; .reg .s32 %r<396>; .reg .s64 %rl<396>; .reg .f32 %f<396>; .reg .f64 %fl<396>; mov.f64 %fl0, examples_2E_mandelbrot_2F_calc_2D_iteration_param_0; mov.f64 %fl1, examples_2E_mandelbrot_2F_calc_2D_iteration_param_3; div.rn.f64 %fl0, %fl0, %fl1; mov.f64 %fl2, examples_2E_mandelbrot_2F_calc_2D_iteration_param_1; mul.f64 %fl1, %fl0, 0d400C000000000000; mov.f64 %fl0, examples_2E_mandelbrot_2F_calc_2D_iteration_param_2; add.f64 %fl1, %fl1, 0dC004000000000000; mov.f64 %fl3, examples_2E_mandelbrot_2F_calc_2D_iteration_param_4; div.rn.f64 %fl2, %fl2, %fl3; add.f64 %fl2, %fl2, %fl2; add.f64 %fl3, %fl2, 0dBFF0000000000000; mov.f64 %fl2, 0d0000000000000000; mov.f64 %fl5, %fl2; mov.f64 %fl4, %fl2; bra.uni BB1_1; BB1_2: add.f64 %fl2, %fl2, 0d3FF0000000000000; sub.f64 %fl6, %fl6, %fl7; add.f64 %fl6, %fl6, %fl1; add.f64 %fl5, %fl5, %fl5; mul.f64 %fl4, %fl5, %fl4; add.f64 %fl4, %fl4, %fl3; mov.f64 %fl5, %fl6; BB1_1: mul.f64 %fl6, %fl5, %fl5; mul.f64 %fl7, %fl4, %fl4; add.f64 %fl8, %fl6, %fl7; setp.lt.f64 %p0, %fl8, 0d4010000000000000; setp.lt.f64 %p1, %fl2, %fl0; and.pred %p0, %p0, %p1; @!%p0 bra BB1_3; bra.uni BB1_2; BB1_3: mov.f64 func_retval0, %fl2; ret; } // .globl examples_2E_mandelbrot_2F_calc_2D_mandelbrot_2D_ptx .func (.reg .b64 func_retval0) examples_2E_mandelbrot_2F_calc_2D_mandelbrot_2D_ptx( .reg .b64 examples_2E_mandelbrot_2F_calc_2D_mandelbrot_2D_ptx_param_0, .reg .b64 examples_2E_mandelbrot_2F_calc_2D_mandelbrot_2D_ptx_param_1, .reg .b64 examples_2E_mandelbrot_2F_calc_2D_mandelbrot_2D_ptx_param_2, .reg .b64 examples_2E_mandelbrot_2F_calc_2D_mandelbrot_2D_ptx_param_3 ) { .reg .pred %p<396>; .reg .s16 %rc<396>; .reg .s16 %rs<396>; .reg .s32 %r<396>; .reg .s64 %rl<396>; .reg .f32 %f<396>; .reg .f64 %fl<396>; mov.b64 %rl0, examples_2E_mandelbrot_2F_calc_2D_mandelbrot_2D_ptx_param_0; mov.f64 %fl2, examples_2E_mandelbrot_2F_calc_2D_mandelbrot_2D_ptx_param_1; mov.f64 %fl1, examples_2E_mandelbrot_2F_calc_2D_mandelbrot_2D_ptx_param_2; mov.f64 %fl0, examples_2E_mandelbrot_2F_calc_2D_mandelbrot_2D_ptx_param_3; // Callseq Start 0 { .reg .b32 temp_param_reg; // <end>} .reg .b32 retval0; call.uni (retval0), INT_PTX_SREG_CTAID_X, ( ); mov.b32 %r0, retval0; //{ }// Callseq End 0 // Callseq Start 1 { .reg .b32 temp_param_reg; // <end>} .reg .b32 retval0; call.uni (retval0), INT_PTX_SREG_NTID_X, ( ); mov.b32 %r1, retval0; //{ }// Callseq End 1 // Callseq Start 2 { .reg .b32 temp_param_reg; // <end>} .reg .b32 retval0; call.uni (retval0), INT_PTX_SREG_TID_X, ( ); mov.b32 %r2, retval0; //{ }// Callseq End 2 mad.lo.s32 %r0, %r0, %r1, %r2; cvt.rn.f64.s32 %fl3, %r0; // Callseq Start 3 { .reg .b32 temp_param_reg; // <end>} .reg .b32 retval0; call.uni (retval0), INT_PTX_SREG_CTAID_Y, ( ); mov.b32 %r0, retval0; //{ }// Callseq End 3 // Callseq Start 4 { .reg .b32 temp_param_reg; // <end>} .reg .b32 retval0; call.uni (retval0), INT_PTX_SREG_NTID_Y, ( ); mov.b32 %r1, retval0; //{ }// Callseq End 4 // Callseq Start 5 { .reg .b32 temp_param_reg; // <end>} .reg .b32 retval0; call.uni (retval0), INT_PTX_SREG_TID_X, ( ); mov.b32 %r2, retval0; //{ }// Callseq End 5 mad.lo.s32 %r0, %r0, %r1, %r2; cvt.rn.f64.s32 %fl4, %r0; mul.f64 %fl5, %fl4, %fl2; add.f64 %fl5, %fl5, %fl3; cvt.rzi.s64.f64 %rl1, %fl5; shl.b64 %rl1, %rl1, 3; add.s64 %rl1, %rl0, %rl1; div.rn.f64 %fl2, %fl3, %fl2; mul.f64 %fl2, %fl2, 0d400C000000000000; add.f64 %fl2, %fl2, 0dC004000000000000; div.rn.f64 %fl1, %fl4, %fl1; add.f64 %fl1, %fl1, %fl1; add.f64 %fl3, %fl1, 0dBFF0000000000000; mov.f64 %fl1, 0d0000000000000000; mov.f64 %fl5, %fl1; mov.f64 %fl4, %fl1; bra.uni BB2_1; BB2_2: add.f64 %fl1, %fl1, 0d3FF0000000000000; sub.f64 %fl6, %fl6, %fl7; add.f64 %fl6, %fl6, %fl2; add.f64 %fl5, %fl5, %fl5; mul.f64 %fl4, %fl5, %fl4; add.f64 %fl4, %fl4, %fl3; mov.f64 %fl5, %fl6; BB2_1: mul.f64 %fl6, %fl5, %fl5; mul.f64 %fl7, %fl4, %fl4; add.f64 %fl8, %fl6, %fl7; setp.lt.f64 %p0, %fl8, 0d4010000000000000; setp.lt.f64 %p1, %fl1, %fl0; and.pred %p0, %p0, %p1; @!%p0 bra BB2_3; bra.uni BB2_2; BB2_3: div.rn.f64 %fl0, %fl1, %fl0; st.global.f64 [%rl1], %fl0; mov.b64 func_retval0, %rl0; ret; } -------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20130301/2c80c840/attachment.html>
Timothy, Those calls to compute grid intrinsics are definitely wrong. In ptx code they should end up into reading special registers, rather than function calls. Try to take some working example and figure out the LLVM IR differences between it and the result of your compiler. - D. ----- Original message -----> I've written a compiler that outputs PTX code, the result seems fairly > reasonable, but I'm not sure the intrinsics are getting compiled > correctly. > > In addition, when I try load the module using聽 CUDA, I get an > error: CUDA_ERROR_NO_BINARY_FOR_GPU. I'm running this on a 2012 MBP > with a 640M GPU. > > PTX Code (for a mandelbrot calculation): > > // > // Generated by LLVM NVPTX Back-End > // > > .version 3.1 > .target sm_10, texmode_independent > .address_size 64 > > .func聽 (.reg .b32 func_retval0) INT_PTX_SREG_CTAID_X > ( > > ) > ; > .func聽 (.reg .b32 func_retval0) INT_PTX_SREG_CTAID_Y > ( > > ) > ; > .func聽 (.reg .b32 func_retval0) INT_PTX_SREG_TID_X > ( > > ) > ; > .func聽 (.reg .b32 func_retval0) INT_PTX_SREG_NTID_X > ( > > ) > ; > .func聽 (.reg .b32 func_retval0) INT_PTX_SREG_NTID_Y > ( > > ) > ; > >聽 聽 聽 聽 聽 聽 聽 聽 // .globl聽 聽 聽 聽 聽 聽 examples_2E_mandelbrot_2F_square > .func聽 (.reg .b64 func_retval0) examples_2E_mandelbrot_2F_square( >聽 聽 聽 聽 聽 聽 聽 聽 .reg .b64 examples_2E_mandelbrot_2F_square_param_0 > ) > { >聽 聽 聽 聽 聽 聽 聽 聽 .reg .pred %p<396>; >聽 聽 聽 聽 聽 聽 聽 聽 .reg .s16 %rc<396>; >聽 聽 聽 聽 聽 聽 聽 聽 .reg .s16 %rs<396>; >聽 聽 聽 聽 聽 聽 聽 聽 .reg .s32 %r<396>; >聽 聽 聽 聽 聽 聽 聽 聽 .reg .s64 %rl<396>; >聽 聽 聽 聽 聽 聽 聽 聽 .reg .f32 %f<396>; >聽 聽 聽 聽 聽 聽 聽 聽 .reg .f64 %fl<396>; > >聽 聽 聽 聽 聽 聽 聽 聽 mov.f64 %fl0, examples_2E_mandelbrot_2F_square_param_0; >聽 聽 聽 聽 聽 聽 聽 聽 mul.f64聽 聽 聽 聽 聽 聽 聽 聽 %fl0, %fl0, %fl0; >聽 聽 聽 聽 聽 聽 聽 聽 mov.f64 func_retval0, %fl0; >聽 聽 聽 聽 聽 聽 聽 聽 ret; > } > >聽 聽 聽 聽 聽 聽 聽 聽 // .globl聽 聽 聽 聽 聽 聽 examples_2E_mandelbrot_2F_calc_2D_iteration > .func聽 (.reg .b64 func_retval0) > examples_2E_mandelbrot_2F_calc_2D_iteration(聽 聽 聽 聽 聽 聽 聽 .reg .b64 > examples_2E_mandelbrot_2F_calc_2D_iteration_param_0,聽 聽 聽 聽 聽 聽 聽 .reg .b64 > examples_2E_mandelbrot_2F_calc_2D_iteration_param_1,聽 聽 聽 聽 聽 聽 聽 .reg .b64 > examples_2E_mandelbrot_2F_calc_2D_iteration_param_2,聽 聽 聽 聽 聽 聽 聽 .reg .b64 > examples_2E_mandelbrot_2F_calc_2D_iteration_param_3,聽 聽 聽 聽 聽 聽 聽 .reg .b64 > examples_2E_mandelbrot_2F_calc_2D_iteration_param_4 ) > { >聽 聽 聽 聽 聽 聽 聽 聽 .reg .pred %p<396>; >聽 聽 聽 聽 聽 聽 聽 聽 .reg .s16 %rc<396>; >聽 聽 聽 聽 聽 聽 聽 聽 .reg .s16 %rs<396>; >聽 聽 聽 聽 聽 聽 聽 聽 .reg .s32 %r<396>; >聽 聽 聽 聽 聽 聽 聽 聽 .reg .s64 %rl<396>; >聽 聽 聽 聽 聽 聽 聽 聽 .reg .f32 %f<396>; >聽 聽 聽 聽 聽 聽 聽 聽 .reg .f64 %fl<396>; > >聽 聽 聽 聽 聽 聽 聽 聽 mov.f64 %fl0, > examples_2E_mandelbrot_2F_calc_2D_iteration_param_0;聽 聽 聽 聽 聽 聽 聽 mov.f64 > %fl1, examples_2E_mandelbrot_2F_calc_2D_iteration_param_3;聽 聽 聽 聽 聽 聽 > div.rn.f64聽 聽 聽 聽 聽 %fl0, %fl0, %fl1;聽 聽 聽 聽 聽 聽 聽 mov.f64 %fl2, > examples_2E_mandelbrot_2F_calc_2D_iteration_param_1;聽 聽 聽 聽 聽 聽 聽 mul.f64聽 聽 聽 聽 >聽 聽 聽 %fl1, %fl0, 0d400C000000000000;聽 聽 聽 聽 聽 聽 聽 mov.f64 %fl0, > examples_2E_mandelbrot_2F_calc_2D_iteration_param_2;聽 聽 聽 聽 聽 聽 聽 add.f64聽 聽 聽 聽 >聽 聽 聽 %fl1, %fl1, 0dC004000000000000;聽 聽 聽 聽 聽 聽 聽 mov.f64 %fl3, > examples_2E_mandelbrot_2F_calc_2D_iteration_param_4;聽 聽 聽 聽 聽 聽 聽 div.rn.f64聽 >聽 聽 聽 %fl2, %fl2, %fl3;聽 聽 聽 聽 聽 聽 聽 add.f64聽 聽 聽 聽 聽 聽 聽 聽 %fl2, %fl2, %fl2; >聽 聽 聽 聽 聽 聽 聽 聽 add.f64聽 聽 聽 聽 聽 聽 聽 聽 %fl3, %fl2, 0dBFF0000000000000; >聽 聽 聽 聽 聽 聽 聽 聽 mov.f64聽 聽 聽 聽 聽 聽 聽 聽 %fl2, 0d0000000000000000; >聽 聽 聽 聽 聽 聽 聽 聽 mov.f64聽 聽 聽 聽 聽 聽 聽 聽 %fl5, %fl2; >聽 聽 聽 聽 聽 聽 聽 聽 mov.f64聽 聽 聽 聽 聽 聽 聽 聽 %fl4, %fl2; >聽 聽 聽 聽 聽 聽 聽 聽 bra.uni聽 聽 聽 聽 聽 聽 聽 聽 BB1_1; > BB1_2: >聽 聽 聽 聽 聽 聽 聽 聽 add.f64聽 聽 聽 聽 聽 聽 聽 聽 %fl2, %fl2, 0d3FF0000000000000; >聽 聽 聽 聽 聽 聽 聽 聽 sub.f64聽 聽 聽 聽 聽 聽 聽 聽 %fl6, %fl6, %fl7; >聽 聽 聽 聽 聽 聽 聽 聽 add.f64聽 聽 聽 聽 聽 聽 聽 聽 %fl6, %fl6, %fl1; >聽 聽 聽 聽 聽 聽 聽 聽 add.f64聽 聽 聽 聽 聽 聽 聽 聽 %fl5, %fl5, %fl5; >聽 聽 聽 聽 聽 聽 聽 聽 mul.f64聽 聽 聽 聽 聽 聽 聽 聽 %fl4, %fl5, %fl4; >聽 聽 聽 聽 聽 聽 聽 聽 add.f64聽 聽 聽 聽 聽 聽 聽 聽 %fl4, %fl4, %fl3; >聽 聽 聽 聽 聽 聽 聽 聽 mov.f64聽 聽 聽 聽 聽 聽 聽 聽 %fl5, %fl6; > BB1_1: >聽 聽 聽 聽 聽 聽 聽 聽 mul.f64聽 聽 聽 聽 聽 聽 聽 聽 %fl6, %fl5, %fl5; >聽 聽 聽 聽 聽 聽 聽 聽 mul.f64聽 聽 聽 聽 聽 聽 聽 聽 %fl7, %fl4, %fl4; >聽 聽 聽 聽 聽 聽 聽 聽 add.f64聽 聽 聽 聽 聽 聽 聽 聽 %fl8, %fl6, %fl7; >聽 聽 聽 聽 聽 聽 聽 聽 setp.lt.f64聽 聽 聽 聽 %p0, %fl8, 0d4010000000000000; >聽 聽 聽 聽 聽 聽 聽 聽 setp.lt.f64聽 聽 聽 聽 %p1, %fl2, %fl0; >聽 聽 聽 聽 聽 聽 聽 聽 and.pred聽 聽 聽 聽 聽 聽 聽 %p0, %p0, %p1; >聽 聽 聽 聽 聽 聽 聽 聽 @!%p0 bra聽 聽 聽 聽 聽 聽 BB1_3; >聽 聽 聽 聽 聽 聽 聽 聽 bra.uni聽 聽 聽 聽 聽 聽 聽 聽 BB1_2; > BB1_3: >聽 聽 聽 聽 聽 聽 聽 聽 mov.f64 func_retval0, %fl2; >聽 聽 聽 聽 聽 聽 聽 聽 ret; > } > >聽 聽 聽 聽 聽 聽 聽 聽 // .globl聽 聽 聽 聽 聽 > examples_2E_mandelbrot_2F_calc_2D_mandelbrot_2D_ptx .func聽 (.reg .b64 > func_retval0) examples_2E_mandelbrot_2F_calc_2D_mandelbrot_2D_ptx( >聽 聽 聽 聽 聽 聽 聽 聽 .reg .b64 > examples_2E_mandelbrot_2F_calc_2D_mandelbrot_2D_ptx_param_0, >聽 聽 聽 聽 聽 聽 聽 聽 .reg .b64 > examples_2E_mandelbrot_2F_calc_2D_mandelbrot_2D_ptx_param_1, >聽 聽 聽 聽 聽 聽 聽 聽 .reg .b64 > examples_2E_mandelbrot_2F_calc_2D_mandelbrot_2D_ptx_param_2, >聽 聽 聽 聽 聽 聽 聽 聽 .reg .b64 > examples_2E_mandelbrot_2F_calc_2D_mandelbrot_2D_ptx_param_3 > ) > { >聽 聽 聽 聽 聽 聽 聽 聽 .reg .pred %p<396>; >聽 聽 聽 聽 聽 聽 聽 聽 .reg .s16 %rc<396>; >聽 聽 聽 聽 聽 聽 聽 聽 .reg .s16 %rs<396>; >聽 聽 聽 聽 聽 聽 聽 聽 .reg .s32 %r<396>; >聽 聽 聽 聽 聽 聽 聽 聽 .reg .s64 %rl<396>; >聽 聽 聽 聽 聽 聽 聽 聽 .reg .f32 %f<396>; >聽 聽 聽 聽 聽 聽 聽 聽 .reg .f64 %fl<396>; > >聽 聽 聽 聽 聽 聽 聽 聽 mov.b64 %rl0, > examples_2E_mandelbrot_2F_calc_2D_mandelbrot_2D_ptx_param_0; >聽 聽 聽 聽 聽 聽 聽 聽 mov.f64 %fl2, > examples_2E_mandelbrot_2F_calc_2D_mandelbrot_2D_ptx_param_1; >聽 聽 聽 聽 聽 聽 聽 聽 mov.f64 %fl1, > examples_2E_mandelbrot_2F_calc_2D_mandelbrot_2D_ptx_param_2; >聽 聽 聽 聽 聽 聽 聽 聽 mov.f64 %fl0, > examples_2E_mandelbrot_2F_calc_2D_mandelbrot_2D_ptx_param_3; >聽 聽 聽 聽 聽 聽 聽 聽 // Callseq Start 0 >聽 聽 聽 聽 聽 聽 聽 聽 { >聽 聽 聽 聽 聽 聽 聽 聽 .reg .b32 temp_param_reg; >聽 聽 聽 聽 聽 聽 聽 聽 // <end>} >聽 聽 聽 聽 聽 聽 聽 聽 .reg .b32 retval0; >聽 聽 聽 聽 聽 聽 聽 聽 call.uni (retval0), >聽 聽 聽 聽 聽 聽 聽 聽 INT_PTX_SREG_CTAID_X, >聽 聽 聽 聽 聽 聽 聽 聽 ( >聽 聽 聽 聽 聽 聽 聽 聽 ); >聽 聽 聽 聽 聽 聽 聽 聽 mov.b32 %r0, retval0; > >聽 聽 聽 聽 聽 聽 聽 聽 //{ >聽 聽 聽 聽 聽 聽 聽 聽 }// Callseq End 0 >聽 聽 聽 聽 聽 聽 聽 聽 // Callseq Start 1 >聽 聽 聽 聽 聽 聽 聽 聽 { >聽 聽 聽 聽 聽 聽 聽 聽 .reg .b32 temp_param_reg; >聽 聽 聽 聽 聽 聽 聽 聽 // <end>} >聽 聽 聽 聽 聽 聽 聽 聽 .reg .b32 retval0; >聽 聽 聽 聽 聽 聽 聽 聽 call.uni (retval0), >聽 聽 聽 聽 聽 聽 聽 聽 INT_PTX_SREG_NTID_X, >聽 聽 聽 聽 聽 聽 聽 聽 ( >聽 聽 聽 聽 聽 聽 聽 聽 ); >聽 聽 聽 聽 聽 聽 聽 聽 mov.b32 %r1, retval0; > >聽 聽 聽 聽 聽 聽 聽 聽 //{ >聽 聽 聽 聽 聽 聽 聽 聽 }// Callseq End 1 >聽 聽 聽 聽 聽 聽 聽 聽 // Callseq Start 2 >聽 聽 聽 聽 聽 聽 聽 聽 { >聽 聽 聽 聽 聽 聽 聽 聽 .reg .b32 temp_param_reg; >聽 聽 聽 聽 聽 聽 聽 聽 // <end>} >聽 聽 聽 聽 聽 聽 聽 聽 .reg .b32 retval0; >聽 聽 聽 聽 聽 聽 聽 聽 call.uni (retval0), >聽 聽 聽 聽 聽 聽 聽 聽 INT_PTX_SREG_TID_X, >聽 聽 聽 聽 聽 聽 聽 聽 ( >聽 聽 聽 聽 聽 聽 聽 聽 ); >聽 聽 聽 聽 聽 聽 聽 聽 mov.b32 %r2, retval0; > >聽 聽 聽 聽 聽 聽 聽 聽 //{ >聽 聽 聽 聽 聽 聽 聽 聽 }// Callseq End 2 >聽 聽 聽 聽 聽 聽 聽 聽 mad.lo.s32聽 聽 聽 聽 聽 %r0, %r0, %r1, %r2; >聽 聽 聽 聽 聽 聽 聽 聽 cvt.rn.f64.s32聽 %fl3, %r0; >聽 聽 聽 聽 聽 聽 聽 聽 // Callseq Start 3 >聽 聽 聽 聽 聽 聽 聽 聽 { >聽 聽 聽 聽 聽 聽 聽 聽 .reg .b32 temp_param_reg; >聽 聽 聽 聽 聽 聽 聽 聽 // <end>} >聽 聽 聽 聽 聽 聽 聽 聽 .reg .b32 retval0; >聽 聽 聽 聽 聽 聽 聽 聽 call.uni (retval0), >聽 聽 聽 聽 聽 聽 聽 聽 INT_PTX_SREG_CTAID_Y, >聽 聽 聽 聽 聽 聽 聽 聽 ( >聽 聽 聽 聽 聽 聽 聽 聽 ); >聽 聽 聽 聽 聽 聽 聽 聽 mov.b32 %r0, retval0; > >聽 聽 聽 聽 聽 聽 聽 聽 //{ >聽 聽 聽 聽 聽 聽 聽 聽 }// Callseq End 3 >聽 聽 聽 聽 聽 聽 聽 聽 // Callseq Start 4 >聽 聽 聽 聽 聽 聽 聽 聽 { >聽 聽 聽 聽 聽 聽 聽 聽 .reg .b32 temp_param_reg; >聽 聽 聽 聽 聽 聽 聽 聽 // <end>} >聽 聽 聽 聽 聽 聽 聽 聽 .reg .b32 retval0; >聽 聽 聽 聽 聽 聽 聽 聽 call.uni (retval0), >聽 聽 聽 聽 聽 聽 聽 聽 INT_PTX_SREG_NTID_Y, >聽 聽 聽 聽 聽 聽 聽 聽 ( >聽 聽 聽 聽 聽 聽 聽 聽 ); >聽 聽 聽 聽 聽 聽 聽 聽 mov.b32 %r1, retval0; > >聽 聽 聽 聽 聽 聽 聽 聽 //{ >聽 聽 聽 聽 聽 聽 聽 聽 }// Callseq End 4 >聽 聽 聽 聽 聽 聽 聽 聽 // Callseq Start 5 >聽 聽 聽 聽 聽 聽 聽 聽 { >聽 聽 聽 聽 聽 聽 聽 聽 .reg .b32 temp_param_reg; >聽 聽 聽 聽 聽 聽 聽 聽 // <end>} >聽 聽 聽 聽 聽 聽 聽 聽 .reg .b32 retval0; >聽 聽 聽 聽 聽 聽 聽 聽 call.uni (retval0), >聽 聽 聽 聽 聽 聽 聽 聽 INT_PTX_SREG_TID_X, >聽 聽 聽 聽 聽 聽 聽 聽 ( >聽 聽 聽 聽 聽 聽 聽 聽 ); >聽 聽 聽 聽 聽 聽 聽 聽 mov.b32 %r2, retval0; > >聽 聽 聽 聽 聽 聽 聽 聽 //{ >聽 聽 聽 聽 聽 聽 聽 聽 }// Callseq End 5 >聽 聽 聽 聽 聽 聽 聽 聽 mad.lo.s32聽 聽 聽 聽 聽 %r0, %r0, %r1, %r2; >聽 聽 聽 聽 聽 聽 聽 聽 cvt.rn.f64.s32聽 %fl4, %r0; >聽 聽 聽 聽 聽 聽 聽 聽 mul.f64聽 聽 聽 聽 聽 聽 聽 聽 %fl5, %fl4, %fl2; >聽 聽 聽 聽 聽 聽 聽 聽 add.f64聽 聽 聽 聽 聽 聽 聽 聽 %fl5, %fl5, %fl3; >聽 聽 聽 聽 聽 聽 聽 聽 cvt.rzi.s64.f64聽 聽 聽 聽 聽 聽 聽 聽 %rl1, %fl5; >聽 聽 聽 聽 聽 聽 聽 聽 shl.b64聽 聽 聽 聽 聽 聽 聽 聽 %rl1, %rl1, 3; >聽 聽 聽 聽 聽 聽 聽 聽 add.s64聽 聽 聽 聽 聽 聽 聽 聽 %rl1, %rl0, %rl1; >聽 聽 聽 聽 聽 聽 聽 聽 div.rn.f64聽 聽 聽 聽 聽 %fl2, %fl3, %fl2; >聽 聽 聽 聽 聽 聽 聽 聽 mul.f64聽 聽 聽 聽 聽 聽 聽 聽 %fl2, %fl2, 0d400C000000000000; >聽 聽 聽 聽 聽 聽 聽 聽 add.f64聽 聽 聽 聽 聽 聽 聽 聽 %fl2, %fl2, 0dC004000000000000; >聽 聽 聽 聽 聽 聽 聽 聽 div.rn.f64聽 聽 聽 聽 聽 %fl1, %fl4, %fl1; >聽 聽 聽 聽 聽 聽 聽 聽 add.f64聽 聽 聽 聽 聽 聽 聽 聽 %fl1, %fl1, %fl1; >聽 聽 聽 聽 聽 聽 聽 聽 add.f64聽 聽 聽 聽 聽 聽 聽 聽 %fl3, %fl1, 0dBFF0000000000000; >聽 聽 聽 聽 聽 聽 聽 聽 mov.f64聽 聽 聽 聽 聽 聽 聽 聽 %fl1, 0d0000000000000000; >聽 聽 聽 聽 聽 聽 聽 聽 mov.f64聽 聽 聽 聽 聽 聽 聽 聽 %fl5, %fl1; >聽 聽 聽 聽 聽 聽 聽 聽 mov.f64聽 聽 聽 聽 聽 聽 聽 聽 %fl4, %fl1; >聽 聽 聽 聽 聽 聽 聽 聽 bra.uni聽 聽 聽 聽 聽 聽 聽 聽 BB2_1; > BB2_2: >聽 聽 聽 聽 聽 聽 聽 聽 add.f64聽 聽 聽 聽 聽 聽 聽 聽 %fl1, %fl1, 0d3FF0000000000000; >聽 聽 聽 聽 聽 聽 聽 聽 sub.f64聽 聽 聽 聽 聽 聽 聽 聽 %fl6, %fl6, %fl7; >聽 聽 聽 聽 聽 聽 聽 聽 add.f64聽 聽 聽 聽 聽 聽 聽 聽 %fl6, %fl6, %fl2; >聽 聽 聽 聽 聽 聽 聽 聽 add.f64聽 聽 聽 聽 聽 聽 聽 聽 %fl5, %fl5, %fl5; >聽 聽 聽 聽 聽 聽 聽 聽 mul.f64聽 聽 聽 聽 聽 聽 聽 聽 %fl4, %fl5, %fl4; >聽 聽 聽 聽 聽 聽 聽 聽 add.f64聽 聽 聽 聽 聽 聽 聽 聽 %fl4, %fl4, %fl3; >聽 聽 聽 聽 聽 聽 聽 聽 mov.f64聽 聽 聽 聽 聽 聽 聽 聽 %fl5, %fl6; > BB2_1: >聽 聽 聽 聽 聽 聽 聽 聽 mul.f64聽 聽 聽 聽 聽 聽 聽 聽 %fl6, %fl5, %fl5; >聽 聽 聽 聽 聽 聽 聽 聽 mul.f64聽 聽 聽 聽 聽 聽 聽 聽 %fl7, %fl4, %fl4; >聽 聽 聽 聽 聽 聽 聽 聽 add.f64聽 聽 聽 聽 聽 聽 聽 聽 %fl8, %fl6, %fl7; >聽 聽 聽 聽 聽 聽 聽 聽 setp.lt.f64聽 聽 聽 聽 %p0, %fl8, 0d4010000000000000; >聽 聽 聽 聽 聽 聽 聽 聽 setp.lt.f64聽 聽 聽 聽 %p1, %fl1, %fl0; >聽 聽 聽 聽 聽 聽 聽 聽 and.pred聽 聽 聽 聽 聽 聽 聽 %p0, %p0, %p1; >聽 聽 聽 聽 聽 聽 聽 聽 @!%p0 bra聽 聽 聽 聽 聽 聽 BB2_3; >聽 聽 聽 聽 聽 聽 聽 聽 bra.uni聽 聽 聽 聽 聽 聽 聽 聽 BB2_2; > BB2_3: >聽 聽 聽 聽 聽 聽 聽 聽 div.rn.f64聽 聽 聽 聽 聽 %fl0, %fl1, %fl0; >聽 聽 聽 聽 聽 聽 聽 聽 st.global.f64聽 聽 [%rl1], %fl0; >聽 聽 聽 聽 聽 聽 聽 聽 mov.b64 func_retval0, %rl0; >聽 聽 聽 聽 聽 聽 聽 聽 ret; > }
I'm building this with llvm-c, and accessing these intrinsics via calling the intrinsic as if it were a function. class F_SREG<string OpStr, NVPTXRegClass regclassOut, Intrinsic IntOp> : NVPTXInst<(outs regclassOut:$dst), (ins), OpStr, [(set regclassOut:$dst, (IntOp))]>; def INT_PTX_SREG_TID_X : F_SREG<"mov.u32 \t$dst, %tid.x;", Int32Regs, int_nvvm_read_ptx_sreg_tid_x>; This method of accessing intrinsics works just fine for other intrinsics (for instance sqrt). Should I be declaring these as extern global variables? Thanks, Timothy On Fri, Mar 1, 2013 at 12:44 PM, Dmitry Mikushin <dmitry at kernelgen.org>wrote:> Timothy, > > Those calls to compute grid intrinsics are definitely wrong. In ptx code > they should end up into reading special registers, rather than function > calls. Try to take some working example and figure out the LLVM IR > differences between it and the result of your compiler. > > - D. > > ----- Original message ----- > > I've written a compiler that outputs PTX code, the result seems fairly > > reasonable, but I'm not sure the intrinsics are getting compiled > > correctly. > > > > In addition, when I try load the module using CUDA, I get an > > error: CUDA_ERROR_NO_BINARY_FOR_GPU. I'm running this on a 2012 MBP > > with a 640M GPU. > > > > PTX Code (for a mandelbrot calculation): > > > > // > > // Generated by LLVM NVPTX Back-End > > // > > > > .version 3.1 > > .target sm_10, texmode_independent > > .address_size 64 > > > > .func (.reg .b32 func_retval0) INT_PTX_SREG_CTAID_X > > ( > > > > ) > > ; > > .func (.reg .b32 func_retval0) INT_PTX_SREG_CTAID_Y > > ( > > > > ) > > ; > > .func (.reg .b32 func_retval0) INT_PTX_SREG_TID_X > > ( > > > > ) > > ; > > .func (.reg .b32 func_retval0) INT_PTX_SREG_NTID_X > > ( > > > > ) > > ; > > .func (.reg .b32 func_retval0) INT_PTX_SREG_NTID_Y > > ( > > > > ) > > ; > > > > // .globl examples_2E_mandelbrot_2F_square > > .func (.reg .b64 func_retval0) examples_2E_mandelbrot_2F_square( > > .reg .b64 examples_2E_mandelbrot_2F_square_param_0 > > ) > > { > > .reg .pred %p<396>; > > .reg .s16 %rc<396>; > > .reg .s16 %rs<396>; > > .reg .s32 %r<396>; > > .reg .s64 %rl<396>; > > .reg .f32 %f<396>; > > .reg .f64 %fl<396>; > > > > mov.f64 %fl0, examples_2E_mandelbrot_2F_square_param_0; > > mul.f64 %fl0, %fl0, %fl0; > > mov.f64 func_retval0, %fl0; > > ret; > > } > > > > // .globl > examples_2E_mandelbrot_2F_calc_2D_iteration > > .func (.reg .b64 func_retval0) > > examples_2E_mandelbrot_2F_calc_2D_iteration( .reg .b64 > > examples_2E_mandelbrot_2F_calc_2D_iteration_param_0, .reg > .b64 > > examples_2E_mandelbrot_2F_calc_2D_iteration_param_1, .reg > .b64 > > examples_2E_mandelbrot_2F_calc_2D_iteration_param_2, .reg > .b64 > > examples_2E_mandelbrot_2F_calc_2D_iteration_param_3, .reg > .b64 > > examples_2E_mandelbrot_2F_calc_2D_iteration_param_4 ) > > { > > .reg .pred %p<396>; > > .reg .s16 %rc<396>; > > .reg .s16 %rs<396>; > > .reg .s32 %r<396>; > > .reg .s64 %rl<396>; > > .reg .f32 %f<396>; > > .reg .f64 %fl<396>; > > > > mov.f64 %fl0, > > examples_2E_mandelbrot_2F_calc_2D_iteration_param_0; > mov.f64 > > %fl1, examples_2E_mandelbrot_2F_calc_2D_iteration_param_3; > > div.rn.f64 %fl0, %fl0, %fl1; mov.f64 %fl2, > > examples_2E_mandelbrot_2F_calc_2D_iteration_param_1; > mul.f64 > > %fl1, %fl0, 0d400C000000000000; mov.f64 %fl0, > > examples_2E_mandelbrot_2F_calc_2D_iteration_param_2; > add.f64 > > %fl1, %fl1, 0dC004000000000000; mov.f64 %fl3, > > examples_2E_mandelbrot_2F_calc_2D_iteration_param_4; > div.rn.f64 > > %fl2, %fl2, %fl3; add.f64 %fl2, > %fl2, %fl2; > > add.f64 %fl3, %fl2, 0dBFF0000000000000; > > mov.f64 %fl2, 0d0000000000000000; > > mov.f64 %fl5, %fl2; > > mov.f64 %fl4, %fl2; > > bra.uni BB1_1; > > BB1_2: > > add.f64 %fl2, %fl2, 0d3FF0000000000000; > > sub.f64 %fl6, %fl6, %fl7; > > add.f64 %fl6, %fl6, %fl1; > > add.f64 %fl5, %fl5, %fl5; > > mul.f64 %fl4, %fl5, %fl4; > > add.f64 %fl4, %fl4, %fl3; > > mov.f64 %fl5, %fl6; > > BB1_1: > > mul.f64 %fl6, %fl5, %fl5; > > mul.f64 %fl7, %fl4, %fl4; > > add.f64 %fl8, %fl6, %fl7; > > setp.lt.f64 %p0, %fl8, 0d4010000000000000; > > setp.lt.f64 %p1, %fl2, %fl0; > > and.pred %p0, %p0, %p1; > > @!%p0 bra BB1_3; > > bra.uni BB1_2; > > BB1_3: > > mov.f64 func_retval0, %fl2; > > ret; > > } > > > > // .globl > > examples_2E_mandelbrot_2F_calc_2D_mandelbrot_2D_ptx .func (.reg .b64 > > func_retval0) examples_2E_mandelbrot_2F_calc_2D_mandelbrot_2D_ptx( > > .reg .b64 > > examples_2E_mandelbrot_2F_calc_2D_mandelbrot_2D_ptx_param_0, > > .reg .b64 > > examples_2E_mandelbrot_2F_calc_2D_mandelbrot_2D_ptx_param_1, > > .reg .b64 > > examples_2E_mandelbrot_2F_calc_2D_mandelbrot_2D_ptx_param_2, > > .reg .b64 > > examples_2E_mandelbrot_2F_calc_2D_mandelbrot_2D_ptx_param_3 > > ) > > { > > .reg .pred %p<396>; > > .reg .s16 %rc<396>; > > .reg .s16 %rs<396>; > > .reg .s32 %r<396>; > > .reg .s64 %rl<396>; > > .reg .f32 %f<396>; > > .reg .f64 %fl<396>; > > > > mov.b64 %rl0, > > examples_2E_mandelbrot_2F_calc_2D_mandelbrot_2D_ptx_param_0; > > mov.f64 %fl2, > > examples_2E_mandelbrot_2F_calc_2D_mandelbrot_2D_ptx_param_1; > > mov.f64 %fl1, > > examples_2E_mandelbrot_2F_calc_2D_mandelbrot_2D_ptx_param_2; > > mov.f64 %fl0, > > examples_2E_mandelbrot_2F_calc_2D_mandelbrot_2D_ptx_param_3; > > // Callseq Start 0 > > { > > .reg .b32 temp_param_reg; > > // <end>} > > .reg .b32 retval0; > > call.uni (retval0), > > INT_PTX_SREG_CTAID_X, > > ( > > ); > > mov.b32 %r0, retval0; > > > > //{ > > }// Callseq End 0 > > // Callseq Start 1 > > { > > .reg .b32 temp_param_reg; > > // <end>} > > .reg .b32 retval0; > > call.uni (retval0), > > INT_PTX_SREG_NTID_X, > > ( > > ); > > mov.b32 %r1, retval0; > > > > //{ > > }// Callseq End 1 > > // Callseq Start 2 > > { > > .reg .b32 temp_param_reg; > > // <end>} > > .reg .b32 retval0; > > call.uni (retval0), > > INT_PTX_SREG_TID_X, > > ( > > ); > > mov.b32 %r2, retval0; > > > > //{ > > }// Callseq End 2 > > mad.lo.s32 %r0, %r0, %r1, %r2; > > cvt.rn.f64.s32 %fl3, %r0; > > // Callseq Start 3 > > { > > .reg .b32 temp_param_reg; > > // <end>} > > .reg .b32 retval0; > > call.uni (retval0), > > INT_PTX_SREG_CTAID_Y, > > ( > > ); > > mov.b32 %r0, retval0; > > > > //{ > > }// Callseq End 3 > > // Callseq Start 4 > > { > > .reg .b32 temp_param_reg; > > // <end>} > > .reg .b32 retval0; > > call.uni (retval0), > > INT_PTX_SREG_NTID_Y, > > ( > > ); > > mov.b32 %r1, retval0; > > > > //{ > > }// Callseq End 4 > > // Callseq Start 5 > > { > > .reg .b32 temp_param_reg; > > // <end>} > > .reg .b32 retval0; > > call.uni (retval0), > > INT_PTX_SREG_TID_X, > > ( > > ); > > mov.b32 %r2, retval0; > > > > //{ > > }// Callseq End 5 > > mad.lo.s32 %r0, %r0, %r1, %r2; > > cvt.rn.f64.s32 %fl4, %r0; > > mul.f64 %fl5, %fl4, %fl2; > > add.f64 %fl5, %fl5, %fl3; > > cvt.rzi.s64.f64 %rl1, %fl5; > > shl.b64 %rl1, %rl1, 3; > > add.s64 %rl1, %rl0, %rl1; > > div.rn.f64 %fl2, %fl3, %fl2; > > mul.f64 %fl2, %fl2, 0d400C000000000000; > > add.f64 %fl2, %fl2, 0dC004000000000000; > > div.rn.f64 %fl1, %fl4, %fl1; > > add.f64 %fl1, %fl1, %fl1; > > add.f64 %fl3, %fl1, 0dBFF0000000000000; > > mov.f64 %fl1, 0d0000000000000000; > > mov.f64 %fl5, %fl1; > > mov.f64 %fl4, %fl1; > > bra.uni BB2_1; > > BB2_2: > > add.f64 %fl1, %fl1, 0d3FF0000000000000; > > sub.f64 %fl6, %fl6, %fl7; > > add.f64 %fl6, %fl6, %fl2; > > add.f64 %fl5, %fl5, %fl5; > > mul.f64 %fl4, %fl5, %fl4; > > add.f64 %fl4, %fl4, %fl3; > > mov.f64 %fl5, %fl6; > > BB2_1: > > mul.f64 %fl6, %fl5, %fl5; > > mul.f64 %fl7, %fl4, %fl4; > > add.f64 %fl8, %fl6, %fl7; > > setp.lt.f64 %p0, %fl8, 0d4010000000000000; > > setp.lt.f64 %p1, %fl1, %fl0; > > and.pred %p0, %p0, %p1; > > @!%p0 bra BB2_3; > > bra.uni BB2_2; > > BB2_3: > > div.rn.f64 %fl0, %fl1, %fl0; > > st.global.f64 [%rl1], %fl0; > > mov.b64 func_retval0, %rl0; > > ret; > > } > >-- “One of the main causes of the fall of the Roman Empire was that–lacking zero–they had no way to indicate successful termination of their C programs.” (Robert Firth) -------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20130301/a395f70e/attachment.html>