When I use clang on an x86-64 to spit out the LLVM, like this clang -O -S -emit-llvm varargstest.c where varargstest.c looks like this int add_em_up(int count, ...) { va_list ap; int i, sum; va_start(ap, count); sum = 0; for (i = 0; i < count; i++) sum += va_arg(ap, int); va_end(ap); return sum; } I see LLVM that looks like it's been customized for the x86-64, versus the varargs stuff I was led to expect from the LLVM IR documentation. define i32 @add_em_up(i32 %count, ...) #0 { entry: %ap = alloca [1 x %struct.__va_list_tag], align 16 %arraydecay1 = bitcast [1 x %struct.__va_list_tag]* %ap to i8* call void @llvm.va_start(i8* %arraydecay1) %cmp7 = icmp sgt i32 %count, 0 br i1 %cmp7, label %for.body.lr.ph, label %for.end for.body.lr.ph: ; preds = %entry %gp_offset_p = getelementptr inbounds [1 x %struct.__va_list_tag]* %ap, i64 0, i64 0, i32 0 %0 = getelementptr inbounds [1 x %struct.__va_list_tag]* %ap, i64 0, i64 0, i32 3 %overflow_arg_area_p = getelementptr inbounds [1 x %struct.__va_list_tag]* %ap, i64 0, i64 0, i32 2 %gp_offset.pre = load i32* %gp_offset_p, align 16 br label %for.body for.body: ; preds = %vaarg.end, % for.body.lr.ph %gp_offset = phi i32 [ %gp_offset.pre, %for.body.lr.ph ], [ %gp_offset10, %vaarg.end ] %sum.09 = phi i32 [ 0, %for.body.lr.ph ], [ %add, %vaarg.end ] %i.08 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %vaarg.end ] %fits_in_gp = icmp ult i32 %gp_offset, 41 br i1 %fits_in_gp, label %vaarg.in_reg, label %vaarg.in_mem vaarg.in_reg: ; preds = %for.body %reg_save_area = load i8** %0, align 16 %1 = sext i32 %gp_offset to i64 %2 = getelementptr i8* %reg_save_area, i64 %1 %3 = add i32 %gp_offset, 8 store i32 %3, i32* %gp_offset_p, align 16 br label %vaarg.end vaarg.in_mem: ; preds = %for.body %overflow_arg_area = load i8** %overflow_arg_area_p, align 8 %overflow_arg_area.next = getelementptr i8* %overflow_arg_area, i64 8 store i8* %overflow_arg_area.next, i8** %overflow_arg_area_p, align 8 br label %vaarg.end vaarg.end: ; preds = %vaarg.in_mem, %vaarg.in_reg %gp_offset10 = phi i32 [ %3, %vaarg.in_reg ], [ %gp_offset, %vaarg.in_mem ] %vaarg.addr.in = phi i8* [ %2, %vaarg.in_reg ], [ %overflow_arg_area, %vaarg.in_mem ] %vaarg.addr = bitcast i8* %vaarg.addr.in to i32* %4 = load i32* %vaarg.addr, align 4 %add = add nsw i32 %4, %sum.09 %inc = add nsw i32 %i.08, 1 %exitcond = icmp eq i32 %inc, %count br i1 %exitcond, label %for.end, label %for.body for.end: ; preds = %vaarg.end, %entry %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add, %vaarg.end ] call void @llvm.va_end(i8* %arraydecay1) ret i32 %sum.0.lcssa } Notice at the bottom of the block labeled "for.body" that there's a test that determines whether to look for an argument on the stack or in a register. I see something similar w/ or w/o the -O flag. This isn't what I was led to expect by the LLVM IR documentation. Is there a way to avoid this "premature" optimization? I tried things like -arch mips and -march=mips but get complaints about unrecognized flags (-arch) or unknown target architecture CPU 'mips'. Why's that? The man page suggests both should work. Thanks, Preston -------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20150928/fd6d3b15/attachment.html>
What is it you are actually trying to do? Varargs are architecture dependent, so will need to take into account the calling convention of the architecture it is on. Your errors about 'mips' is probably based on the build of clang on your machine? -- Mats On 29 September 2015 at 00:07, Preston Briggs via llvm-dev < llvm-dev at lists.llvm.org> wrote:> When I use clang on an x86-64 to spit out the LLVM, like this > > clang -O -S -emit-llvm varargstest.c > > > where varargstest.c looks like this > > int add_em_up(int count, ...) { > va_list ap; > int i, sum; > va_start(ap, count); > sum = 0; > for (i = 0; i < count; i++) > sum += va_arg(ap, int); > va_end(ap); > return sum; > } > > > I see LLVM that looks like it's been customized for the x86-64, > versus the varargs stuff I was led to expect from the LLVM IR > documentation. > > define i32 @add_em_up(i32 %count, ...) #0 { > entry: > %ap = alloca [1 x %struct.__va_list_tag], align 16 > %arraydecay1 = bitcast [1 x %struct.__va_list_tag]* %ap to i8* > call void @llvm.va_start(i8* %arraydecay1) > %cmp7 = icmp sgt i32 %count, 0 > br i1 %cmp7, label %for.body.lr.ph, label %for.end > > for.body.lr.ph: ; preds = %entry > %gp_offset_p = getelementptr inbounds [1 x %struct.__va_list_tag]* %ap, > i64 0, i64 0, i32 0 > %0 = getelementptr inbounds [1 x %struct.__va_list_tag]* %ap, i64 0, i64 > 0, i32 3 > %overflow_arg_area_p = getelementptr inbounds [1 x > %struct.__va_list_tag]* %ap, i64 0, i64 0, i32 2 > %gp_offset.pre = load i32* %gp_offset_p, align 16 > br label %for.body > > for.body: ; preds = %vaarg.end, % > for.body.lr.ph > %gp_offset = phi i32 [ %gp_offset.pre, %for.body.lr.ph ], [ > %gp_offset10, %vaarg.end ] > %sum.09 = phi i32 [ 0, %for.body.lr.ph ], [ %add, %vaarg.end ] > %i.08 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %vaarg.end ] > %fits_in_gp = icmp ult i32 %gp_offset, 41 > br i1 %fits_in_gp, label %vaarg.in_reg, label %vaarg.in_mem > > vaarg.in_reg: ; preds = %for.body > %reg_save_area = load i8** %0, align 16 > %1 = sext i32 %gp_offset to i64 > %2 = getelementptr i8* %reg_save_area, i64 %1 > %3 = add i32 %gp_offset, 8 > store i32 %3, i32* %gp_offset_p, align 16 > br label %vaarg.end > > vaarg.in_mem: ; preds = %for.body > %overflow_arg_area = load i8** %overflow_arg_area_p, align 8 > %overflow_arg_area.next = getelementptr i8* %overflow_arg_area, i64 8 > store i8* %overflow_arg_area.next, i8** %overflow_arg_area_p, align 8 > br label %vaarg.end > > vaarg.end: ; preds = %vaarg.in_mem, > %vaarg.in_reg > %gp_offset10 = phi i32 [ %3, %vaarg.in_reg ], [ %gp_offset, > %vaarg.in_mem ] > %vaarg.addr.in = phi i8* [ %2, %vaarg.in_reg ], [ %overflow_arg_area, > %vaarg.in_mem ] > %vaarg.addr = bitcast i8* %vaarg.addr.in to i32* > %4 = load i32* %vaarg.addr, align 4 > %add = add nsw i32 %4, %sum.09 > %inc = add nsw i32 %i.08, 1 > %exitcond = icmp eq i32 %inc, %count > br i1 %exitcond, label %for.end, label %for.body > > for.end: ; preds = %vaarg.end, > %entry > %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add, %vaarg.end ] > call void @llvm.va_end(i8* %arraydecay1) > ret i32 %sum.0.lcssa > } > > > Notice at the bottom of the block labeled "for.body" that there's a test > that determines > whether to look for an argument on the stack or in a register. I see > something similar w/ or w/o the -O flag. > > This isn't what I was led to expect by the LLVM IR documentation. > Is there a way to avoid this "premature" optimization? > I tried things like -arch mips and -march=mips but get complaints > about unrecognized flags (-arch) or unknown target architecture CPU 'mips'. > Why's that? The man page suggests both should work. > > Thanks, > Preston > > > > > _______________________________________________ > LLVM Developers mailing list > llvm-dev at lists.llvm.org > http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev > >-------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20150929/467b0b3a/attachment.html>
Joerg Sonnenberger via llvm-dev
2015-Sep-29 11:53 UTC
[llvm-dev] varargs, the x86, and clang
On Mon, Sep 28, 2015 at 04:07:42PM -0700, Preston Briggs via llvm-dev wrote:> I see LLVM that looks like it's been customized for the x86-64, > versus the varargs stuff I was led to expect from the LLVM IR documentation.This is not an optimisation, but based on the complexity of the ABI and the limited functionality of the IR varargs support. While it tends to work for the simple cases, passes of aggregates and unions is very difficult to get right in many ABIs. Consider it the 80% solution -- it makes it easy to interact with simple variadic functions or get the common cases working for a new backend, but it often fails to handle all the tricky cases. Joerg
Eric Christopher via llvm-dev
2015-Sep-29 14:41 UTC
[llvm-dev] varargs, the x86, and clang
On Mon, Sep 28, 2015, 4:07 PM Preston Briggs via llvm-dev < llvm-dev at lists.llvm.org> wrote:> When I use clang on an x86-64 to spit out the LLVM, like this > > clang -O -S -emit-llvm varargstest.c > > > where varargstest.c looks like this > > int add_em_up(int count, ...) { > va_list ap; > int i, sum; > va_start(ap, count); > sum = 0; > for (i = 0; i < count; i++) > sum += va_arg(ap, int); > va_end(ap); > return sum; > } > > > I see LLVM that looks like it's been customized for the x86-64, > versus the varargs stuff I was led to expect from the LLVM IR > documentation. > > define i32 @add_em_up(i32 %count, ...) #0 { > entry: > %ap = alloca [1 x %struct.__va_list_tag], align 16 > %arraydecay1 = bitcast [1 x %struct.__va_list_tag]* %ap to i8* > call void @llvm.va_start(i8* %arraydecay1) > %cmp7 = icmp sgt i32 %count, 0 > br i1 %cmp7, label %for.body.lr.ph, label %for.end > > for.body.lr.ph: ; preds = %entry > %gp_offset_p = getelementptr inbounds [1 x %struct.__va_list_tag]* %ap, > i64 0, i64 0, i32 0 > %0 = getelementptr inbounds [1 x %struct.__va_list_tag]* %ap, i64 0, i64 > 0, i32 3 > %overflow_arg_area_p = getelementptr inbounds [1 x > %struct.__va_list_tag]* %ap, i64 0, i64 0, i32 2 > %gp_offset.pre = load i32* %gp_offset_p, align 16 > br label %for.body > > for.body: ; preds = %vaarg.end, % > for.body.lr.ph > %gp_offset = phi i32 [ %gp_offset.pre, %for.body.lr.ph ], [ > %gp_offset10, %vaarg.end ] > %sum.09 = phi i32 [ 0, %for.body.lr.ph ], [ %add, %vaarg.end ] > %i.08 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %vaarg.end ] > %fits_in_gp = icmp ult i32 %gp_offset, 41 > br i1 %fits_in_gp, label %vaarg.in_reg, label %vaarg.in_mem > > vaarg.in_reg: ; preds = %for.body > %reg_save_area = load i8** %0, align 16 > %1 = sext i32 %gp_offset to i64 > %2 = getelementptr i8* %reg_save_area, i64 %1 > %3 = add i32 %gp_offset, 8 > store i32 %3, i32* %gp_offset_p, align 16 > br label %vaarg.end > > vaarg.in_mem: ; preds = %for.body > %overflow_arg_area = load i8** %overflow_arg_area_p, align 8 > %overflow_arg_area.next = getelementptr i8* %overflow_arg_area, i64 8 > store i8* %overflow_arg_area.next, i8** %overflow_arg_area_p, align 8 > br label %vaarg.end > > vaarg.end: ; preds = %vaarg.in_mem, > %vaarg.in_reg > %gp_offset10 = phi i32 [ %3, %vaarg.in_reg ], [ %gp_offset, > %vaarg.in_mem ] > %vaarg.addr.in = phi i8* [ %2, %vaarg.in_reg ], [ %overflow_arg_area, > %vaarg.in_mem ] > %vaarg.addr = bitcast i8* %vaarg.addr.in to i32* > %4 = load i32* %vaarg.addr, align 4 > %add = add nsw i32 %4, %sum.09 > %inc = add nsw i32 %i.08, 1 > %exitcond = icmp eq i32 %inc, %count > br i1 %exitcond, label %for.end, label %for.body > > for.end: ; preds = %vaarg.end, > %entry > %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add, %vaarg.end ] > call void @llvm.va_end(i8* %arraydecay1) > ret i32 %sum.0.lcssa > } > > > Notice at the bottom of the block labeled "for.body" that there's a test > that determines > whether to look for an argument on the stack or in a register. I see > something similar w/ or w/o the -O flag. > > This isn't what I was led to expect by the LLVM IR documentation. > Is there a way to avoid this "premature" optimization? > I tried things like -arch mips and -march=mips but get complaints > about unrecognized flags (-arch) or unknown target architecture CPU 'mips'. > Why's that? The man page suggests both should work. >-arch really only works for Darwin. -March works in a way that you aren't thinking about, basically clang is a cross compiler by default, but you do need to specify a triple on the command line using -target. -eric> Thanks, > Preston > > > > _______________________________________________ > LLVM Developers mailing list > llvm-dev at lists.llvm.org > http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev >-------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20150929/cbffe04e/attachment.html>
When I read the LLVM IR manual, the description of the Variable Argument Handling Intrinsics and the va_arg instruction lead me to expect IR that uses va_arg instead of loads and register references. I was try to get hold of the IR before it becomes too machine dependent, i.e., before code generation, but it seems as though va_arg is replaced/expanded very early, before most optimization. Is there a way I can suppress that expansion? I'd really like to get hold of the IR with the va_args intact. My other question regarding -arch and -march I think involves a misleading (imo) or out-of-date man page. Using -target is an effective way to get code for other machines. Preston On Tue, Sep 29, 2015 at 1:06 AM, mats petersson <mats at planetcatfish.com> wrote:> What is it you are actually trying to do? > > Varargs are architecture dependent, so will need to take into account the > calling convention of the architecture it is on. > > Your errors about 'mips' is probably based on the build of clang on your > machine? > > -- > Mats > > On 29 September 2015 at 00:07, Preston Briggs via llvm-dev < > llvm-dev at lists.llvm.org> wrote: > >> When I use clang on an x86-64 to spit out the LLVM, like this >> >> clang -O -S -emit-llvm varargstest.c >> >> >> where varargstest.c looks like this >> >> int add_em_up(int count, ...) { >> va_list ap; >> int i, sum; >> va_start(ap, count); >> sum = 0; >> for (i = 0; i < count; i++) >> sum += va_arg(ap, int); >> va_end(ap); >> return sum; >> } >> >> >> I see LLVM that looks like it's been customized for the x86-64, >> versus the varargs stuff I was led to expect from the LLVM IR >> documentation. >> >> define i32 @add_em_up(i32 %count, ...) #0 { >> entry: >> %ap = alloca [1 x %struct.__va_list_tag], align 16 >> %arraydecay1 = bitcast [1 x %struct.__va_list_tag]* %ap to i8* >> call void @llvm.va_start(i8* %arraydecay1) >> %cmp7 = icmp sgt i32 %count, 0 >> br i1 %cmp7, label %for.body.lr.ph, label %for.end >> >> for.body.lr.ph: ; preds = %entry >> %gp_offset_p = getelementptr inbounds [1 x %struct.__va_list_tag]* %ap, >> i64 0, i64 0, i32 0 >> %0 = getelementptr inbounds [1 x %struct.__va_list_tag]* %ap, i64 0, >> i64 0, i32 3 >> %overflow_arg_area_p = getelementptr inbounds [1 x >> %struct.__va_list_tag]* %ap, i64 0, i64 0, i32 2 >> %gp_offset.pre = load i32* %gp_offset_p, align 16 >> br label %for.body >> >> for.body: ; preds = %vaarg.end, % >> for.body.lr.ph >> %gp_offset = phi i32 [ %gp_offset.pre, %for.body.lr.ph ], [ >> %gp_offset10, %vaarg.end ] >> %sum.09 = phi i32 [ 0, %for.body.lr.ph ], [ %add, %vaarg.end ] >> %i.08 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %vaarg.end ] >> %fits_in_gp = icmp ult i32 %gp_offset, 41 >> br i1 %fits_in_gp, label %vaarg.in_reg, label %vaarg.in_mem >> >> vaarg.in_reg: ; preds = %for.body >> %reg_save_area = load i8** %0, align 16 >> %1 = sext i32 %gp_offset to i64 >> %2 = getelementptr i8* %reg_save_area, i64 %1 >> %3 = add i32 %gp_offset, 8 >> store i32 %3, i32* %gp_offset_p, align 16 >> br label %vaarg.end >> >> vaarg.in_mem: ; preds = %for.body >> %overflow_arg_area = load i8** %overflow_arg_area_p, align 8 >> %overflow_arg_area.next = getelementptr i8* %overflow_arg_area, i64 8 >> store i8* %overflow_arg_area.next, i8** %overflow_arg_area_p, align 8 >> br label %vaarg.end >> >> vaarg.end: ; preds >> %vaarg.in_mem, %vaarg.in_reg >> %gp_offset10 = phi i32 [ %3, %vaarg.in_reg ], [ %gp_offset, >> %vaarg.in_mem ] >> %vaarg.addr.in = phi i8* [ %2, %vaarg.in_reg ], [ %overflow_arg_area, >> %vaarg.in_mem ] >> %vaarg.addr = bitcast i8* %vaarg.addr.in to i32* >> %4 = load i32* %vaarg.addr, align 4 >> %add = add nsw i32 %4, %sum.09 >> %inc = add nsw i32 %i.08, 1 >> %exitcond = icmp eq i32 %inc, %count >> br i1 %exitcond, label %for.end, label %for.body >> >> for.end: ; preds = %vaarg.end, >> %entry >> %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add, %vaarg.end ] >> call void @llvm.va_end(i8* %arraydecay1) >> ret i32 %sum.0.lcssa >> } >> >> >> Notice at the bottom of the block labeled "for.body" that there's a test >> that determines >> whether to look for an argument on the stack or in a register. I see >> something similar w/ or w/o the -O flag. >> >> This isn't what I was led to expect by the LLVM IR documentation. >> Is there a way to avoid this "premature" optimization? >> I tried things like -arch mips and -march=mips but get complaints >> about unrecognized flags (-arch) or unknown target architecture CPU >> 'mips'. >> Why's that? The man page suggests both should work. >> >> Thanks, >> Preston >> >> >> >> >> _______________________________________________ >> LLVM Developers mailing list >> llvm-dev at lists.llvm.org >> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev >> >> >-------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20150929/423b404c/attachment.html>