Tim Northover via llvm-dev
2019-May-13 18:09 UTC
[llvm-dev] How to change CLang struct alignment behaviour?
Hi Joan, On Mon, 13 May 2019 at 18:01, Joan Lluch <joan.lluch at icloud.com> wrote:> After looking at it a bit further, I think this is a Clang thing. Clang issues “align 2” if the struct has at least one int (2 bytes), but also if the entire struct size is multiple of 2. For example a struct with 4 char members. In these cases the LLVM backend correctly creates word sized load/stores (2 bytes).I'm slightly surprised that it happens based purely on size, but either way LLVM should be able to cope.> The LLVM backend just follows what’s dictated by Clang regarding alignment and thus it creates 2 byte or 1 byte load/stores instructions accordingly. I have not found a way to override this in LLVM. Any suggestions are appreciated.That sounds right, but I don't think it explains the shifts you described before. It should work out a lot better than what you're seeing. Specifically, a 3 byte struct (for example) ought to either lower to: load i16, load i8 + stores if your target can do misaligned i16 operations. or load i8, load i8, load i8 + stores if not. Neither of those involve shifting operations. I'd suggest breaking just after getMemcpyLoadsAndStores and using SelectionDAG::dump to see exactly what it's created. Then try to work out where that gets pessimized to shifts, because it's not normal. Cheers. Tim.
Joan Lluch via llvm-dev
2019-May-13 18:58 UTC
[llvm-dev] How to change CLang struct alignment behaviour?
Hi Tim,
I agree this should be “solved” in LLVM
Just as a matter of information, this is the test source code and the generated
assembly for my target architecture (more comments below):
struct AA
{
char n;
char m;
char j;
};
struct BB
{
char n;
char m;
char j;
char k;
};
extern void convertA( struct AA *a);
extern void convertB( struct BB *b);
void callConvertA()
{
struct AA a = {3, 4};
convertA( &a );
}
void callConvertB()
{
struct BB b = {3, 4};
convertB( &b );
}
callConvertA: ; @callConvertA
; %bb.0: ; %entry
sub SP, #4, SP
mov &.LcallConvertA.a, r0
ld.sb [r0, #2], r1
st.b r1, [SP, #2]
ld.sb [r0, #0], r1
zext r1, r1
ld.sb [r0, #1], r0
zext r0, r0
bswap r0, r0
or r0, r1, r0
st.w r0, [SP, #0]
mov SP, r0
call &convertA
add SP, #4, SP
ret
.Lfunc_end0:
.size callConvertA, .Lfunc_end0-callConvertA
; -- End function
.globl callConvertB ; -- Begin function callConvertB
.p2align 1
.type callConvertB, at function
callConvertB: ; @callConvertB
; %bb.0: ; %entry
sub SP, #4, SP
mov #0, r0
st.w r0, [SP, #2]
mov #1027, r0
st.w r0, [SP, #0]
mov SP, r0
call &convertB
add SP, #4, SP
ret
.Lfunc_end1:
.size callConvertB, .Lfunc_end1-callConvertB
; -- End function
.type .LcallConvertA.a, at object ; @callConvertA.a
.section .rodata,"a", at progbits
.LcallConvertA.a:
.byte 3 ; 0x3
.byte 4 ; 0x4
.byte 0 ; 0x0
.size .LcallConvertA.a, 3
Please note that for this architecture the destination operand is on the RIGHT
HAND SIDE, this is important to know to read the assembly correctly.
“ld.sb" are signextend 8 bit loads
“st,b” are 8 bit trunc stores
“ld.w” are 16 bit word loads
“st.w” are 16 bit word stores
The generated code is functionally correct, but:
- For callConvertA i8, i8, i8 loads + i8, i16 stores are generated. To get the
value of two i8, i8 loads into a i16 store the “zext”+”bswap” (equivalent to
shift) + “or” trick is performed
- For callConvertB i16, i16 loads + i16, i16 stores are generated.
The desired behaviour would be to have i16, i8 loads + i16, i8 stores for the
callConvertA.
The only difference that I can spot by debugging the LLVM source code is that
getMemcpyLoadsAndStores is called with align = 1 for callConvertA, but it is NOT
called for callConvertB.
The Clang generated IR is this:
; ModuleID = 'add.c'
source_filename = "add.c"
target datalayout =
"e-m:e-p:16:16-i32:16-i64:16-f32:16-f64:16-a:8-n8:16-S16"
target triple = "cpu74"
%struct.AA = type { i8, i8, i8 }
%struct.BB = type { i8, i8, i8, i8 }
@callConvertA.a = private unnamed_addr constant %struct.AA { i8 3, i8 4, i8 0 },
align 1
; Function Attrs: minsize nounwind optsize
define dso_local void @callConvertA() local_unnamed_addr #0 {
entry:
%a = alloca %struct.AA, align 1
%0 = getelementptr inbounds %struct.AA, %struct.AA* %a, i16 0, i32 0
call void @llvm.lifetime.start.p0i8(i64 3, i8* nonnull %0) #3
call void @llvm.memcpy.p0i8.p0i8.i16(i8* nonnull align 1 %0, i8* align 1
getelementptr inbounds (%struct.AA, %struct.AA* @callConvertA.a, i16 0, i32 0),
i16 3, i1 false)
call void @convertA(%struct.AA* nonnull %a) #4
call void @llvm.lifetime.end.p0i8(i64 3, i8* nonnull %0) #3
ret void
}
; Function Attrs: argmemonly nounwind
declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
; Function Attrs: argmemonly nounwind
declare void @llvm.memcpy.p0i8.p0i8.i16(i8* nocapture writeonly, i8* nocapture
readonly, i16, i1) #1
; Function Attrs: minsize optsize
declare dso_local void @convertA(%struct.AA*) local_unnamed_addr #2
; Function Attrs: argmemonly nounwind
declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
; Function Attrs: minsize nounwind optsize
define dso_local void @callConvertB() local_unnamed_addr #0 {
entry:
%b = alloca i32, align 2
%tmpcast = bitcast i32* %b to %struct.BB*
%0 = bitcast i32* %b to i8*
call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %0) #3
store i32 1027, i32* %b, align 2
call void @convertB(%struct.BB* nonnull %tmpcast) #4
call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %0) #3
ret void
}
; Function Attrs: minsize optsize
declare dso_local void @convertB(%struct.BB*) local_unnamed_addr #2
attributes #0 = { minsize nounwind optsize
"correctly-rounded-divide-sqrt-fp-math"="false"
"disable-tail-calls"="false"
"less-precise-fpmad"="false"
"no-frame-pointer-elim"="true"
"no-frame-pointer-elim-non-leaf"
"no-infs-fp-math"="false"
"no-jump-tables"="false"
"no-nans-fp-math"="false"
"no-signed-zeros-fp-math"="false"
"no-trapping-math"="false"
"stack-protector-buffer-size"="8"
"unsafe-fp-math"="false"
"use-soft-float"="false" }
attributes #1 = { argmemonly nounwind }
attributes #2 = { minsize optsize
"correctly-rounded-divide-sqrt-fp-math"="false"
"disable-tail-calls"="false"
"less-precise-fpmad"="false"
"no-frame-pointer-elim"="true"
"no-frame-pointer-elim-non-leaf"
"no-infs-fp-math"="false"
"no-nans-fp-math"="false"
"no-signed-zeros-fp-math"="false"
"no-trapping-math"="false"
"stack-protector-buffer-size"="8"
"unsafe-fp-math"="false"
"use-soft-float"="false" }
attributes #3 = { nounwind }
attributes #4 = { minsize nounwind optsize }
!llvm.module.flags = !{!0}
!llvm.ident = !{!1}
!0 = !{i32 1, !"wchar_size", i32 2}
!1 = !{!"clang version 7.0.1 (tags/RELEASE_701/final)"}
John
Tel: 620 28 45 13
> On 13 May 2019, at 20:09, Tim Northover <t.p.northover at gmail.com>
wrote:
>
> Hi Joan,
>
> On Mon, 13 May 2019 at 18:01, Joan Lluch <joan.lluch at icloud.com>
wrote:
>> After looking at it a bit further, I think this is a Clang thing.
Clang issues “align 2” if the struct has at least one int (2 bytes), but also if
the entire struct size is multiple of 2. For example a struct with 4 char
members. In these cases the LLVM backend correctly creates word sized
load/stores (2 bytes).
>
> I'm slightly surprised that it happens based purely on size, but
> either way LLVM should be able to cope.
>
>> The LLVM backend just follows what’s dictated by Clang regarding
alignment and thus it creates 2 byte or 1 byte load/stores instructions
accordingly. I have not found a way to override this in LLVM. Any suggestions
are appreciated.
>
> That sounds right, but I don't think it explains the shifts you
> described before. It should work out a lot better than what you're
> seeing. Specifically, a 3 byte struct (for example) ought to either
> lower to:
>
> load i16, load i8 + stores if your target can do misaligned i16
operations.
>
> or
>
> load i8, load i8, load i8 + stores if not.
>
> Neither of those involve shifting operations. I'd suggest breaking
> just after getMemcpyLoadsAndStores and using SelectionDAG::dump to see
> exactly what it's created. Then try to work out where that gets
> pessimized to shifts, because it's not normal.
>
> Cheers.
>
> Tim.
-------------- next part --------------
An HTML attachment was scrubbed...
URL:
<http://lists.llvm.org/pipermail/llvm-dev/attachments/20190513/2c20ac23/attachment.html>
Joan Lluch via llvm-dev
2019-May-14 16:51 UTC
[llvm-dev] How to change CLang struct alignment behaviour?
Hi Tim,
I tracked down the issue to a LLVM omission or bug, or that’s what I think. The
issue can be easily reproduced for targets that do not support misaligned memory
accessed and do not implement allowsMisalignedMemoryAccesses. Let me try to
explain what happens:
In getMemcpyLoadsAndStores, the function FindOptimalMemOpLowering is called to
obtain the list of Store operations that should be generated. If the memcpy
destination is the non-fixed section of the stack, a value of Zero is passed as
the destination alignment (DstAlign) to signal that it can be “changed”.
The FindOptimalMemOpLowering starts by calling the target getOptimalMemOpType.
Most targets implement the later by returning the largest possible EVT that
shall be used to create the load/store pairs. This is generally ok for targets
that can handle non-aligned load/stores. If this function is not implemented,
then the FindOptimalMemOpLowering does that by finding the largest legal integer
type. Again, this is ok if the target would support misaligned accesses. Upon
return, the FindOptimalMemOpLowering has found the minimal list of memory
operations required to complete the memcpy.
Next, the getMemcpyLoadsAndStores continues by generating the required
load/store pairs, which will result into too large loads/stores for targets that
do not support misaligned accesses.
Later on, during the legalizeDAG phase, the function LegalizeLoadOps is invoked
which eventually calls allowsMemoryAccess function, which only now calls
allowsMisalignedMemoryAccesses to determine that it must replace the too long
loads by the odd Shifts and OR aggregates of smaller loads, the latter happens
inside expandUnalignedLoad
I would have expected, that for targets not supporting misaligned accesses, the
list of load/stores generated around the getMemcpyLoadsAndStores function, would
have respected such target limitation and would have not produced misaligned
load/stores to begin with. Instead, the default implementation defers that for
later and ends replacing such incorrect load/stores by odd expanded aggregates.
The workaround that I found was implementing getOptimalMemOpType in a way that
rather than finding the highest possible EVT to minimise the number of
load/stores, it would provide one that will never get greater than the SrcAlign.
So this is what I have now:
EVT CPU74TargetLowering::getOptimalMemOpType(uint64_t Size,
unsigned DstAlign, unsigned SrcAlign, bool IsMemset, bool ZeroMemset,
bool MemcpyStrSrc, MachineFunction &MF) const
{
if ( DstAlign == 0 && !IsMemset )
{
// Return the EVT of the source
DstAlign = SrcAlign;
EVT VT = MVT::i64;
while (DstAlign && DstAlign < VT.getSizeInBits() / 8)
VT = (MVT::SimpleValueType)(VT.getSimpleVT().SimpleTy - 1);
return VT;
}
return MVT::Other;
}
Maybe, this is what we are expected to do after all, but it still looks somewhat
weird to me that the default LLVM implementation would not take care of
misaligned memory access targets in a more appropriate way. Solving this issue
is just a matter of including the code above in FindOptimalMemOpLowering for
such targets. The current implementation does just the opposite, and will even
use MVT::i64 as the destination align even if the source align is only MVT::i8,
which I regard as totally wrong.
This problem is also shared by the MSP430 target, and it’s very easy to
reproduce by just compiling the code that I posted before.
Thanks
John Lluch
> On 13 May 2019, at 20:09, Tim Northover <t.p.northover at gmail.com>
wrote:
>
> Hi Joan,
>
> On Mon, 13 May 2019 at 18:01, Joan Lluch <joan.lluch at icloud.com>
wrote:
>> After looking at it a bit further, I think this is a Clang thing.
Clang issues “align 2” if the struct has at least one int (2 bytes), but also if
the entire struct size is multiple of 2. For example a struct with 4 char
members. In these cases the LLVM backend correctly creates word sized
load/stores (2 bytes).
>
> I'm slightly surprised that it happens based purely on size, but
> either way LLVM should be able to cope.
>
>> The LLVM backend just follows what’s dictated by Clang regarding
alignment and thus it creates 2 byte or 1 byte load/stores instructions
accordingly. I have not found a way to override this in LLVM. Any suggestions
are appreciated.
>
> That sounds right, but I don't think it explains the shifts you
> described before. It should work out a lot better than what you're
> seeing. Specifically, a 3 byte struct (for example) ought to either
> lower to:
>
> load i16, load i8 + stores if your target can do misaligned i16
operations.
>
> or
>
> load i8, load i8, load i8 + stores if not.
>
> Neither of those involve shifting operations. I'd suggest breaking
> just after getMemcpyLoadsAndStores and using SelectionDAG::dump to see
> exactly what it's created. Then try to work out where that gets
> pessimized to shifts, because it's not normal.
>
> Cheers.
>
> Tim.
-------------- next part --------------
An HTML attachment was scrubbed...
URL:
<http://lists.llvm.org/pipermail/llvm-dev/attachments/20190514/f16d1252/attachment.html>
Tim Northover via llvm-dev
2019-May-14 17:55 UTC
[llvm-dev] How to change CLang struct alignment behaviour?
Hi John, On Tue, 14 May 2019 at 17:51, Joan Lluch <joan.lluch at icloud.com> wrote:> This problem is also shared by the MSP430 target, and it’s very easy to reproduce by just compiling the code that I posted before.That's some good detective work; it definitely explains what you're seeing. Since MSP430 is affected it would probably be pretty easy to upstream an alignment-aware version of the function and test it, if you're keen. I'd promise to do a review promptly! Cheers. Tim.