thr3ads.net - llvm dev - [llvm-dev] KNL Assembly Code for Matrix Multiplication [Jun 2017]

If this information is useful, please help other people find it:
Share via:

hameeza ahmed via llvm-dev

2017-Jun-30 12:28 UTC

[llvm-dev] KNL Assembly Code for Matrix Multiplication

Hello, I want some help in understanding knl intel assembly of matrix
multiplication code. some of the things are not clear;

here .c file:

#include <stdio.h>
#define N 1000

// This function multiplies A[][] and B[][], and stores
// the result in C[][]
void multiply(int A[][N], int B[][N], int C[][N])
{
    int i, j, k, r;
    for (i = 0; i < N; i++)
    {
        for (j = 0; j < N; j++)
        {
            r = 0;
            for (k = 0; k < N; k++) {
                r += A[i][k]*B[k][j];}
                C[i][j] = r;

        }

    }
}

here .s file: * the code that i want to ask is in red color.*

.text
.intel_syntax noprefix
.file "matn_o3.ll"
.section .rodata,"a", at progbits
.p2align 6
.LCPI0_0:
.quad 8                       # 0x8
.quad 9                       # 0x9
.quad 10                      # 0xa
.quad 11                      # 0xb
.quad 12                      # 0xc
.quad 13                      # 0xd
.quad 14                      # 0xe
.quad 15                      # 0xf
.LCPI0_1:
.quad 0                       # 0x0
.quad 1                       # 0x1
.quad 2                       # 0x2
.quad 3                       # 0x3
.quad 4                       # 0x4
.quad 5                       # 0x5
.quad 6                       # 0x6
.quad 7                       # 0x7
.section .rodata.cst8,"aM", at progbits,8
.p2align 3
.LCPI0_2:
.quad 4000                    # 0xfa0
.LCPI0_3:
.quad 64000                   # 0xfa00
.LCPI0_4:
.quad 128000                  # 0x1f400
.LCPI0_5:
.quad 192000                  # 0x2ee00
.LCPI0_6:
.quad 64                      # 0x40
.text
.globl multiply
.p2align 4, 0x90
.type multiply, at function
multiply:                               # @multiply
.cfi_startproc
# BB#0:
push rbp
.Lcfi0:
.cfi_def_cfa_offset 16
push r15
.Lcfi1:
.cfi_def_cfa_offset 24
push r14
.Lcfi2:
.cfi_def_cfa_offset 32
push r12
.Lcfi3:
.cfi_def_cfa_offset 40
push rbx
.Lcfi4:
.cfi_def_cfa_offset 48
.Lcfi5:
.cfi_offset rbx, -48
.Lcfi6:
.cfi_offset r12, -40
.Lcfi7:
.cfi_offset r14, -32
.Lcfi8:
.cfi_offset r15, -24
.Lcfi9:
.cfi_offset rbp, -16
lea r8, [rdi + 3856]
xor r9d, r9d
vmovdqa64 zmm22, zmmword ptr [rip + .LCPI0_0] # zmm22 [8,9,10,11,12,13,14,15]
vmovdqa64 zmm23, zmmword ptr [rip + .LCPI0_1] # zmm23 = [0,1,2,3,4,5,6,7]
vpbroadcastq zmm2, qword ptr [rip + .LCPI0_2]
vpbroadcastq zmm3, rsi
add rsi, 3856000
vpbroadcastq zmm4, qword ptr [rip + .LCPI0_3]
vpbroadcastq zmm5, qword ptr [rip + .LCPI0_4]
vpbroadcastq zmm6, qword ptr [rip + .LCPI0_5]
kxnorw k1, k0, k0
kshiftrw k1, k1, 8
vpbroadcastq zmm7, qword ptr [rip + .LCPI0_6]
.p2align 4, 0x90
.LBB0_1:                                # %.preheader26
                                        # =>This Loop Header: Depth=1
                                        #     Child Loop BB0_2 Depth 2
                                        #       Child Loop BB0_3 Depth 3
                                        #       Child Loop BB0_5 Depth 3
xor r11d, r11d
.p2align 4, 0x90
.LBB0_2:                                # %.preheader
                                        #   Parent Loop BB0_1 Depth=1
                                        # =>  This Loop Header: Depth=2
                                        #       Child Loop BB0_3 Depth 3
                                        #       Child Loop BB0_5 Depth 3
vpxord zmm8, zmm8, zmm8
mov ecx, 960
vmovdqa64 zmm9, zmm23
vmovdqa64 zmm10, zmm22
vpxord zmm11, zmm11, zmm11
vpxord zmm12, zmm12, zmm12
vpxord zmm13, zmm13, zmm13
.p2align 4, 0x90
.LBB0_3:                                # %vector.body
                                        #   Parent Loop BB0_1 Depth=1
                                        #     Parent Loop BB0_2 Depth=2
                                        # =>    This Inner Loop Header:
Depth=3
                                        # this bb will run 15 times
vmovq rax, xmm9
imul r10, r9, 4000
lea rbx, [rdi + r10]
*vpmuludq zmm14, zmm10, zmm2       ; this is BB for vector here we have to
do gather for B due to arbitrary addresses so here
zmm10=[8,9,10,11,12,13,14,15]. it means zmm10 contains 8 values present in
these indexes? and zmm2=[4000, 4000,.....4000]. these are the indexes for B
we need to multiple indexes with stride=4000. i know here these indexes are
64 bit but the values stored in these locations are 32 bits then  the load
using zmm10 index will give 8 elements of 32 bits present in these
locations, so do the registers contain 8 elements of 32 bits present at
specified indexes?? so after multiplication we get indexes for higher 8
elements of B i.e [3200,3600,40000,.......54000].*

* vpsrlq zmm15, zmm10, 32              ; i dont understand the need for
this step, please explain the purpose of all these steps. here vpsrlq will
shift right zmm10 values by 256 bits (32*8)....zmmm10
initially=**[8,9,10,11,12,13,14,15].
it will now become [0,0,0,0,8,9,10,11]...Am I correct? Please explain me
the purpose of this step.*
* vpmuludq zmm15, zmm15, zmm2  ;    similarly **dont understand the need
for this step.*
* vpsllq zmm15, zmm15, 32    ; **dont understand the need for this step*
* vpaddq zmm14, zmm14, zmm3  ; *
* vpaddq zmm14, zmm15, zmm14 ; **dont understand the need for this step*
* vpbroadcastq zmm15, r11 ; **r11 changes when loop variable j changes
whats the need of this step?*
* vpsllq zmm15, zmm15, 2   ; **dont understand the need for this step*
* vpaddq zmm14, zmm14, zmm15 ; **dont understand the need for this step*
* vpmuludq zmm16, zmm9, zmm2 ; **here same as before the lower 8 elements
of B indexes are computed as Zmm16=[0,4000,8000,.......28000]*
* vpsrlq zmm17, zmm9, 32   **; **dont understand the need for this step*
* vpmuludq zmm17, zmm17, zmm2  **; **dont understand the need for this step*
* vpsllq zmm17, zmm17, 32  **; **dont understand the need for this step*
* vpaddq zmm16, zmm16, zmm3  *
* vpaddq zmm16, zmm17, zmm16  **; **dont understand the need for this step*
* vpaddq zmm15, zmm16, zmm15  **; **dont understand the need for this step*
* vpaddq zmm16, zmm15, zmm4*
* vpaddq zmm17, zmm14, zmm4*
* vpaddq zmm18, zmm15, zmm5*
* vpaddq zmm19, zmm14, zmm5*
* vpaddq zmm20, zmm15, zmm6*
* vpaddq zmm21, zmm14, zmm6*
* kmovw k2, k1  **; **dont understand the need for this step*
* vpgatherqd ymm0 {k2}, zmmword ptr [zmm14] ; since zmm14 contains 8
indexes ( or values at these 8 indexes???) so it will load 8 elements not
16. here it should be zmm14**=[3200,3600,40000,.......54000]. but by the
above computation these indexes are changes??*
* kxnorw k2, k0, k0  **; **dont understand the need for this step*
* vpgatherqd ymm14 {k2}, zmmword ptr [zmm15]   **; **here again issues with
index zmm15. it should be **[0,4000,8000,.......28000] but its different
due to above computation.*
* vinserti64x4 zmm0, zmm14, ymm0, 1*
* kmovw k2, k1*
* vpgatherqd ymm14 {k2}, zmmword ptr [zmm17]*
* kxnorw k2, k0, k0*
* vpgatherqd ymm15 {k2}, zmmword ptr [zmm16]*
* vinserti64x4 zmm14, zmm15, ymm14, 1*
* kmovw k2, k1*
* vpgatherqd ymm15 {k2}, zmmword ptr [zmm19]*
* kxnorw k2, k0, k0*
* vpgatherqd ymm16 {k2}, zmmword ptr [zmm18]*
* vinserti64x4 zmm15, zmm16, ymm15, 1*
* kmovw k2, k1*
* vpgatherqd ymm1 {k2}, zmmword ptr [zmm21]*
* kxnorw k2, k0, k0*
* vpgatherqd ymm16 {k2}, zmmword ptr [zmm20]*
* vinserti64x4 zmm1, zmm16, ymm1, 1*
* vpmulld zmm0, zmm0, zmmword ptr [rbx + 4*rax]*
vpmulld zmm14, zmm14, zmmword ptr [rbx + 4*rax + 64]
vpmulld zmm15, zmm15, zmmword ptr [rbx + 4*rax + 128]
vpmulld zmm1, zmm1, zmmword ptr [rbx + 4*rax + 192]
vpaddd zmm8, zmm0, zmm8
vpaddd zmm11, zmm14, zmm11
vpaddd zmm12, zmm15, zmm12
vpaddd zmm13, zmm1, zmm13
vpaddq zmm9, zmm9, zmm7        #zmm7=64
vpaddq zmm10, zmm10, zmm7
add rcx, -64     #decrement counter by 64
jne .LBB0_3       # if rcx not equal to zero goto .lbbo_3
# BB#4:                                 # %middle.block
                                        #   in Loop: Header=BB0_2 Depth=2
vpaddd zmm0, zmm11, zmm8
vpaddd zmm0, zmm12, zmm0
vpaddd zmm0, zmm13, zmm0
*vshufi64x2 zmm1, zmm0, zmm0, 14 # zmm1 = zmm0[4,5,6,7,0,1,0,1]   ; please
explain how shuffle instructions work here. i know of llvm ir shuffle, but
these assembly ones are difficult for me to understand*
* vpaddd zmm0, zmm0, zmm1*
* vshufi64x2 zmm1, zmm0, zmm0, 1 # zmm1 = zmm0[2,3,0,1,0,1,0,1]*
* vpaddd zmm0, zmm0, zmm1*
* vpshufd zmm1, zmm0, 238         # zmm1
zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]*
* vpaddd zmm0, zmm0, zmm1*
* vpshufd zmm1, zmm0, 229         # zmm1
zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15]*
vpaddd zmm0, zmm0, zmm1
vmovd ebx, xmm0
mov rax, r8
xor r14d, r14d
.p2align 4, 0x90
.LBB0_5:                                #   Parent Loop BB0_1 Depth=1
                                        #     Parent Loop BB0_2 Depth=2
                                        # =>    This Inner Loop Header:
Depth=3
lea r15, [rsi + r14]
mov r12d, dword ptr [r15 + 4*r11 - 16000]
imul r12d, dword ptr [rax - 16]
mov ecx, dword ptr [r15 + 4*r11 - 12000]
imul ecx, dword ptr [rax - 12]
mov ebp, dword ptr [r15 + 4*r11 - 8000]
imul ebp, dword ptr [rax - 8]
add r12d, ebx
add ecx, r12d
add ebp, ecx
mov ecx, dword ptr [r15 + 4*r11 - 4000]
imul ecx, dword ptr [rax - 4]
add ecx, ebp
mov ebx, dword ptr [r15 + 4*r11]
imul ebx, dword ptr [rax]
add ebx, ecx
add r14, 20000
add rax, 20
cmp r14, 160000
jne .LBB0_5
# BB#6:                                 # %.loopexit
                                        #   in Loop: Header=BB0_2 Depth=2
add r10, rdx                #rdx is c[][]
mov dword ptr [r10 + 4*r11], ebx
inc r11
cmp r11, 1000
jne .LBB0_2
# BB#7:                                 #   in Loop: Header=BB0_1 Depth=1
inc r9
add r8, 4000
cmp r9, 1000
jne .LBB0_1
# BB#8:
pop rbx
pop r12
pop r14
pop r15
pop rbp
ret


Looking forward to your reply

Thank You
-------------- next part --------------
An HTML attachment was scrubbed...
URL:
<http://lists.llvm.org/pipermail/llvm-dev/attachments/20170630/8c57090e/attachment.html>

Craig Topper via llvm-dev

2017-Jun-30 15:59 UTC

head link

[llvm-dev] KNL Assembly Code for Matrix Multiplication

Some comments inline, I'll need to look more later.

~Craig

On Fri, Jun 30, 2017 at 5:28 AM, hameeza ahmed via llvm-dev <
llvm-dev at lists.llvm.org> wrote:
> Hello, I want some help in understanding knl intel assembly of matrix
> multiplication code. some of the things are not clear;
>
> here .c file:
>
> #include <stdio.h>
> #define N 1000
>
> // This function multiplies A[][] and B[][], and stores
> // the result in C[][]
> void multiply(int A[][N], int B[][N], int C[][N])
> {
>     int i, j, k, r;
>     for (i = 0; i < N; i++)
>     {
>         for (j = 0; j < N; j++)
>         {
>             r = 0;
>             for (k = 0; k < N; k++) {
>                 r += A[i][k]*B[k][j];}
>                 C[i][j] = r;
>
>         }
>
>     }
> }
>
> here .s file: * the code that i want to ask is in red color.*
>
> .text
> .intel_syntax noprefix
> .file "matn_o3.ll"
> .section .rodata,"a", at progbits
> .p2align 6
> .LCPI0_0:
> .quad 8                       # 0x8
> .quad 9                       # 0x9
> .quad 10                      # 0xa
> .quad 11                      # 0xb
> .quad 12                      # 0xc
> .quad 13                      # 0xd
> .quad 14                      # 0xe
> .quad 15                      # 0xf
> .LCPI0_1:
> .quad 0                       # 0x0
> .quad 1                       # 0x1
> .quad 2                       # 0x2
> .quad 3                       # 0x3
> .quad 4                       # 0x4
> .quad 5                       # 0x5
> .quad 6                       # 0x6
> .quad 7                       # 0x7
> .section .rodata.cst8,"aM", at progbits,8
> .p2align 3
> .LCPI0_2:
> .quad 4000                    # 0xfa0
> .LCPI0_3:
> .quad 64000                   # 0xfa00
> .LCPI0_4:
> .quad 128000                  # 0x1f400
> .LCPI0_5:
> .quad 192000                  # 0x2ee00
> .LCPI0_6:
> .quad 64                      # 0x40
> .text
> .globl multiply
> .p2align 4, 0x90
> .type multiply, at function
> multiply:                               # @multiply
> .cfi_startproc
> # BB#0:
> push rbp
> .Lcfi0:
> .cfi_def_cfa_offset 16
> push r15
> .Lcfi1:
> .cfi_def_cfa_offset 24
> push r14
> .Lcfi2:
> .cfi_def_cfa_offset 32
> push r12
> .Lcfi3:
> .cfi_def_cfa_offset 40
> push rbx
> .Lcfi4:
> .cfi_def_cfa_offset 48
> .Lcfi5:
> .cfi_offset rbx, -48
> .Lcfi6:
> .cfi_offset r12, -40
> .Lcfi7:
> .cfi_offset r14, -32
> .Lcfi8:
> .cfi_offset r15, -24
> .Lcfi9:
> .cfi_offset rbp, -16
> lea r8, [rdi + 3856]
> xor r9d, r9d
> vmovdqa64 zmm22, zmmword ptr [rip + .LCPI0_0] # zmm22 >
[8,9,10,11,12,13,14,15]
> vmovdqa64 zmm23, zmmword ptr [rip + .LCPI0_1] # zmm23 = [0,1,2,3,4,5,6,7]
> vpbroadcastq zmm2, qword ptr [rip + .LCPI0_2]
> vpbroadcastq zmm3, rsi
> add rsi, 3856000
> vpbroadcastq zmm4, qword ptr [rip + .LCPI0_3]
> vpbroadcastq zmm5, qword ptr [rip + .LCPI0_4]
> vpbroadcastq zmm6, qword ptr [rip + .LCPI0_5]
> kxnorw k1, k0, k0
> kshiftrw k1, k1, 8
> vpbroadcastq zmm7, qword ptr [rip + .LCPI0_6]
> .p2align 4, 0x90
> .LBB0_1:                                # %.preheader26
>                                         # =>This Loop Header: Depth=1
>                                         #     Child Loop BB0_2 Depth 2
>                                         #       Child Loop BB0_3 Depth 3
>                                         #       Child Loop BB0_5 Depth 3
> xor r11d, r11d
> .p2align 4, 0x90
> .LBB0_2:                                # %.preheader
>                                         #   Parent Loop BB0_1 Depth=1
>                                         # =>  This Loop Header: Depth=2
>                                         #       Child Loop BB0_3 Depth 3
>                                         #       Child Loop BB0_5 Depth 3
> vpxord zmm8, zmm8, zmm8
> mov ecx, 960
> vmovdqa64 zmm9, zmm23
> vmovdqa64 zmm10, zmm22
> vpxord zmm11, zmm11, zmm11
> vpxord zmm12, zmm12, zmm12
> vpxord zmm13, zmm13, zmm13
> .p2align 4, 0x90
> .LBB0_3:                                # %vector.body
>                                         #   Parent Loop BB0_1 Depth=1
>                                         #     Parent Loop BB0_2 Depth=2
>                                         # =>    This Inner Loop Header:
> Depth=3
>                                         # this bb will run 15 times
> vmovq rax, xmm9
> imul r10, r9, 4000
> lea rbx, [rdi + r10]
> *vpmuludq zmm14, zmm10, zmm2       ; this is BB for vector here we have to
> do gather for B due to arbitrary addresses so here
> zmm10=[8,9,10,11,12,13,14,15]. it means zmm10 contains 8 values present in
> these indexes? and zmm2=[4000, 4000,.....4000]. these are the indexes for B
> we need to multiple indexes with stride=4000. i know here these indexes are
> 64 bit but the values stored in these locations are 32 bits then  the load
> using zmm10 index will give 8 elements of 32 bits present in these
> locations, so do the registers contain 8 elements of 32 bits present at
> specified indexes?? so after multiplication we get indexes for higher 8
> elements of B i.e [3200,3600,40000,.......54000].*
>
> * vpsrlq zmm15, zmm10, 32              ; i dont understand the need for
> this step, please explain the purpose of all these steps. here vpsrlq will
> shift right zmm10 values by 256 bits (32*8)....zmmm10
initially=**[8,9,10,11,12,13,14,15].
> it will now become [0,0,0,0,8,9,10,11]...Am I correct? Please explain me
> the purpose of this step.*
> * vpmuludq zmm15, zmm15, zmm2  ;    similarly **dont understand the need
> for this step.*
> * vpsllq zmm15, zmm15, 32    ; **dont understand the need for this step*
> * vpaddq zmm14, zmm14, zmm3  ; *
> * vpaddq zmm14, zmm15, zmm14 ; **dont understand the need for this step*
>
vpsrlq zmm15, zmm10, 32 shifts every 64-bit element in zmm10 right by 32
bits. I believe this effectively taking every odd numbered 32-bit element
and moving them to the next lowest even numbered 32-bit element.

vmuludq multiplies all even numbered 32-bit elements and creates 64-bit
results.

The combination of the shifts, vpmuludq, and vpaddq is to multiply 64-bit
elements and create a 64-bit elements result. We don't have an instruction
for this so we have to multiply the low 32-bits of each element and the
high 32-bits of each element separately and add the results together. Looks
like we determined that the high 32-bits of one of the inputs is all zeros
so we skipped 1 of the multiplies and adds that would normally be required
for this operation.


> * vpbroadcastq zmm15, r11 ; **r11 changes when loop variable j changes
> whats the need of this step?*
> * vpsllq zmm15, zmm15, 2   ; **dont understand the need for this step*
> * vpaddq zmm14, zmm14, zmm15 ; **dont understand the need for this step*
> * vpmuludq zmm16, zmm9, zmm2 ; **here same as before the lower 8 elements
> of B indexes are computed as Zmm16=[0,4000,8000,.......28000]*
> * vpsrlq zmm17, zmm9, 32   **; **dont understand the need for this step*
> * vpmuludq zmm17, zmm17, zmm2  **; **dont understand the need for this
> step*
> * vpsllq zmm17, zmm17, 32  **; **dont understand the need for this step*
> * vpaddq zmm16, zmm16, zmm3  *
> * vpaddq zmm16, zmm17, zmm16  **; **dont understand the need for this
> step*
> * vpaddq zmm15, zmm16, zmm15  **; **dont understand the need for this
> step*
> * vpaddq zmm16, zmm15, zmm4*
> * vpaddq zmm17, zmm14, zmm4*
> * vpaddq zmm18, zmm15, zmm5*
> * vpaddq zmm19, zmm14, zmm5*
> * vpaddq zmm20, zmm15, zmm6*
> * vpaddq zmm21, zmm14, zmm6*
> * kmovw k2, k1  **; **dont understand the need for this step*
>
The gather instruction requires a mask of which elements to read. When the
gather completes, if there are no faults it will have written the mask
register to 0. So it needs to reloaded for each gather.

> * vpgatherqd ymm0 {k2}, zmmword ptr [zmm14] ; since zmm14 contains 8
> indexes ( or values at these 8 indexes???) so it will load 8 elements not
> 16. here it should be zmm14**=[3200,3600,40000,.......54000]. but by the
> above computation these indexes are changes??*
> * kxnorw k2, k0, k0  **; **dont understand the need for this step*
>* vpgatherqd ymm14 {k2}, zmmword ptr [zmm15]   **; **here again issues
with> index zmm15. it should be **[0,4000,8000,.......28000] but its different
> due to above computation.*
> * vinserti64x4 zmm0, zmm14, ymm0, 1*
> * kmovw k2, k1*
> * vpgatherqd ymm14 {k2}, zmmword ptr [zmm17]*
> * kxnorw k2, k0, k0*
> * vpgatherqd ymm15 {k2}, zmmword ptr [zmm16]*
> * vinserti64x4 zmm14, zmm15, ymm14, 1*
> * kmovw k2, k1*
> * vpgatherqd ymm15 {k2}, zmmword ptr [zmm19]*
> * kxnorw k2, k0, k0*
> * vpgatherqd ymm16 {k2}, zmmword ptr [zmm18]*
> * vinserti64x4 zmm15, zmm16, ymm15, 1*
> * kmovw k2, k1*
> * vpgatherqd ymm1 {k2}, zmmword ptr [zmm21]*
> * kxnorw k2, k0, k0*
> * vpgatherqd ymm16 {k2}, zmmword ptr [zmm20]*
> * vinserti64x4 zmm1, zmm16, ymm1, 1*
> * vpmulld zmm0, zmm0, zmmword ptr [rbx + 4*rax]*
> vpmulld zmm14, zmm14, zmmword ptr [rbx + 4*rax + 64]
> vpmulld zmm15, zmm15, zmmword ptr [rbx + 4*rax + 128]
> vpmulld zmm1, zmm1, zmmword ptr [rbx + 4*rax + 192]
> vpaddd zmm8, zmm0, zmm8
> vpaddd zmm11, zmm14, zmm11
> vpaddd zmm12, zmm15, zmm12
> vpaddd zmm13, zmm1, zmm13
> vpaddq zmm9, zmm9, zmm7        #zmm7=64
> vpaddq zmm10, zmm10, zmm7
> add rcx, -64     #decrement counter by 64
> jne .LBB0_3       # if rcx not equal to zero goto .lbbo_3
> # BB#4:                                 # %middle.block
>                                         #   in Loop: Header=BB0_2 Depth=2
> vpaddd zmm0, zmm11, zmm8
> vpaddd zmm0, zmm12, zmm0
> vpaddd zmm0, zmm13, zmm0
> *vshufi64x2 zmm1, zmm0, zmm0, 14 # zmm1 = zmm0[4,5,6,7,0,1,0,1]   ; please
> explain how shuffle instructions work here. i know of llvm ir shuffle, but
> these assembly ones are difficult for me to understand*
>
You have to look at the size of the register being mentioned and the number
of elements in brackets. In this case the regsiter is 512-bits and the
number of elements is 8. 512/8 is 64. So its a shuffle of a v8i64 vector.
Then we read the element numbers from left to write just like the shuffle
IR instruction.

So element 0 of zmm1 gets the value of element 4 of zmm0. Element 1 of zmm1
gets the value of element 5 of zmm5, etc.

> * vpaddd zmm0, zmm0, zmm1*
> * vshufi64x2 zmm1, zmm0, zmm0, 1 # zmm1 = zmm0[2,3,0,1,0,1,0,1]*
> * vpaddd zmm0, zmm0, zmm1*
> * vpshufd zmm1, zmm0, 238         # zmm1 >
zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]*
> * vpaddd zmm0, zmm0, zmm1*
> * vpshufd zmm1, zmm0, 229         # zmm1 >
zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15]*
> vpaddd zmm0, zmm0, zmm1
> vmovd ebx, xmm0
> mov rax, r8
> xor r14d, r14d
> .p2align 4, 0x90
> .LBB0_5:                                #   Parent Loop BB0_1 Depth=1
>                                         #     Parent Loop BB0_2 Depth=2
>                                         # =>    This Inner Loop Header:
> Depth=3
> lea r15, [rsi + r14]
> mov r12d, dword ptr [r15 + 4*r11 - 16000]
> imul r12d, dword ptr [rax - 16]
> mov ecx, dword ptr [r15 + 4*r11 - 12000]
> imul ecx, dword ptr [rax - 12]
> mov ebp, dword ptr [r15 + 4*r11 - 8000]
> imul ebp, dword ptr [rax - 8]
> add r12d, ebx
> add ecx, r12d
> add ebp, ecx
> mov ecx, dword ptr [r15 + 4*r11 - 4000]
> imul ecx, dword ptr [rax - 4]
> add ecx, ebp
> mov ebx, dword ptr [r15 + 4*r11]
> imul ebx, dword ptr [rax]
> add ebx, ecx
> add r14, 20000
> add rax, 20
> cmp r14, 160000
> jne .LBB0_5
> # BB#6:                                 # %.loopexit
>                                         #   in Loop: Header=BB0_2 Depth=2
> add r10, rdx                #rdx is c[][]
> mov dword ptr [r10 + 4*r11], ebx
> inc r11
> cmp r11, 1000
> jne .LBB0_2
> # BB#7:                                 #   in Loop: Header=BB0_1 Depth=1
> inc r9
> add r8, 4000
> cmp r9, 1000
> jne .LBB0_1
> # BB#8:
> pop rbx
> pop r12
> pop r14
> pop r15
> pop rbp
> ret
>
>
> Looking forward to your reply
>
> Thank You
>
>
>
> _______________________________________________
> LLVM Developers mailing list
> llvm-dev at lists.llvm.org
> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
>
>-------------- next part --------------
An HTML attachment was scrubbed...
URL:
<http://lists.llvm.org/pipermail/llvm-dev/attachments/20170630/26dc30e7/attachment-0001.html>

hameeza ahmed via llvm-dev

2017-Jun-30 22:47 UTC

head link

[llvm-dev] Fwd: KNL Assembly Code for Matrix Multiplication

---------- Forwarded message ----------
From: hameeza ahmed <hahmed2305 at gmail.com>
Date: Sat, Jul 1, 2017 at 3:46 AM
Subject: Re: [llvm-dev] KNL Assembly Code for Matrix Multiplication
To: Craig Topper <craig.topper at gmail.com>


Thank You.

in this step;
vmovdqa64 zmm22, zmmword ptr [rip + .LCPI0_0] # zmm22 [8,9,10,11,12,13,14,15]
the indexes are 64 bit but the element stored at these position is 32 bit
since we are dealing with integers and ir also shows this.
here we are loading 32 bit value from those 64 bit indexes which means
zmm22 will hold values 32 bit from these 64 bit position so there is
capacity of 16 32 bit elements then why all this??

this is mentioned in IR as

  %5 = getelementptr inbounds [1000 x i32], [1000 x i32]* %0, i64
%indvars.iv34, i64 %4
  %6 = bitcast i32* %5 to <16 x i32>*
  %wide.load = load <16 x i32>, <16 x i32>* %6, align 4, !tbaa !1


here indvars are 64 bit values but the values loaded from these indexes
(step 3) is 32 bit???

Please correct me.





On Fri, Jun 30, 2017 at 8:59 PM, Craig Topper <craig.topper at gmail.com>
wrote:
> Some comments inline, I'll need to look more later.
>
> ~Craig
>
> On Fri, Jun 30, 2017 at 5:28 AM, hameeza ahmed via llvm-dev <
> llvm-dev at lists.llvm.org> wrote:
>
>> Hello, I want some help in understanding knl intel assembly of matrix
>> multiplication code. some of the things are not clear;
>>
>> here .c file:
>>
>> #include <stdio.h>
>> #define N 1000
>>
>> // This function multiplies A[][] and B[][], and stores
>> // the result in C[][]
>> void multiply(int A[][N], int B[][N], int C[][N])
>> {
>>     int i, j, k, r;
>>     for (i = 0; i < N; i++)
>>     {
>>         for (j = 0; j < N; j++)
>>         {
>>             r = 0;
>>             for (k = 0; k < N; k++) {
>>                 r += A[i][k]*B[k][j];}
>>                 C[i][j] = r;
>>
>>         }
>>
>>     }
>> }
>>
>> here .s file: * the code that i want to ask is in red color.*
>>
>> .text
>> .intel_syntax noprefix
>> .file "matn_o3.ll"
>> .section .rodata,"a", at progbits
>> .p2align 6
>> .LCPI0_0:
>> .quad 8                       # 0x8
>> .quad 9                       # 0x9
>> .quad 10                      # 0xa
>> .quad 11                      # 0xb
>> .quad 12                      # 0xc
>> .quad 13                      # 0xd
>> .quad 14                      # 0xe
>> .quad 15                      # 0xf
>> .LCPI0_1:
>> .quad 0                       # 0x0
>> .quad 1                       # 0x1
>> .quad 2                       # 0x2
>> .quad 3                       # 0x3
>> .quad 4                       # 0x4
>> .quad 5                       # 0x5
>> .quad 6                       # 0x6
>> .quad 7                       # 0x7
>> .section .rodata.cst8,"aM", at progbits,8
>> .p2align 3
>> .LCPI0_2:
>> .quad 4000                    # 0xfa0
>> .LCPI0_3:
>> .quad 64000                   # 0xfa00
>> .LCPI0_4:
>> .quad 128000                  # 0x1f400
>> .LCPI0_5:
>> .quad 192000                  # 0x2ee00
>> .LCPI0_6:
>> .quad 64                      # 0x40
>> .text
>> .globl multiply
>> .p2align 4, 0x90
>> .type multiply, at function
>> multiply:                               # @multiply
>> .cfi_startproc
>> # BB#0:
>> push rbp
>> .Lcfi0:
>> .cfi_def_cfa_offset 16
>> push r15
>> .Lcfi1:
>> .cfi_def_cfa_offset 24
>> push r14
>> .Lcfi2:
>> .cfi_def_cfa_offset 32
>> push r12
>> .Lcfi3:
>> .cfi_def_cfa_offset 40
>> push rbx
>> .Lcfi4:
>> .cfi_def_cfa_offset 48
>> .Lcfi5:
>> .cfi_offset rbx, -48
>> .Lcfi6:
>> .cfi_offset r12, -40
>> .Lcfi7:
>> .cfi_offset r14, -32
>> .Lcfi8:
>> .cfi_offset r15, -24
>> .Lcfi9:
>> .cfi_offset rbp, -16
>> lea r8, [rdi + 3856]
>> xor r9d, r9d
>> vmovdqa64 zmm22, zmmword ptr [rip + .LCPI0_0] # zmm22 >>
[8,9,10,11,12,13,14,15]
>> vmovdqa64 zmm23, zmmword ptr [rip + .LCPI0_1] # zmm23 =
[0,1,2,3,4,5,6,7]
>> vpbroadcastq zmm2, qword ptr [rip + .LCPI0_2]
>> vpbroadcastq zmm3, rsi
>> add rsi, 3856000
>> vpbroadcastq zmm4, qword ptr [rip + .LCPI0_3]
>> vpbroadcastq zmm5, qword ptr [rip + .LCPI0_4]
>> vpbroadcastq zmm6, qword ptr [rip + .LCPI0_5]
>> kxnorw k1, k0, k0
>> kshiftrw k1, k1, 8
>> vpbroadcastq zmm7, qword ptr [rip + .LCPI0_6]
>> .p2align 4, 0x90
>> .LBB0_1:                                # %.preheader26
>>                                         # =>This Loop Header:
Depth=1
>>                                         #     Child Loop BB0_2 Depth 2
>>                                         #       Child Loop BB0_3 Depth
3
>>                                         #       Child Loop BB0_5 Depth
3
>> xor r11d, r11d
>> .p2align 4, 0x90
>> .LBB0_2:                                # %.preheader
>>                                         #   Parent Loop BB0_1 Depth=1
>>                                         # =>  This Loop Header:
Depth=2
>>                                         #       Child Loop BB0_3 Depth
3
>>                                         #       Child Loop BB0_5 Depth
3
>> vpxord zmm8, zmm8, zmm8
>> mov ecx, 960
>> vmovdqa64 zmm9, zmm23
>> vmovdqa64 zmm10, zmm22
>> vpxord zmm11, zmm11, zmm11
>> vpxord zmm12, zmm12, zmm12
>> vpxord zmm13, zmm13, zmm13
>> .p2align 4, 0x90
>> .LBB0_3:                                # %vector.body
>>                                         #   Parent Loop BB0_1 Depth=1
>>                                         #     Parent Loop BB0_2 Depth=2
>>                                         # =>    This Inner Loop
Header:
>> Depth=3
>>                                         # this bb will run 15 times
>> vmovq rax, xmm9
>> imul r10, r9, 4000
>> lea rbx, [rdi + r10]
>> *vpmuludq zmm14, zmm10, zmm2       ; this is BB for vector here we have
>> to do gather for B due to arbitrary addresses so here
>> zmm10=[8,9,10,11,12,13,14,15]. it means zmm10 contains 8 values present
in
>> these indexes? and zmm2=[4000, 4000,.....4000]. these are the indexes
for B
>> we need to multiple indexes with stride=4000. i know here these indexes
are
>> 64 bit but the values stored in these locations are 32 bits then  the
load
>> using zmm10 index will give 8 elements of 32 bits present in these
>> locations, so do the registers contain 8 elements of 32 bits present at
>> specified indexes?? so after multiplication we get indexes for higher 8
>> elements of B i.e [3200,3600,40000,.......54000].*
>>
>> * vpsrlq zmm15, zmm10, 32              ; i dont understand the need for
>> this step, please explain the purpose of all these steps. here vpsrlq
will
>> shift right zmm10 values by 256 bits (32*8)....zmmm10
initially=**[8,9,10,11,12,13,14,15].
>> it will now become [0,0,0,0,8,9,10,11]...Am I correct? Please explain
me
>> the purpose of this step.*
>> * vpmuludq zmm15, zmm15, zmm2  ;    similarly **dont understand the
need
>> for this step.*
>> * vpsllq zmm15, zmm15, 32    ; **dont understand the need for this
step*
>> * vpaddq zmm14, zmm14, zmm3  ; *
>> * vpaddq zmm14, zmm15, zmm14 ; **dont understand the need for this
step*
>>
>
> vpsrlq zmm15, zmm10, 32 shifts every 64-bit element in zmm10 right by 32
> bits. I believe this effectively taking every odd numbered 32-bit element
> and moving them to the next lowest even numbered 32-bit element.
>
> vmuludq multiplies all even numbered 32-bit elements and creates 64-bit
> results.
>
> The combination of the shifts, vpmuludq, and vpaddq is to multiply 64-bit
> elements and create a 64-bit elements result. We don't have an
instruction
> for this so we have to multiply the low 32-bits of each element and the
> high 32-bits of each element separately and add the results together. Looks
> like we determined that the high 32-bits of one of the inputs is all zeros
> so we skipped 1 of the multiplies and adds that would normally be required
> for this operation.
>
>
>
>> * vpbroadcastq zmm15, r11 ; **r11 changes when loop variable j changes
>> whats the need of this step?*
>> * vpsllq zmm15, zmm15, 2   ; **dont understand the need for this step*
>> * vpaddq zmm14, zmm14, zmm15 ; **dont understand the need for this
step*
>> * vpmuludq zmm16, zmm9, zmm2 ; **here same as before the lower 8
>> elements of B indexes are computed as Zmm16=[0,4000,8000,.......28000]*
>> * vpsrlq zmm17, zmm9, 32   **; **dont understand the need for this
step*
>> * vpmuludq zmm17, zmm17, zmm2  **; **dont understand the need for this
>> step*
>> * vpsllq zmm17, zmm17, 32  **; **dont understand the need for this
step*
>> * vpaddq zmm16, zmm16, zmm3  *
>> * vpaddq zmm16, zmm17, zmm16  **; **dont understand the need for this
>> step*
>> * vpaddq zmm15, zmm16, zmm15  **; **dont understand the need for this
>> step*
>> * vpaddq zmm16, zmm15, zmm4*
>> * vpaddq zmm17, zmm14, zmm4*
>> * vpaddq zmm18, zmm15, zmm5*
>> * vpaddq zmm19, zmm14, zmm5*
>> * vpaddq zmm20, zmm15, zmm6*
>> * vpaddq zmm21, zmm14, zmm6*
>> * kmovw k2, k1  **; **dont understand the need for this step*
>>
>
> The gather instruction requires a mask of which elements to read. When the
> gather completes, if there are no faults it will have written the mask
> register to 0. So it needs to reloaded for each gather.
>
>
>> * vpgatherqd ymm0 {k2}, zmmword ptr [zmm14] ; since zmm14 contains 8
>> indexes ( or values at these 8 indexes???) so it will load 8 elements
not
>> 16. here it should be zmm14**=[3200,3600,40000,.......54000]. but by
the
>> above computation these indexes are changes??*
>> * kxnorw k2, k0, k0  **; **dont understand the need for this step*
>>
> * vpgatherqd ymm14 {k2}, zmmword ptr [zmm15]   **; **here again issues
>> with index zmm15. it should be **[0,4000,8000,.......28000] but its
>> different due to above computation.*
>> * vinserti64x4 zmm0, zmm14, ymm0, 1*
>> * kmovw k2, k1*
>> * vpgatherqd ymm14 {k2}, zmmword ptr [zmm17]*
>> * kxnorw k2, k0, k0*
>> * vpgatherqd ymm15 {k2}, zmmword ptr [zmm16]*
>> * vinserti64x4 zmm14, zmm15, ymm14, 1*
>> * kmovw k2, k1*
>> * vpgatherqd ymm15 {k2}, zmmword ptr [zmm19]*
>> * kxnorw k2, k0, k0*
>> * vpgatherqd ymm16 {k2}, zmmword ptr [zmm18]*
>> * vinserti64x4 zmm15, zmm16, ymm15, 1*
>> * kmovw k2, k1*
>> * vpgatherqd ymm1 {k2}, zmmword ptr [zmm21]*
>> * kxnorw k2, k0, k0*
>> * vpgatherqd ymm16 {k2}, zmmword ptr [zmm20]*
>> * vinserti64x4 zmm1, zmm16, ymm1, 1*
>> * vpmulld zmm0, zmm0, zmmword ptr [rbx + 4*rax]*
>> vpmulld zmm14, zmm14, zmmword ptr [rbx + 4*rax + 64]
>> vpmulld zmm15, zmm15, zmmword ptr [rbx + 4*rax + 128]
>> vpmulld zmm1, zmm1, zmmword ptr [rbx + 4*rax + 192]
>> vpaddd zmm8, zmm0, zmm8
>> vpaddd zmm11, zmm14, zmm11
>> vpaddd zmm12, zmm15, zmm12
>> vpaddd zmm13, zmm1, zmm13
>> vpaddq zmm9, zmm9, zmm7        #zmm7=64
>> vpaddq zmm10, zmm10, zmm7
>> add rcx, -64     #decrement counter by 64
>> jne .LBB0_3       # if rcx not equal to zero goto .lbbo_3
>> # BB#4:                                 # %middle.block
>>                                         #   in Loop: Header=BB0_2
Depth=2
>> vpaddd zmm0, zmm11, zmm8
>> vpaddd zmm0, zmm12, zmm0
>> vpaddd zmm0, zmm13, zmm0
>> *vshufi64x2 zmm1, zmm0, zmm0, 14 # zmm1 = zmm0[4,5,6,7,0,1,0,1]   ;
>> please explain how shuffle instructions work here. i know of llvm ir
>> shuffle, but these assembly ones are difficult for me to understand*
>>
>
> You have to look at the size of the register being mentioned and the
> number of elements in brackets. In this case the regsiter is 512-bits and
> the number of elements is 8. 512/8 is 64. So its a shuffle of a v8i64
> vector. Then we read the element numbers from left to write just like the
> shuffle IR instruction.
>
> So element 0 of zmm1 gets the value of element 4 of zmm0. Element 1 of
> zmm1 gets the value of element 5 of zmm5, etc.
>
>
>> * vpaddd zmm0, zmm0, zmm1*
>> * vshufi64x2 zmm1, zmm0, zmm0, 1 # zmm1 = zmm0[2,3,0,1,0,1,0,1]*
>> * vpaddd zmm0, zmm0, zmm1*
>> * vpshufd zmm1, zmm0, 238         # zmm1 >>
zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]*
>> * vpaddd zmm0, zmm0, zmm1*
>> * vpshufd zmm1, zmm0, 229         # zmm1 >>
zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15]*
>> vpaddd zmm0, zmm0, zmm1
>> vmovd ebx, xmm0
>> mov rax, r8
>> xor r14d, r14d
>> .p2align 4, 0x90
>> .LBB0_5:                                #   Parent Loop BB0_1 Depth=1
>>                                         #     Parent Loop BB0_2 Depth=2
>>                                         # =>    This Inner Loop
Header:
>> Depth=3
>> lea r15, [rsi + r14]
>> mov r12d, dword ptr [r15 + 4*r11 - 16000]
>> imul r12d, dword ptr [rax - 16]
>> mov ecx, dword ptr [r15 + 4*r11 - 12000]
>> imul ecx, dword ptr [rax - 12]
>> mov ebp, dword ptr [r15 + 4*r11 - 8000]
>> imul ebp, dword ptr [rax - 8]
>> add r12d, ebx
>> add ecx, r12d
>> add ebp, ecx
>> mov ecx, dword ptr [r15 + 4*r11 - 4000]
>> imul ecx, dword ptr [rax - 4]
>> add ecx, ebp
>> mov ebx, dword ptr [r15 + 4*r11]
>> imul ebx, dword ptr [rax]
>> add ebx, ecx
>> add r14, 20000
>> add rax, 20
>> cmp r14, 160000
>> jne .LBB0_5
>> # BB#6:                                 # %.loopexit
>>                                         #   in Loop: Header=BB0_2
Depth=2
>> add r10, rdx                #rdx is c[][]
>> mov dword ptr [r10 + 4*r11], ebx
>> inc r11
>> cmp r11, 1000
>> jne .LBB0_2
>> # BB#7:                                 #   in Loop: Header=BB0_1
Depth=1
>> inc r9
>> add r8, 4000
>> cmp r9, 1000
>> jne .LBB0_1
>> # BB#8:
>> pop rbx
>> pop r12
>> pop r14
>> pop r15
>> pop rbp
>> ret
>>
>>
>> Looking forward to your reply
>>
>> Thank You
>>
>>
>>
>> _______________________________________________
>> LLVM Developers mailing list
>> llvm-dev at lists.llvm.org
>> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
>>
>>
>-------------- next part --------------
An HTML attachment was scrubbed...
URL:
<http://lists.llvm.org/pipermail/llvm-dev/attachments/20170701/954b2788/attachment.html>

llvm dev - Jun 2017 - KNL Assembly Code for Matrix Multiplication

[llvm-dev] KNL Assembly Code for Matrix Multiplication

[llvm-dev] KNL Assembly Code for Matrix Multiplication

[llvm-dev] Fwd: KNL Assembly Code for Matrix Multiplication