hameeza ahmed via llvm-dev
2017-Jun-30 23:11 UTC
[llvm-dev] KNL Assembly Code for Matrix Multiplication
Further, I need to understand it with putting actual values since it is very confusing... vmovdqa64 zmm22, zmmword ptr [rip + .LCPI0_0] ; i am supposing this will move 64 bit values from mentioned indexes though i still believe each value is required to be 32 bit. Now the indexes are [8, 9, 10, 11, 12, 13, 14, 15]. now when these indexes are added with rip it points to the value actually present at these locations so zmm22 will contain values not indexes. suppose [8]={1}, [9]={5}, [10]={4}...... so zmm22 will become zmm22={1, 5, 4, 3, 8, 7, 6, 2}......these are those 64 bit values loaded from memory indexes. vpbroadcastq zmm2, qword ptr [rip + .LCPI0_2]; here .LCPI0_2=4000 means broadcast value at this index for eg this location contains 2 so zmm2={2,2,2,2.....2}. vpmuludq zmm14, zmm10, zmm2 ; this step is value multiplication not index, there seems no point in multiplying these values here since we havent used A and B yet??? Please clarify my understanding about these initial steps; if these get cleared then only i will be able to move forward..... Thank You On Sat, Jul 1, 2017 at 3:47 AM, hameeza ahmed <hahmed2305 at gmail.com> wrote:> > ---------- Forwarded message ---------- > From: hameeza ahmed <hahmed2305 at gmail.com> > Date: Sat, Jul 1, 2017 at 3:46 AM > Subject: Re: [llvm-dev] KNL Assembly Code for Matrix Multiplication > To: Craig Topper <craig.topper at gmail.com> > > > Thank You. > > in this step; > vmovdqa64 zmm22, zmmword ptr [rip + .LCPI0_0] # zmm22 > [8,9,10,11,12,13,14,15] > the indexes are 64 bit but the element stored at these position is 32 bit > since we are dealing with integers and ir also shows this. > here we are loading 32 bit value from those 64 bit indexes which means > zmm22 will hold values 32 bit from these 64 bit position so there is > capacity of 16 32 bit elements then why all this?? > > this is mentioned in IR as > > %5 = getelementptr inbounds [1000 x i32], [1000 x i32]* %0, i64 > %indvars.iv34, i64 %4 > %6 = bitcast i32* %5 to <16 x i32>* > %wide.load = load <16 x i32>, <16 x i32>* %6, align 4, !tbaa !1 > > > here indvars are 64 bit values but the values loaded from these indexes > (step 3) is 32 bit??? > > Please correct me. > > > > > > On Fri, Jun 30, 2017 at 8:59 PM, Craig Topper <craig.topper at gmail.com> > wrote: > >> Some comments inline, I'll need to look more later. >> >> ~Craig >> >> On Fri, Jun 30, 2017 at 5:28 AM, hameeza ahmed via llvm-dev < >> llvm-dev at lists.llvm.org> wrote: >> >>> Hello, I want some help in understanding knl intel assembly of matrix >>> multiplication code. some of the things are not clear; >>> >>> here .c file: >>> >>> #include <stdio.h> >>> #define N 1000 >>> >>> // This function multiplies A[][] and B[][], and stores >>> // the result in C[][] >>> void multiply(int A[][N], int B[][N], int C[][N]) >>> { >>> int i, j, k, r; >>> for (i = 0; i < N; i++) >>> { >>> for (j = 0; j < N; j++) >>> { >>> r = 0; >>> for (k = 0; k < N; k++) { >>> r += A[i][k]*B[k][j];} >>> C[i][j] = r; >>> >>> } >>> >>> } >>> } >>> >>> here .s file: * the code that i want to ask is in red color.* >>> >>> .text >>> .intel_syntax noprefix >>> .file "matn_o3.ll" >>> .section .rodata,"a", at progbits >>> .p2align 6 >>> .LCPI0_0: >>> .quad 8 # 0x8 >>> .quad 9 # 0x9 >>> .quad 10 # 0xa >>> .quad 11 # 0xb >>> .quad 12 # 0xc >>> .quad 13 # 0xd >>> .quad 14 # 0xe >>> .quad 15 # 0xf >>> .LCPI0_1: >>> .quad 0 # 0x0 >>> .quad 1 # 0x1 >>> .quad 2 # 0x2 >>> .quad 3 # 0x3 >>> .quad 4 # 0x4 >>> .quad 5 # 0x5 >>> .quad 6 # 0x6 >>> .quad 7 # 0x7 >>> .section .rodata.cst8,"aM", at progbits,8 >>> .p2align 3 >>> .LCPI0_2: >>> .quad 4000 # 0xfa0 >>> .LCPI0_3: >>> .quad 64000 # 0xfa00 >>> .LCPI0_4: >>> .quad 128000 # 0x1f400 >>> .LCPI0_5: >>> .quad 192000 # 0x2ee00 >>> .LCPI0_6: >>> .quad 64 # 0x40 >>> .text >>> .globl multiply >>> .p2align 4, 0x90 >>> .type multiply, at function >>> multiply: # @multiply >>> .cfi_startproc >>> # BB#0: >>> push rbp >>> .Lcfi0: >>> .cfi_def_cfa_offset 16 >>> push r15 >>> .Lcfi1: >>> .cfi_def_cfa_offset 24 >>> push r14 >>> .Lcfi2: >>> .cfi_def_cfa_offset 32 >>> push r12 >>> .Lcfi3: >>> .cfi_def_cfa_offset 40 >>> push rbx >>> .Lcfi4: >>> .cfi_def_cfa_offset 48 >>> .Lcfi5: >>> .cfi_offset rbx, -48 >>> .Lcfi6: >>> .cfi_offset r12, -40 >>> .Lcfi7: >>> .cfi_offset r14, -32 >>> .Lcfi8: >>> .cfi_offset r15, -24 >>> .Lcfi9: >>> .cfi_offset rbp, -16 >>> lea r8, [rdi + 3856] >>> xor r9d, r9d >>> vmovdqa64 zmm22, zmmword ptr [rip + .LCPI0_0] # zmm22 >>> [8,9,10,11,12,13,14,15] >>> vmovdqa64 zmm23, zmmword ptr [rip + .LCPI0_1] # zmm23 >>> [0,1,2,3,4,5,6,7] >>> vpbroadcastq zmm2, qword ptr [rip + .LCPI0_2] >>> vpbroadcastq zmm3, rsi >>> add rsi, 3856000 >>> vpbroadcastq zmm4, qword ptr [rip + .LCPI0_3] >>> vpbroadcastq zmm5, qword ptr [rip + .LCPI0_4] >>> vpbroadcastq zmm6, qword ptr [rip + .LCPI0_5] >>> kxnorw k1, k0, k0 >>> kshiftrw k1, k1, 8 >>> vpbroadcastq zmm7, qword ptr [rip + .LCPI0_6] >>> .p2align 4, 0x90 >>> .LBB0_1: # %.preheader26 >>> # =>This Loop Header: Depth=1 >>> # Child Loop BB0_2 Depth 2 >>> # Child Loop BB0_3 Depth 3 >>> # Child Loop BB0_5 Depth 3 >>> xor r11d, r11d >>> .p2align 4, 0x90 >>> .LBB0_2: # %.preheader >>> # Parent Loop BB0_1 Depth=1 >>> # => This Loop Header: Depth=2 >>> # Child Loop BB0_3 Depth 3 >>> # Child Loop BB0_5 Depth 3 >>> vpxord zmm8, zmm8, zmm8 >>> mov ecx, 960 >>> vmovdqa64 zmm9, zmm23 >>> vmovdqa64 zmm10, zmm22 >>> vpxord zmm11, zmm11, zmm11 >>> vpxord zmm12, zmm12, zmm12 >>> vpxord zmm13, zmm13, zmm13 >>> .p2align 4, 0x90 >>> .LBB0_3: # %vector.body >>> # Parent Loop BB0_1 Depth=1 >>> # Parent Loop BB0_2 Depth=2 >>> # => This Inner Loop Header: >>> Depth=3 >>> # this bb will run 15 times >>> vmovq rax, xmm9 >>> imul r10, r9, 4000 >>> lea rbx, [rdi + r10] >>> *vpmuludq zmm14, zmm10, zmm2 ; this is BB for vector here we have >>> to do gather for B due to arbitrary addresses so here >>> zmm10=[8,9,10,11,12,13,14,15]. it means zmm10 contains 8 values present in >>> these indexes? and zmm2=[4000, 4000,.....4000]. these are the indexes for B >>> we need to multiple indexes with stride=4000. i know here these indexes are >>> 64 bit but the values stored in these locations are 32 bits then the load >>> using zmm10 index will give 8 elements of 32 bits present in these >>> locations, so do the registers contain 8 elements of 32 bits present at >>> specified indexes?? so after multiplication we get indexes for higher 8 >>> elements of B i.e [3200,3600,40000,.......54000].* >>> >>> * vpsrlq zmm15, zmm10, 32 ; i dont understand the need for >>> this step, please explain the purpose of all these steps. here vpsrlq will >>> shift right zmm10 values by 256 bits (32*8)....zmmm10 initially=**[8,9,10,11,12,13,14,15]. >>> it will now become [0,0,0,0,8,9,10,11]...Am I correct? Please explain me >>> the purpose of this step.* >>> * vpmuludq zmm15, zmm15, zmm2 ; similarly **dont understand the >>> need for this step.* >>> * vpsllq zmm15, zmm15, 32 ; **dont understand the need for this step* >>> * vpaddq zmm14, zmm14, zmm3 ; * >>> * vpaddq zmm14, zmm15, zmm14 ; **dont understand the need for this step* >>> >> >> vpsrlq zmm15, zmm10, 32 shifts every 64-bit element in zmm10 right by 32 >> bits. I believe this effectively taking every odd numbered 32-bit element >> and moving them to the next lowest even numbered 32-bit element. >> >> vmuludq multiplies all even numbered 32-bit elements and creates 64-bit >> results. >> >> The combination of the shifts, vpmuludq, and vpaddq is to multiply 64-bit >> elements and create a 64-bit elements result. We don't have an instruction >> for this so we have to multiply the low 32-bits of each element and the >> high 32-bits of each element separately and add the results together. Looks >> like we determined that the high 32-bits of one of the inputs is all zeros >> so we skipped 1 of the multiplies and adds that would normally be required >> for this operation. >> >> >> >>> * vpbroadcastq zmm15, r11 ; **r11 changes when loop variable j changes >>> whats the need of this step?* >>> * vpsllq zmm15, zmm15, 2 ; **dont understand the need for this step* >>> * vpaddq zmm14, zmm14, zmm15 ; **dont understand the need for this step* >>> * vpmuludq zmm16, zmm9, zmm2 ; **here same as before the lower 8 >>> elements of B indexes are computed as Zmm16=[0,4000,8000,.......28000]* >>> * vpsrlq zmm17, zmm9, 32 **; **dont understand the need for this step* >>> * vpmuludq zmm17, zmm17, zmm2 **; **dont understand the need for this >>> step* >>> * vpsllq zmm17, zmm17, 32 **; **dont understand the need for this step* >>> * vpaddq zmm16, zmm16, zmm3 * >>> * vpaddq zmm16, zmm17, zmm16 **; **dont understand the need for this >>> step* >>> * vpaddq zmm15, zmm16, zmm15 **; **dont understand the need for this >>> step* >>> * vpaddq zmm16, zmm15, zmm4* >>> * vpaddq zmm17, zmm14, zmm4* >>> * vpaddq zmm18, zmm15, zmm5* >>> * vpaddq zmm19, zmm14, zmm5* >>> * vpaddq zmm20, zmm15, zmm6* >>> * vpaddq zmm21, zmm14, zmm6* >>> * kmovw k2, k1 **; **dont understand the need for this step* >>> >> >> The gather instruction requires a mask of which elements to read. When >> the gather completes, if there are no faults it will have written the mask >> register to 0. So it needs to reloaded for each gather. >> >> >>> * vpgatherqd ymm0 {k2}, zmmword ptr [zmm14] ; since zmm14 contains 8 >>> indexes ( or values at these 8 indexes???) so it will load 8 elements not >>> 16. here it should be zmm14**=[3200,3600,40000,.......54000]. but by >>> the above computation these indexes are changes??* >>> * kxnorw k2, k0, k0 **; **dont understand the need for this step* >>> >> * vpgatherqd ymm14 {k2}, zmmword ptr [zmm15] **; **here again issues >>> with index zmm15. it should be **[0,4000,8000,.......28000] but its >>> different due to above computation.* >>> * vinserti64x4 zmm0, zmm14, ymm0, 1* >>> * kmovw k2, k1* >>> * vpgatherqd ymm14 {k2}, zmmword ptr [zmm17]* >>> * kxnorw k2, k0, k0* >>> * vpgatherqd ymm15 {k2}, zmmword ptr [zmm16]* >>> * vinserti64x4 zmm14, zmm15, ymm14, 1* >>> * kmovw k2, k1* >>> * vpgatherqd ymm15 {k2}, zmmword ptr [zmm19]* >>> * kxnorw k2, k0, k0* >>> * vpgatherqd ymm16 {k2}, zmmword ptr [zmm18]* >>> * vinserti64x4 zmm15, zmm16, ymm15, 1* >>> * kmovw k2, k1* >>> * vpgatherqd ymm1 {k2}, zmmword ptr [zmm21]* >>> * kxnorw k2, k0, k0* >>> * vpgatherqd ymm16 {k2}, zmmword ptr [zmm20]* >>> * vinserti64x4 zmm1, zmm16, ymm1, 1* >>> * vpmulld zmm0, zmm0, zmmword ptr [rbx + 4*rax]* >>> vpmulld zmm14, zmm14, zmmword ptr [rbx + 4*rax + 64] >>> vpmulld zmm15, zmm15, zmmword ptr [rbx + 4*rax + 128] >>> vpmulld zmm1, zmm1, zmmword ptr [rbx + 4*rax + 192] >>> vpaddd zmm8, zmm0, zmm8 >>> vpaddd zmm11, zmm14, zmm11 >>> vpaddd zmm12, zmm15, zmm12 >>> vpaddd zmm13, zmm1, zmm13 >>> vpaddq zmm9, zmm9, zmm7 #zmm7=64 >>> vpaddq zmm10, zmm10, zmm7 >>> add rcx, -64 #decrement counter by 64 >>> jne .LBB0_3 # if rcx not equal to zero goto .lbbo_3 >>> # BB#4: # %middle.block >>> # in Loop: Header=BB0_2 Depth=2 >>> vpaddd zmm0, zmm11, zmm8 >>> vpaddd zmm0, zmm12, zmm0 >>> vpaddd zmm0, zmm13, zmm0 >>> *vshufi64x2 zmm1, zmm0, zmm0, 14 # zmm1 = zmm0[4,5,6,7,0,1,0,1] ; >>> please explain how shuffle instructions work here. i know of llvm ir >>> shuffle, but these assembly ones are difficult for me to understand* >>> >> >> You have to look at the size of the register being mentioned and the >> number of elements in brackets. In this case the regsiter is 512-bits and >> the number of elements is 8. 512/8 is 64. So its a shuffle of a v8i64 >> vector. Then we read the element numbers from left to write just like the >> shuffle IR instruction. >> >> So element 0 of zmm1 gets the value of element 4 of zmm0. Element 1 of >> zmm1 gets the value of element 5 of zmm5, etc. >> >> >>> * vpaddd zmm0, zmm0, zmm1* >>> * vshufi64x2 zmm1, zmm0, zmm0, 1 # zmm1 = zmm0[2,3,0,1,0,1,0,1]* >>> * vpaddd zmm0, zmm0, zmm1* >>> * vpshufd zmm1, zmm0, 238 # zmm1 >>> zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]* >>> * vpaddd zmm0, zmm0, zmm1* >>> * vpshufd zmm1, zmm0, 229 # zmm1 >>> zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15]* >>> vpaddd zmm0, zmm0, zmm1 >>> vmovd ebx, xmm0 >>> mov rax, r8 >>> xor r14d, r14d >>> .p2align 4, 0x90 >>> .LBB0_5: # Parent Loop BB0_1 Depth=1 >>> # Parent Loop BB0_2 Depth=2 >>> # => This Inner Loop Header: >>> Depth=3 >>> lea r15, [rsi + r14] >>> mov r12d, dword ptr [r15 + 4*r11 - 16000] >>> imul r12d, dword ptr [rax - 16] >>> mov ecx, dword ptr [r15 + 4*r11 - 12000] >>> imul ecx, dword ptr [rax - 12] >>> mov ebp, dword ptr [r15 + 4*r11 - 8000] >>> imul ebp, dword ptr [rax - 8] >>> add r12d, ebx >>> add ecx, r12d >>> add ebp, ecx >>> mov ecx, dword ptr [r15 + 4*r11 - 4000] >>> imul ecx, dword ptr [rax - 4] >>> add ecx, ebp >>> mov ebx, dword ptr [r15 + 4*r11] >>> imul ebx, dword ptr [rax] >>> add ebx, ecx >>> add r14, 20000 >>> add rax, 20 >>> cmp r14, 160000 >>> jne .LBB0_5 >>> # BB#6: # %.loopexit >>> # in Loop: Header=BB0_2 Depth=2 >>> add r10, rdx #rdx is c[][] >>> mov dword ptr [r10 + 4*r11], ebx >>> inc r11 >>> cmp r11, 1000 >>> jne .LBB0_2 >>> # BB#7: # in Loop: Header=BB0_1 Depth=1 >>> inc r9 >>> add r8, 4000 >>> cmp r9, 1000 >>> jne .LBB0_1 >>> # BB#8: >>> pop rbx >>> pop r12 >>> pop r14 >>> pop r15 >>> pop rbp >>> ret >>> >>> >>> Looking forward to your reply >>> >>> Thank You >>> >>> >>> >>> _______________________________________________ >>> LLVM Developers mailing list >>> llvm-dev at lists.llvm.org >>> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev >>> >>> >> > >-------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20170701/79f39bee/attachment-0001.html>
Craig Topper via llvm-dev
2017-Jun-30 23:45 UTC
[llvm-dev] KNL Assembly Code for Matrix Multiplication
If you see a comment after an instruction that contains LCP in the address, the comment indicates what static value we loading from the constant pool. So after this instruction bits 63:0 will contain the value 8. Bits 127:64 will contain the value 9. Bits 192:128 will contain 10. And so on. The CP in LCP stands for Constant Pool. vmovdqa64 zmm22, zmmword ptr [rip + .LCPI0_0] # zmm22 [8,9,10,11,12,13,14,15] ~Craig On Fri, Jun 30, 2017 at 4:11 PM, hameeza ahmed <hahmed2305 at gmail.com> wrote:> Further, I need to understand it with putting actual values since it is > very confusing... > > vmovdqa64 zmm22, zmmword ptr [rip + .LCPI0_0] ; i am supposing this will > move 64 bit values from mentioned indexes though i still believe each value > is required to be 32 bit. Now the indexes are [8, 9, 10, 11, 12, 13, 14, > 15]. now when these indexes are added with rip it points to the value > actually present at these locations so zmm22 will contain values not > indexes. suppose [8]={1}, [9]={5}, [10]={4}...... so zmm22 will become > zmm22={1, 5, 4, 3, 8, 7, 6, 2}......these are those 64 bit values loaded > from memory indexes. > > vpbroadcastq zmm2, qword ptr [rip + .LCPI0_2]; here .LCPI0_2=4000 means > broadcast value at this index for eg this location contains 2 so > zmm2={2,2,2,2.....2}. > > vpmuludq zmm14, zmm10, zmm2 ; this step is value multiplication not > index, there seems no point in multiplying these values here since we > havent used A and B yet??? > > > > Please clarify my understanding about these initial steps; if these get > cleared then only i will be able to move forward..... > > > Thank You > > > > > > > > > > > > > > On Sat, Jul 1, 2017 at 3:47 AM, hameeza ahmed <hahmed2305 at gmail.com> > wrote: > >> >> ---------- Forwarded message ---------- >> From: hameeza ahmed <hahmed2305 at gmail.com> >> Date: Sat, Jul 1, 2017 at 3:46 AM >> Subject: Re: [llvm-dev] KNL Assembly Code for Matrix Multiplication >> To: Craig Topper <craig.topper at gmail.com> >> >> >> Thank You. >> >> in this step; >> vmovdqa64 zmm22, zmmword ptr [rip + .LCPI0_0] # zmm22 >> [8,9,10,11,12,13,14,15] >> the indexes are 64 bit but the element stored at these position is 32 bit >> since we are dealing with integers and ir also shows this. >> here we are loading 32 bit value from those 64 bit indexes which means >> zmm22 will hold values 32 bit from these 64 bit position so there is >> capacity of 16 32 bit elements then why all this?? >> >> this is mentioned in IR as >> >> %5 = getelementptr inbounds [1000 x i32], [1000 x i32]* %0, i64 >> %indvars.iv34, i64 %4 >> %6 = bitcast i32* %5 to <16 x i32>* >> %wide.load = load <16 x i32>, <16 x i32>* %6, align 4, !tbaa !1 >> >> >> here indvars are 64 bit values but the values loaded from these indexes >> (step 3) is 32 bit??? >> >> Please correct me. >> >> >> >> >> >> On Fri, Jun 30, 2017 at 8:59 PM, Craig Topper <craig.topper at gmail.com> >> wrote: >> >>> Some comments inline, I'll need to look more later. >>> >>> ~Craig >>> >>> On Fri, Jun 30, 2017 at 5:28 AM, hameeza ahmed via llvm-dev < >>> llvm-dev at lists.llvm.org> wrote: >>> >>>> Hello, I want some help in understanding knl intel assembly of matrix >>>> multiplication code. some of the things are not clear; >>>> >>>> here .c file: >>>> >>>> #include <stdio.h> >>>> #define N 1000 >>>> >>>> // This function multiplies A[][] and B[][], and stores >>>> // the result in C[][] >>>> void multiply(int A[][N], int B[][N], int C[][N]) >>>> { >>>> int i, j, k, r; >>>> for (i = 0; i < N; i++) >>>> { >>>> for (j = 0; j < N; j++) >>>> { >>>> r = 0; >>>> for (k = 0; k < N; k++) { >>>> r += A[i][k]*B[k][j];} >>>> C[i][j] = r; >>>> >>>> } >>>> >>>> } >>>> } >>>> >>>> here .s file: * the code that i want to ask is in red color.* >>>> >>>> .text >>>> .intel_syntax noprefix >>>> .file "matn_o3.ll" >>>> .section .rodata,"a", at progbits >>>> .p2align 6 >>>> .LCPI0_0: >>>> .quad 8 # 0x8 >>>> .quad 9 # 0x9 >>>> .quad 10 # 0xa >>>> .quad 11 # 0xb >>>> .quad 12 # 0xc >>>> .quad 13 # 0xd >>>> .quad 14 # 0xe >>>> .quad 15 # 0xf >>>> .LCPI0_1: >>>> .quad 0 # 0x0 >>>> .quad 1 # 0x1 >>>> .quad 2 # 0x2 >>>> .quad 3 # 0x3 >>>> .quad 4 # 0x4 >>>> .quad 5 # 0x5 >>>> .quad 6 # 0x6 >>>> .quad 7 # 0x7 >>>> .section .rodata.cst8,"aM", at progbits,8 >>>> .p2align 3 >>>> .LCPI0_2: >>>> .quad 4000 # 0xfa0 >>>> .LCPI0_3: >>>> .quad 64000 # 0xfa00 >>>> .LCPI0_4: >>>> .quad 128000 # 0x1f400 >>>> .LCPI0_5: >>>> .quad 192000 # 0x2ee00 >>>> .LCPI0_6: >>>> .quad 64 # 0x40 >>>> .text >>>> .globl multiply >>>> .p2align 4, 0x90 >>>> .type multiply, at function >>>> multiply: # @multiply >>>> .cfi_startproc >>>> # BB#0: >>>> push rbp >>>> .Lcfi0: >>>> .cfi_def_cfa_offset 16 >>>> push r15 >>>> .Lcfi1: >>>> .cfi_def_cfa_offset 24 >>>> push r14 >>>> .Lcfi2: >>>> .cfi_def_cfa_offset 32 >>>> push r12 >>>> .Lcfi3: >>>> .cfi_def_cfa_offset 40 >>>> push rbx >>>> .Lcfi4: >>>> .cfi_def_cfa_offset 48 >>>> .Lcfi5: >>>> .cfi_offset rbx, -48 >>>> .Lcfi6: >>>> .cfi_offset r12, -40 >>>> .Lcfi7: >>>> .cfi_offset r14, -32 >>>> .Lcfi8: >>>> .cfi_offset r15, -24 >>>> .Lcfi9: >>>> .cfi_offset rbp, -16 >>>> lea r8, [rdi + 3856] >>>> xor r9d, r9d >>>> vmovdqa64 zmm22, zmmword ptr [rip + .LCPI0_0] # zmm22 >>>> [8,9,10,11,12,13,14,15] >>>> vmovdqa64 zmm23, zmmword ptr [rip + .LCPI0_1] # zmm23 >>>> [0,1,2,3,4,5,6,7] >>>> vpbroadcastq zmm2, qword ptr [rip + .LCPI0_2] >>>> vpbroadcastq zmm3, rsi >>>> add rsi, 3856000 >>>> vpbroadcastq zmm4, qword ptr [rip + .LCPI0_3] >>>> vpbroadcastq zmm5, qword ptr [rip + .LCPI0_4] >>>> vpbroadcastq zmm6, qword ptr [rip + .LCPI0_5] >>>> kxnorw k1, k0, k0 >>>> kshiftrw k1, k1, 8 >>>> vpbroadcastq zmm7, qword ptr [rip + .LCPI0_6] >>>> .p2align 4, 0x90 >>>> .LBB0_1: # %.preheader26 >>>> # =>This Loop Header: Depth=1 >>>> # Child Loop BB0_2 Depth 2 >>>> # Child Loop BB0_3 Depth 3 >>>> # Child Loop BB0_5 Depth 3 >>>> xor r11d, r11d >>>> .p2align 4, 0x90 >>>> .LBB0_2: # %.preheader >>>> # Parent Loop BB0_1 Depth=1 >>>> # => This Loop Header: Depth=2 >>>> # Child Loop BB0_3 Depth 3 >>>> # Child Loop BB0_5 Depth 3 >>>> vpxord zmm8, zmm8, zmm8 >>>> mov ecx, 960 >>>> vmovdqa64 zmm9, zmm23 >>>> vmovdqa64 zmm10, zmm22 >>>> vpxord zmm11, zmm11, zmm11 >>>> vpxord zmm12, zmm12, zmm12 >>>> vpxord zmm13, zmm13, zmm13 >>>> .p2align 4, 0x90 >>>> .LBB0_3: # %vector.body >>>> # Parent Loop BB0_1 Depth=1 >>>> # Parent Loop BB0_2 Depth=2 >>>> # => This Inner Loop Header: >>>> Depth=3 >>>> # this bb will run 15 times >>>> vmovq rax, xmm9 >>>> imul r10, r9, 4000 >>>> lea rbx, [rdi + r10] >>>> *vpmuludq zmm14, zmm10, zmm2 ; this is BB for vector here we have >>>> to do gather for B due to arbitrary addresses so here >>>> zmm10=[8,9,10,11,12,13,14,15]. it means zmm10 contains 8 values present in >>>> these indexes? and zmm2=[4000, 4000,.....4000]. these are the indexes for B >>>> we need to multiple indexes with stride=4000. i know here these indexes are >>>> 64 bit but the values stored in these locations are 32 bits then the load >>>> using zmm10 index will give 8 elements of 32 bits present in these >>>> locations, so do the registers contain 8 elements of 32 bits present at >>>> specified indexes?? so after multiplication we get indexes for higher 8 >>>> elements of B i.e [3200,3600,40000,.......54000].* >>>> >>>> * vpsrlq zmm15, zmm10, 32 ; i dont understand the need for >>>> this step, please explain the purpose of all these steps. here vpsrlq will >>>> shift right zmm10 values by 256 bits (32*8)....zmmm10 initially=**[8,9,10,11,12,13,14,15]. >>>> it will now become [0,0,0,0,8,9,10,11]...Am I correct? Please explain me >>>> the purpose of this step.* >>>> * vpmuludq zmm15, zmm15, zmm2 ; similarly **dont understand the >>>> need for this step.* >>>> * vpsllq zmm15, zmm15, 32 ; **dont understand the need for this >>>> step* >>>> * vpaddq zmm14, zmm14, zmm3 ; * >>>> * vpaddq zmm14, zmm15, zmm14 ; **dont understand the need for this >>>> step* >>>> >>> >>> vpsrlq zmm15, zmm10, 32 shifts every 64-bit element in zmm10 right by 32 >>> bits. I believe this effectively taking every odd numbered 32-bit element >>> and moving them to the next lowest even numbered 32-bit element. >>> >>> vmuludq multiplies all even numbered 32-bit elements and creates 64-bit >>> results. >>> >>> The combination of the shifts, vpmuludq, and vpaddq is to multiply >>> 64-bit elements and create a 64-bit elements result. We don't have an >>> instruction for this so we have to multiply the low 32-bits of each element >>> and the high 32-bits of each element separately and add the results >>> together. Looks like we determined that the high 32-bits of one of the >>> inputs is all zeros so we skipped 1 of the multiplies and adds that would >>> normally be required for this operation. >>> >>> >>> >>>> * vpbroadcastq zmm15, r11 ; **r11 changes when loop variable j changes >>>> whats the need of this step?* >>>> * vpsllq zmm15, zmm15, 2 ; **dont understand the need for this step* >>>> * vpaddq zmm14, zmm14, zmm15 ; **dont understand the need for this >>>> step* >>>> * vpmuludq zmm16, zmm9, zmm2 ; **here same as before the lower 8 >>>> elements of B indexes are computed as Zmm16=[0,4000,8000,.......28000]* >>>> * vpsrlq zmm17, zmm9, 32 **; **dont understand the need for this >>>> step* >>>> * vpmuludq zmm17, zmm17, zmm2 **; **dont understand the need for this >>>> step* >>>> * vpsllq zmm17, zmm17, 32 **; **dont understand the need for this >>>> step* >>>> * vpaddq zmm16, zmm16, zmm3 * >>>> * vpaddq zmm16, zmm17, zmm16 **; **dont understand the need for this >>>> step* >>>> * vpaddq zmm15, zmm16, zmm15 **; **dont understand the need for this >>>> step* >>>> * vpaddq zmm16, zmm15, zmm4* >>>> * vpaddq zmm17, zmm14, zmm4* >>>> * vpaddq zmm18, zmm15, zmm5* >>>> * vpaddq zmm19, zmm14, zmm5* >>>> * vpaddq zmm20, zmm15, zmm6* >>>> * vpaddq zmm21, zmm14, zmm6* >>>> * kmovw k2, k1 **; **dont understand the need for this step* >>>> >>> >>> The gather instruction requires a mask of which elements to read. When >>> the gather completes, if there are no faults it will have written the mask >>> register to 0. So it needs to reloaded for each gather. >>> >>> >>>> * vpgatherqd ymm0 {k2}, zmmword ptr [zmm14] ; since zmm14 contains 8 >>>> indexes ( or values at these 8 indexes???) so it will load 8 elements not >>>> 16. here it should be zmm14**=[3200,3600,40000,.......54000]. but by >>>> the above computation these indexes are changes??* >>>> * kxnorw k2, k0, k0 **; **dont understand the need for this step* >>>> >>> * vpgatherqd ymm14 {k2}, zmmword ptr [zmm15] **; **here again issues >>>> with index zmm15. it should be **[0,4000,8000,.......28000] but its >>>> different due to above computation.* >>>> * vinserti64x4 zmm0, zmm14, ymm0, 1* >>>> * kmovw k2, k1* >>>> * vpgatherqd ymm14 {k2}, zmmword ptr [zmm17]* >>>> * kxnorw k2, k0, k0* >>>> * vpgatherqd ymm15 {k2}, zmmword ptr [zmm16]* >>>> * vinserti64x4 zmm14, zmm15, ymm14, 1* >>>> * kmovw k2, k1* >>>> * vpgatherqd ymm15 {k2}, zmmword ptr [zmm19]* >>>> * kxnorw k2, k0, k0* >>>> * vpgatherqd ymm16 {k2}, zmmword ptr [zmm18]* >>>> * vinserti64x4 zmm15, zmm16, ymm15, 1* >>>> * kmovw k2, k1* >>>> * vpgatherqd ymm1 {k2}, zmmword ptr [zmm21]* >>>> * kxnorw k2, k0, k0* >>>> * vpgatherqd ymm16 {k2}, zmmword ptr [zmm20]* >>>> * vinserti64x4 zmm1, zmm16, ymm1, 1* >>>> * vpmulld zmm0, zmm0, zmmword ptr [rbx + 4*rax]* >>>> vpmulld zmm14, zmm14, zmmword ptr [rbx + 4*rax + 64] >>>> vpmulld zmm15, zmm15, zmmword ptr [rbx + 4*rax + 128] >>>> vpmulld zmm1, zmm1, zmmword ptr [rbx + 4*rax + 192] >>>> vpaddd zmm8, zmm0, zmm8 >>>> vpaddd zmm11, zmm14, zmm11 >>>> vpaddd zmm12, zmm15, zmm12 >>>> vpaddd zmm13, zmm1, zmm13 >>>> vpaddq zmm9, zmm9, zmm7 #zmm7=64 >>>> vpaddq zmm10, zmm10, zmm7 >>>> add rcx, -64 #decrement counter by 64 >>>> jne .LBB0_3 # if rcx not equal to zero goto .lbbo_3 >>>> # BB#4: # %middle.block >>>> # in Loop: Header=BB0_2 >>>> Depth=2 >>>> vpaddd zmm0, zmm11, zmm8 >>>> vpaddd zmm0, zmm12, zmm0 >>>> vpaddd zmm0, zmm13, zmm0 >>>> *vshufi64x2 zmm1, zmm0, zmm0, 14 # zmm1 = zmm0[4,5,6,7,0,1,0,1] ; >>>> please explain how shuffle instructions work here. i know of llvm ir >>>> shuffle, but these assembly ones are difficult for me to understand* >>>> >>> >>> You have to look at the size of the register being mentioned and the >>> number of elements in brackets. In this case the regsiter is 512-bits and >>> the number of elements is 8. 512/8 is 64. So its a shuffle of a v8i64 >>> vector. Then we read the element numbers from left to write just like the >>> shuffle IR instruction. >>> >>> So element 0 of zmm1 gets the value of element 4 of zmm0. Element 1 of >>> zmm1 gets the value of element 5 of zmm5, etc. >>> >>> >>>> * vpaddd zmm0, zmm0, zmm1* >>>> * vshufi64x2 zmm1, zmm0, zmm0, 1 # zmm1 = zmm0[2,3,0,1,0,1,0,1]* >>>> * vpaddd zmm0, zmm0, zmm1* >>>> * vpshufd zmm1, zmm0, 238 # zmm1 >>>> zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]* >>>> * vpaddd zmm0, zmm0, zmm1* >>>> * vpshufd zmm1, zmm0, 229 # zmm1 >>>> zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15]* >>>> vpaddd zmm0, zmm0, zmm1 >>>> vmovd ebx, xmm0 >>>> mov rax, r8 >>>> xor r14d, r14d >>>> .p2align 4, 0x90 >>>> .LBB0_5: # Parent Loop BB0_1 Depth=1 >>>> # Parent Loop BB0_2 Depth=2 >>>> # => This Inner Loop Header: >>>> Depth=3 >>>> lea r15, [rsi + r14] >>>> mov r12d, dword ptr [r15 + 4*r11 - 16000] >>>> imul r12d, dword ptr [rax - 16] >>>> mov ecx, dword ptr [r15 + 4*r11 - 12000] >>>> imul ecx, dword ptr [rax - 12] >>>> mov ebp, dword ptr [r15 + 4*r11 - 8000] >>>> imul ebp, dword ptr [rax - 8] >>>> add r12d, ebx >>>> add ecx, r12d >>>> add ebp, ecx >>>> mov ecx, dword ptr [r15 + 4*r11 - 4000] >>>> imul ecx, dword ptr [rax - 4] >>>> add ecx, ebp >>>> mov ebx, dword ptr [r15 + 4*r11] >>>> imul ebx, dword ptr [rax] >>>> add ebx, ecx >>>> add r14, 20000 >>>> add rax, 20 >>>> cmp r14, 160000 >>>> jne .LBB0_5 >>>> # BB#6: # %.loopexit >>>> # in Loop: Header=BB0_2 >>>> Depth=2 >>>> add r10, rdx #rdx is c[][] >>>> mov dword ptr [r10 + 4*r11], ebx >>>> inc r11 >>>> cmp r11, 1000 >>>> jne .LBB0_2 >>>> # BB#7: # in Loop: Header=BB0_1 >>>> Depth=1 >>>> inc r9 >>>> add r8, 4000 >>>> cmp r9, 1000 >>>> jne .LBB0_1 >>>> # BB#8: >>>> pop rbx >>>> pop r12 >>>> pop r14 >>>> pop r15 >>>> pop rbp >>>> ret >>>> >>>> >>>> Looking forward to your reply >>>> >>>> Thank You >>>> >>>> >>>> >>>> _______________________________________________ >>>> LLVM Developers mailing list >>>> llvm-dev at lists.llvm.org >>>> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev >>>> >>>> >>> >> >> >-------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20170630/85a7135a/attachment.html>