Jorrit Jongma
2020-May-22 18:07 UTC
[PATCH] Optimized assembler version of md5_process() for x86-64
This patch introduces an optimized assembler version of md5_process(), the inner loop of MD5 checksumming. It affects the performance of all MD5 operations in rsync - including block matching and whole-file checksums. Performance gain is 5-10% depending on the specific CPU. Originally created by Marc Bevand and placed in the public domain, later integrated into OpenSSL. This is the original version so no need to muck about with OpenSSL license exemptions. It applies on top of my previously submitted 'v3' patch for get_checksum1(), and though this is technically not SIMD it is wrapped by the SIMD defines and ./configure option. @ GitHub: https://github.com/Chainfire/rsync/commit/dcab47da4f6853974a952f0412f247126a6f1de8 https://github.com/Chainfire/rsync/commit/dcab47da4f6853974a952f0412f247126a6f1de8.patch Note: the assembly (.s) file is formatted using tabs, which gmail here may be messing up in the patch below. See the GitHub link above in that case. -->From dcab47da4f6853974a952f0412f247126a6f1de8 Mon Sep 17 00:00:00 2001From: Jorrit Jongma <git at jongma.org> Date: Fri, 22 May 2020 19:38:37 +0200 Subject: [PATCH] Optimized assembler version of md5_process() for x86-64 Originally created by Marc Bevand and placed in the public domain --- Makefile.in | 7 +- lib/md5.c | 13 + lib/md5_asm_x86_64.s | 693 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 712 insertions(+), 1 deletion(-) create mode 100644 lib/md5_asm_x86_64.s diff --git a/Makefile.in b/Makefile.in index af5aaa56..fbe22ccc 100644 --- a/Makefile.in +++ b/Makefile.in @@ -33,8 +33,10 @@ VERSION=@RSYNC_VERSION@ .SUFFIXES: .c .o CXXOBJ+ASMOBJ ifeq ($(SIMD),x86-64) CXXOBJ=checksum_simd_x86_64.o + ASMOBJ=lib/md5_asm_x86_64.o endif GENFILES=configure.sh aclocal.m4 config.h.in proto.h proto.h-tstamp rsync.1 rsync-ssl.1 rsyncd.conf.5 @@ -52,7 +54,7 @@ OBJS3=progress.o pipe.o DAEMON_OBJ = params.o loadparm.o clientserver.o access.o connection.o authenticate.o popt_OBJS=popt/findme.o popt/popt.o popt/poptconfig.o \ popt/popthelp.o popt/poptparse.o -OBJS=$(OBJS1) $(OBJS2) $(OBJS3) $(CXXOBJ) $(DAEMON_OBJ) $(LIBOBJ) @BUILD_ZLIB@ @BUILD_POPT@ +OBJS=$(OBJS1) $(OBJS2) $(OBJS3) $(CXXOBJ) $(ASMOBJ) $(DAEMON_OBJ) $(LIBOBJ) @BUILD_ZLIB@ @BUILD_POPT@ TLS_OBJ = tls.o syscall.o t_stub.o lib/compat.o lib/snprintf.o lib/permstring.o lib/sysxattrs.o @BUILD_POPT@ @@ -126,6 +128,9 @@ rounding.h: rounding.c rsync.h proto.h checksum_simd_x86_64.o: checksum_simd_x86_64.cpp $(CXX) $(CXXFLAGS) $(CPPFLAGS) -c -o $@ $< +lib/md5_asm_x86_64.o: lib/md5_asm_x86_64.s + $(CC) -c -o $@ $< + tls$(EXEEXT): $(TLS_OBJ) $(CC) $(CFLAGS) $(LDFLAGS) -o $@ $(TLS_OBJ) $(LIBS) diff --git a/lib/md5.c b/lib/md5.c index c979d10c..62bb4715 100644 --- a/lib/md5.c +++ b/lib/md5.c @@ -147,6 +147,10 @@ static void md5_process(md_context *ctx, const uchar data[CSUM_CHUNK]) ctx->D += D; } +#if defined(HAVE_SIMD) && (CSUM_CHUNK == 64) +extern void md5_process_asm(md_context *ctx, const void *data, size_t num); +#endif + void md5_update(md_context *ctx, const uchar *input, uint32 length) { uint32 left, fill; @@ -171,11 +175,20 @@ void md5_update(md_context *ctx, const uchar *input, uint32 length) left = 0; } +#if defined(HAVE_SIMD) && (CSUM_CHUNK == 64) + if (length >= CSUM_CHUNK) { + uint32 chunks = length / CSUM_CHUNK; + md5_process_asm(ctx, input, chunks); + length -= chunks * CSUM_CHUNK; + input += chunks * CSUM_CHUNK; + } +#else while (length >= CSUM_CHUNK) { md5_process(ctx, input); length -= CSUM_CHUNK; input += CSUM_CHUNK; } +#endif if (length) memcpy(ctx->buffer + left, input, length); diff --git a/lib/md5_asm_x86_64.s b/lib/md5_asm_x86_64.s new file mode 100644 index 00000000..a3126151 --- /dev/null +++ b/lib/md5_asm_x86_64.s @@ -0,0 +1,693 @@ +/* + * x86-64 optimized assembler MD5 implementation + * + * Author: Marc Bevand, 2004 + * + * This code was placed in the public domain by the author. The original + * publication can be found at: + * + * https://www.zorinaq.com/papers/md5-amd64.html + */ +/* + * No modifications were made aside from changing the function and file names. + * The MD5_CTX structure as expected here (from OpenSSL) is binary compatible + * with the md_context used by rsync, for the fields accessed. + * + * Benchmarks (in MB/s) C ASM + * - Intel Atom D2700 302 334 + * - Intel i7-7700hq 351 376 + * - AMD ThreadRipper 2950x 728 784 + * + * The original code was also incorporated into OpenSSL. It has since been + * modified there. Those changes have not been made here due to licensing + * incompatibilities. Benchmarks of those changes on the above CPUs did not + * show any significant difference in performance, though. + */ + +.text +.align 16 + +.globl md5_process_asm +.type md5_process_asm, at function +md5_process_asm: + push %rbp + push %rbx + push %r12 + push %r13 # not really useful (r13 is unused) + push %r14 + push %r15 + + # rdi = arg #1 (ctx, MD5_CTX pointer) + # rsi = arg #2 (ptr, data pointer) + # rdx = arg #3 (nbr, number of 16-word blocks to process) + mov %rdi, %rbp # rbp = ctx + shl $6, %rdx # rdx = nbr in bytes + lea (%rsi,%rdx), %rdi # rdi = end + mov 0*4(%rbp), %eax # eax = ctx->A + mov 1*4(%rbp), %ebx # ebx = ctx->B + mov 2*4(%rbp), %ecx # ecx = ctx->C + mov 3*4(%rbp), %edx # edx = ctx->D + # end is 'rdi' + # ptr is 'rsi' + # A is 'eax' + # B is 'ebx' + # C is 'ecx' + # D is 'edx' + + cmp %rdi, %rsi # cmp end with ptr + je 1f # jmp if ptr == end + + # BEGIN of loop over 16-word blocks +2: # save old values of A, B, C, D + mov %eax, %r8d + mov %ebx, %r9d + mov %ecx, %r14d + mov %edx, %r15d + mov 0*4(%rsi), %r10d /* (NEXT STEP) X[0] */ + mov %edx, %r11d /* (NEXT STEP) z' = %edx */ + xor %ecx, %r11d /* y ^ ... */ + lea -680876936(%eax,%r10d),%eax /* Const + dst + ... */ + and %ebx, %r11d /* x & ... */ + xor %edx, %r11d /* z ^ ... */ + mov 1*4(%rsi),%r10d /* (NEXT STEP) X[1] */ + add %r11d, %eax /* dst += ... */ + rol $7, %eax /* dst <<< s */ + mov %ecx, %r11d /* (NEXT STEP) z' = %ecx */ + add %ebx, %eax /* dst += x */ + xor %ebx, %r11d /* y ^ ... */ + lea -389564586(%edx,%r10d),%edx /* Const + dst + ... */ + and %eax, %r11d /* x & ... */ + xor %ecx, %r11d /* z ^ ... */ + mov 2*4(%rsi),%r10d /* (NEXT STEP) X[2] */ + add %r11d, %edx /* dst += ... */ + rol $12, %edx /* dst <<< s */ + mov %ebx, %r11d /* (NEXT STEP) z' = %ebx */ + add %eax, %edx /* dst += x */ + xor %eax, %r11d /* y ^ ... */ + lea 606105819(%ecx,%r10d),%ecx /* Const + dst + ... */ + and %edx, %r11d /* x & ... */ + xor %ebx, %r11d /* z ^ ... */ + mov 3*4(%rsi),%r10d /* (NEXT STEP) X[3] */ + add %r11d, %ecx /* dst += ... */ + rol $17, %ecx /* dst <<< s */ + mov %eax, %r11d /* (NEXT STEP) z' = %eax */ + add %edx, %ecx /* dst += x */ + xor %edx, %r11d /* y ^ ... */ + lea -1044525330(%ebx,%r10d),%ebx /* Const + dst + ... */ + and %ecx, %r11d /* x & ... */ + xor %eax, %r11d /* z ^ ... */ + mov 4*4(%rsi),%r10d /* (NEXT STEP) X[4] */ + add %r11d, %ebx /* dst += ... */ + rol $22, %ebx /* dst <<< s */ + mov %edx, %r11d /* (NEXT STEP) z' = %edx */ + add %ecx, %ebx /* dst += x */ + xor %ecx, %r11d /* y ^ ... */ + lea -176418897(%eax,%r10d),%eax /* Const + dst + ... */ + and %ebx, %r11d /* x & ... */ + xor %edx, %r11d /* z ^ ... */ + mov 5*4(%rsi),%r10d /* (NEXT STEP) X[5] */ + add %r11d, %eax /* dst += ... */ + rol $7, %eax /* dst <<< s */ + mov %ecx, %r11d /* (NEXT STEP) z' = %ecx */ + add %ebx, %eax /* dst += x */ + xor %ebx, %r11d /* y ^ ... */ + lea 1200080426(%edx,%r10d),%edx /* Const + dst + ... */ + and %eax, %r11d /* x & ... */ + xor %ecx, %r11d /* z ^ ... */ + mov 6*4(%rsi),%r10d /* (NEXT STEP) X[6] */ + add %r11d, %edx /* dst += ... */ + rol $12, %edx /* dst <<< s */ + mov %ebx, %r11d /* (NEXT STEP) z' = %ebx */ + add %eax, %edx /* dst += x */ + xor %eax, %r11d /* y ^ ... */ + lea -1473231341(%ecx,%r10d),%ecx /* Const + dst + ... */ + and %edx, %r11d /* x & ... */ + xor %ebx, %r11d /* z ^ ... */ + mov 7*4(%rsi),%r10d /* (NEXT STEP) X[7] */ + add %r11d, %ecx /* dst += ... */ + rol $17, %ecx /* dst <<< s */ + mov %eax, %r11d /* (NEXT STEP) z' = %eax */ + add %edx, %ecx /* dst += x */ + xor %edx, %r11d /* y ^ ... */ + lea -45705983(%ebx,%r10d),%ebx /* Const + dst + ... */ + and %ecx, %r11d /* x & ... */ + xor %eax, %r11d /* z ^ ... */ + mov 8*4(%rsi),%r10d /* (NEXT STEP) X[8] */ + add %r11d, %ebx /* dst += ... */ + rol $22, %ebx /* dst <<< s */ + mov %edx, %r11d /* (NEXT STEP) z' = %edx */ + add %ecx, %ebx /* dst += x */ + xor %ecx, %r11d /* y ^ ... */ + lea 1770035416(%eax,%r10d),%eax /* Const + dst + ... */ + and %ebx, %r11d /* x & ... */ + xor %edx, %r11d /* z ^ ... */ + mov 9*4(%rsi),%r10d /* (NEXT STEP) X[9] */ + add %r11d, %eax /* dst += ... */ + rol $7, %eax /* dst <<< s */ + mov %ecx, %r11d /* (NEXT STEP) z' = %ecx */ + add %ebx, %eax /* dst += x */ + xor %ebx, %r11d /* y ^ ... */ + lea -1958414417(%edx,%r10d),%edx /* Const + dst + ... */ + and %eax, %r11d /* x & ... */ + xor %ecx, %r11d /* z ^ ... */ + mov 10*4(%rsi),%r10d /* (NEXT STEP) X[10] */ + add %r11d, %edx /* dst += ... */ + rol $12, %edx /* dst <<< s */ + mov %ebx, %r11d /* (NEXT STEP) z' = %ebx */ + add %eax, %edx /* dst += x */ + xor %eax, %r11d /* y ^ ... */ + lea -42063(%ecx,%r10d),%ecx /* Const + dst + ... */ + and %edx, %r11d /* x & ... */ + xor %ebx, %r11d /* z ^ ... */ + mov 11*4(%rsi),%r10d /* (NEXT STEP) X[11] */ + add %r11d, %ecx /* dst += ... */ + rol $17, %ecx /* dst <<< s */ + mov %eax, %r11d /* (NEXT STEP) z' = %eax */ + add %edx, %ecx /* dst += x */ + xor %edx, %r11d /* y ^ ... */ + lea -1990404162(%ebx,%r10d),%ebx /* Const + dst + ... */ + and %ecx, %r11d /* x & ... */ + xor %eax, %r11d /* z ^ ... */ + mov 12*4(%rsi),%r10d /* (NEXT STEP) X[12] */ + add %r11d, %ebx /* dst += ... */ + rol $22, %ebx /* dst <<< s */ + mov %edx, %r11d /* (NEXT STEP) z' = %edx */ + add %ecx, %ebx /* dst += x */ + xor %ecx, %r11d /* y ^ ... */ + lea 1804603682(%eax,%r10d),%eax /* Const + dst + ... */ + and %ebx, %r11d /* x & ... */ + xor %edx, %r11d /* z ^ ... */ + mov 13*4(%rsi),%r10d /* (NEXT STEP) X[13] */ + add %r11d, %eax /* dst += ... */ + rol $7, %eax /* dst <<< s */ + mov %ecx, %r11d /* (NEXT STEP) z' = %ecx */ + add %ebx, %eax /* dst += x */ + xor %ebx, %r11d /* y ^ ... */ + lea -40341101(%edx,%r10d),%edx /* Const + dst + ... */ + and %eax, %r11d /* x & ... */ + xor %ecx, %r11d /* z ^ ... */ + mov 14*4(%rsi),%r10d /* (NEXT STEP) X[14] */ + add %r11d, %edx /* dst += ... */ + rol $12, %edx /* dst <<< s */ + mov %ebx, %r11d /* (NEXT STEP) z' = %ebx */ + add %eax, %edx /* dst += x */ + xor %eax, %r11d /* y ^ ... */ + lea -1502002290(%ecx,%r10d),%ecx /* Const + dst + ... */ + and %edx, %r11d /* x & ... */ + xor %ebx, %r11d /* z ^ ... */ + mov 15*4(%rsi),%r10d /* (NEXT STEP) X[15] */ + add %r11d, %ecx /* dst += ... */ + rol $17, %ecx /* dst <<< s */ + mov %eax, %r11d /* (NEXT STEP) z' = %eax */ + add %edx, %ecx /* dst += x */ + xor %edx, %r11d /* y ^ ... */ + lea 1236535329(%ebx,%r10d),%ebx /* Const + dst + ... */ + and %ecx, %r11d /* x & ... */ + xor %eax, %r11d /* z ^ ... */ + mov 0*4(%rsi),%r10d /* (NEXT STEP) X[0] */ + add %r11d, %ebx /* dst += ... */ + rol $22, %ebx /* dst <<< s */ + mov %edx, %r11d /* (NEXT STEP) z' = %edx */ + add %ecx, %ebx /* dst += x */ + mov 1*4(%rsi), %r10d /* (NEXT STEP) X[1] */ + mov %edx, %r11d /* (NEXT STEP) z' = %edx */ + mov %edx, %r12d /* (NEXT STEP) z' = %edx */ + not %r11d /* not z */ + lea -165796510(%eax,%r10d),%eax /* Const + dst + ... */ + and %ebx, %r12d /* x & z */ + and %ecx, %r11d /* y & (not z) */ + mov 6*4(%rsi),%r10d /* (NEXT STEP) X[6] */ + or %r11d, %r12d /* (y & (not z)) | (x & z) */ + mov %ecx, %r11d /* (NEXT STEP) z' = %ecx */ + add %r12d, %eax /* dst += ... */ + mov %ecx, %r12d /* (NEXT STEP) z' = %ecx */ + rol $5, %eax /* dst <<< s */ + add %ebx, %eax /* dst += x */ + not %r11d /* not z */ + lea -1069501632(%edx,%r10d),%edx /* Const + dst + ... */ + and %eax, %r12d /* x & z */ + and %ebx, %r11d /* y & (not z) */ + mov 11*4(%rsi),%r10d /* (NEXT STEP) X[11] */ + or %r11d, %r12d /* (y & (not z)) | (x & z) */ + mov %ebx, %r11d /* (NEXT STEP) z' = %ebx */ + add %r12d, %edx /* dst += ... */ + mov %ebx, %r12d /* (NEXT STEP) z' = %ebx */ + rol $9, %edx /* dst <<< s */ + add %eax, %edx /* dst += x */ + not %r11d /* not z */ + lea 643717713(%ecx,%r10d),%ecx /* Const + dst + ... */ + and %edx, %r12d /* x & z */ + and %eax, %r11d /* y & (not z) */ + mov 0*4(%rsi),%r10d /* (NEXT STEP) X[0] */ + or %r11d, %r12d /* (y & (not z)) | (x & z) */ + mov %eax, %r11d /* (NEXT STEP) z' = %eax */ + add %r12d, %ecx /* dst += ... */ + mov %eax, %r12d /* (NEXT STEP) z' = %eax */ + rol $14, %ecx /* dst <<< s */ + add %edx, %ecx /* dst += x */ + not %r11d /* not z */ + lea -373897302(%ebx,%r10d),%ebx /* Const + dst + ... */ + and %ecx, %r12d /* x & z */ + and %edx, %r11d /* y & (not z) */ + mov 5*4(%rsi),%r10d /* (NEXT STEP) X[5] */ + or %r11d, %r12d /* (y & (not z)) | (x & z) */ + mov %edx, %r11d /* (NEXT STEP) z' = %edx */ + add %r12d, %ebx /* dst += ... */ + mov %edx, %r12d /* (NEXT STEP) z' = %edx */ + rol $20, %ebx /* dst <<< s */ + add %ecx, %ebx /* dst += x */ + not %r11d /* not z */ + lea -701558691(%eax,%r10d),%eax /* Const + dst + ... */ + and %ebx, %r12d /* x & z */ + and %ecx, %r11d /* y & (not z) */ + mov 10*4(%rsi),%r10d /* (NEXT STEP) X[10] */ + or %r11d, %r12d /* (y & (not z)) | (x & z) */ + mov %ecx, %r11d /* (NEXT STEP) z' = %ecx */ + add %r12d, %eax /* dst += ... */ + mov %ecx, %r12d /* (NEXT STEP) z' = %ecx */ + rol $5, %eax /* dst <<< s */ + add %ebx, %eax /* dst += x */ + not %r11d /* not z */ + lea 38016083(%edx,%r10d),%edx /* Const + dst + ... */ + and %eax, %r12d /* x & z */ + and %ebx, %r11d /* y & (not z) */ + mov 15*4(%rsi),%r10d /* (NEXT STEP) X[15] */ + or %r11d, %r12d /* (y & (not z)) | (x & z) */ + mov %ebx, %r11d /* (NEXT STEP) z' = %ebx */ + add %r12d, %edx /* dst += ... */ + mov %ebx, %r12d /* (NEXT STEP) z' = %ebx */ + rol $9, %edx /* dst <<< s */ + add %eax, %edx /* dst += x */ + not %r11d /* not z */ + lea -660478335(%ecx,%r10d),%ecx /* Const + dst + ... */ + and %edx, %r12d /* x & z */ + and %eax, %r11d /* y & (not z) */ + mov 4*4(%rsi),%r10d /* (NEXT STEP) X[4] */ + or %r11d, %r12d /* (y & (not z)) | (x & z) */ + mov %eax, %r11d /* (NEXT STEP) z' = %eax */ + add %r12d, %ecx /* dst += ... */ + mov %eax, %r12d /* (NEXT STEP) z' = %eax */ + rol $14, %ecx /* dst <<< s */ + add %edx, %ecx /* dst += x */ + not %r11d /* not z */ + lea -405537848(%ebx,%r10d),%ebx /* Const + dst + ... */ + and %ecx, %r12d /* x & z */ + and %edx, %r11d /* y & (not z) */ + mov 9*4(%rsi),%r10d /* (NEXT STEP) X[9] */ + or %r11d, %r12d /* (y & (not z)) | (x & z) */ + mov %edx, %r11d /* (NEXT STEP) z' = %edx */ + add %r12d, %ebx /* dst += ... */ + mov %edx, %r12d /* (NEXT STEP) z' = %edx */ + rol $20, %ebx /* dst <<< s */ + add %ecx, %ebx /* dst += x */ + not %r11d /* not z */ + lea 568446438(%eax,%r10d),%eax /* Const + dst + ... */ + and %ebx, %r12d /* x & z */ + and %ecx, %r11d /* y & (not z) */ + mov 14*4(%rsi),%r10d /* (NEXT STEP) X[14] */ + or %r11d, %r12d /* (y & (not z)) | (x & z) */ + mov %ecx, %r11d /* (NEXT STEP) z' = %ecx */ + add %r12d, %eax /* dst += ... */ + mov %ecx, %r12d /* (NEXT STEP) z' = %ecx */ + rol $5, %eax /* dst <<< s */ + add %ebx, %eax /* dst += x */ + not %r11d /* not z */ + lea -1019803690(%edx,%r10d),%edx /* Const + dst + ... */ + and %eax, %r12d /* x & z */ + and %ebx, %r11d /* y & (not z) */ + mov 3*4(%rsi),%r10d /* (NEXT STEP) X[3] */ + or %r11d, %r12d /* (y & (not z)) | (x & z) */ + mov %ebx, %r11d /* (NEXT STEP) z' = %ebx */ + add %r12d, %edx /* dst += ... */ + mov %ebx, %r12d /* (NEXT STEP) z' = %ebx */ + rol $9, %edx /* dst <<< s */ + add %eax, %edx /* dst += x */ + not %r11d /* not z */ + lea -187363961(%ecx,%r10d),%ecx /* Const + dst + ... */ + and %edx, %r12d /* x & z */ + and %eax, %r11d /* y & (not z) */ + mov 8*4(%rsi),%r10d /* (NEXT STEP) X[8] */ + or %r11d, %r12d /* (y & (not z)) | (x & z) */ + mov %eax, %r11d /* (NEXT STEP) z' = %eax */ + add %r12d, %ecx /* dst += ... */ + mov %eax, %r12d /* (NEXT STEP) z' = %eax */ + rol $14, %ecx /* dst <<< s */ + add %edx, %ecx /* dst += x */ + not %r11d /* not z */ + lea 1163531501(%ebx,%r10d),%ebx /* Const + dst + ... */ + and %ecx, %r12d /* x & z */ + and %edx, %r11d /* y & (not z) */ + mov 13*4(%rsi),%r10d /* (NEXT STEP) X[13] */ + or %r11d, %r12d /* (y & (not z)) | (x & z) */ + mov %edx, %r11d /* (NEXT STEP) z' = %edx */ + add %r12d, %ebx /* dst += ... */ + mov %edx, %r12d /* (NEXT STEP) z' = %edx */ + rol $20, %ebx /* dst <<< s */ + add %ecx, %ebx /* dst += x */ + not %r11d /* not z */ + lea -1444681467(%eax,%r10d),%eax /* Const + dst + ... */ + and %ebx, %r12d /* x & z */ + and %ecx, %r11d /* y & (not z) */ + mov 2*4(%rsi),%r10d /* (NEXT STEP) X[2] */ + or %r11d, %r12d /* (y & (not z)) | (x & z) */ + mov %ecx, %r11d /* (NEXT STEP) z' = %ecx */ + add %r12d, %eax /* dst += ... */ + mov %ecx, %r12d /* (NEXT STEP) z' = %ecx */ + rol $5, %eax /* dst <<< s */ + add %ebx, %eax /* dst += x */ + not %r11d /* not z */ + lea -51403784(%edx,%r10d),%edx /* Const + dst + ... */ + and %eax, %r12d /* x & z */ + and %ebx, %r11d /* y & (not z) */ + mov 7*4(%rsi),%r10d /* (NEXT STEP) X[7] */ + or %r11d, %r12d /* (y & (not z)) | (x & z) */ + mov %ebx, %r11d /* (NEXT STEP) z' = %ebx */ + add %r12d, %edx /* dst += ... */ + mov %ebx, %r12d /* (NEXT STEP) z' = %ebx */ + rol $9, %edx /* dst <<< s */ + add %eax, %edx /* dst += x */ + not %r11d /* not z */ + lea 1735328473(%ecx,%r10d),%ecx /* Const + dst + ... */ + and %edx, %r12d /* x & z */ + and %eax, %r11d /* y & (not z) */ + mov 12*4(%rsi),%r10d /* (NEXT STEP) X[12] */ + or %r11d, %r12d /* (y & (not z)) | (x & z) */ + mov %eax, %r11d /* (NEXT STEP) z' = %eax */ + add %r12d, %ecx /* dst += ... */ + mov %eax, %r12d /* (NEXT STEP) z' = %eax */ + rol $14, %ecx /* dst <<< s */ + add %edx, %ecx /* dst += x */ + not %r11d /* not z */ + lea -1926607734(%ebx,%r10d),%ebx /* Const + dst + ... */ + and %ecx, %r12d /* x & z */ + and %edx, %r11d /* y & (not z) */ + mov 0*4(%rsi),%r10d /* (NEXT STEP) X[0] */ + or %r11d, %r12d /* (y & (not z)) | (x & z) */ + mov %edx, %r11d /* (NEXT STEP) z' = %edx */ + add %r12d, %ebx /* dst += ... */ + mov %edx, %r12d /* (NEXT STEP) z' = %edx */ + rol $20, %ebx /* dst <<< s */ + add %ecx, %ebx /* dst += x */ + mov 5*4(%rsi), %r10d /* (NEXT STEP) X[5] */ + mov %ecx, %r11d /* (NEXT STEP) y' = %ecx */ + lea -378558(%eax,%r10d),%eax /* Const + dst + ... */ + mov 8*4(%rsi),%r10d /* (NEXT STEP) X[8] */ + xor %edx, %r11d /* z ^ ... */ + xor %ebx, %r11d /* x ^ ... */ + add %r11d, %eax /* dst += ... */ + rol $4, %eax /* dst <<< s */ + mov %ebx, %r11d /* (NEXT STEP) y' = %ebx */ + add %ebx, %eax /* dst += x */ + lea -2022574463(%edx,%r10d),%edx /* Const + dst + ... */ + mov 11*4(%rsi),%r10d /* (NEXT STEP) X[11] */ + xor %ecx, %r11d /* z ^ ... */ + xor %eax, %r11d /* x ^ ... */ + add %r11d, %edx /* dst += ... */ + rol $11, %edx /* dst <<< s */ + mov %eax, %r11d /* (NEXT STEP) y' = %eax */ + add %eax, %edx /* dst += x */ + lea 1839030562(%ecx,%r10d),%ecx /* Const + dst + ... */ + mov 14*4(%rsi),%r10d /* (NEXT STEP) X[14] */ + xor %ebx, %r11d /* z ^ ... */ + xor %edx, %r11d /* x ^ ... */ + add %r11d, %ecx /* dst += ... */ + rol $16, %ecx /* dst <<< s */ + mov %edx, %r11d /* (NEXT STEP) y' = %edx */ + add %edx, %ecx /* dst += x */ + lea -35309556(%ebx,%r10d),%ebx /* Const + dst + ... */ + mov 1*4(%rsi),%r10d /* (NEXT STEP) X[1] */ + xor %eax, %r11d /* z ^ ... */ + xor %ecx, %r11d /* x ^ ... */ + add %r11d, %ebx /* dst += ... */ + rol $23, %ebx /* dst <<< s */ + mov %ecx, %r11d /* (NEXT STEP) y' = %ecx */ + add %ecx, %ebx /* dst += x */ + lea -1530992060(%eax,%r10d),%eax /* Const + dst + ... */ + mov 4*4(%rsi),%r10d /* (NEXT STEP) X[4] */ + xor %edx, %r11d /* z ^ ... */ + xor %ebx, %r11d /* x ^ ... */ + add %r11d, %eax /* dst += ... */ + rol $4, %eax /* dst <<< s */ + mov %ebx, %r11d /* (NEXT STEP) y' = %ebx */ + add %ebx, %eax /* dst += x */ + lea 1272893353(%edx,%r10d),%edx /* Const + dst + ... */ + mov 7*4(%rsi),%r10d /* (NEXT STEP) X[7] */ + xor %ecx, %r11d /* z ^ ... */ + xor %eax, %r11d /* x ^ ... */ + add %r11d, %edx /* dst += ... */ + rol $11, %edx /* dst <<< s */ + mov %eax, %r11d /* (NEXT STEP) y' = %eax */ + add %eax, %edx /* dst += x */ + lea -155497632(%ecx,%r10d),%ecx /* Const + dst + ... */ + mov 10*4(%rsi),%r10d /* (NEXT STEP) X[10] */ + xor %ebx, %r11d /* z ^ ... */ + xor %edx, %r11d /* x ^ ... */ + add %r11d, %ecx /* dst += ... */ + rol $16, %ecx /* dst <<< s */ + mov %edx, %r11d /* (NEXT STEP) y' = %edx */ + add %edx, %ecx /* dst += x */ + lea -1094730640(%ebx,%r10d),%ebx /* Const + dst + ... */ + mov 13*4(%rsi),%r10d /* (NEXT STEP) X[13] */ + xor %eax, %r11d /* z ^ ... */ + xor %ecx, %r11d /* x ^ ... */ + add %r11d, %ebx /* dst += ... */ + rol $23, %ebx /* dst <<< s */ + mov %ecx, %r11d /* (NEXT STEP) y' = %ecx */ + add %ecx, %ebx /* dst += x */ + lea 681279174(%eax,%r10d),%eax /* Const + dst + ... */ + mov 0*4(%rsi),%r10d /* (NEXT STEP) X[0] */ + xor %edx, %r11d /* z ^ ... */ + xor %ebx, %r11d /* x ^ ... */ + add %r11d, %eax /* dst += ... */ + rol $4, %eax /* dst <<< s */ + mov %ebx, %r11d /* (NEXT STEP) y' = %ebx */ + add %ebx, %eax /* dst += x */ + lea -358537222(%edx,%r10d),%edx /* Const + dst + ... */ + mov 3*4(%rsi),%r10d /* (NEXT STEP) X[3] */ + xor %ecx, %r11d /* z ^ ... */ + xor %eax, %r11d /* x ^ ... */ + add %r11d, %edx /* dst += ... */ + rol $11, %edx /* dst <<< s */ + mov %eax, %r11d /* (NEXT STEP) y' = %eax */ + add %eax, %edx /* dst += x */ + lea -722521979(%ecx,%r10d),%ecx /* Const + dst + ... */ + mov 6*4(%rsi),%r10d /* (NEXT STEP) X[6] */ + xor %ebx, %r11d /* z ^ ... */ + xor %edx, %r11d /* x ^ ... */ + add %r11d, %ecx /* dst += ... */ + rol $16, %ecx /* dst <<< s */ + mov %edx, %r11d /* (NEXT STEP) y' = %edx */ + add %edx, %ecx /* dst += x */ + lea 76029189(%ebx,%r10d),%ebx /* Const + dst + ... */ + mov 9*4(%rsi),%r10d /* (NEXT STEP) X[9] */ + xor %eax, %r11d /* z ^ ... */ + xor %ecx, %r11d /* x ^ ... */ + add %r11d, %ebx /* dst += ... */ + rol $23, %ebx /* dst <<< s */ + mov %ecx, %r11d /* (NEXT STEP) y' = %ecx */ + add %ecx, %ebx /* dst += x */ + lea -640364487(%eax,%r10d),%eax /* Const + dst + ... */ + mov 12*4(%rsi),%r10d /* (NEXT STEP) X[12] */ + xor %edx, %r11d /* z ^ ... */ + xor %ebx, %r11d /* x ^ ... */ + add %r11d, %eax /* dst += ... */ + rol $4, %eax /* dst <<< s */ + mov %ebx, %r11d /* (NEXT STEP) y' = %ebx */ + add %ebx, %eax /* dst += x */ + lea -421815835(%edx,%r10d),%edx /* Const + dst + ... */ + mov 15*4(%rsi),%r10d /* (NEXT STEP) X[15] */ + xor %ecx, %r11d /* z ^ ... */ + xor %eax, %r11d /* x ^ ... */ + add %r11d, %edx /* dst += ... */ + rol $11, %edx /* dst <<< s */ + mov %eax, %r11d /* (NEXT STEP) y' = %eax */ + add %eax, %edx /* dst += x */ + lea 530742520(%ecx,%r10d),%ecx /* Const + dst + ... */ + mov 2*4(%rsi),%r10d /* (NEXT STEP) X[2] */ + xor %ebx, %r11d /* z ^ ... */ + xor %edx, %r11d /* x ^ ... */ + add %r11d, %ecx /* dst += ... */ + rol $16, %ecx /* dst <<< s */ + mov %edx, %r11d /* (NEXT STEP) y' = %edx */ + add %edx, %ecx /* dst += x */ + lea -995338651(%ebx,%r10d),%ebx /* Const + dst + ... */ + mov 0*4(%rsi),%r10d /* (NEXT STEP) X[0] */ + xor %eax, %r11d /* z ^ ... */ + xor %ecx, %r11d /* x ^ ... */ + add %r11d, %ebx /* dst += ... */ + rol $23, %ebx /* dst <<< s */ + mov %ecx, %r11d /* (NEXT STEP) y' = %ecx */ + add %ecx, %ebx /* dst += x */ + mov 0*4(%rsi), %r10d /* (NEXT STEP) X[0] */ + mov $0xffffffff, %r11d + xor %edx, %r11d /* (NEXT STEP) not z' = not %edx*/ + lea -198630844(%eax,%r10d),%eax /* Const + dst + ... */ + or %ebx, %r11d /* x | ... */ + xor %ecx, %r11d /* y ^ ... */ + add %r11d, %eax /* dst += ... */ + mov 7*4(%rsi),%r10d /* (NEXT STEP) X[7] */ + mov $0xffffffff, %r11d + rol $6, %eax /* dst <<< s */ + xor %ecx, %r11d /* (NEXT STEP) not z' = not %ecx */ + add %ebx, %eax /* dst += x */ + lea 1126891415(%edx,%r10d),%edx /* Const + dst + ... */ + or %eax, %r11d /* x | ... */ + xor %ebx, %r11d /* y ^ ... */ + add %r11d, %edx /* dst += ... */ + mov 14*4(%rsi),%r10d /* (NEXT STEP) X[14] */ + mov $0xffffffff, %r11d + rol $10, %edx /* dst <<< s */ + xor %ebx, %r11d /* (NEXT STEP) not z' = not %ebx */ + add %eax, %edx /* dst += x */ + lea -1416354905(%ecx,%r10d),%ecx /* Const + dst + ... */ + or %edx, %r11d /* x | ... */ + xor %eax, %r11d /* y ^ ... */ + add %r11d, %ecx /* dst += ... */ + mov 5*4(%rsi),%r10d /* (NEXT STEP) X[5] */ + mov $0xffffffff, %r11d + rol $15, %ecx /* dst <<< s */ + xor %eax, %r11d /* (NEXT STEP) not z' = not %eax */ + add %edx, %ecx /* dst += x */ + lea -57434055(%ebx,%r10d),%ebx /* Const + dst + ... */ + or %ecx, %r11d /* x | ... */ + xor %edx, %r11d /* y ^ ... */ + add %r11d, %ebx /* dst += ... */ + mov 12*4(%rsi),%r10d /* (NEXT STEP) X[12] */ + mov $0xffffffff, %r11d + rol $21, %ebx /* dst <<< s */ + xor %edx, %r11d /* (NEXT STEP) not z' = not %edx */ + add %ecx, %ebx /* dst += x */ + lea 1700485571(%eax,%r10d),%eax /* Const + dst + ... */ + or %ebx, %r11d /* x | ... */ + xor %ecx, %r11d /* y ^ ... */ + add %r11d, %eax /* dst += ... */ + mov 3*4(%rsi),%r10d /* (NEXT STEP) X[3] */ + mov $0xffffffff, %r11d + rol $6, %eax /* dst <<< s */ + xor %ecx, %r11d /* (NEXT STEP) not z' = not %ecx */ + add %ebx, %eax /* dst += x */ + lea -1894986606(%edx,%r10d),%edx /* Const + dst + ... */ + or %eax, %r11d /* x | ... */ + xor %ebx, %r11d /* y ^ ... */ + add %r11d, %edx /* dst += ... */ + mov 10*4(%rsi),%r10d /* (NEXT STEP) X[10] */ + mov $0xffffffff, %r11d + rol $10, %edx /* dst <<< s */ + xor %ebx, %r11d /* (NEXT STEP) not z' = not %ebx */ + add %eax, %edx /* dst += x */ + lea -1051523(%ecx,%r10d),%ecx /* Const + dst + ... */ + or %edx, %r11d /* x | ... */ + xor %eax, %r11d /* y ^ ... */ + add %r11d, %ecx /* dst += ... */ + mov 1*4(%rsi),%r10d /* (NEXT STEP) X[1] */ + mov $0xffffffff, %r11d + rol $15, %ecx /* dst <<< s */ + xor %eax, %r11d /* (NEXT STEP) not z' = not %eax */ + add %edx, %ecx /* dst += x */ + lea -2054922799(%ebx,%r10d),%ebx /* Const + dst + ... */ + or %ecx, %r11d /* x | ... */ + xor %edx, %r11d /* y ^ ... */ + add %r11d, %ebx /* dst += ... */ + mov 8*4(%rsi),%r10d /* (NEXT STEP) X[8] */ + mov $0xffffffff, %r11d + rol $21, %ebx /* dst <<< s */ + xor %edx, %r11d /* (NEXT STEP) not z' = not %edx */ + add %ecx, %ebx /* dst += x */ + lea 1873313359(%eax,%r10d),%eax /* Const + dst + ... */ + or %ebx, %r11d /* x | ... */ + xor %ecx, %r11d /* y ^ ... */ + add %r11d, %eax /* dst += ... */ + mov 15*4(%rsi),%r10d /* (NEXT STEP) X[15] */ + mov $0xffffffff, %r11d + rol $6, %eax /* dst <<< s */ + xor %ecx, %r11d /* (NEXT STEP) not z' = not %ecx */ + add %ebx, %eax /* dst += x */ + lea -30611744(%edx,%r10d),%edx /* Const + dst + ... */ + or %eax, %r11d /* x | ... */ + xor %ebx, %r11d /* y ^ ... */ + add %r11d, %edx /* dst += ... */ + mov 6*4(%rsi),%r10d /* (NEXT STEP) X[6] */ + mov $0xffffffff, %r11d + rol $10, %edx /* dst <<< s */ + xor %ebx, %r11d /* (NEXT STEP) not z' = not %ebx */ + add %eax, %edx /* dst += x */ + lea -1560198380(%ecx,%r10d),%ecx /* Const + dst + ... */ + or %edx, %r11d /* x | ... */ + xor %eax, %r11d /* y ^ ... */ + add %r11d, %ecx /* dst += ... */ + mov 13*4(%rsi),%r10d /* (NEXT STEP) X[13] */ + mov $0xffffffff, %r11d + rol $15, %ecx /* dst <<< s */ + xor %eax, %r11d /* (NEXT STEP) not z' = not %eax */ + add %edx, %ecx /* dst += x */ + lea 1309151649(%ebx,%r10d),%ebx /* Const + dst + ... */ + or %ecx, %r11d /* x | ... */ + xor %edx, %r11d /* y ^ ... */ + add %r11d, %ebx /* dst += ... */ + mov 4*4(%rsi),%r10d /* (NEXT STEP) X[4] */ + mov $0xffffffff, %r11d + rol $21, %ebx /* dst <<< s */ + xor %edx, %r11d /* (NEXT STEP) not z' = not %edx */ + add %ecx, %ebx /* dst += x */ + lea -145523070(%eax,%r10d),%eax /* Const + dst + ... */ + or %ebx, %r11d /* x | ... */ + xor %ecx, %r11d /* y ^ ... */ + add %r11d, %eax /* dst += ... */ + mov 11*4(%rsi),%r10d /* (NEXT STEP) X[11] */ + mov $0xffffffff, %r11d + rol $6, %eax /* dst <<< s */ + xor %ecx, %r11d /* (NEXT STEP) not z' = not %ecx */ + add %ebx, %eax /* dst += x */ + lea -1120210379(%edx,%r10d),%edx /* Const + dst + ... */ + or %eax, %r11d /* x | ... */ + xor %ebx, %r11d /* y ^ ... */ + add %r11d, %edx /* dst += ... */ + mov 2*4(%rsi),%r10d /* (NEXT STEP) X[2] */ + mov $0xffffffff, %r11d + rol $10, %edx /* dst <<< s */ + xor %ebx, %r11d /* (NEXT STEP) not z' = not %ebx */ + add %eax, %edx /* dst += x */ + lea 718787259(%ecx,%r10d),%ecx /* Const + dst + ... */ + or %edx, %r11d /* x | ... */ + xor %eax, %r11d /* y ^ ... */ + add %r11d, %ecx /* dst += ... */ + mov 9*4(%rsi),%r10d /* (NEXT STEP) X[9] */ + mov $0xffffffff, %r11d + rol $15, %ecx /* dst <<< s */ + xor %eax, %r11d /* (NEXT STEP) not z' = not %eax */ + add %edx, %ecx /* dst += x */ + lea -343485551(%ebx,%r10d),%ebx /* Const + dst + ... */ + or %ecx, %r11d /* x | ... */ + xor %edx, %r11d /* y ^ ... */ + add %r11d, %ebx /* dst += ... */ + mov 0*4(%rsi),%r10d /* (NEXT STEP) X[0] */ + mov $0xffffffff, %r11d + rol $21, %ebx /* dst <<< s */ + xor %edx, %r11d /* (NEXT STEP) not z' = not %edx */ + add %ecx, %ebx /* dst += x */ + # add old values of A, B, C, D + add %r8d, %eax + add %r9d, %ebx + add %r14d, %ecx + add %r15d, %edx + + # loop control + add $64, %rsi # ptr += 64 + cmp %rdi, %rsi # cmp end with ptr + jb 2b # jmp if ptr < end + # END of loop over 16-word blocks +1: + mov %eax, 0*4(%rbp) # ctx->A = A + mov %ebx, 1*4(%rbp) # ctx->B = B + mov %ecx, 2*4(%rbp) # ctx->C = C + mov %edx, 3*4(%rbp) # ctx->D = D + + pop %r15 + pop %r14 + pop %r13 # not really useful (r13 is unused) + pop %r12 + pop %rbx + pop %rbp + ret +.L_md5_process_asm_end: +.size md5_process_asm,.L_md5_process_asm_end-md5_process_asm
Wayne Davison
2020-May-23 05:54 UTC
[PATCH] Optimized assembler version of md5_process() for x86-64
On Fri, May 22, 2020 at 11:08 AM Jorrit Jongma via rsync < rsync at lists.samba.org> wrote:> This patch introduces an optimized assembler version of md5_process(), the > inner loop of MD5 checksumming. It affects the performance of all MD5 > operations in rsync - including block matching and whole-file checksums. >Thanks for the optimizing patches, Jorrit! I've merged your latest changes into the git master branch. ..wayne.. -------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.samba.org/pipermail/rsync/attachments/20200522/6dad8782/attachment.htm>
Sebastian Andrzej Siewior
2020-May-23 11:37 UTC
[PATCH] Optimized assembler version of md5_process() for x86-64
On 2020-05-22 22:54:18 [-0700], Wayne Davison via rsync wrote:> Thanks for the optimizing patches, Jorrit! I've merged your latest changes > into the git master branch.Wouldn't it be better to add support for a crypto library (like openssl) which would provide optimized algorithms for more than just one platform without the need to maintain it separately?> ..wayne..Sebastian
Seemingly Similar Threads
- [PATCH] Optimized assembler version of md5_process() for x86-64
- [PATCH] Optimized assembler version of md5_process() for x86-64
- [RFC PATCH] Add SHA1 support
- [PATCH] Optimized assembler version of md5_process() for x86-64
- [PATCH] SSE2/SSSE3 optimized version of get_checksum1() for x86-64