lvqcl
2013-Aug-22 18:24 UTC
[flac-dev] New routine: FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16
libFLAC have three SSE-accelerated functions FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_N (N = 4, 8, 12). They require lpc_order less than N. The best compression preset (flac -8) uses lpc_order up to 12; it means that during encoding FLAC also uses unaccelerated C function. I'm not very familiar with asm so I took FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12, changed it and wrote function ..._sse_lag_16. According to my tests 'flac -8' encoding became 5% faster. The output files are identical to the output of 64-bit FLAC. But as I said I'm not familiar with assembler so please check it. --- a\src\libFLAC\include\private\lpc.h 2013-08-13 13:30:24.000000000 +0400 +++ b\src\libFLAC\include\private\lpc.h 2013-08-22 20:11:17.524302900 +0400 @@ -75,6 +75,7 @@ void FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]); void FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]); void FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]); +void FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]); void FLAC__lpc_compute_autocorrelation_asm_ia32_3dnow(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]); # endif # endif --- a\src\libFLAC\stream_encoder.c 2013-08-13 13:30:24.000000000 +0400 +++ b\src\libFLAC\stream_encoder.c 2013-08-22 20:14:38.889820300 +0400 @@ -895,6 +895,8 @@ encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8; else if(encoder->protected_->max_lpc_order < 12) encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12; + else if(encoder->protected_->max_lpc_order < 16) + encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16; else encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_asm_ia32; } --- a\src\libFLAC\ia32\lpc_asm.nasm 2013-08-13 13:30:24.000000000 +0400 +++ b\src\libFLAC\ia32\lpc_asm.nasm 2013-08-22 13:06:46.497389100 +0400 @@ -39,6 +39,7 @@ cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4 cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8 cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12 +cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16 cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_3dnow cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32 cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx @@ -596,7 +597,7 @@ movss xmm3, xmm2 movss xmm2, xmm0 - ; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm3:xmm3:xmm2 + ; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm4:xmm3:xmm2 movaps xmm1, xmm0 mulps xmm1, xmm2 addps xmm5, xmm1 @@ -619,6 +620,95 @@ ret ALIGN 16 +cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16 + ;[ebp + 20] == autoc[] + ;[ebp + 16] == lag + ;[ebp + 12] == data_len + ;[ebp + 8] == data[] + ;[esp] == __m128 + ;[esp + 16] == __m128 + + push ebp + mov ebp, esp + and esp, -16 ; stack realign for SSE instructions 'movaps' and 'addps' + sub esp, 32 + + ;ASSERT(lag > 0) + ;ASSERT(lag <= 12) + ;ASSERT(lag <= data_len) + ;ASSERT(data_len > 0) + + ; for(coeff = 0; coeff < lag; coeff++) + ; autoc[coeff] = 0.0; + xorps xmm5, xmm5 + xorps xmm6, xmm6 + movaps [esp], xmm5 + movaps [esp + 16], xmm6 + + mov edx, [ebp + 12] ; edx == data_len + mov eax, [ebp + 8] ; eax == &data[sample] <- &data[0] + + movss xmm0, [eax] ; xmm0 = 0,0,0,data[0] + add eax, 4 + movaps xmm1, xmm0 ; xmm1 = 0,0,0,data[0] + shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0] + xorps xmm2, xmm2 ; xmm2 = 0,0,0,0 + xorps xmm3, xmm3 ; xmm3 = 0,0,0,0 + xorps xmm4, xmm4 ; xmm4 = 0,0,0,0 + movaps xmm7, xmm0 + mulps xmm7, xmm1 + addps xmm5, xmm7 + dec edx + jz .loop_end + ALIGN 16 +.loop_start: + ; start by reading the next sample + movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample] + add eax, 4 + shufps xmm0, xmm0, 0 ; xmm0 = data[sample],data[sample],data[sample],data[sample] + + ; shift xmm4:xmm3:xmm2:xmm1 left by one float + shufps xmm1, xmm1, 93h + shufps xmm2, xmm2, 93h + shufps xmm3, xmm3, 93h + shufps xmm4, xmm4, 93h + movss xmm4, xmm3 + movss xmm3, xmm2 + movss xmm2, xmm1 + movss xmm1, xmm0 + + ; xmmB:xmmA:xmm6:xmm5 += xmm0:xmm0:xmm0:xmm0 * xmm4:xmm3:xmm2:xmm1 + movaps xmm7, xmm0 + mulps xmm7, xmm1 + addps xmm5, xmm7 + movaps xmm7, xmm0 + mulps xmm7, xmm2 + addps xmm6, xmm7 + movaps xmm7, xmm0 + mulps xmm7, xmm3 + mulps xmm0, xmm4 + addps xmm7, [esp] + addps xmm0, [esp + 16] + movaps [esp], xmm7 + movaps [esp + 16], xmm0 + + dec edx + jnz .loop_start +.loop_end: + ; store autoc + mov edx, [ebp + 20] ; edx == autoc + movups [edx], xmm5 + movups [edx + 16], xmm6 + movaps xmm5, [esp] + movaps xmm6, [esp + 16] + movups [edx + 32], xmm5 + movups [edx + 48], xmm6 +.end: + mov esp, ebp + pop ebp + ret + + ALIGN 16 cident FLAC__lpc_compute_autocorrelation_asm_ia32_3dnow ;[ebp + 32] autoc ;[ebp + 28] lag
Erik de Castro Lopo
2013-Aug-26 11:44 UTC
[flac-dev] New routine: FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16
lvqcl wrote:> libFLAC have three SSE-accelerated functions > FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_N (N = 4, 8, 12). > They require lpc_order less than N. > The best compression preset (flac -8) uses lpc_order up to 12; it means > that during encoding FLAC also uses unaccelerated C function. > > I'm not very familiar with asm so I took > FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12, changed it and > wrote function ..._sse_lag_16. According to my tests 'flac -8' encoding > became 5% faster. The output files are identical to the output of 64-bit FLAC. > > But as I said I'm not familiar with assembler so please check it.Unfortunately this patch doesn't apply. How was it generated? Would it be possible to send a patch generated by Git because those usually do work. Erik -- ---------------------------------------------------------------------- Erik de Castro Lopo http://www.mega-nerd.com/
lvqcl
2013-Aug-26 14:38 UTC
[flac-dev] New routine: FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16
Erik de Castro Lopo <mle+la at mega-nerd.com> wrote:> Unfortunately this patch doesn't apply. How was it generated? Would it be > possible to send a patch generated by Git because those usually do work. > > ErikI'm not very familiar with git... I hope that now I did everything correctly. The patch is in the attachment. -------------- next part -------------- A non-text attachment was scrubbed... Name: autocorr_16.patch Type: application/octet-stream Size: 5155 bytes Desc: not available Url : http://lists.xiph.org/pipermail/flac-dev/attachments/20130826/d9b3cfdd/attachment.obj
Possibly Parallel Threads
- New routine: FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16
- New routine: FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16
- New routine: FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16
- Git branch with compiling fixes for win32
- [LLVMdev] LLVM 3.0 oddity