Hello, As we discussed with derf some time ago, it seems it is not neccessary to enforce "forward" order of dct_coeffs. This patch gains .99366902855226196000% so approx 1% speedup. Meausurement method: time nice -n -19 ./dump /mnt/disc4/theora/unix/gripen.ogg > /dev/null Ogg logical stream 310b2968 is Theora 720x480 29.97 fps video Encoded frame content is 720x480 with 0x0 offset 12460 frames This patch is as I had it working. I just want to submit it before I will forgot about it :) Derf please consider applaying this into SVN in some form. Thanks, Regards Rudolf -------------- next part -------------- diff -Naur ../mergeSTATE/test/lib/decode.c test/lib/decode.c --- ../mergeSTATE/test/lib/decode.c 2005-08-17 09:58:23.000000000 +0200 +++ test/lib/decode.c 2005-08-20 11:19:04.052143250 +0200 @@ -1083,53 +1083,53 @@ to. This is updated before the function returns.*/ typedef void (*oc_token_expand_func)(int _token,int _extra_bits, - ogg_int16_t _dct_coeffs[64],int *_zzi); + ogg_int16_t _dct_coeffs[128],int *_zzi); /*Expands a zero run token.*/ void oc_token_expand_zrl(int _token,int _extra_bits, - ogg_int16_t _dct_coeffs[64],int *_zzi){ + ogg_int16_t _dct_coeffs[128],int *_zzi){ int zzi; zzi=*_zzi; - do _dct_coeffs[OC_FZIG_ZAG[zzi++]]=0; + do _dct_coeffs[zzi++]=0; while(_extra_bits-->0); *_zzi=zzi; } /*Expands a constant, single-value token.*/ void oc_token_expand_const(int _token,int _extra_bits, - ogg_int16_t _dct_coeffs[64],int *_zzi){ - _dct_coeffs[OC_FZIG_ZAG[(*_zzi)++]]+ ogg_int16_t _dct_coeffs[128],int *_zzi){ + _dct_coeffs[(*_zzi)++] (ogg_int16_t)oc_token_dec1val_const(_token); } /*Expands category 2 single-valued tokens.*/ void oc_token_expand_cat2(int _token,int _extra_bits, - ogg_int16_t _dct_coeffs[64],int *_zzi){ - _dct_coeffs[OC_FZIG_ZAG[(*_zzi)++]]+ ogg_int16_t _dct_coeffs[128],int *_zzi){ + _dct_coeffs[(*_zzi)++] (ogg_int16_t)oc_token_dec1val_cat2(_token,_extra_bits); } /*Expands category 3 through 8 single-valued tokens.*/ void oc_token_expand_cati(int _token,int _extra_bits, - ogg_int16_t _dct_coeffs[64],int *_zzi){ - _dct_coeffs[OC_FZIG_ZAG[(*_zzi)++]]+ ogg_int16_t _dct_coeffs[128],int *_zzi){ + _dct_coeffs[(*_zzi)++] (ogg_int16_t)oc_token_dec1val_cati(_token,_extra_bits); } /*Expands a category 1a zero run/value combo token.*/ void oc_token_expand_run_cat1a(int _token,int _extra_bits, - ogg_int16_t _dct_coeffs[64],int *_zzi){ + ogg_int16_t _dct_coeffs[128],int *_zzi){ int zzi; int rl; zzi=*_zzi; - for(rl=_token-OC_DCT_RUN_CAT1A+1;rl-->0;)_dct_coeffs[OC_FZIG_ZAG[zzi++]]=0; - _dct_coeffs[OC_FZIG_ZAG[zzi++]]=(ogg_int16_t)(1-(_extra_bits<<1)); + for(rl=_token-OC_DCT_RUN_CAT1A+1;rl-->0;)_dct_coeffs[zzi++]=0; + _dct_coeffs[zzi++]=(ogg_int16_t)(1-(_extra_bits<<1)); *_zzi=zzi; } /*Expands all other zero run/value combo tokens.*/ void oc_token_expand_run(int _token,int _extra_bits, - ogg_int16_t _dct_coeffs[64],int *_zzi){ + ogg_int16_t _dct_coeffs[128],int *_zzi){ static const int NZEROS_ADJUST[OC_NDCT_RUN_MAX-OC_DCT_RUN_CAT1B]={ 6,10,1,2 }; @@ -1154,11 +1154,11 @@ _token-=OC_DCT_RUN_CAT1B; rl=(_extra_bits&NZEROS_MASK[_token])+NZEROS_ADJUST[_token]; zzi=*_zzi; - while(rl-->0)_dct_coeffs[OC_FZIG_ZAG[zzi++]]=0; + while(rl-->0)_dct_coeffs[zzi++]=0; valsigned[0]=VALUE_ADJUST[_token]+ (_extra_bits>>VALUE_SHIFT[_token]&VALUE_MASK[_token]); valsigned[1]=-valsigned[0]; - _dct_coeffs[OC_FZIG_ZAG[zzi++]]=(ogg_int16_t)valsigned[ + _dct_coeffs[zzi++]=(ogg_int16_t)valsigned[ _extra_bits>>SIGN_SHIFT[_token]]; *_zzi=zzi; } diff -Naur ../mergeSTATE/test/lib/internal.h test/lib/internal.h --- ../mergeSTATE/test/lib/internal.h 2005-08-17 10:05:34.000000000 +0200 +++ test/lib/internal.h 2005-08-20 11:39:38.797310000 +0200 @@ -239,7 +239,7 @@ void (*state_frag_copy)(const oc_theora_state *_state, const int *_fragis,int _nfragis,int _dst_frame,int _src_frame,int _pli); void (*state_frag_recon)(oc_theora_state *_state,const oc_fragment *_frag, - int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs, + int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,int _ncoefs, ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]); void (*restore_fpu)(void); void (*oc_state_loop_filter_frag_rows)(oc_theora_state *_state,int *_bv, @@ -409,7 +409,7 @@ void oc_state_frag_copy(const oc_theora_state *_state,const int *_fragis, int _nfragis,int _dst_frame,int _src_frame,int _pli); void oc_state_frag_recon(oc_theora_state *_state,const oc_fragment *_frag, - int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs, + int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,int _ncoefs, ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]); void oc_restore_fpu(const oc_theora_state *_state); void oc_state_loop_filter_frag_rows(oc_theora_state *_state,int *_bv, @@ -426,7 +426,7 @@ void oc_state_frag_copy_c(const oc_theora_state *_state,const int *_fragis, int _nfragis,int _dst_frame,int _src_frame,int _pli); void oc_state_frag_recon_c(oc_theora_state *_state,const oc_fragment *_frag, - int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs, + int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,int _ncoefs, ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]); void oc_restore_fpu_c(void); void oc_state_loop_filter_frag_rows_c(oc_theora_state *_state,int *_bv, diff -Naur ../mergeSTATE/test/lib/state.c test/lib/state.c --- ../mergeSTATE/test/lib/state.c 2005-08-17 10:29:23.797763500 +0200 +++ test/lib/state.c 2005-08-20 11:22:10.211777500 +0200 @@ -788,14 +788,14 @@ } void oc_state_frag_recon(oc_theora_state *_state,const oc_fragment *_frag, - int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs, + int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,int _ncoefs, ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]){ _state->opt_vtable.state_frag_recon(_state,_frag,_pli,_dct_coeffs, _last_zzi,_ncoefs,_dc_iquant,_ac_iquant); } void oc_state_frag_recon_c(oc_theora_state *_state,const oc_fragment *_frag, - int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs, + int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,int _ncoefs, ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]){ ogg_int16_t dct_buf[64]; ogg_int16_t res_buf[64]; @@ -841,7 +841,7 @@ for(zzi=1;zzi<_ncoefs;zzi++){ int ci; ci=OC_FZIG_ZAG[zzi]; - dct_buf[ci]=(ogg_int16_t)((ogg_int32_t)_dct_coeffs[ci]*_ac_iquant[ci]); + dct_buf[ci]=(ogg_int16_t)((ogg_int32_t)_dct_coeffs[zzi]*_ac_iquant[ci]); } /*Then, fill in the remainder of the coefficients with 0's, and perform the iDCT.*/ diff -Naur ../mergeSTATE/test/lib/x86/mmxstate.c test/lib/x86/mmxstate.c --- ../mergeSTATE/test/lib/x86/mmxstate.c 2005-08-17 21:03:14.000000000 +0200 +++ test/lib/x86/mmxstate.c 2005-08-20 11:39:04.899191500 +0200 @@ -29,7 +29,7 @@ void oc_state_frag_recon_mmx(oc_theora_state *_state,const oc_fragment *_frag, - int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs, + int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,int _ncoefs, ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]){ ogg_int16_t __attribute__((aligned(8))) res_buf[64]; int dst_framei; @@ -131,7 +131,7 @@ for(zzi=1;zzi<_ncoefs;zzi++){ int ci; ci=OC_FZIG_ZAG[zzi]; - res_buf[OC_FZIG_ZAGMMX[zzi]]=(ogg_int16_t)((ogg_int32_t)_dct_coeffs[ci]* + res_buf[OC_FZIG_ZAGMMX[zzi]]=(ogg_int16_t)((ogg_int32_t)_dct_coeffs[zzi]* _ac_iquant[ci]); } if(_last_zzi<10){ diff -Naur ../mergeSTATE/test/lib/x86/x86int.h test/lib/x86/x86int.h --- ../mergeSTATE/test/lib/x86/x86int.h 2005-08-17 10:11:36.000000000 +0200 +++ test/lib/x86/x86int.h 2005-08-20 11:38:53.890503500 +0200 @@ -14,7 +14,7 @@ void oc_state_frag_copy_mmx(const oc_theora_state *_state,const int *_fragis, int _nfragis,int _dst_frame,int _src_frame,int _pli); void oc_state_frag_recon_mmx(oc_theora_state *_state,const oc_fragment *_frag, - int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs, + int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,int _ncoefs, ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]); void oc_restore_fpu_mmx(void); void oc_idct8x8_mmx(ogg_int16_t _y[64]);