search for: _mm_set1_epi32

Displaying 9 results from an estimated 9 matches for "_mm_set1_epi32".

2014 Oct 13
2
[LLVMdev] Unexpected spilling of vector register during lane extraction on some x86_64 targets
...int32_t(v[1] >> 32); } assert(0); return -1; } #endif __m128 x[] = { (__m128){ .123f, .999f, .123f, .999f } }; __m128 r[1]; static const float table[3] = { 1.0, 2.0, 4.0, }; static __m128 testee( const __m128 x) { const __m128i iexp = _mm_sub_epi32(_mm_srli_epi32(_mm_castps_si128(x), 23), _mm_set1_epi32(127)); const __m128 s = _mm_or_ps( _mm_andnot_ps(_mm_castsi128_ps(_mm_set1_epi32(0xff << 23)), x), _mm_castsi128_ps(_mm_set1_epi32(0x7f << 23))); const __m128 exp = _mm_cvtepi32_ps(iexp); const __m128i quot = _mm_cvttps_epi32(_mm_div_ps(exp, _mm_set1_ps(3.f))); const __m1...
2016 May 31
2
[PATCH 1/2] Modify autoconf tests for intrinsics to stop clang from optimizing them away.
...[OPUS_X86_MAY_HAVE_SSE2], [OPUS_X86_PRESUME_SSE2], [[#include <emmintrin.h> + #include <time.h> ]], [[ - static __m128i mtest; - mtest = _mm_setzero_si128(); + __m128i mtest; + mtest = _mm_set1_epi32((int)time(NULL)); + mtest = _mm_mul_epu32(mtest, mtest); + return _mm_cvtsi128_si32(mtest); ]] ) AS_IF([test x"$OPUS_X86_MAY_HAVE_SSE2" = x"1" && test x"$OPUS_X86_PRESUME_SSE2" != x"1"], @@ -557,11 +564,13...
2020 May 18
6
[PATCH] SSE2/SSSE3 optimized version of get_checksum1() for x86-64
...1 + __m128i mul_one = _mm_set1_epi8(1); + __m128i add16_1 = sse_maddubs_epi16(mul_one, in8_1); + __m128i add16_2 = sse_maddubs_epi16(mul_one, in8_2); + + // (4*buf[i] + 3*buf[i+1]), (2*buf[i+2], buf[i+3]), ... 2*[int16*8] + __m128i mul_const = _mm_set1_epi32(4 + (3 << 8) + (2 << 16) + (1 << 24)); + __m128i mul_add16_1 = sse_maddubs_epi16(mul_const, in8_1); + __m128i mul_add16_2 = sse_maddubs_epi16(mul_const, in8_2); + + // s2 += 32*s1 + ss2 = _mm_add_epi32(ss2, _mm_slli_epi32(ss1, 5)); + +...
2020 May 18
0
[PATCH] SSE2/SSSE3 optimized version of get_checksum1() for x86-64
...e = _mm_set1_epi8(1); > + __m128i add16_1 = sse_maddubs_epi16(mul_one, in8_1); > + __m128i add16_2 = sse_maddubs_epi16(mul_one, in8_2); > + > + // (4*buf[i] + 3*buf[i+1]), (2*buf[i+2], buf[i+3]), ... > 2*[int16*8] > + __m128i mul_const = _mm_set1_epi32(4 + (3 << 8) + (2 << > 16) + (1 << 24)); > + __m128i mul_add16_1 = sse_maddubs_epi16(mul_const, in8_1); > + __m128i mul_add16_2 = sse_maddubs_epi16(mul_const, in8_2); > + > + // s2 += 32*s1 > + ss2 = _mm_add_epi32(ss2,...
2020 May 19
5
[PATCHv2] SSE2/SSSE3 optimized version of get_checksum1() for x86-64
...1 + __m128i mul_one = _mm_set1_epi8(1); + __m128i add16_1 = sse_maddubs_epi16(mul_one, in8_1); + __m128i add16_2 = sse_maddubs_epi16(mul_one, in8_2); + + // (4*buf[i] + 3*buf[i+1]), (2*buf[i+2], buf[i+3]), ... 2*[int16*8] + __m128i mul_const = _mm_set1_epi32(4 + (3 << 8) + (2 << 16) + (1 << 24)); + __m128i mul_add16_1 = sse_maddubs_epi16(mul_const, in8_1); + __m128i mul_add16_2 = sse_maddubs_epi16(mul_const, in8_2); + + // s2 += 32*s1 + ss2 = _mm_add_epi32(ss2, _mm_slli_epi32(ss1, 5)); + +...
2020 May 18
2
[PATCH] SSE2/SSSE3 optimized version of get_checksum1() for x86-64
...i8(1); >> + __m128i add16_1 = sse_maddubs_epi16(mul_one, in8_1); >> + __m128i add16_2 = sse_maddubs_epi16(mul_one, in8_2); >> + >> + // (4*buf[i] + 3*buf[i+1]), (2*buf[i+2], buf[i+3]), ... 2*[int16*8] >> + __m128i mul_const = _mm_set1_epi32(4 + (3 << 8) + (2 << >> 16) + (1 << 24)); >> + __m128i mul_add16_1 = sse_maddubs_epi16(mul_const, in8_1); >> + __m128i mul_add16_2 = sse_maddubs_epi16(mul_const, in8_2); >> + >> + // s2 += 32*s1 >> + s...
2020 May 20
0
[PATCHv2] SSE2/SSSE3 optimized version of get_checksum1() for x86-64
...ul_one = _mm_set1_epi8(1); > + __m128i add16_1 = sse_maddubs_epi16(mul_one, in8_1); > + __m128i add16_2 = sse_maddubs_epi16(mul_one, in8_2); > + > + // (4*buf[i] + 3*buf[i+1]), (2*buf[i+2], buf[i+3]), ... 2*[int16*8] > + __m128i mul_const = _mm_set1_epi32(4 + (3 << 8) + (2 << > 16) + (1 << 24)); > + __m128i mul_add16_1 = sse_maddubs_epi16(mul_const, in8_1); > + __m128i mul_add16_2 = sse_maddubs_epi16(mul_const, in8_2); > + > + // s2 += 32*s1 > + ss2 = _mm_add_epi32(ss2,...
2020 May 18
3
[PATCH] SSE2/SSSE3 optimized version of get_checksum1() for x86-64
What do you base this on? Per https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html : "For the x86-32 compiler, you must use -march=cpu-type, -msse or -msse2 switches to enable SSE extensions and make this option effective. For the x86-64 compiler, these extensions are enabled by default." That reads to me like we're fine for SSE2. As stated in my comments, SSSE3 support must be
2009 Jan 31
2
[LLVMdev] Optimized code analysis problems
...i32_si128) #include <pmmintrin.h> #include<sys/time.h> #include<iostream> void foo_opt(unsigned char output[64], int Yc[64], int S_BITS) { __m128i XMM1, XMM2, XMM3, XMM4; __m128i *xmm1 = (__m128i*)Yc; __m128i XMM5 = _mm_cvtsi32_si128(S_BITS + 3) ; XMM2 = _mm_set1_epi32(S_BITS + 2); for (int l = 0; l < 8; l++) { XMM1 = _mm_loadu_si128(xmm1++); XMM3 = _mm_add_epi32(XMM1, XMM2); XMM1 = _mm_cmplt_epi32(XMM1, _mm_setzero_si128()); XMM1 = _mm_srli_epi32(XMM1, 31); XMM3 = _mm_sub_epi32(XMM3, XMM1); XMM3 = _mm...