Displaying 9 results from an estimated 9 matches for "_mm_set1_epi32".
2014 Oct 13
2
[LLVMdev] Unexpected spilling of vector register during lane extraction on some x86_64 targets
...int32_t(v[1] >> 32);
}
assert(0);
return -1;
}
#endif
__m128 x[] = { (__m128){ .123f, .999f, .123f, .999f } };
__m128 r[1];
static const float table[3] = {
1.0,
2.0,
4.0,
};
static __m128 testee(
const __m128 x)
{
const __m128i iexp = _mm_sub_epi32(_mm_srli_epi32(_mm_castps_si128(x), 23),
_mm_set1_epi32(127));
const __m128 s = _mm_or_ps(
_mm_andnot_ps(_mm_castsi128_ps(_mm_set1_epi32(0xff << 23)), x),
_mm_castsi128_ps(_mm_set1_epi32(0x7f << 23)));
const __m128 exp = _mm_cvtepi32_ps(iexp);
const __m128i quot = _mm_cvttps_epi32(_mm_div_ps(exp, _mm_set1_ps(3.f)));
const __m1...
2016 May 31
2
[PATCH 1/2] Modify autoconf tests for intrinsics to stop clang from optimizing them away.
...[OPUS_X86_MAY_HAVE_SSE2],
[OPUS_X86_PRESUME_SSE2],
[[#include <emmintrin.h>
+ #include <time.h>
]],
[[
- static __m128i mtest;
- mtest = _mm_setzero_si128();
+ __m128i mtest;
+ mtest = _mm_set1_epi32((int)time(NULL));
+ mtest = _mm_mul_epu32(mtest, mtest);
+ return _mm_cvtsi128_si32(mtest);
]]
)
AS_IF([test x"$OPUS_X86_MAY_HAVE_SSE2" = x"1" && test x"$OPUS_X86_PRESUME_SSE2" != x"1"],
@@ -557,11 +564,13...
2020 May 18
6
[PATCH] SSE2/SSSE3 optimized version of get_checksum1() for x86-64
...1
+ __m128i mul_one = _mm_set1_epi8(1);
+ __m128i add16_1 = sse_maddubs_epi16(mul_one, in8_1);
+ __m128i add16_2 = sse_maddubs_epi16(mul_one, in8_2);
+
+ // (4*buf[i] + 3*buf[i+1]), (2*buf[i+2], buf[i+3]), ... 2*[int16*8]
+ __m128i mul_const = _mm_set1_epi32(4 + (3 << 8) + (2 <<
16) + (1 << 24));
+ __m128i mul_add16_1 = sse_maddubs_epi16(mul_const, in8_1);
+ __m128i mul_add16_2 = sse_maddubs_epi16(mul_const, in8_2);
+
+ // s2 += 32*s1
+ ss2 = _mm_add_epi32(ss2, _mm_slli_epi32(ss1, 5));
+
+...
2020 May 18
0
[PATCH] SSE2/SSSE3 optimized version of get_checksum1() for x86-64
...e = _mm_set1_epi8(1);
> + __m128i add16_1 = sse_maddubs_epi16(mul_one, in8_1);
> + __m128i add16_2 = sse_maddubs_epi16(mul_one, in8_2);
> +
> + // (4*buf[i] + 3*buf[i+1]), (2*buf[i+2], buf[i+3]), ...
> 2*[int16*8]
> + __m128i mul_const = _mm_set1_epi32(4 + (3 << 8) + (2 <<
> 16) + (1 << 24));
> + __m128i mul_add16_1 = sse_maddubs_epi16(mul_const, in8_1);
> + __m128i mul_add16_2 = sse_maddubs_epi16(mul_const, in8_2);
> +
> + // s2 += 32*s1
> + ss2 = _mm_add_epi32(ss2,...
2020 May 19
5
[PATCHv2] SSE2/SSSE3 optimized version of get_checksum1() for x86-64
...1
+ __m128i mul_one = _mm_set1_epi8(1);
+ __m128i add16_1 = sse_maddubs_epi16(mul_one, in8_1);
+ __m128i add16_2 = sse_maddubs_epi16(mul_one, in8_2);
+
+ // (4*buf[i] + 3*buf[i+1]), (2*buf[i+2], buf[i+3]), ... 2*[int16*8]
+ __m128i mul_const = _mm_set1_epi32(4 + (3 << 8) + (2 <<
16) + (1 << 24));
+ __m128i mul_add16_1 = sse_maddubs_epi16(mul_const, in8_1);
+ __m128i mul_add16_2 = sse_maddubs_epi16(mul_const, in8_2);
+
+ // s2 += 32*s1
+ ss2 = _mm_add_epi32(ss2, _mm_slli_epi32(ss1, 5));
+
+...
2020 May 18
2
[PATCH] SSE2/SSSE3 optimized version of get_checksum1() for x86-64
...i8(1);
>> + __m128i add16_1 = sse_maddubs_epi16(mul_one, in8_1);
>> + __m128i add16_2 = sse_maddubs_epi16(mul_one, in8_2);
>> +
>> + // (4*buf[i] + 3*buf[i+1]), (2*buf[i+2], buf[i+3]), ... 2*[int16*8]
>> + __m128i mul_const = _mm_set1_epi32(4 + (3 << 8) + (2 <<
>> 16) + (1 << 24));
>> + __m128i mul_add16_1 = sse_maddubs_epi16(mul_const, in8_1);
>> + __m128i mul_add16_2 = sse_maddubs_epi16(mul_const, in8_2);
>> +
>> + // s2 += 32*s1
>> + s...
2020 May 20
0
[PATCHv2] SSE2/SSSE3 optimized version of get_checksum1() for x86-64
...ul_one = _mm_set1_epi8(1);
> + __m128i add16_1 = sse_maddubs_epi16(mul_one, in8_1);
> + __m128i add16_2 = sse_maddubs_epi16(mul_one, in8_2);
> +
> + // (4*buf[i] + 3*buf[i+1]), (2*buf[i+2], buf[i+3]), ... 2*[int16*8]
> + __m128i mul_const = _mm_set1_epi32(4 + (3 << 8) + (2 <<
> 16) + (1 << 24));
> + __m128i mul_add16_1 = sse_maddubs_epi16(mul_const, in8_1);
> + __m128i mul_add16_2 = sse_maddubs_epi16(mul_const, in8_2);
> +
> + // s2 += 32*s1
> + ss2 = _mm_add_epi32(ss2,...
2020 May 18
3
[PATCH] SSE2/SSSE3 optimized version of get_checksum1() for x86-64
What do you base this on?
Per https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html :
"For the x86-32 compiler, you must use -march=cpu-type, -msse or
-msse2 switches to enable SSE extensions and make this option
effective. For the x86-64 compiler, these extensions are enabled by
default."
That reads to me like we're fine for SSE2. As stated in my comments,
SSSE3 support must be
2009 Jan 31
2
[LLVMdev] Optimized code analysis problems
...i32_si128)
#include <pmmintrin.h>
#include<sys/time.h>
#include<iostream>
void foo_opt(unsigned char output[64], int Yc[64], int S_BITS)
{
__m128i XMM1, XMM2, XMM3, XMM4;
__m128i *xmm1 = (__m128i*)Yc;
__m128i XMM5 = _mm_cvtsi32_si128(S_BITS + 3) ;
XMM2 = _mm_set1_epi32(S_BITS + 2);
for (int l = 0; l < 8; l++) {
XMM1 = _mm_loadu_si128(xmm1++);
XMM3 = _mm_add_epi32(XMM1, XMM2);
XMM1 = _mm_cmplt_epi32(XMM1, _mm_setzero_si128());
XMM1 = _mm_srli_epi32(XMM1, 31);
XMM3 = _mm_sub_epi32(XMM3, XMM1);
XMM3 = _mm...