search for: __m128i

Displaying 20 results from an estimated 48 matches for "__m128i".

2020 May 19
5
[PATCHv2] SSE2/SSSE3 optimized version of get_checksum1() for x86-64
...+ * detection of CPU capabilities. + */ + +#ifdef __x86_64__ +#ifdef __cplusplus + +#include "rsync.h" + +#ifdef ENABLE_SSE2 + +#include <immintrin.h> + +/* Compatibility functions to let our SSSE3 algorithm run on SSE2 */ + +__attribute__ ((target ("sse2"))) static inline __m128i sse_load_si128(__m128i_u* buf) { + return _mm_loadu_si128(buf); +} + +__attribute__ ((target ("ssse3"))) static inline __m128i sse_load_si128(__m128i_u* buf) { + return _mm_lddqu_si128(buf); // same as loadu on all but the oldest SSSE3 CPUs +} + +__attribute__ ((target ("sse2&...
2020 May 20
0
[PATCHv2] SSE2/SSSE3 optimized version of get_checksum1() for x86-64
..._64__ > +#ifdef __cplusplus > + > +#include "rsync.h" > + > +#ifdef ENABLE_SSE2 > + > +#include <immintrin.h> > + > +/* Compatibility functions to let our SSSE3 algorithm run on SSE2 */ > + > +__attribute__ ((target ("sse2"))) static inline __m128i > sse_load_si128(__m128i_u* buf) { > + return _mm_loadu_si128(buf); > +} > + > +__attribute__ ((target ("ssse3"))) static inline __m128i > sse_load_si128(__m128i_u* buf) { > + return _mm_lddqu_si128(buf); // same as loadu on all but the > oldest SSSE3 CPUs &...
2020 May 18
6
[PATCH] SSE2/SSSE3 optimized version of get_checksum1() for x86-64
...configure CFLAGS="-mssse3 -O2" + */ + +#ifdef __x86_64__ +#ifdef __SSE2__ + +#include "rsync.h" + +#ifdef __SSSE3__ +#include <immintrin.h> +#else +#include <tmmintrin.h> +#endif + +/* Compatibility functions to let our SSSE3 algorithm run on SSE2 */ + +static inline __m128i sse_load_si128(void const* buf) { +#ifdef __SSSE3__ + return _mm_lddqu_si128(buf); // same as loadu on all but the oldest SSSE3 CPUs +#else + return _mm_loadu_si128(buf); +#endif +} + +#ifndef __SSSE3__ +static inline __m128i sse_interleave_odd_epi16(__m128i a, __m128i b) { + return _mm_p...
2020 May 18
3
[PATCH] SSE2/SSSE3 optimized version of get_checksum1() for x86-64
What do you base this on? Per https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html : "For the x86-32 compiler, you must use -march=cpu-type, -msse or -msse2 switches to enable SSE extensions and make this option effective. For the x86-64 compiler, these extensions are enabled by default." That reads to me like we're fine for SSE2. As stated in my comments, SSSE3 support must be
2020 May 18
0
[PATCH] SSE2/SSSE3 optimized version of get_checksum1() for x86-64
..._ > +#ifdef __SSE2__ > + > +#include "rsync.h" > + > +#ifdef __SSSE3__ > +#include <immintrin.h> > +#else > +#include <tmmintrin.h> > +#endif > + > +/* Compatibility functions to let our SSSE3 algorithm run on SSE2 */ > + > +static inline __m128i sse_load_si128(void const* buf) { > +#ifdef __SSSE3__ > + return _mm_lddqu_si128(buf); // same as loadu on all but the > oldest SSSE3 CPUs > +#else > + return _mm_loadu_si128(buf); > +#endif > +} > + > +#ifndef __SSSE3__ > +static inline __m128i sse_interleave_o...
2020 May 18
2
[PATCH] SSE2/SSSE3 optimized version of get_checksum1() for x86-64
...clude "rsync.h" >> + >> +#ifdef __SSSE3__ >> +#include <immintrin.h> >> +#else >> +#include <tmmintrin.h> >> +#endif >> + >> +/* Compatibility functions to let our SSSE3 algorithm run on SSE2 */ >> + >> +static inline __m128i sse_load_si128(void const* buf) { >> +#ifdef __SSSE3__ >> + return _mm_lddqu_si128(buf); // same as loadu on all but the >> oldest SSSE3 CPUs >> +#else >> + return _mm_loadu_si128(buf); >> +#endif >> +} >> + >> +#ifndef __SSSE3__ >>...
2013 Nov 22
0
[LLVMdev] [clang] SSE2 intrinsics (emmintrin.h): _mm_movpi64_pi64 should be _mm_movpi64_epi64?
Hi there, I've recently encountered a piece of code that uses some SSE2 intrinsics and builds with gcc46, but not clang: clang can't find _mm_movpi64_epi64(), while gcc46 defines it in its lib/gcc46/gcc/.../4.6.3/include/emmintrin.h: extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_movpi64_epi64 (__m64 __A) { return _mm_set_epi64 ((__m64)0LL, __A); } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_set_epi64x (long long __q1, long long __q0) {...
2015 Mar 13
1
[RFC PATCH v3] Intrinsics/RTCD related fixes. Mostly x86.
...mathops.h" #include "pitch.h" -#if defined(OPUS_X86_MAY_HAVE_SSE4_1) -#include <smmintrin.h> -#include "x86cpu.h" - -opus_val32 celt_inner_prod_sse4_1(const opus_val16 *x, const opus_val16 *y, - int N) -{ - opus_int i, dataSize16; - opus_int32 sum; - __m128i inVec1_76543210, inVec1_FEDCBA98, acc1; - __m128i inVec2_76543210, inVec2_FEDCBA98, acc2; - __m128i inVec1_3210, inVec2_3210; - - sum = 0; - dataSize16 = N & ~15; - - acc1 = _mm_setzero_si128(); - acc2 = _mm_setzero_si128(); - - for (i=0;i<dataSize16;i+=16) { - in...
2015 Mar 12
1
[RFC PATCHv2] Intrinsics/RTCD related fixes. Mostly x86.
...mathops.h" #include "pitch.h" -#if defined(OPUS_X86_MAY_HAVE_SSE4_1) -#include <smmintrin.h> -#include "x86cpu.h" - -opus_val32 celt_inner_prod_sse4_1(const opus_val16 *x, const opus_val16 *y, - int N) -{ - opus_int i, dataSize16; - opus_int32 sum; - __m128i inVec1_76543210, inVec1_FEDCBA98, acc1; - __m128i inVec2_76543210, inVec2_FEDCBA98, acc2; - __m128i inVec1_3210, inVec2_3210; - - sum = 0; - dataSize16 = N & ~15; - - acc1 = _mm_setzero_si128(); - acc2 = _mm_setzero_si128(); - - for (i=0;i<dataSize16;i+=16) { - in...
2016 May 31
2
[PATCH 1/2] Modify autoconf tests for intrinsics to stop clang from optimizing them away.
..."1"], @@ -539,10 +543,13 @@ AS_IF([test x"$enable_intrinsics" = x"yes"],[ [OPUS_X86_MAY_HAVE_SSE2], [OPUS_X86_PRESUME_SSE2], [[#include <emmintrin.h> + #include <time.h> ]], [[ - static __m128i mtest; - mtest = _mm_setzero_si128(); + __m128i mtest; + mtest = _mm_set1_epi32((int)time(NULL)); + mtest = _mm_mul_epu32(mtest, mtest); + return _mm_cvtsi128_si32(mtest); ]] ) AS_IF([test x"$OPUS_X86_MAY_HAVE_SSE2...
2018 May 24
0
X86 Intrinsics : _mm_storel_epi64/ _mm_loadl_epi64 with -m32
Hi, I’m using _mm_storel_epi64/ _mm_loadl_epi64 in my test case as below and generating 32-bit code (using -m32 and -msse4.2). The 64-bit load and 64-bit store operations are replaced with two 32-bit mov instructions, presumably due to the use of uint64_t type. If I use __m128i instead of uint64_t everywhere, then the read and write happen as 64-bit operations using the xmm registers as expected. void indvbl_write64(volatile void *p, uint64_t v) { __m128i tmp = _mm_loadl_epi64((__m128i const *)&v); _mm_storel_epi64((__m128i *)p, tmp); } uint64_t indivb...
2010 Jun 11
0
[LLVMdev] thinking about timing-test-driven scheduler
On Wed, 2010-06-09 at 17:30 +0200, orthochronous wrote: > Hi, > > I've been thinking about how to implement a framework for attempting > instruction scheduling of small blocks of code by using (GA/simulated > annealing/etc) controlled timing-test-evaluations of various > orderings. This sounds interesting. > (I'm particularly interested small-ish numerical inner
2015 Mar 02
13
Patch cleaning up Opus x86 intrinsics configury
The attached patch cleans up Opus's x86 intrinsics configury. It: * Makes ?enable-intrinsics work with clang and other non-GCC compilers * Enables RTCD for the floating-point-mode SSE code in Celt. * Disables use of RTCD in cases where the compiler targets an instruction set by default. * Enables the SSE4.1 Silk optimizations that apply to the common parts of Silk when Opus is built in
2010 Jun 09
2
[LLVMdev] thinking about timing-test-driven scheduler
Hi, I've been thinking about how to implement a framework for attempting instruction scheduling of small blocks of code by using (GA/simulated annealing/etc) controlled timing-test-evaluations of various orderings. (I'm particularly interested small-ish numerical inner loop code in low-power CPUs like Atom and various ARMs where there CPU doesn't have the ability to
2014 Oct 13
2
[LLVMdev] Unexpected spilling of vector register during lane extraction on some x86_64 targets
...cked other targets). Here's a test case with spilling/no-spilling code put on conditional compile: #if __SSE4_1__ != 0 #include <smmintrin.h> #else #include <emmintrin.h> #endif #include <stdint.h> #include <assert.h> #if SPILLING_ENSUES == 1 static int32_t geti(const __m128i v, const size_t i) { switch (i) { case 0: return _mm_cvtsi128_si32(v); case 1: return _mm_cvtsi128_si32(_mm_shuffle_epi32(v, 0xe5)); case 2: return _mm_cvtsi128_si32(_mm_shuffle_epi32(v, 0xe6)); case 3: return _mm_cvtsi128_si32(_mm_shuffle_epi32(v, 0xe7)); } assert(0); return -1; } #else static i...
2016 Dec 21
0
Correct way to pass int128 from LLVM to C++ function (MSVC)
...k others. =/ > I’ve also attempted to bit-cast i128’s to <2 x i64> in LLVM. The ABI > problems are pretty much the same. At a first glance, it seems to me like > this problem is more general, namely: for all structures larger than 8 > bytes. > My tests show that we translate __m128i to <2 x i64>, and that we are ABI compatible with MSVC when passing __m128i values, so this should actually work. Behind the scenes LLVM will pass these values indirectly by pointer. -------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.llvm.org/pipe...
2009 Jan 31
2
[LLVMdev] Optimized code analysis problems
...included the sample code below. I get the function call names as llvm.x86 something instead of getting function names(eg. _mm_cvtsi32_si128) #include <pmmintrin.h> #include<sys/time.h> #include<iostream> void foo_opt(unsigned char output[64], int Yc[64], int S_BITS) { __m128i XMM1, XMM2, XMM3, XMM4; __m128i *xmm1 = (__m128i*)Yc; __m128i XMM5 = _mm_cvtsi32_si128(S_BITS + 3) ; XMM2 = _mm_set1_epi32(S_BITS + 2); for (int l = 0; l < 8; l++) { XMM1 = _mm_loadu_si128(xmm1++); XMM3 = _mm_add_epi32(XMM1, XMM2); XMM1 = _mm_cmplt_...
2011 Oct 28
2
[LLVMdev] instcombine does silly things with vector x+x
....b), and the saturating 128 bit version (llvm.x86.sse2.padds.b). I would just give up and use inline assembly, but it seems I can't JIT that. I'm using the latest llvm 3.1 from svn. I get similar behavior at llvm.org/demo using the following equivalent C code: #include <emmintrin.h> __m128i f(__m128i a) { return _mm_add_epi8(a, a); } The no-optimization compilation of this is better than the optimized version. Any ideas? Should I just not use this pass? - Andrew
2019 Jun 10
2
[RFC] Expose user provided vector function for auto-vectorization.
...or the case mentioned earlier: float MyAdd(float* a, int b) { return *a + b; } __declspec(vector_variant(implements(MyAdd(float *a, int b)), linear(a), vectorlength(8), nomask, processor(core_2nd_gen_avx))) __m256 __regcall MyAddVec(float* v_a, __m128i v_b1, __m128i v_b2) If FE emitted ;; Alwaysinline define <8 x float> @MyAddVec.abi_wrapper(float* %v_a, <8 x i32> %v_b) { ;; Not sure about the exact values in the mask parameter. %v_b1 = shufflevector <8 x i32> %v_b, <8 x i32> undef, <4 x i32><i32 0, i32 1, i...
2019 Jun 10
2
[RFC] Expose user provided vector function for auto-vectorization.
...ant: > > float MyAdd(float* a, int b) { return *a + b; } > __declspec(vector_variant(implements(MyAdd(float *a, int b)), > linear(a), vectorlength(8), > nomask, processor(core_2nd_gen_avx))) > __m256 __regcall MyAddVec(float* v_a, __m128i v_b1, __m128i v_b2) > > We need somehow communicate which lanes of widened "b" would map for the b1 parameter and which would go to the b2. If we only care about single ABI (like the one mandated by the OMP) than such things could be put to TTI, but what about other ABIs? Should we...