00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013 #ifndef __MMX_SSE_HPP
00014 #define __MMX_SSE_HPP
00015 #include <numerix/simd.hpp>
00016 #if defined (NUMERIX_ENABLE_SIMD) && defined (__SSE2__)
00017 #include <stdint.h>
00018 #include <emmintrin.h>
00019 #ifdef __SSE3__
00020 #include <pmmintrin.h>
00021 #endif
00022 #ifdef __SSSE3__
00023 #include <tmmintrin.h>
00024 #endif
00025 #ifdef __SSE4_1__
00026 #include <smmintrin.h>
00027 #endif
00028 #ifdef __SSE4_2__
00029 #include <smmintrin.h>
00030 #endif
00031
00032 #include <basix/compound.hpp>
00033 #include <basix/identifiers.hpp>
00034 #include <basix/syntactic.hpp>
00035 #include <numerix/complex.hpp>
00036
00037 namespace mmx {
00038
00039
00040
00041
00042
00043 template<typename C> inline typename Simd_type (C)
00044 simd_load_aligned (const C* v) {
00045 return (typename Simd_type (C))
00046 _mm_load_si128 ((const __m128i*) v); }
00047
00048 template<typename C> inline void
00049 simd_save_aligned (C* v, const typename Simd_type (C)& x) {
00050 _mm_store_si128 ((__m128i*) v, (const __m128i) x); }
00051
00052 template<typename C> inline void
00053 simd_save (C* v, const typename Simd_type(C)& x) {
00054 _mm_storeu_si128 ((__m128i*) v, (const __m128i) x); }
00055
00056 template<typename C> inline typename Simd_type (C)
00057 simd_load (const C* v0, const C* v1) {
00058 return simd_set (*v0, *v1); }
00059
00060 template<typename C> inline void
00061 simd_save (C* v0, C* v1, const typename Simd_type (C)& x) {
00062 static C v[Simd_size (C)];
00063 simd_save_aligned (v, x);
00064 *v0 = v[0]; *v1 = v[1]; }
00065
00066 template<typename C> inline typename Simd_type (C)
00067 simd_load (const C* v0, const C* v1, const C* v2, const C* v3) {
00068 return simd_set (*v0, *v1, *v2, *v3); }
00069
00070 template<typename C> inline void
00071 simd_save (C* v0, C* v1, C* v2, C* v3, const typename Simd_type (C)& x) {
00072 static C v[Simd_size (C)];
00073 simd_save_aligned (v, x);
00074 *v0 = v[0]; *v1 = v[1]; *v2 = v[2]; *v3 = v[3]; }
00075
00076 template<typename C> inline typename Simd_type (C)
00077 simd_load (const C* v0, const C* v1, const C* v2, const C* v3,
00078 const C* v4, const C* v5, const C* v6, const C* v7) {
00079 return simd_set (*v0, *v1, *v2, *v3, *v4, *v5, *v6, *v7); }
00080
00081 template<typename C> inline void
00082 simd_save (C* v0, C* v1, C* v2, C* v3, C* v4, C* v5, C* v6, C* v7,
00083 const typename Simd_type (C)& x) {
00084 static C v[Simd_size (C)];
00085 simd_save_aligned (v, x);
00086 *v0 = v[0]; *v1 = v[1]; *v2 = v[2]; *v3 = v[3];
00087 *v4 = v[4]; *v5 = v[5]; *v6 = v[6]; *v7 = v[7]; }
00088
00089 template<typename C> inline typename Simd_type (C)
00090 simd_load (const C* v0, const C* v1, const C* v2, const C* v3,
00091 const C* v4, const C* v5, const C* v6, const C* v7,
00092 const C* v8, const C* v9, const C* v10, const C* v11,
00093 const C* v12, const C* v13, const C* v14, const C* v15) {
00094 return simd_set (*v0, *v1, *v2, *v3, *v4, *v5, *v6, *v7,
00095 *v8, *v9, *v10, *v11, *v12, *v13, *v14, *v15); }
00096
00097 template<typename C> inline void
00098 simd_save (C* v0, C* v1, C* v2, C* v3, C* v4, C* v5, C* v6, C* v7,
00099 C* v8, C* v9, C* v10, C* v11, C* v12, C* v13, C* v14, C* v15,
00100 const typename Simd_type (C)& x) {
00101 static C v[Simd_size (C)];
00102 simd_save_aligned (v, x);
00103 *v0 = v[0]; *v1 = v[1]; *v2 = v[2]; *v3 = v[3];
00104 *v4 = v[4]; *v5 = v[5]; *v6 = v[6]; *v7 = v[7];
00105 *v8 = v[8]; *v9 = v[9]; *v10 = v[10]; *v11 = v[11];
00106 *v12 = v[12]; *v13 = v[13]; *v14 = v[14]; *v15 = v[15]; }
00107
00108 template<typename C> inline C
00109 simd_big_add (const typename Simd_type (C)& x) {
00110 C r = 0;
00111 for (nat i = 0; i < Simd_size (C); i++)
00112 r += ((C*) &x) [i];
00113 return r; }
00114
00115
00116
00117
00118
00119
00120
00121
00122 template<typename V> inline syntactic
00123 simd_flatten (const V& x) {
00124 typedef typename Simd_base_type(V) C;
00125 static const nat size = Simd_size(C);
00126 C* v = mmx_new<C> (size);
00127 simd_save_aligned (v, x);
00128 vector<syntactic> w = fill <syntactic> (size);;
00129 for (nat i = 0; i < size; i++)
00130 w[i] = flatten (v[i]);
00131 mmx_delete<C> (v, size);
00132 return apply (GEN_SQTUPLE, w);
00133 }
00134
00135 #define SIMD_SUGAR(C,V) \
00136 inline syntactic flatten (const V& x) { \
00137 return simd_flatten (x); } \
00138 inline bool equal (const V& x, const V& y) { \
00139 return _mm_movemask_epi8 ( \
00140 (__m128i) _mm_cmpeq_epi32 ((const __m128i) x, \
00141 (const __m128i) y)) == 131071; } \
00142 inline bool unequal (const V& x, const V& y) { \
00143 return ! equal (x, y); } \
00144 STMPL inline void clear (V& x) { \
00145 x = simd_set_duplicate (C (0)); } \
00146 STMPL inline void mul (V& x, const V& y1, const C& y2) { \
00147 x = y1 * simd_set_duplicate (y2); } \
00148 STMPL inline void mul_add (V& x, const V& y1, const C& y2) { \
00149 x += y1 * simd_set_duplicate (y2); }
00150
00151
00152
00153
00154
00155 typedef double __attribute__((vector_size(16))) sse_double;
00156
00157 template<>
00158 struct is_simd_helper<sse_double> {
00159 static const bool val = true; };
00160
00161 template<>
00162 struct simd_helper<double> {
00163 typedef sse_double type;
00164 static const nat size = 2; };
00165
00166 template<>
00167 struct simd_base_helper<sse_double> {
00168 typedef double type; };
00169
00170 inline sse_double
00171 simd_load (const double* v) {
00172 return _mm_loadu_pd (v); }
00173
00174 inline sse_double
00175 simd_set_duplicate (const double& x) {
00176 return _mm_set1_pd (x); }
00177
00178 inline sse_double
00179 simd_set (const double& v0, const double& v1) {
00180 return _mm_set_pd (v1, v0); }
00181
00182 #ifdef __SSE3__
00183 static const sse_double sse_double_zero= simd_set_duplicate((double) 0);
00184
00185 STMPL inline double
00186 simd_big_add (const sse_double& x) {
00187 double r;
00188 sse_double y = _mm_hadd_pd (x, sse_double_zero);
00189 _mm_storel_pd (&r, y);
00190 return r;
00191 }
00192 #endif
00193
00194
00195 inline sse_double
00196 simd_equal (const sse_double& x, const sse_double& y) {
00197 return _mm_cmpeq_pd (x, y); }
00198
00199 inline sse_double
00200 simd_unequal (const sse_double& x, const sse_double& y) {
00201 return _mm_cmpneq_pd (x, y); }
00202
00203 inline sse_double
00204 simd_less (const sse_double& x, const sse_double& y) {
00205 return _mm_cmplt_pd (x, y); }
00206
00207 inline sse_double
00208 simd_gtr (const sse_double& x, const sse_double& y) {
00209 return _mm_cmpgt_pd (x, y); }
00210
00211 inline sse_double
00212 simd_lesseq (const sse_double& x, const sse_double& y) {
00213 return _mm_cmple_pd (x, y); }
00214
00215 inline sse_double
00216 simd_gtreq (const sse_double& x, const sse_double& y) {
00217 return _mm_cmpge_pd (x, y); }
00218
00219
00220 inline sse_double
00221 min (const sse_double& x, const sse_double& y) {
00222 return _mm_min_pd (x, y); }
00223
00224 inline sse_double
00225 max (const sse_double& x, const sse_double& y) {
00226 return _mm_max_pd (x, y); }
00227
00228 inline sse_double
00229 simd_shuffle (const sse_double& x, const sse_double& y, int i) {
00230 return _mm_shuffle_pd (x, y, i); }
00231
00232
00233 inline sse_double
00234 simd_unpacklo (const sse_double& x, const sse_double& y) {
00235 return _mm_unpacklo_pd (x, y); }
00236
00237 inline sse_double
00238 simd_unpackhi (const sse_double& x, const sse_double& y) {
00239 return _mm_unpackhi_pd (x, y); }
00240
00241
00242 inline sse_double
00243 simd_load_duplicate (const double* v) {
00244 return _mm_load1_pd (v); }
00245
00246 inline sse_double
00247 simd_load (const double* v0, const double* v1) {
00248 return _mm_loadh_pd (_mm_load1_pd (v0), v1); }
00249
00250 inline void
00251 simd_save (double* v0, double* v1, const sse_double& x) {
00252 _mm_storel_pd (v0, x); _mm_storeh_pd (v1, x); }
00253
00254 inline sse_double
00255 simd_swap (const sse_double& x) {
00256 return _mm_shuffle_pd (x, x, 1); }
00257
00258
00259 SIMD_SUGAR (double, sse_double)
00260
00261
00262
00263
00264
00265 typedef complex< double> complex_double;
00266 typedef complex<sse_double> sse_complex_double;
00267
00268 template<>
00269 struct simd_helper<complex_double> {
00270 typedef sse_complex_double type;
00271 static const nat size = 2; };
00272
00273 inline sse_complex_double
00274 simd_set_duplicate (const complex_double& z) {
00275 return sse_complex_double (simd_set_duplicate (Re (z)),
00276 simd_set_duplicate (Im (z))); }
00277 inline sse_complex_double
00278 simd_set (const complex_double& z0, const complex_double& z1) {
00279 return sse_complex_double (simd_set (Re (z0), Re (z1)),
00280 simd_set (Im (z0), Im (z1))); }
00281 inline sse_complex_double
00282 simd_load_duplicate (const complex_double* v) {
00283 const double* w= (double*) ((void*) v);
00284 return sse_complex_double (simd_load_duplicate (w),
00285 simd_load_duplicate (w + 1)); }
00286
00287 template<> inline syntactic
00288 flatten (const sse_complex_double& z) {
00289 return flatten (Re (z)) + flatten (Im (z)) * Imaginary (syntactic); }
00290
00291
00292
00293
00294
00295 typedef int64_t __attribute__((vector_size(16))) sse_int64_t;
00296
00297 template<>
00298 struct is_simd_helper<sse_int64_t> {
00299 static const bool val = true; };
00300
00301 template<>
00302 struct simd_helper<int64_t> {
00303 typedef sse_int64_t type;
00304 static const nat size = 2; };
00305
00306 template<>
00307 struct simd_base_helper<sse_int64_t> {
00308 typedef int64_t type; };
00309
00310 inline sse_int64_t
00311 simd_set_duplicate (const int64_t& x) {
00312 return sse_int64_t(_mm_set1_epi64 ((__m64) x)); }
00313
00314 inline sse_int64_t
00315 simd_set (const int64_t& x0, const int64_t& x1) {
00316 return sse_int64_t(_mm_set_epi64 ((__m64) x1, (__m64) x0)); }
00317
00318
00319 #ifdef __SSE4_1__
00320 inline sse_int64_t
00321 simd_equal (const sse_int64_t& x, const sse_int64_t& y) {
00322 return sse_int64_t(_mm_cmpeq_epi64 ((__m128i) x, (__m128i) y)); }
00323 #endif
00324
00325 #ifdef __SSE4_2__
00326 inline sse_int64_t
00327 simd_gtr (const sse_int64_t& x, const sse_int64_t& y) {
00328 return sse_int64_t(_mm_cmpgt_epi64 ((__m128i) x, (__m128i) y)); }
00329
00330 inline sse_int64_t
00331 simd_less (const sse_int64_t& x, const sse_int64_t& y) {
00332 return sse_int64_t(_mm_cmpgt_epi64 ((__m128i) y, (__m128i) x)); }
00333 #endif
00334
00335
00336 inline sse_int64_t
00337 simd_unpacklo (const sse_int64_t& x, const sse_int64_t& y) {
00338 return sse_int64_t(_mm_unpacklo_epi64 ((__m128i) x, (__m128i) y)); }
00339
00340 inline sse_int64_t
00341 simd_unpackhi (const sse_int64_t& x, const sse_int64_t& y) {
00342 return sse_int64_t(_mm_unpackhi_epi64 ((__m128i) x, (__m128i) y)); }
00343
00344
00345 inline sse_int64_t
00346 simd_sll (const sse_int64_t& x, int i) {
00347 return sse_int64_t(_mm_slli_epi64 ((__m128i) x, i)); }
00348
00349 inline sse_int64_t
00350 simd_srl (const sse_int64_t& x, int i) {
00351 return sse_int64_t(_mm_srli_epi64 ((__m128i) x, i)); }
00352
00353
00354 SIMD_SUGAR (int64_t, sse_int64_t)
00355
00356
00357
00358
00359
00360 typedef uint64_t __attribute__((vector_size(16))) sse_uint64_t;
00361
00362 template<>
00363 struct is_simd_helper<sse_uint64_t> {
00364 static const bool val = true; };
00365
00366 template<>
00367 struct simd_helper<uint64_t> {
00368 typedef sse_uint64_t type;
00369 static const nat size = 2; };
00370
00371 template<>
00372 struct simd_base_helper<sse_uint64_t> {
00373 typedef uint64_t type; };
00374
00375 inline sse_uint64_t
00376 simd_set_duplicate (const uint64_t& x) {
00377 return sse_uint64_t(_mm_set1_epi64 ((__m64) x)); }
00378
00379 inline sse_uint64_t
00380 simd_set (const uint64_t& x0, const uint64_t& x1) {
00381 return sse_uint64_t(_mm_set_epi64 ((__m64) x1, (__m64) x0)); }
00382
00383
00384 #ifdef __SSE4_1__
00385 inline sse_uint64_t
00386 simd_equal (const sse_uint64_t& x, const sse_uint64_t& y) {
00387 return sse_uint64_t(_mm_cmpeq_epi64 ((__m128i) x, (__m128i) y)); }
00388
00389 static const sse_uint64_t _half_max_uint64=
00390 simd_set_duplicate ((uint64_t) (1ull << 63));
00391
00392 inline sse_uint64_t
00393 simd_gtr (const sse_uint64_t& x, const sse_uint64_t& y) {
00394 return sse_uint64_t(_mm_cmpgt_epi64 ((__m128i) (x - _half_max_uint64),
00395 (__m128i) (y - _half_max_uint64))); }
00396
00397 inline sse_uint64_t
00398 simd_less (const sse_uint64_t& x, const sse_uint64_t& y) {
00399 return simd_gtr (y, x); }
00400
00401 inline sse_uint64_t
00402 min (const sse_uint64_t& x, const sse_uint64_t& y) {
00403 sse_uint64_t d= x - y;
00404 sse_uint64_t b= sse_uint64_t
00405 (_mm_cmpgt_epi64 ((__m128i) (sse_int64_t) d,
00406 (__m128i) (sse_int64_t) _half_max_uint64));
00407 return x + b & d; }
00408 #endif
00409
00410
00411 inline sse_uint64_t
00412 simd_unpacklo (const sse_uint64_t& x, const sse_uint64_t& y) {
00413 return sse_uint64_t(_mm_unpacklo_epi64 ((__m128i) x, (__m128i) y)); }
00414
00415 inline sse_uint64_t
00416 simd_unpackhi (const sse_uint64_t& x, const sse_uint64_t& y) {
00417 return sse_uint64_t(_mm_unpackhi_epi64 ((__m128i) x, (__m128i) y)); }
00418
00419
00420 inline sse_uint64_t
00421 simd_sll (const sse_uint64_t& x, int i) {
00422 return sse_uint64_t(_mm_slli_epi64 ((__m128i) x, i)); }
00423
00424 inline sse_uint64_t
00425 simd_srl (const sse_uint64_t& x, int i) {
00426 return sse_uint64_t(_mm_srli_epi64 ((__m128i) x, i)); }
00427
00428
00429 SIMD_SUGAR (uint64_t, sse_uint64_t)
00430
00431
00432
00433
00434
00435 typedef int32_t __attribute__((vector_size(16))) sse_int32_t;
00436
00437 template<>
00438 struct is_simd_helper<sse_int32_t> {
00439 static const bool val = true; };
00440
00441 template<>
00442 struct simd_helper<int32_t> {
00443 typedef sse_int32_t type;
00444 static const nat size = 4; };
00445
00446 template<>
00447 struct simd_base_helper<sse_int32_t> {
00448 typedef int32_t type; };
00449
00450 inline sse_int32_t
00451 simd_set_duplicate (const int32_t& x) {
00452 return sse_int32_t(_mm_set1_epi32 (x)); }
00453
00454 inline sse_int32_t
00455 simd_set (const int32_t& x0, const int32_t& x1,
00456 const int32_t& x2, const int32_t& x3) {
00457 return sse_int32_t(_mm_set_epi32 (x3, x2, x1, x0)); }
00458
00459
00460 inline sse_int32_t
00461 simd_equal (const sse_int32_t& x, const sse_int32_t& y) {
00462 return sse_int32_t(_mm_cmpeq_epi32 ((__m128i) x, (__m128i) y)); }
00463
00464 inline sse_int32_t
00465 simd_less (const sse_int32_t& x, const sse_int32_t& y) {
00466 return sse_int32_t(_mm_cmplt_epi32 ((__m128i) x, (__m128i) y)); }
00467
00468 inline sse_int32_t
00469 simd_gtr (const sse_int32_t& x, const sse_int32_t& y) {
00470 return sse_int32_t(_mm_cmpgt_epi32 ((__m128i) x, (__m128i) y)); }
00471
00472
00473 inline sse_int32_t
00474 simd_unpacklo (const sse_int32_t& x, const sse_int32_t& y) {
00475 return sse_int32_t(_mm_unpacklo_epi32 ((__m128i) x, (__m128i) y)); }
00476
00477 inline sse_int32_t
00478 simd_unpackhi (const sse_int32_t& x, const sse_int32_t& y) {
00479 return sse_int32_t(_mm_unpackhi_epi32 ((__m128i) x, (__m128i) y)); }
00480
00481
00482 inline sse_int32_t
00483 simd_sll (const sse_int32_t& x, int i) {
00484 return sse_int32_t(_mm_slli_epi32 ((__m128i) x, i)); }
00485
00486 inline sse_int32_t
00487 simd_srl (const sse_int32_t& x, int i) {
00488 return sse_int32_t(_mm_srli_epi32 ((__m128i) x, i)); }
00489
00490 inline sse_int32_t
00491 simd_sra (const sse_int32_t& x, int i) {
00492 return sse_int32_t(_mm_srai_epi32 ((__m128i) x, i)); }
00493
00494 #ifdef __SSE4_1__
00495 inline sse_int32_t
00496 min (const sse_int32_t& x, const sse_int32_t& y) {
00497 return sse_int32_t(_mm_min_epi32 ((__m128i) x, (__m128i) y)); }
00498
00499 inline sse_int32_t
00500 max (const sse_int32_t& x, const sse_int32_t& y) {
00501 return sse_int32_t(_mm_max_epi32 ((__m128i) x, (__m128i) y)); }
00502 #endif
00503
00504
00505 SIMD_SUGAR (int32_t, sse_int32_t)
00506
00507
00508
00509
00510
00511 typedef uint32_t __attribute__((vector_size(16))) sse_uint32_t;
00512
00513 template<>
00514 struct is_simd_helper<sse_uint32_t> {
00515 static const bool val = true; };
00516
00517 template<>
00518 struct simd_helper<uint32_t> {
00519 typedef sse_uint32_t type;
00520 static const nat size = 4; };
00521
00522 template<>
00523 struct simd_base_helper<sse_uint32_t> {
00524 typedef uint32_t type; };
00525
00526 inline sse_uint32_t
00527 simd_set_duplicate (const uint32_t& x) {
00528 return sse_uint32_t(_mm_set1_epi32 (x)); }
00529
00530 inline sse_uint32_t
00531 simd_set (const uint32_t& x0, const uint32_t& x1,
00532 const uint32_t& x2, const uint32_t& x3) {
00533 return sse_uint32_t(_mm_set_epi32 (x3, x2, x1, x0)); }
00534
00535
00536 #ifdef __SSE4_1__
00537 inline sse_uint32_t
00538 simd_equal (const sse_uint32_t& x, const sse_uint32_t& y) {
00539 return sse_uint32_t(_mm_cmpeq_epi32 ((__m128i) x, (__m128i) y)); }
00540
00541 static const sse_uint32_t _half_max_uint32=
00542 simd_set_duplicate (((uint32_t) 1) << 31);
00543
00544 inline sse_uint32_t
00545 simd_gtr (const sse_uint32_t& x, const sse_uint32_t& y) {
00546 return sse_uint32_t(_mm_cmpgt_epi32 ((__m128i) (x - _half_max_uint32),
00547 (__m128i) (y - _half_max_uint32))); }
00548
00549 inline sse_uint32_t
00550 simd_less (const sse_uint32_t& x, const sse_uint32_t& y) {
00551 return simd_gtr (y, x); }
00552 #endif
00553
00554
00555 inline sse_uint32_t
00556 simd_unpacklo (const sse_uint32_t& x, const sse_uint32_t& y) {
00557 return sse_uint32_t(_mm_unpacklo_epi32 ((__m128i) x, (__m128i) y)); }
00558
00559 inline sse_uint32_t
00560 simd_unpackhi (const sse_uint32_t& x, const sse_uint32_t& y) {
00561 return sse_uint32_t(_mm_unpackhi_epi32 ((__m128i) x, (__m128i) y)); }
00562
00563
00564 inline sse_uint32_t
00565 simd_sll (const sse_uint32_t& x, int i) {
00566 return sse_uint32_t(_mm_slli_epi32 ((__m128i) x, i)); }
00567
00568 inline sse_uint32_t
00569 simd_srl (const sse_uint32_t& x, int i) {
00570 return sse_uint32_t(_mm_srli_epi32 ((__m128i) x, i)); }
00571
00572 #ifdef __SSE4_1__
00573 inline sse_uint32_t
00574 min (const sse_uint32_t& x, const sse_uint32_t& y) {
00575 return sse_uint32_t(_mm_min_epu32 ((__m128i) x, (__m128i) y)); }
00576
00577 inline sse_uint32_t
00578 max (const sse_uint32_t& x, const sse_uint32_t& y) {
00579 return sse_uint32_t(_mm_max_epu32 ((__m128i) x, (__m128i) y)); }
00580 #endif
00581
00582
00583 SIMD_SUGAR (uint32_t, sse_uint32_t)
00584
00585
00586
00587
00588
00589 typedef int16_t __attribute__((vector_size(16))) sse_int16_t;
00590
00591 template<>
00592 struct is_simd_helper<sse_int16_t> {
00593 static const bool val = true; };
00594
00595 template<>
00596 struct simd_helper<int16_t> {
00597 typedef sse_int16_t type;
00598 static const nat size = 8; };
00599
00600 template<>
00601 struct simd_base_helper<sse_int16_t> {
00602 typedef int16_t type; };
00603
00604 inline sse_int16_t
00605 simd_set_duplicate (const int16_t& x) {
00606 return sse_int16_t(_mm_set1_epi16 (x)); }
00607
00608 inline sse_int16_t
00609 simd_set (const int16_t& x0, const int16_t& x1,
00610 const int16_t& x2, const int16_t& x3,
00611 const int16_t& x4, const int16_t& x5,
00612 const int16_t& x6, const int16_t& x7) {
00613 return sse_int16_t(_mm_set_epi16 (x7, x6, x5, x4, x3, x2, x1, x0)); }
00614
00615
00616 inline sse_int16_t
00617 simd_equal (const sse_int16_t& x, const sse_int16_t& y) {
00618 return sse_int16_t(_mm_cmpeq_epi16 ((__m128i) x, (__m128i) y)); }
00619
00620 inline sse_int16_t
00621 simd_less (const sse_int16_t& x, const sse_int16_t& y) {
00622 return sse_int16_t(_mm_cmplt_epi16 ((__m128i) x, (__m128i) y)); }
00623
00624 inline sse_int16_t
00625 simd_gtr (const sse_int16_t& x, const sse_int16_t& y) {
00626 return sse_int16_t(_mm_cmpgt_epi16 ((__m128i) x, (__m128i) y)); }
00627
00628
00629 inline sse_int16_t
00630 min (const sse_int16_t& x, const sse_int16_t& y) {
00631 return sse_int16_t(_mm_min_epi16 ((__m128i) x, (__m128i) y)); }
00632
00633 inline sse_int16_t
00634 max (const sse_int16_t& x, const sse_int16_t& y) {
00635 return sse_int16_t(_mm_max_epi16 ((__m128i) x, (__m128i) y)); }
00636
00637
00638 inline sse_int16_t
00639 simd_unpacklo (const sse_int16_t& x, const sse_int16_t& y) {
00640 return sse_int16_t(_mm_unpacklo_epi16 ((__m128i) x, (__m128i) y)); }
00641
00642 inline sse_int16_t
00643 simd_unpackhi (const sse_int16_t& x, const sse_int16_t& y) {
00644 return sse_int16_t(_mm_unpackhi_epi16 ((__m128i) x, (__m128i) y)); }
00645
00646 inline sse_int16_t
00647 simd_pack (const sse_int32_t& x, const sse_int32_t& y) {
00648 return sse_int16_t(_mm_packs_epi32 ((__m128i) x, (__m128i) y)); }
00649
00650
00651 inline sse_int16_t
00652 simd_sll (const sse_int16_t& x, int i) {
00653 return sse_int16_t(_mm_slli_epi16 ((__m128i) x, i)); }
00654
00655 inline sse_int16_t
00656 simd_srl (const sse_int16_t& x, int i) {
00657 return sse_int16_t(_mm_srli_epi16 ((__m128i) x, i)); }
00658
00659 inline sse_int16_t
00660 simd_sra (const sse_int16_t& x, int i) {
00661 return sse_int16_t(_mm_srai_epi16 ((__m128i) x, i)); }
00662
00663
00664 SIMD_SUGAR (int16_t, sse_int16_t)
00665
00666
00667
00668
00669
00670 typedef uint16_t __attribute__((vector_size(16))) sse_uint16_t;
00671
00672 template<>
00673 struct is_simd_helper<sse_uint16_t> {
00674 static const bool val = true; };
00675
00676 template<>
00677 struct simd_helper<uint16_t> {
00678 typedef sse_uint16_t type;
00679 static const nat size = 8; };
00680
00681 template<>
00682 struct simd_base_helper<sse_uint16_t> {
00683 typedef uint16_t type; };
00684
00685 inline sse_uint16_t
00686 simd_set_duplicate (const uint16_t& x) {
00687 return sse_uint16_t(_mm_set1_epi16 ((short) x)); }
00688
00689 inline sse_uint16_t
00690 simd_set (const uint16_t& x0, const uint16_t& x1,
00691 const uint16_t& x2, const uint16_t& x3,
00692 const uint16_t& x4, const uint16_t& x5,
00693 const uint16_t& x6, const uint16_t& x7) {
00694 return sse_uint16_t(_mm_set_epi16 ((short) x7, (short) x6, (short) x5,
00695 (short) x4, (short) x3, (short) x2, (short) x1, (short) x0)); }
00696
00697
00698 inline sse_uint16_t
00699 simd_equal (const sse_uint16_t& x, const sse_uint16_t& y) {
00700 return sse_uint16_t(_mm_cmpeq_epi16 ((__m128i) x, (__m128i) y)); }
00701
00702 static const sse_uint16_t _half_max_uint16=
00703 sse_uint16_t(simd_set_duplicate (((uint16_t) 1) << 15));
00704
00705 inline sse_uint16_t
00706 simd_gtr (const sse_uint16_t& x, const sse_uint16_t& y) {
00707 return sse_uint16_t(_mm_cmpgt_epi16 ((__m128i) (x - _half_max_uint16),
00708 (__m128i) (y - _half_max_uint16))); }
00709
00710 inline sse_uint16_t
00711 simd_less (const sse_uint16_t& x, const sse_uint16_t& y) {
00712 return simd_gtr (y, x); }
00713
00714
00715 inline sse_uint16_t
00716 simd_unpacklo (const sse_uint16_t& x, const sse_uint16_t& y) {
00717 return sse_uint16_t(_mm_unpacklo_epi16 ((__m128i) x, (__m128i) y)); }
00718
00719 inline sse_uint16_t
00720 simd_unpackhi (const sse_uint16_t& x, const sse_uint16_t& y) {
00721 return sse_uint16_t(_mm_unpackhi_epi16 ((__m128i) x, (__m128i) y)); }
00722
00723 #ifdef __SSE4_1__
00724 inline sse_uint16_t
00725 simd_pack (const sse_uint32_t& x, const sse_uint32_t& y) {
00726 return sse_uint16_t(_mm_packus_epi32 ((__m128i) x, (__m128i) y)); }
00727 #endif
00728
00729
00730 inline sse_uint16_t
00731 simd_sll (const sse_uint16_t& x, int i) {
00732 return sse_uint16_t(_mm_slli_epi16 ((__m128i) x, i)); }
00733
00734 inline sse_uint16_t
00735 simd_srl (const sse_uint16_t& x, int i) {
00736 return sse_uint16_t(_mm_srli_epi16 ((__m128i) x, i)); }
00737
00738 #ifdef __SSE4_1__
00739 inline sse_uint16_t
00740 min (const sse_uint16_t& x, const sse_uint16_t& y) {
00741 return sse_uint16_t(_mm_min_epu16 ((__m128i) x, (__m128i) y)); }
00742
00743 inline sse_uint16_t
00744 max (const sse_uint16_t& x, const sse_uint16_t& y) {
00745 return sse_uint16_t(_mm_max_epu16 ((__m128i) x, (__m128i) y)); }
00746 #endif
00747
00748
00749 SIMD_SUGAR (uint16_t, sse_uint16_t)
00750
00751
00752
00753
00754
00755 typedef int8_t __attribute__((vector_size(16))) sse_int8_t;
00756
00757 template<>
00758 struct is_simd_helper<sse_int8_t> {
00759 static const bool val = true; };
00760
00761 template<>
00762 struct simd_helper<int8_t> {
00763 typedef sse_int8_t type;
00764 static const nat size = 16;
00765 };
00766
00767 template<>
00768 struct simd_base_helper<sse_int8_t> {
00769 typedef int8_t type; };
00770
00771 inline sse_int8_t
00772 simd_set_duplicate (const int8_t& x) {
00773 return sse_int8_t(_mm_set1_epi8 ((char) x)); }
00774
00775 inline sse_int8_t
00776 simd_set (const int8_t& x0, const int8_t& x1,
00777 const int8_t& x2, const int8_t& x3,
00778 const int8_t& x4, const int8_t& x5,
00779 const int8_t& x6, const int8_t& x7,
00780 const int8_t& x8, const int8_t& x9,
00781 const int8_t& x10, const int8_t& x11,
00782 const int8_t& x12, const int8_t& x13,
00783 const int8_t& x14, const int8_t& x15) {
00784 return sse_int8_t(_mm_set_epi8 (x15, x14, x13, x12, x11, x10, x9, x8,
00785 x7, x6, x5, x4, x3, x2, x1, x0)); }
00786
00787
00788 inline sse_int8_t
00789 simd_equal (const sse_int8_t& x, const sse_int8_t& y) {
00790 return sse_int8_t(_mm_cmpeq_epi8 ((__m128i) x, (__m128i) y)); }
00791
00792 inline sse_int8_t
00793 simd_less (const sse_int8_t& x, const sse_int8_t& y) {
00794 return sse_int8_t(_mm_cmplt_epi8 ((__m128i) x, (__m128i) y)); }
00795
00796 inline sse_int8_t
00797 simd_gtr (const sse_int8_t& x, const sse_int8_t& y) {
00798 return sse_int8_t(_mm_cmpgt_epi8 ((__m128i) x, (__m128i) y)); }
00799
00800
00801 inline sse_int8_t
00802 simd_unpacklo (const sse_int8_t& x, const sse_int8_t& y) {
00803 return sse_int8_t(_mm_unpacklo_epi8 ((__m128i) x, (__m128i) y)); }
00804
00805 inline sse_int8_t
00806 simd_unpackhi (const sse_int8_t& x, const sse_int8_t& y) {
00807 return sse_int8_t(_mm_unpackhi_epi8 ((__m128i) x, (__m128i) y)); }
00808
00809 inline sse_int8_t
00810 simd_pack (const sse_int16_t& x, const sse_int16_t& y) {
00811 return sse_int8_t(_mm_packs_epi16 ((__m128i) x, (__m128i) y)); }
00812
00813
00814 #ifdef __SSE4_1__
00815 inline sse_int8_t
00816 min (const sse_int8_t& x, const sse_int8_t& y) {
00817 return sse_int8_t(_mm_min_epi8 ((__m128i) x, (__m128i) y)); }
00818
00819 inline sse_int8_t
00820 max (const sse_int8_t& x, const sse_int8_t& y) {
00821 return sse_int8_t(_mm_max_epi8 ((__m128i) x, (__m128i) y)); }
00822 #endif
00823
00824
00825 SIMD_SUGAR (int8_t, sse_int8_t)
00826
00827
00828
00829
00830
00831 typedef uint8_t __attribute__((vector_size(16))) sse_uint8_t;
00832
00833 template<>
00834 struct is_simd_helper<sse_uint8_t> {
00835 static const bool val = true; };
00836
00837 template<>
00838 struct simd_helper<uint8_t> {
00839 typedef sse_uint8_t type;
00840 static const nat size = 16;
00841 };
00842
00843 template<>
00844 struct simd_base_helper<sse_uint8_t> {
00845 typedef uint8_t type; };
00846
00847 inline sse_uint8_t
00848 simd_set_duplicate (const uint8_t& x) {
00849 return sse_uint8_t(_mm_set1_epi8 (x)); }
00850
00851 inline sse_uint8_t
00852 simd_set (const uint8_t& x0, const uint8_t& x1,
00853 const uint8_t& x2, const uint8_t& x3,
00854 const uint8_t& x4, const uint8_t& x5,
00855 const uint8_t& x6, const uint8_t& x7,
00856 const uint8_t& x8, const uint8_t& x9,
00857 const uint8_t& x10, const uint8_t& x11,
00858 const uint8_t& x12, const uint8_t& x13,
00859 const uint8_t& x14, const uint8_t& x15) {
00860 return sse_uint8_t(_mm_set_epi8 (x15, x14, x13, x12, x11, x10, x9, x8,
00861 x7, x6, x5, x4, x3, x2, x1, x0)); }
00862
00863
00864 inline sse_uint8_t
00865 simd_equal (const sse_uint8_t& x, const sse_uint8_t& y) {
00866 return sse_uint8_t(_mm_cmpeq_epi8 ((__m128i) x, (__m128i) y)); }
00867
00868 static const sse_uint8_t _half_max_uint8=
00869 sse_uint8_t(simd_set_duplicate (((uint8_t) 1) << 7));
00870
00871 inline sse_uint8_t
00872 simd_gtr (const sse_uint8_t& x, const sse_uint8_t& y) {
00873 return sse_uint8_t(_mm_cmpgt_epi8 ((__m128i) (x - _half_max_uint8),
00874 (__m128i) (y - _half_max_uint8))); }
00875
00876 inline sse_uint8_t
00877 simd_less (const sse_uint8_t& x, const sse_uint8_t& y) {
00878 return simd_gtr (y, x); }
00879
00880
00881 inline sse_uint8_t
00882 simd_unpacklo (const sse_uint8_t& x, const sse_uint8_t& y) {
00883 return sse_uint8_t(_mm_unpacklo_epi8 ((__m128i) x, (__m128i) y)); }
00884
00885 inline sse_uint8_t
00886 simd_unpackhi (const sse_uint8_t& x, const sse_uint8_t& y) {
00887 return sse_uint8_t(_mm_unpackhi_epi8 ((__m128i) x, (__m128i) y)); }
00888
00889 inline sse_uint8_t
00890 simd_pack (const sse_uint16_t& x, const sse_uint16_t& y) {
00891 return sse_uint8_t(_mm_packus_epi16 ((__m128i) x, (__m128i) y)); }
00892
00893
00894 inline sse_uint8_t
00895 min (const sse_uint8_t& x, const sse_uint8_t& y) {
00896 return sse_uint8_t(_mm_min_epu8 ((__m128i) x, (__m128i) y)); }
00897
00898 inline sse_uint8_t
00899 max (const sse_uint8_t& x, const sse_uint8_t& y) {
00900 return sse_uint8_t(_mm_max_epu8 ((__m128i) x, (__m128i) y)); }
00901
00902
00903 SIMD_SUGAR (uint8_t, sse_uint8_t)
00904
00905 #undef SIMD_SUGAR
00906
00907
00908 template<typename C>
00909 struct simd_set_duplicate_helper {
00910 typedef typename Simd_base_type(C) I;
00911 static inline C op (const I& x) {
00912 return simd_set_duplicate (x); }
00913 };
00914
00915 }
00916 #endif // NUMERIX_ENABLE_SIMD
00917 #endif // __MMX_SSE_HPP