00001 
00002 
00003 
00004 
00005 
00006 
00007 
00008 
00009 
00010 
00011 
00012 
00013 #ifndef __MMX_SSE_HPP
00014 #define __MMX_SSE_HPP
00015 #include <numerix/simd.hpp>
00016 #if defined (NUMERIX_ENABLE_SIMD) && defined (__SSE2__)
00017 #include <stdint.h>
00018 #include <emmintrin.h>
00019 #ifdef __SSE3__
00020 #include <pmmintrin.h>
00021 #endif
00022 #ifdef __SSSE3__
00023 #include <tmmintrin.h>
00024 #endif
00025 #ifdef __SSE4_1__ 
00026 #include <smmintrin.h>
00027 #endif
00028 #ifdef __SSE4_2__ 
00029 #include <smmintrin.h>
00030 #endif
00031 
00032 #include <basix/compound.hpp>
00033 #include <basix/identifiers.hpp>
00034 #include <basix/syntactic.hpp>
00035 #include <numerix/complex.hpp>
00036 
00037 namespace mmx {
00038 
00039 
00040 
00041 
00042   
00043 template<typename C> inline typename Simd_type (C)
00044 simd_load_aligned (const C* v) {
00045   return (typename Simd_type (C))
00046     _mm_load_si128 ((const __m128i*) v); }
00047 
00048 template<typename C> inline void
00049 simd_save_aligned (C* v, const typename Simd_type (C)& x) {
00050   _mm_store_si128 ((__m128i*) v, (const __m128i) x); }
00051 
00052 template<typename C> inline void
00053 simd_save (C* v, const typename Simd_type(C)& x) {
00054   _mm_storeu_si128 ((__m128i*) v, (const __m128i) x); }
00055 
00056 template<typename C> inline typename Simd_type (C)
00057 simd_load (const C* v0, const C* v1) {
00058   return simd_set (*v0, *v1); }
00059 
00060 template<typename C> inline void
00061 simd_save (C* v0, C* v1, const typename Simd_type (C)& x) {
00062   static C v[Simd_size (C)]; 
00063   simd_save_aligned (v, x);
00064   *v0 = v[0]; *v1 = v[1]; }
00065 
00066 template<typename C> inline typename Simd_type (C)
00067 simd_load (const C* v0, const C* v1, const C* v2, const C* v3) {
00068   return simd_set (*v0, *v1, *v2, *v3); }
00069 
00070 template<typename C> inline void
00071 simd_save (C* v0, C* v1, C* v2, C* v3, const typename Simd_type (C)& x) {
00072   static C v[Simd_size (C)]; 
00073   simd_save_aligned (v, x);
00074   *v0 = v[0]; *v1 = v[1]; *v2 = v[2]; *v3 = v[3]; }
00075 
00076 template<typename C> inline typename Simd_type (C)
00077 simd_load (const C* v0, const C* v1, const C* v2, const C* v3,
00078            const C* v4, const C* v5, const C* v6, const C* v7) {
00079   return simd_set (*v0, *v1, *v2, *v3, *v4, *v5, *v6, *v7); }
00080 
00081 template<typename C> inline void
00082 simd_save (C* v0, C* v1, C* v2, C* v3, C* v4, C* v5, C* v6, C* v7,
00083            const typename Simd_type (C)& x) {
00084   static C v[Simd_size (C)]; 
00085   simd_save_aligned (v, x);
00086   *v0 = v[0]; *v1 = v[1]; *v2 = v[2]; *v3 = v[3];
00087   *v4 = v[4]; *v5 = v[5]; *v6 = v[6]; *v7 = v[7]; }
00088 
00089 template<typename C> inline typename Simd_type (C)
00090 simd_load (const C* v0, const C* v1, const C* v2, const C* v3,
00091            const C* v4, const C* v5, const C* v6, const C* v7,
00092            const C* v8, const C* v9, const C* v10, const C* v11,
00093            const C* v12, const C* v13, const C* v14, const C* v15) {
00094   return simd_set (*v0, *v1, *v2, *v3, *v4, *v5, *v6, *v7,
00095                    *v8, *v9, *v10, *v11, *v12, *v13, *v14, *v15); }
00096 
00097 template<typename C> inline void
00098 simd_save (C* v0, C* v1, C* v2, C* v3, C* v4, C* v5, C* v6, C* v7,
00099            C* v8, C* v9, C* v10, C* v11, C* v12, C* v13, C* v14, C* v15,
00100            const typename Simd_type (C)& x) {
00101   static C v[Simd_size (C)]; 
00102   simd_save_aligned (v, x);
00103   *v0 = v[0]; *v1 = v[1]; *v2 = v[2]; *v3 = v[3];
00104   *v4 = v[4]; *v5 = v[5]; *v6 = v[6]; *v7 = v[7];
00105   *v8 = v[8]; *v9 = v[9]; *v10 = v[10]; *v11 = v[11];
00106   *v12 = v[12]; *v13 = v[13]; *v14 = v[14]; *v15 = v[15]; }
00107 
00108 template<typename C> inline C
00109 simd_big_add (const typename Simd_type (C)& x) {
00110   C r = 0;
00111   for (nat i = 0; i < Simd_size (C); i++)
00112     r += ((C*) &x) [i];
00113   return r; }
00114  
00115 
00116 
00117 
00118 
00119 
00120 
00121 
00122 template<typename V> inline syntactic
00123 simd_flatten (const V& x) {
00124   typedef typename Simd_base_type(V) C;
00125   static const nat size = Simd_size(C);
00126   C* v = mmx_new<C> (size);
00127   simd_save_aligned (v, x);
00128   vector<syntactic> w = fill <syntactic> (size);;
00129   for (nat i = 0; i < size; i++)
00130     w[i] = flatten (v[i]);
00131   mmx_delete<C> (v, size);
00132   return apply (GEN_SQTUPLE, w); 
00133 }
00134 
00135 #define SIMD_SUGAR(C,V)                                                 \
00136   inline syntactic flatten (const V& x) {                               \
00137     return simd_flatten (x); }                                          \
00138   inline bool equal (const V& x, const V& y) {                          \
00139     return _mm_movemask_epi8 (                                          \
00140       (__m128i) _mm_cmpeq_epi32 ((const __m128i) x,                     \
00141                                  (const __m128i) y)) == 131071; }       \
00142   inline bool unequal (const V& x, const V& y) {                        \
00143     return ! equal (x, y); }                                            \
00144   STMPL inline void clear (V& x) {                                      \
00145     x = simd_set_duplicate (C (0)); }                                   \
00146   STMPL inline void mul (V& x, const V& y1, const C& y2) {              \
00147     x = y1 * simd_set_duplicate (y2); }                                 \
00148   STMPL inline void mul_add (V& x, const V& y1, const C& y2) {          \
00149     x += y1 * simd_set_duplicate (y2); }
00150 
00151 
00152 
00153 
00154 
00155 typedef double __attribute__((vector_size(16))) sse_double;
00156 
00157 template<>
00158 struct is_simd_helper<sse_double> {
00159   static const bool val = true; };
00160 
00161 template<>
00162 struct simd_helper<double> {
00163   typedef sse_double type;
00164   static const nat size = 2; };
00165 
00166 template<>
00167 struct simd_base_helper<sse_double> {
00168   typedef double type; };
00169 
00170 inline sse_double
00171 simd_load (const double* v) {
00172   return _mm_loadu_pd (v); }
00173 
00174 inline sse_double
00175 simd_set_duplicate (const double& x) {
00176   return _mm_set1_pd (x); }
00177 
00178 inline sse_double
00179 simd_set (const double& v0, const double& v1) {
00180   return _mm_set_pd (v1, v0); }
00181 
00182 #ifdef __SSE3__
00183 static const sse_double sse_double_zero= simd_set_duplicate((double) 0);
00184 
00185 STMPL inline double
00186 simd_big_add (const sse_double& x) {
00187   double r;
00188   sse_double y = _mm_hadd_pd (x, sse_double_zero);
00189   _mm_storel_pd (&r, y);
00190   return r;
00191 }
00192 #endif
00193 
00194 
00195 inline sse_double
00196 simd_equal (const sse_double& x, const sse_double& y) {
00197   return _mm_cmpeq_pd (x, y); }
00198   
00199 inline sse_double
00200 simd_unequal (const sse_double& x, const sse_double& y) {
00201   return _mm_cmpneq_pd (x, y); }
00202 
00203 inline sse_double
00204 simd_less (const sse_double& x, const sse_double& y) {
00205   return _mm_cmplt_pd (x, y); }
00206 
00207 inline sse_double
00208 simd_gtr (const sse_double& x, const sse_double& y) {
00209   return _mm_cmpgt_pd (x, y); }
00210 
00211 inline sse_double
00212 simd_lesseq (const sse_double& x, const sse_double& y) {
00213   return _mm_cmple_pd (x, y); }
00214 
00215 inline sse_double
00216 simd_gtreq (const sse_double& x, const sse_double& y) {
00217   return _mm_cmpge_pd (x, y); }
00218 
00219 
00220 inline sse_double
00221 min (const sse_double& x, const sse_double& y) { 
00222   return _mm_min_pd (x, y); }
00223 
00224 inline sse_double
00225 max (const sse_double& x, const sse_double& y) {
00226   return _mm_max_pd (x, y); }
00227 
00228 inline sse_double
00229 simd_shuffle (const sse_double& x, const sse_double& y, int i) {
00230   return _mm_shuffle_pd (x, y, i); }
00231 
00232 
00233 inline sse_double
00234 simd_unpacklo (const sse_double& x, const sse_double& y) { 
00235   return _mm_unpacklo_pd (x, y); }
00236 
00237 inline sse_double
00238 simd_unpackhi (const sse_double& x, const sse_double& y) { 
00239   return _mm_unpackhi_pd (x, y); }
00240 
00241 
00242 inline sse_double
00243 simd_load_duplicate (const double* v) {
00244   return _mm_load1_pd (v); }
00245 
00246 inline sse_double
00247 simd_load (const double* v0, const double* v1) {
00248   return _mm_loadh_pd (_mm_load1_pd (v0), v1); }
00249 
00250 inline void
00251 simd_save (double* v0, double* v1, const sse_double& x) {
00252   _mm_storel_pd (v0, x); _mm_storeh_pd (v1, x); }
00253 
00254 inline sse_double
00255 simd_swap (const sse_double& x) {
00256   return _mm_shuffle_pd (x, x, 1); }
00257 
00258 
00259 SIMD_SUGAR (double, sse_double)
00260 
00261 
00262 
00263 
00264 
00265 typedef complex<    double>      complex_double;
00266 typedef complex<sse_double>  sse_complex_double;
00267 
00268 template<>
00269 struct simd_helper<complex_double> {
00270   typedef sse_complex_double type;
00271   static const nat size = 2; };
00272 
00273 inline sse_complex_double
00274 simd_set_duplicate (const complex_double& z) {
00275   return sse_complex_double (simd_set_duplicate (Re (z)),
00276                              simd_set_duplicate (Im (z))); }
00277 inline sse_complex_double
00278 simd_set (const complex_double& z0, const complex_double& z1) {
00279   return sse_complex_double (simd_set (Re (z0), Re (z1)),
00280                              simd_set (Im (z0), Im (z1))); }
00281 inline sse_complex_double
00282 simd_load_duplicate (const complex_double* v) {
00283   const double* w= (double*) ((void*) v);
00284   return sse_complex_double (simd_load_duplicate (w),
00285                              simd_load_duplicate (w + 1)); }
00286 
00287 template<> inline syntactic
00288 flatten (const sse_complex_double& z) {
00289   return flatten (Re (z)) + flatten (Im (z)) * Imaginary (syntactic); }
00290 
00291 
00292 
00293 
00294 
00295 typedef int64_t __attribute__((vector_size(16))) sse_int64_t;
00296 
00297 template<>
00298 struct is_simd_helper<sse_int64_t> {
00299   static const bool val = true; };
00300 
00301 template<>
00302 struct simd_helper<int64_t> {
00303   typedef sse_int64_t type;
00304   static const nat size = 2; };
00305 
00306 template<>
00307 struct simd_base_helper<sse_int64_t> {
00308   typedef int64_t type; };
00309 
00310 inline sse_int64_t
00311 simd_set_duplicate (const int64_t& x) {
00312   return sse_int64_t(_mm_set1_epi64 ((__m64) x)); }
00313 
00314 inline sse_int64_t
00315 simd_set (const int64_t& x0, const int64_t& x1) {
00316   return sse_int64_t(_mm_set_epi64 ((__m64) x1, (__m64) x0)); }
00317 
00318 
00319 #ifdef __SSE4_1__
00320 inline sse_int64_t
00321 simd_equal (const sse_int64_t& x, const sse_int64_t& y) {
00322   return sse_int64_t(_mm_cmpeq_epi64 ((__m128i) x, (__m128i) y)); }
00323 #endif
00324 
00325 #ifdef __SSE4_2__
00326 inline sse_int64_t
00327 simd_gtr (const sse_int64_t& x, const sse_int64_t& y) {
00328   return sse_int64_t(_mm_cmpgt_epi64 ((__m128i) x, (__m128i) y)); }
00329 
00330 inline sse_int64_t
00331 simd_less (const sse_int64_t& x, const sse_int64_t& y) {
00332   return sse_int64_t(_mm_cmpgt_epi64 ((__m128i) y, (__m128i) x)); }
00333 #endif
00334 
00335 
00336 inline sse_int64_t
00337 simd_unpacklo (const sse_int64_t& x, const sse_int64_t& y) { 
00338   return sse_int64_t(_mm_unpacklo_epi64 ((__m128i) x, (__m128i) y)); }
00339 
00340 inline sse_int64_t
00341 simd_unpackhi (const sse_int64_t& x, const sse_int64_t& y) { 
00342   return sse_int64_t(_mm_unpackhi_epi64 ((__m128i) x, (__m128i) y)); }
00343 
00344 
00345 inline sse_int64_t
00346 simd_sll (const sse_int64_t& x, int i) {
00347   return sse_int64_t(_mm_slli_epi64 ((__m128i) x, i)); }
00348 
00349 inline sse_int64_t
00350 simd_srl (const sse_int64_t& x, int i) {
00351   return sse_int64_t(_mm_srli_epi64 ((__m128i) x, i)); }
00352 
00353 
00354 SIMD_SUGAR (int64_t, sse_int64_t)
00355 
00356 
00357 
00358 
00359 
00360 typedef uint64_t __attribute__((vector_size(16))) sse_uint64_t;
00361 
00362 template<>
00363 struct is_simd_helper<sse_uint64_t> {
00364   static const bool val = true; };
00365 
00366 template<>
00367 struct simd_helper<uint64_t> {
00368   typedef sse_uint64_t type;
00369   static const nat size = 2; };
00370 
00371 template<>
00372 struct simd_base_helper<sse_uint64_t> {
00373   typedef uint64_t type; };
00374 
00375 inline sse_uint64_t
00376 simd_set_duplicate (const uint64_t& x) {
00377   return sse_uint64_t(_mm_set1_epi64 ((__m64) x)); }
00378 
00379 inline sse_uint64_t
00380 simd_set (const uint64_t& x0, const uint64_t& x1) {
00381   return sse_uint64_t(_mm_set_epi64 ((__m64) x1, (__m64) x0)); }
00382 
00383 
00384 #ifdef __SSE4_1__
00385 inline sse_uint64_t
00386 simd_equal (const sse_uint64_t& x, const sse_uint64_t& y) {
00387   return sse_uint64_t(_mm_cmpeq_epi64 ((__m128i) x, (__m128i) y)); }
00388 
00389 static const sse_uint64_t _half_max_uint64=
00390   simd_set_duplicate ((uint64_t) (1ull << 63));
00391 
00392 inline sse_uint64_t
00393 simd_gtr (const sse_uint64_t& x, const sse_uint64_t& y) {
00394   return sse_uint64_t(_mm_cmpgt_epi64 ((__m128i) (x - _half_max_uint64),
00395                                        (__m128i) (y - _half_max_uint64))); }
00396 
00397 inline sse_uint64_t
00398 simd_less (const sse_uint64_t& x, const sse_uint64_t& y) {
00399   return simd_gtr (y, x); }
00400 
00401 inline sse_uint64_t
00402 min (const sse_uint64_t& x, const sse_uint64_t& y) {
00403   sse_uint64_t d= x - y;
00404   sse_uint64_t b=  sse_uint64_t
00405     (_mm_cmpgt_epi64 ((__m128i) (sse_int64_t) d,
00406                       (__m128i) (sse_int64_t) _half_max_uint64));
00407   return x + b & d; }
00408 #endif
00409 
00410 
00411 inline sse_uint64_t
00412 simd_unpacklo (const sse_uint64_t& x, const sse_uint64_t& y) { 
00413   return sse_uint64_t(_mm_unpacklo_epi64 ((__m128i) x, (__m128i) y)); }
00414 
00415 inline sse_uint64_t
00416 simd_unpackhi (const sse_uint64_t& x, const sse_uint64_t& y) { 
00417   return sse_uint64_t(_mm_unpackhi_epi64 ((__m128i) x, (__m128i) y)); }
00418 
00419 
00420 inline sse_uint64_t
00421 simd_sll (const sse_uint64_t& x, int i) {
00422   return sse_uint64_t(_mm_slli_epi64 ((__m128i) x, i)); }
00423 
00424 inline sse_uint64_t
00425 simd_srl (const sse_uint64_t& x, int i) {
00426   return sse_uint64_t(_mm_srli_epi64 ((__m128i) x, i)); }
00427 
00428 
00429 SIMD_SUGAR (uint64_t, sse_uint64_t)
00430 
00431 
00432 
00433 
00434 
00435 typedef int32_t __attribute__((vector_size(16))) sse_int32_t;
00436 
00437 template<>
00438 struct is_simd_helper<sse_int32_t> {
00439   static const bool val = true; };
00440 
00441 template<>
00442 struct simd_helper<int32_t> {
00443   typedef sse_int32_t type;
00444   static const nat size = 4; };
00445 
00446 template<>
00447 struct simd_base_helper<sse_int32_t> {
00448   typedef int32_t type; };
00449 
00450 inline sse_int32_t
00451 simd_set_duplicate (const int32_t& x) {
00452   return sse_int32_t(_mm_set1_epi32 (x)); }
00453 
00454 inline sse_int32_t
00455 simd_set (const int32_t& x0, const int32_t& x1,
00456           const int32_t& x2, const int32_t& x3) {
00457   return sse_int32_t(_mm_set_epi32 (x3, x2, x1, x0)); }
00458 
00459 
00460 inline sse_int32_t
00461 simd_equal (const sse_int32_t& x, const sse_int32_t& y) {
00462   return sse_int32_t(_mm_cmpeq_epi32 ((__m128i) x, (__m128i) y)); }
00463   
00464 inline sse_int32_t
00465 simd_less (const sse_int32_t& x, const sse_int32_t& y) {
00466   return sse_int32_t(_mm_cmplt_epi32 ((__m128i) x, (__m128i) y)); }
00467 
00468 inline sse_int32_t
00469 simd_gtr (const sse_int32_t& x, const sse_int32_t& y) {
00470   return sse_int32_t(_mm_cmpgt_epi32 ((__m128i) x, (__m128i) y)); }
00471 
00472 
00473 inline sse_int32_t
00474 simd_unpacklo (const sse_int32_t& x, const sse_int32_t& y) { 
00475   return sse_int32_t(_mm_unpacklo_epi32 ((__m128i) x, (__m128i) y)); }
00476 
00477 inline sse_int32_t
00478 simd_unpackhi (const sse_int32_t& x, const sse_int32_t& y) { 
00479   return sse_int32_t(_mm_unpackhi_epi32 ((__m128i) x, (__m128i) y)); }
00480 
00481 
00482 inline sse_int32_t
00483 simd_sll (const sse_int32_t& x, int i) {
00484   return sse_int32_t(_mm_slli_epi32 ((__m128i) x, i)); }
00485 
00486 inline sse_int32_t
00487 simd_srl (const sse_int32_t& x, int i) {
00488   return sse_int32_t(_mm_srli_epi32 ((__m128i) x, i)); }
00489 
00490 inline sse_int32_t
00491 simd_sra (const sse_int32_t& x, int i) {
00492   return sse_int32_t(_mm_srai_epi32 ((__m128i) x, i)); }
00493 
00494 #ifdef __SSE4_1__
00495 inline sse_int32_t
00496 min (const sse_int32_t& x, const sse_int32_t& y) { 
00497   return sse_int32_t(_mm_min_epi32 ((__m128i) x, (__m128i) y)); }
00498 
00499 inline sse_int32_t
00500 max (const sse_int32_t& x, const sse_int32_t& y) {
00501   return sse_int32_t(_mm_max_epi32 ((__m128i) x, (__m128i) y)); }
00502 #endif
00503 
00504 
00505 SIMD_SUGAR (int32_t, sse_int32_t)
00506 
00507 
00508 
00509 
00510 
00511 typedef uint32_t __attribute__((vector_size(16))) sse_uint32_t;
00512 
00513 template<>
00514 struct is_simd_helper<sse_uint32_t> {
00515   static const bool val = true; };
00516 
00517 template<>
00518 struct simd_helper<uint32_t> {
00519   typedef sse_uint32_t type;
00520   static const nat size = 4; };
00521 
00522 template<>
00523 struct simd_base_helper<sse_uint32_t> {
00524   typedef uint32_t type; };
00525 
00526 inline sse_uint32_t
00527 simd_set_duplicate (const uint32_t& x) {
00528   return sse_uint32_t(_mm_set1_epi32 (x)); }
00529 
00530 inline sse_uint32_t
00531 simd_set (const uint32_t& x0, const uint32_t& x1,
00532           const uint32_t& x2, const uint32_t& x3) {
00533   return sse_uint32_t(_mm_set_epi32 (x3, x2, x1, x0)); }
00534 
00535 
00536 #ifdef __SSE4_1__
00537 inline sse_uint32_t
00538 simd_equal (const sse_uint32_t& x, const sse_uint32_t& y) {
00539   return sse_uint32_t(_mm_cmpeq_epi32 ((__m128i) x, (__m128i) y)); }
00540 
00541 static const sse_uint32_t _half_max_uint32=
00542   simd_set_duplicate (((uint32_t) 1) << 31);
00543 
00544 inline sse_uint32_t
00545 simd_gtr (const sse_uint32_t& x, const sse_uint32_t& y) {
00546   return sse_uint32_t(_mm_cmpgt_epi32 ((__m128i) (x - _half_max_uint32),
00547                                        (__m128i) (y - _half_max_uint32))); }
00548 
00549 inline sse_uint32_t
00550 simd_less (const sse_uint32_t& x, const sse_uint32_t& y) {
00551   return simd_gtr (y, x); }
00552 #endif
00553 
00554 
00555 inline sse_uint32_t
00556 simd_unpacklo (const sse_uint32_t& x, const sse_uint32_t& y) { 
00557   return sse_uint32_t(_mm_unpacklo_epi32 ((__m128i) x, (__m128i) y)); }
00558 
00559 inline sse_uint32_t
00560 simd_unpackhi (const sse_uint32_t& x, const sse_uint32_t& y) { 
00561   return sse_uint32_t(_mm_unpackhi_epi32 ((__m128i) x, (__m128i) y)); }
00562 
00563 
00564 inline sse_uint32_t
00565 simd_sll (const sse_uint32_t& x, int i) {
00566   return sse_uint32_t(_mm_slli_epi32 ((__m128i) x, i)); }
00567 
00568 inline sse_uint32_t
00569 simd_srl (const sse_uint32_t& x, int i) {
00570   return sse_uint32_t(_mm_srli_epi32 ((__m128i) x, i)); }
00571 
00572 #ifdef __SSE4_1__
00573 inline sse_uint32_t
00574 min (const sse_uint32_t& x, const sse_uint32_t& y) { 
00575   return sse_uint32_t(_mm_min_epu32 ((__m128i) x, (__m128i) y)); }
00576 
00577 inline sse_uint32_t
00578 max (const sse_uint32_t& x, const sse_uint32_t& y) {
00579   return sse_uint32_t(_mm_max_epu32 ((__m128i) x, (__m128i) y)); }
00580 #endif
00581 
00582 
00583 SIMD_SUGAR (uint32_t, sse_uint32_t)
00584 
00585 
00586 
00587 
00588 
00589 typedef int16_t __attribute__((vector_size(16))) sse_int16_t;
00590 
00591 template<>
00592 struct is_simd_helper<sse_int16_t> {
00593   static const bool val = true; };
00594 
00595 template<>
00596 struct simd_helper<int16_t> {
00597   typedef sse_int16_t type;
00598   static const nat size = 8; };
00599 
00600 template<>
00601 struct simd_base_helper<sse_int16_t> {
00602   typedef int16_t type; };
00603 
00604 inline sse_int16_t
00605 simd_set_duplicate (const int16_t& x) {
00606   return sse_int16_t(_mm_set1_epi16 (x)); }
00607 
00608 inline sse_int16_t
00609 simd_set (const int16_t& x0, const int16_t& x1,
00610           const int16_t& x2, const int16_t& x3,
00611           const int16_t& x4, const int16_t& x5,
00612           const int16_t& x6, const int16_t& x7) {
00613   return sse_int16_t(_mm_set_epi16 (x7, x6, x5, x4, x3, x2, x1, x0)); }
00614 
00615 
00616 inline sse_int16_t
00617 simd_equal (const sse_int16_t& x, const sse_int16_t& y) {
00618   return sse_int16_t(_mm_cmpeq_epi16 ((__m128i) x, (__m128i) y)); }
00619   
00620 inline sse_int16_t
00621 simd_less (const sse_int16_t& x, const sse_int16_t& y) {
00622   return sse_int16_t(_mm_cmplt_epi16 ((__m128i) x, (__m128i) y)); }
00623 
00624 inline sse_int16_t
00625 simd_gtr (const sse_int16_t& x, const sse_int16_t& y) {
00626   return sse_int16_t(_mm_cmpgt_epi16 ((__m128i) x, (__m128i) y)); }
00627 
00628 
00629 inline sse_int16_t
00630 min (const sse_int16_t& x, const sse_int16_t& y) { 
00631   return sse_int16_t(_mm_min_epi16 ((__m128i) x, (__m128i) y)); }
00632 
00633 inline sse_int16_t
00634 max (const sse_int16_t& x, const sse_int16_t& y) {
00635   return sse_int16_t(_mm_max_epi16 ((__m128i) x, (__m128i) y)); }
00636 
00637 
00638 inline sse_int16_t
00639 simd_unpacklo (const sse_int16_t& x, const sse_int16_t& y) { 
00640   return sse_int16_t(_mm_unpacklo_epi16 ((__m128i) x, (__m128i) y)); }
00641 
00642 inline sse_int16_t
00643 simd_unpackhi (const sse_int16_t& x, const sse_int16_t& y) { 
00644   return sse_int16_t(_mm_unpackhi_epi16 ((__m128i) x, (__m128i) y)); }
00645 
00646 inline sse_int16_t
00647 simd_pack (const sse_int32_t& x, const sse_int32_t& y) { 
00648   return sse_int16_t(_mm_packs_epi32 ((__m128i) x, (__m128i) y)); }
00649 
00650 
00651 inline sse_int16_t
00652 simd_sll (const sse_int16_t& x, int i) {
00653   return sse_int16_t(_mm_slli_epi16 ((__m128i) x, i)); }
00654 
00655 inline sse_int16_t
00656 simd_srl (const sse_int16_t& x, int i) {
00657   return sse_int16_t(_mm_srli_epi16 ((__m128i) x, i)); }
00658 
00659 inline sse_int16_t
00660 simd_sra (const sse_int16_t& x, int i) {
00661   return sse_int16_t(_mm_srai_epi16 ((__m128i) x, i)); }
00662 
00663 
00664 SIMD_SUGAR (int16_t, sse_int16_t)
00665 
00666 
00667 
00668 
00669 
00670 typedef uint16_t __attribute__((vector_size(16))) sse_uint16_t;
00671 
00672 template<>
00673 struct is_simd_helper<sse_uint16_t> {
00674   static const bool val = true; };
00675 
00676 template<>
00677 struct simd_helper<uint16_t> {
00678   typedef sse_uint16_t type;
00679   static const nat size = 8; };
00680 
00681 template<>
00682 struct simd_base_helper<sse_uint16_t> {
00683   typedef uint16_t type; };
00684 
00685 inline sse_uint16_t
00686 simd_set_duplicate (const uint16_t& x) {
00687   return sse_uint16_t(_mm_set1_epi16 ((short) x)); }
00688 
00689 inline sse_uint16_t
00690 simd_set (const uint16_t& x0, const uint16_t& x1,
00691           const uint16_t& x2, const uint16_t& x3,
00692           const uint16_t& x4, const uint16_t& x5,
00693           const uint16_t& x6, const uint16_t& x7) {
00694   return sse_uint16_t(_mm_set_epi16 ((short) x7, (short) x6, (short) x5,
00695         (short) x4, (short) x3, (short) x2, (short)  x1, (short) x0)); }
00696 
00697 
00698 inline sse_uint16_t
00699 simd_equal (const sse_uint16_t& x, const sse_uint16_t& y) {
00700   return sse_uint16_t(_mm_cmpeq_epi16 ((__m128i) x, (__m128i) y)); }
00701 
00702 static const sse_uint16_t _half_max_uint16=
00703   sse_uint16_t(simd_set_duplicate (((uint16_t) 1) << 15));
00704 
00705 inline sse_uint16_t
00706 simd_gtr (const sse_uint16_t& x, const sse_uint16_t& y) {
00707   return sse_uint16_t(_mm_cmpgt_epi16 ((__m128i) (x - _half_max_uint16),
00708                                        (__m128i) (y - _half_max_uint16))); }
00709 
00710 inline sse_uint16_t
00711 simd_less (const sse_uint16_t& x, const sse_uint16_t& y) {
00712   return simd_gtr (y, x); }
00713 
00714 
00715 inline sse_uint16_t
00716 simd_unpacklo (const sse_uint16_t& x, const sse_uint16_t& y) { 
00717   return sse_uint16_t(_mm_unpacklo_epi16 ((__m128i) x, (__m128i) y)); }
00718 
00719 inline sse_uint16_t
00720 simd_unpackhi (const sse_uint16_t& x, const sse_uint16_t& y) { 
00721   return sse_uint16_t(_mm_unpackhi_epi16 ((__m128i) x, (__m128i) y)); }
00722 
00723 #ifdef __SSE4_1__
00724 inline sse_uint16_t
00725 simd_pack (const sse_uint32_t& x, const sse_uint32_t& y) { 
00726   return sse_uint16_t(_mm_packus_epi32 ((__m128i) x, (__m128i) y)); }
00727 #endif
00728 
00729 
00730 inline sse_uint16_t
00731 simd_sll (const sse_uint16_t& x, int i) {
00732   return sse_uint16_t(_mm_slli_epi16 ((__m128i) x, i)); }
00733 
00734 inline sse_uint16_t
00735 simd_srl (const sse_uint16_t& x, int i) {
00736   return sse_uint16_t(_mm_srli_epi16 ((__m128i) x, i)); }
00737 
00738 #ifdef __SSE4_1__
00739 inline sse_uint16_t
00740 min (const sse_uint16_t& x, const sse_uint16_t& y) { 
00741   return sse_uint16_t(_mm_min_epu16 ((__m128i) x, (__m128i) y)); }
00742 
00743 inline sse_uint16_t
00744 max (const sse_uint16_t& x, const sse_uint16_t& y) {
00745   return sse_uint16_t(_mm_max_epu16 ((__m128i) x, (__m128i) y)); }
00746 #endif
00747 
00748 
00749 SIMD_SUGAR (uint16_t, sse_uint16_t)
00750 
00751 
00752 
00753 
00754 
00755 typedef int8_t __attribute__((vector_size(16))) sse_int8_t;
00756 
00757 template<>
00758 struct is_simd_helper<sse_int8_t> {
00759   static const bool val = true; };
00760 
00761 template<>
00762 struct simd_helper<int8_t> {
00763   typedef sse_int8_t type;
00764   static const nat size = 16;
00765 };
00766 
00767 template<>
00768 struct simd_base_helper<sse_int8_t> {
00769   typedef int8_t type; };
00770 
00771 inline sse_int8_t
00772 simd_set_duplicate (const int8_t& x) {
00773   return sse_int8_t(_mm_set1_epi8 ((char) x)); }
00774 
00775 inline sse_int8_t 
00776 simd_set (const int8_t& x0, const int8_t& x1, 
00777           const int8_t& x2, const int8_t& x3,
00778           const int8_t& x4, const int8_t& x5,
00779           const int8_t& x6, const int8_t& x7,
00780           const int8_t& x8, const int8_t& x9,
00781           const int8_t& x10, const int8_t& x11,
00782           const int8_t& x12, const int8_t& x13,
00783           const int8_t& x14, const int8_t& x15) {
00784   return sse_int8_t(_mm_set_epi8 (x15, x14,  x13,  x12,  x11,  x10,  x9,  x8,
00785                                   x7, x6, x5, x4, x3, x2, x1, x0)); }
00786 
00787 
00788 inline sse_int8_t
00789 simd_equal (const sse_int8_t& x, const sse_int8_t& y) {
00790   return sse_int8_t(_mm_cmpeq_epi8 ((__m128i) x, (__m128i) y)); }
00791   
00792 inline sse_int8_t
00793 simd_less (const sse_int8_t& x, const sse_int8_t& y) {
00794   return sse_int8_t(_mm_cmplt_epi8 ((__m128i) x, (__m128i) y)); }
00795 
00796 inline sse_int8_t
00797 simd_gtr (const sse_int8_t& x, const sse_int8_t& y) {
00798   return sse_int8_t(_mm_cmpgt_epi8 ((__m128i) x, (__m128i) y)); }
00799 
00800 
00801 inline sse_int8_t
00802 simd_unpacklo (const sse_int8_t& x, const sse_int8_t& y) { 
00803   return sse_int8_t(_mm_unpacklo_epi8 ((__m128i) x, (__m128i) y)); }
00804 
00805 inline sse_int8_t
00806 simd_unpackhi (const sse_int8_t& x, const sse_int8_t& y) { 
00807   return sse_int8_t(_mm_unpackhi_epi8 ((__m128i) x, (__m128i) y)); }
00808 
00809 inline sse_int8_t
00810 simd_pack (const sse_int16_t& x, const sse_int16_t& y) { 
00811   return sse_int8_t(_mm_packs_epi16 ((__m128i) x, (__m128i) y)); }
00812 
00813 
00814 #ifdef __SSE4_1__
00815 inline sse_int8_t
00816 min (const sse_int8_t& x, const sse_int8_t& y) { 
00817   return sse_int8_t(_mm_min_epi8 ((__m128i) x, (__m128i) y)); }
00818 
00819 inline sse_int8_t
00820 max (const sse_int8_t& x, const sse_int8_t& y) {
00821   return sse_int8_t(_mm_max_epi8 ((__m128i) x, (__m128i) y)); }
00822 #endif
00823 
00824 
00825 SIMD_SUGAR (int8_t, sse_int8_t)
00826 
00827 
00828 
00829 
00830 
00831 typedef uint8_t __attribute__((vector_size(16))) sse_uint8_t;
00832 
00833 template<>
00834 struct is_simd_helper<sse_uint8_t> {
00835   static const bool val = true; };
00836 
00837 template<>
00838 struct simd_helper<uint8_t> {
00839   typedef sse_uint8_t type;
00840   static const nat size = 16;
00841 };
00842 
00843 template<>
00844 struct simd_base_helper<sse_uint8_t> {
00845   typedef uint8_t type; };
00846 
00847 inline sse_uint8_t
00848 simd_set_duplicate (const uint8_t& x) {
00849   return sse_uint8_t(_mm_set1_epi8 (x)); }
00850 
00851 inline sse_uint8_t 
00852 simd_set (const uint8_t& x0, const uint8_t& x1,
00853           const uint8_t& x2, const uint8_t& x3,
00854           const uint8_t& x4, const uint8_t& x5,
00855           const uint8_t& x6, const uint8_t& x7,
00856           const uint8_t& x8, const uint8_t& x9,
00857           const uint8_t& x10, const uint8_t& x11,
00858           const uint8_t& x12, const uint8_t& x13,
00859           const uint8_t& x14, const uint8_t& x15) {
00860   return sse_uint8_t(_mm_set_epi8 (x15, x14,  x13,  x12,  x11,  x10,  x9,  x8,
00861                                    x7, x6, x5, x4, x3, x2, x1, x0)); }
00862 
00863 
00864 inline sse_uint8_t
00865 simd_equal (const sse_uint8_t& x, const sse_uint8_t& y) {
00866   return sse_uint8_t(_mm_cmpeq_epi8 ((__m128i) x, (__m128i) y)); }
00867 
00868 static const sse_uint8_t _half_max_uint8=
00869   sse_uint8_t(simd_set_duplicate (((uint8_t) 1) << 7));
00870 
00871 inline sse_uint8_t
00872 simd_gtr (const sse_uint8_t& x, const sse_uint8_t& y) {
00873   return sse_uint8_t(_mm_cmpgt_epi8 ((__m128i) (x - _half_max_uint8),
00874                                      (__m128i) (y - _half_max_uint8))); }
00875 
00876 inline sse_uint8_t
00877 simd_less (const sse_uint8_t& x, const sse_uint8_t& y) {
00878   return simd_gtr (y, x); }
00879 
00880 
00881 inline sse_uint8_t
00882 simd_unpacklo (const sse_uint8_t& x, const sse_uint8_t& y) { 
00883   return sse_uint8_t(_mm_unpacklo_epi8 ((__m128i) x, (__m128i) y)); }
00884 
00885 inline sse_uint8_t
00886 simd_unpackhi (const sse_uint8_t& x, const sse_uint8_t& y) { 
00887   return sse_uint8_t(_mm_unpackhi_epi8 ((__m128i) x, (__m128i) y)); }
00888 
00889 inline sse_uint8_t
00890 simd_pack (const sse_uint16_t& x, const sse_uint16_t& y) { 
00891   return sse_uint8_t(_mm_packus_epi16 ((__m128i) x, (__m128i) y)); }
00892 
00893 
00894 inline sse_uint8_t
00895 min (const sse_uint8_t& x, const sse_uint8_t& y) { 
00896   return sse_uint8_t(_mm_min_epu8 ((__m128i) x, (__m128i) y)); }
00897 
00898 inline sse_uint8_t
00899 max (const sse_uint8_t& x, const sse_uint8_t& y) {
00900   return sse_uint8_t(_mm_max_epu8 ((__m128i) x, (__m128i) y)); }
00901 
00902 
00903 SIMD_SUGAR (uint8_t, sse_uint8_t)
00904 
00905 #undef SIMD_SUGAR
00906 
00907 
00908 template<typename C>
00909 struct simd_set_duplicate_helper {
00910   typedef typename Simd_base_type(C) I;
00911   static inline C op (const I& x) {
00912     return simd_set_duplicate (x); }
00913 };
00914 
00915 } 
00916 #endif // NUMERIX_ENABLE_SIMD
00917 #endif // __MMX_SSE_HPP