00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013 #ifndef __MMX__POLYNOMIAL_SIMD__HPP
00014 #define __MMX__POLYNOMIAL_SIMD__HPP
00015 #include <numerix/simd.hpp>
00016 #ifdef NUMERIX_ENABLE_SIMD
00017 #include <algebramix/polynomial_unrolled.hpp>
00018
00019 namespace mmx {
00020
00021
00022
00023
00024
00025 #ifdef __SSE2__
00026
00027 template<typename V, typename C, typename K>
00028 struct polynomial_mul_simd_helper {
00029 typedef implementation<polynomial_linear,V> Pol;
00030 static const nat m = Simd_size (C);
00031
00032 static void
00033 op (C* dest, const C* s1, const K* s2, nat n1, nat n2) {
00034 if (n1 < n2) {
00035 op (dest, s2, s1, n2, n1);
00036 return;
00037 }
00038 nat l = aligned_size<K,V> (n2+m);
00039 K* rev_s2 = mmx_new<K> (l);
00040 for (nat j = 0; j < m; j++) {
00041 nat i = 1 + ((n2 + m - j - 1) % m);
00042 K* d = dest + i - 1;
00043 Pol::reverse (rev_s2 + j, s2, n2);
00044 if (j > 0) rev_s2[j-1] = (C) 0;
00045 for (; i <= n2; i += m, d += m)
00046 *d = Pol::Vec::inn_prod (s1, rev_s2 + j + n2 - i, i);
00047 for (i = 1 + ((m+j-1) % m); i < n1; i += m, d += m)
00048 *d = Pol::Vec::inn_prod (s1 + i - j, rev_s2, min (n2, n1 - i) + j);
00049 }
00050 mmx_delete<K> (rev_s2, l);
00051 }
00052 };
00053
00054 #define POLYNOMIAL_MUL_IMPL(C) \
00055 template<typename V, typename K> \
00056 struct polynomial_mul_helper<V,C,K> { \
00057 static inline void \
00058 op (C* dest, const C* s1, const C* s2, nat n1, nat n2) { \
00059 polynomial_mul_simd_helper<V,C,K>::op (dest, s1, s2, n1, n2); } \
00060 };
00061
00062 template<typename V, typename C>
00063 struct polynomial_tmul_simd_helper {
00064 typedef implementation<polynomial_linear,V> Pol;
00065 static const nat m = Simd_size (C);
00066 static void
00067 op (C* dest, const C* s1, const C* s2, nat n1, nat n2) {
00068 nat l = aligned_size<C,V> (n1+m);
00069 C* aligned_s1 = mmx_new<C> (l);
00070 for (nat j = 0; j < m; j++) {
00071 C* d = dest + j;
00072 const C* t2 = s2;
00073 Pol::copy (aligned_s1 + j, s1, n1);
00074 if (j > 0) aligned_s1[j-1] = (C) 0;
00075 for (nat i = j; i < n2; i+=m, d+=m, t2+=m)
00076 *d = Pol::Vec::inn_prod (aligned_s1, t2, n1+j);
00077 }
00078 mmx_delete<C> (aligned_s1, l);
00079 }
00080 };
00081
00082 #define POLYNOMIAL_TMUL_IMPL(C) \
00083 template<typename V> \
00084 struct polynomial_tmul_helper<V,C> { \
00085 static inline void \
00086 op (C* dest, const C* s1, const C* s2, nat n1, nat n2) { \
00087 polynomial_tmul_simd_helper<V,C>::op (dest, s1, s2, n1, n2); } \
00088 };
00089
00090 POLYNOMIAL_MUL_IMPL (double);
00091 POLYNOMIAL_TMUL_IMPL (double);
00092
00093 #undef POLYNOMIAL_MUL_IMPL
00094 #undef POLYNOMIAL_TMUL_IMPL
00095
00096
00097
00098
00099
00100 template<typename V, typename C>
00101 struct polynomial_quo_rem_simd_helper {
00102 typedef implementation<polynomial_linear,V> Pol;
00103 static const nat m = Simd_size (C);
00104 static void
00105 op (C* dest, C* s1, const C* s2, nat n1, nat n2) {
00106
00107
00108
00109 C* aligned_s2 [m];
00110 for (nat j = 1; j < m; j++) {
00111 aligned_s2[j] = mmx_new<C> (aligned_size<C,V> (n2+j));
00112 for (nat k = 0; k < j; k++)
00113 aligned_s2[j][k] = C (0);
00114 Pol::copy (aligned_s2[j] + j, s2, n2);
00115 }
00116 while (n1 >= n2 && n1 != 0) {
00117 int d= n1-n2;
00118 nat j= d % m;
00119 C q= quo (s1[n1-1], s2[n2-1]);
00120 dest[d]= q;
00121 if (j == 0)
00122 Pol::Vec::template vec_binary_scalar<mul_add_op,C,C,C>
00123 (s1 + d, s2, -q, n2);
00124 else
00125 Pol::Vec::template vec_binary_scalar<mul_add_op,C,C,C>
00126 (s1 + d - j, aligned_s2 [j], -q, n2 + j);
00127 n1--;
00128 }
00129 for (nat j = 1; j < m; j++)
00130 mmx_delete<C> (aligned_s2[j], aligned_size<C,V> (n2+j));
00131 }
00132 };
00133
00134 #define POLYNOMIAL_QUO_REM_IMPL(C) \
00135 template<typename V> \
00136 struct polynomial_quo_rem_helper<V,C> { \
00137 static inline void \
00138 op (C* dest, C* s1, const C* s2, nat n1, nat n2) { \
00139 polynomial_quo_rem_simd_helper<V,C>::op (dest, s1, s2, n1, n2); } \
00140 };
00141
00142 POLYNOMIAL_QUO_REM_IMPL (double);
00143
00144 #undef POLYNOMIAL_QUO_REM_IMPL
00145
00146
00147
00148
00149
00150 template<typename V, typename C>
00151 struct polynomial_evaluate_simd_helper {
00152 static const nat m = Simd_size (C);
00153 typedef typename Simd_type (C) sC;
00154 static inline C
00155 op (const C* p, const C& x, nat l) {
00156 if (l == 0) return 0;
00157 nat k = (l+m-1) / m;
00158 const C* q = p + (k-1) * m;
00159 sC sr = simd_load_aligned (q);
00160 nat i = 1; q -= m;
00161 sC sxm= simd_set_duplicate (binpow (x, m));
00162 for (; i + 4 < k; i += 4, q -= 4*m) {
00163 sr = simd_load_aligned (q) + sxm * sr;
00164 sr = simd_load_aligned (q-m) + sxm * sr;
00165 sr = simd_load_aligned (q-2*m) + sxm * sr;
00166 sr = simd_load_aligned (q-3*m) + sxm * sr;
00167 }
00168 for (; i < k; i++, q -= m)
00169 sr = simd_load_aligned (q) + sxm * sr;
00170 C r = ((C*) &sr) [m-1];
00171 for (i = m-2; i != ((nat) -1); i--)
00172 r = r * x + ((C*) &sr) [i];
00173 return r;
00174 }
00175 };
00176
00177 #define POLYNOMIAL_EVALUATE_IMPL(C) \
00178 template<typename V> \
00179 struct polynomial_evaluate_helper<V,C> { \
00180 static inline C \
00181 op (const C* p, const C& x, nat l) { \
00182 return polynomial_evaluate_simd_helper<V,C>::op (p, x, l); } \
00183 };
00184
00185 POLYNOMIAL_EVALUATE_IMPL (double);
00186
00187 #undef POLYNOMIAL_EVALUATE_IMPL
00188
00189 #endif // __SSE2__
00190
00191 }
00192
00193 #endif // NUMERIX_ENABLE_SIMD
00194 #endif //__MMX__POLYNOMIAL_SIMD__HPP