00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013 #ifndef __MMX__MATRIX_SSE__HPP
00014 #define __MMX__MATRIX_SSE__HPP
00015 #include <algebramix/vector_sse.hpp>
00016 #include <algebramix/matrix_aligned.hpp>
00017
00018 namespace mmx {
00019 #if defined (ALGEBRAMIX_ENABLE_SIMD) & defined (__SSE2__) \
00020 & defined (ALGEBRAMIX_HAVE_STDINT_H)
00021
00022
00023
00024
00025
00026 template<bool b, typename V, typename W, typename Op, typename C>
00027 struct mat_mul_simd_helper {
00028 static inline void
00029 mul (C* d, const C* s1, const C* s2,
00030 nat r, nat rr, nat l, nat ll, nat c, nat cc)
00031 {
00032 typedef implementation<matrix_multiply,V> Mat;
00033 Mat::template mul<Op> (d, s1, s2, r, rr, l, ll, c, cc);
00034 }
00035 };
00036
00037 template<typename V, typename W, typename Op, typename C>
00038 struct mat_mul_simd_helper<true,V,W,Op,C> {
00039 static inline void
00040 mul (C* d, const C* s1, const C* s2,
00041 nat r, nat rr, nat l, nat ll, nat c, nat cc)
00042 {
00043 typedef typename Simd_type (C) simd_C;
00044 typedef implementation<matrix_multiply,W> SMat;
00045 static const nat m= Simd_size (C);
00046 VERIFY (r % m == 0, "alignment problem");
00047 VERIFY (rr % m == 0, "alignment problem");
00048 SMat::template mul<Op> ((simd_C*) d, (simd_C*) s1, s2,
00049 r / m, rr / m, l, ll, c, cc);
00050 }
00051 };
00052
00053 template<typename V, typename W,
00054 typename Op, typename C>
00055 struct mat_mul_aligned_helper<V,W,Op,C,C,C> {
00056 static inline void
00057 mul (C* d, const C* s1, const C* s2,
00058 nat r, nat rr, nat l, nat ll, nat c, nat cc)
00059 {
00060 mat_mul_simd_helper<sse_has_helper<Op, C>::value,V,W,Op,C>
00061 ::mul (d, s1, s2, r, rr, l, ll, c, cc);
00062 }
00063 };
00064
00065
00066
00067
00068
00069 template<typename Op>
00070 struct _matrix_sse_mul_3_2 {
00071 template<typename C> static inline void
00072 op (C*& d1, C*&d2, C*& s1, C*& s2, C*& s3,
00073 const C& c11, const C& c12, const C& c13,
00074 const C& c21, const C& c22, const C& c23) {
00075 typedef typename Op::acc_op Acc;
00076 Op ::set_op (*(d1) , c11, *(s1));
00077 Acc::set_op (*(d1) , c12, *(s2));
00078 Acc::set_op (*(d1++), c13, *(s3));
00079 Op ::set_op (*(d2) , c21, *(s1++));
00080 Acc::set_op (*(d2) , c22, *(s2++));
00081 Acc::set_op (*(d2++), c23, *(s3++)); }
00082 };
00083
00084 STMPL
00085 struct _matrix_sse_mul_3_2<mul_op> {
00086 template<typename C> static inline void
00087 op (C*& d1, C*&d2, C*& s1, C*& s2, C*& s3,
00088 const C& c11, const C& c12, const C& c13,
00089 const C& c21, const C& c22, const C& c23) {
00090 *(d1++)= c11 * *(s1) + c12 * *(s2) + c13 * *(s3);
00091 *(d2++)= c21 * *(s1++) + c22 * *(s2++) + c23 * *(s3++); }
00092 };
00093
00094 STMPL
00095 struct _matrix_sse_mul_3_2<rmul_op>: _matrix_sse_mul_3_2<mul_op> {};
00096 STMPL
00097 struct _matrix_sse_mul_3_2<lmul_op>: _matrix_sse_mul_3_2<mul_op> {};
00098
00099 STMPL
00100 struct _matrix_sse_mul_3_2<mul_add_op> {
00101 template<typename C> static inline void
00102 op (C*& d1, C*&d2, C*& s1, C*& s2, C*& s3,
00103 const C& c11, const C& c12, const C& c13,
00104 const C& c21, const C& c22, const C& c23) {
00105 *(d1++)+= c11 * *(s1) + c12 * *(s2) + c13 * *(s3);
00106 *(d2++)+= c21 * *(s1++) + c22 * *(s2++) + c23 * *(s3++); }
00107 };
00108
00109 STMPL
00110 struct _matrix_sse_mul_3_2<rmul_add_op>: _matrix_sse_mul_3_2<mul_add_op> {};
00111 STMPL
00112 struct _matrix_sse_mul_3_2<lmul_add_op>: _matrix_sse_mul_3_2<mul_add_op> {};
00113
00114 template<typename C>
00115 struct mul_unrolled_sse_helper {
00116 typedef typename Simd_type(C) Simd_C;
00117 #if 0
00118 template<typename Op, nat ur, nat ul, nat uc, typename V, typename W>
00119 static void
00120 op (Simd_C* dest, const Simd_C* src1, const C* src2,
00121 nat r, nat rr, nat l, nat ll, nat c, nat cc) {
00122 typedef vector_unrolled<4 * Simd_size(C), vector_naive> VV;
00123 typedef implementation<vector_abstractions,VV> Vec;
00124 typedef implementation<matrix_multiply,W> Mat;
00125 typedef typename Op::acc_op Acc;
00126 if (r < 4 * Simd_size(C)) {
00127 mul_unrolled<Op, ur, ul, uc, W>
00128 (dest, src1, src2, r, rr, l, ll, c, cc);
00129 return;
00130 }
00131 for (nat ic=0; ic<c; ic++) {
00132 Vec::template vec_binary_scalar<Op>
00133 (dest + Mat::index (0, ic, rr, cc),
00134 src1 + Mat::index (0, 0, rr, ll),
00135 simd_set_duplicate (*(src2 + Mat::index (0, ic, ll, cc))),
00136 r);
00137 for (nat il= 1; il<l; il++) {
00138 Vec::template vec_binary_scalar<Acc>
00139 (dest + Mat::index (0, ic, rr, cc),
00140 src1 + Mat::index (0, il, rr, ll),
00141 simd_set_duplicate (*(src2 + Mat::index (il, ic, ll, cc))),
00142 r);
00143 }
00144 }
00145 }
00146 #endif
00147
00148 template<typename Op, nat ur, nat ul, nat uc, typename V, typename W>
00149 static void
00150 op (Simd_C* dest, const Simd_C* src1, const C* src2,
00151 nat r, nat rr, nat l, nat ll, nat c, nat cc) {
00152 typedef typename Vector_simd_variant (C) VV;
00153 typedef implementation<vector_allocate,VV> Alloc;
00154 typedef implementation<vector_abstractions,VV> Vec;
00155 typedef implementation<matrix_multiply,W> Mat;
00156 typedef typename Op::acc_op Acc;
00157 ASSERT (Alloc::vec_is_aligned (dest), "wrong alignment");
00158 ASSERT (Alloc::vec_is_aligned (src1), "wrong alignment");
00159
00160
00161 if (r < 2 * Simd_size (C) ||
00162 l < 2 * Simd_size (C) ||
00163 c < 2 * Simd_size (C)) {
00164 mul_full_unrolled<Op, ur, ul, uc, W>
00165 (dest, src1, src2, r, rr, l, ll, c, cc);
00166 return;
00167 }
00168
00169 Simd_C* d1, * d2, * s1, * s2, * s3;
00170 Simd_C c11, c12, c13;
00171 Simd_C c21, c22, c23;
00172
00173 nat szl= 3;
00174 nat szc= 2;
00175 nat ic, il;
00176 for (ic= 0; ic + szc - 1 < c; ic += szc) {
00177 d1= dest + Mat::index (0, ic , rr, cc);
00178 d2= dest + Mat::index (0, ic+1, rr, cc);
00179 s1= (Simd_C*) src1 + Mat::index (0, 0, rr, ll);
00180 s2= (Simd_C*) src1 + Mat::index (0, 1, rr, ll);
00181 s3= (Simd_C*) src1 + Mat::index (0, 2, rr, ll);
00182 c11= simd_set_duplicate (*(src2 + Mat::index (0, ic , ll, cc)));
00183 c12= simd_set_duplicate (*(src2 + Mat::index (1, ic , ll, cc)));
00184 c13= simd_set_duplicate (*(src2 + Mat::index (2, ic , ll, cc)));
00185 c21= simd_set_duplicate (*(src2 + Mat::index (0, ic+1, ll, cc)));
00186 c22= simd_set_duplicate (*(src2 + Mat::index (1, ic+1, ll, cc)));
00187 c23= simd_set_duplicate (*(src2 + Mat::index (2, ic+1, ll, cc)));
00188 for (nat ir= 0; ir < r; ir++)
00189 _matrix_sse_mul_3_2<Op>::op (d1, d2, s1, s2, s3,
00190 c11, c12, c13, c21, c22, c23);
00191 for (il= szl; il + szl - 1 < l; il += szl) {
00192 d1= dest + Mat::index (0, ic , rr, cc);
00193 d2= dest + Mat::index (0, ic+1, rr, cc);
00194 s1= (Simd_C*) src1 + Mat::index (0, il , rr, ll);
00195 s2= (Simd_C*) src1 + Mat::index (0, il+1, rr, ll);
00196 s3= (Simd_C*) src1 + Mat::index (0, il+2, rr, ll);
00197 c11= simd_set_duplicate (*(src2 + Mat::index (il+0, ic , ll, cc)));
00198 c12= simd_set_duplicate (*(src2 + Mat::index (il+1, ic , ll, cc)));
00199 c13= simd_set_duplicate (*(src2 + Mat::index (il+2, ic , ll, cc)));
00200 c21= simd_set_duplicate (*(src2 + Mat::index (il+0, ic+1, ll, cc)));
00201 c22= simd_set_duplicate (*(src2 + Mat::index (il+1, ic+1, ll, cc)));
00202 c23= simd_set_duplicate (*(src2 + Mat::index (il+2, ic+1, ll, cc)));
00203 for (nat ir= 0; ir < r; ir++)
00204 _matrix_sse_mul_3_2<Acc>::op (d1, d2, s1, s2, s3,
00205 c11, c12, c13, c21, c22, c23);
00206 }
00207 for (; il < l; il++) {
00208 d1= dest + Mat::index (0, ic , rr, cc);
00209 d2= dest + Mat::index (0, ic+1, rr, cc);
00210 s1= (Simd_C*) src1 + Mat::index (0, il, rr, ll);
00211 c11= simd_set_duplicate (*(src2 + Mat::index (il, ic , ll, cc)));
00212 c21= simd_set_duplicate (*(src2 + Mat::index (il, ic+1, ll, cc)));
00213 for (nat ir= 0; ir < r; ir++) {
00214 Acc::set_op (*(d1++), c11, *(s1));
00215 Acc::set_op (*(d2++), c21, *(s1++));
00216 }
00217 }
00218 }
00219 for (; ic<c; ic++) {
00220 Vec::template vec_binary_scalar<Op>
00221 (dest + Mat::index (0, ic, rr, cc),
00222 src1 + Mat::index (0, 0, rr, ll),
00223 simd_set_duplicate (*(src2 + Mat::index (0, ic, ll, cc))),
00224 r);
00225 for (nat il= 1; il<l; il++) {
00226 Vec::template vec_binary_scalar<Acc>
00227 (dest + Mat::index (0, ic, rr, cc),
00228 src1 + Mat::index (0, il, rr, ll),
00229 simd_set_duplicate (*(src2 + Mat::index (il, ic, ll, cc))),
00230 r);
00231 }
00232 }
00233 }
00234 };
00235
00236 #define DECLARE_HELPER(C) \
00237 STMPL \
00238 struct mul_unrolled_helper<Simd_type(C), Simd_type(C), C>: \
00239 mul_unrolled_sse_helper<C> {};
00240
00241 DECLARE_HELPER(double)
00242 DECLARE_HELPER(uint16_t)
00243 DECLARE_HELPER(int16_t)
00244 DECLARE_HELPER(uint32_t)
00245 DECLARE_HELPER(int32_t)
00246 #undef DECLARE_HELPER
00247
00248
00249
00250
00251
00252 template<typename Op>
00253 struct _matrix_unpacked_sse_mul_3_2_uint8 {
00254 typedef Simd_type(uint8_t) C;
00255 typedef Simd_type(uint16_t) D;
00256 static inline void
00257 op (C*& d1, C*&d2, C*& s1, C*& s2, C*& s3,
00258 const D& c11, const D& c12, const D& c13,
00259 const D& c21, const D& c22, const D& c23) {
00260 typedef typename Op::acc_op Acc;
00261 D r1l, r1h, r2l, r2h, t1, t2, t3;
00262 t1= (D) (__m128i) simd_unpacklo (*s1, _zero_uint8);
00263 t2= (D) (__m128i) simd_unpacklo (*s2, _zero_uint8);
00264 t3= (D) (__m128i) simd_unpacklo (*s3, _zero_uint8);
00265 r1l= (D) (__m128i) simd_unpacklo (*d1, _zero_uint8);
00266 r2l= (D) (__m128i) simd_unpacklo (*d2, _zero_uint8);
00267 Op ::set_op (r1l, c11, t1);
00268 Acc::set_op (r1l, c12, t2);
00269 Acc::set_op (r1l, c13, t3);
00270 Op ::set_op (r2l, c21, t1);
00271 Acc::set_op (r2l, c22, t2);
00272 Acc::set_op (r2l, c23, t3);
00273 t1= (D) (__m128i) simd_unpackhi (*(s1++), _zero_uint8);
00274 t2= (D) (__m128i) simd_unpackhi (*(s2++), _zero_uint8);
00275 t3= (D) (__m128i) simd_unpackhi (*(s3++), _zero_uint8);
00276 r1h= (D) (__m128i) simd_unpackhi (*d1, _zero_uint8);
00277 r2h= (D) (__m128i) simd_unpackhi (*d2, _zero_uint8);
00278 Op ::set_op (r1h, c11, t1);
00279 Acc::set_op (r1h, c12, t2);
00280 Acc::set_op (r1h, c13, t3);
00281 Op ::set_op (r2h, c21, t1);
00282 Acc::set_op (r2h, c22, t2);
00283 Acc::set_op (r2h, c23, t3);
00284 *(d1++)= simd_pack (r1l & _low_mask_uint16, r1h & _low_mask_uint16);
00285 *(d2++)= simd_pack (r2l & _low_mask_uint16, r2h & _low_mask_uint16); }
00286 };
00287
00288 template<typename Op>
00289 struct _matrix_unpacked_sse_mul_2_1_uint8 {
00290 typedef Simd_type(uint8_t) C;
00291 typedef Simd_type(uint16_t) D;
00292 static inline void
00293 op (C*& d1, C*&d2, C*& s1, const D& c11, const D& c21) {
00294 typedef typename Op::acc_op Acc;
00295 D r1l, r1h, r2l, r2h, t1;
00296 t1 = (D) (__m128i) simd_unpacklo (*s1, _zero_uint8);
00297 r1l= (D) (__m128i) simd_unpacklo (*d1, _zero_uint8);
00298 r2l= (D) (__m128i) simd_unpacklo (*d2, _zero_uint8);
00299 Op::set_op (r1l, c11, t1);
00300 Op::set_op (r2l, c21, t1);
00301 t1 = (D) (__m128i) simd_unpackhi (*(s1++), _zero_uint8);
00302 r1h= (D) (__m128i) simd_unpackhi (*d1, _zero_uint8);
00303 r2h= (D) (__m128i) simd_unpackhi (*d2, _zero_uint8);
00304 Op::set_op (r1h, c11, t1);
00305 Op::set_op (r2h, c21, t1);
00306 *(d1++)= simd_pack (r1l & _low_mask_uint16, r1h & _low_mask_uint16);
00307 *(d2++)= simd_pack (r2l & _low_mask_uint16, r2h & _low_mask_uint16); }
00308 };
00309
00310 STMPL
00311 struct _matrix_unpacked_sse_mul_3_2_uint8<mul_op> {
00312 typedef Simd_type(uint8_t) C;
00313 typedef Simd_type(uint16_t) D;
00314 static inline void
00315 op (C*& d1, C*&d2, C*& s1, C*& s2, C*& s3,
00316 const D& c11, const D& c12, const D& c13,
00317 const D& c21, const D& c22, const D& c23) {
00318 D r1l, r1h, r2l, r2h, t1, t2, t3;
00319 t1= (D) (__m128i) simd_unpacklo (*s1, _zero_uint8);
00320 t2= (D) (__m128i) simd_unpacklo (*s2, _zero_uint8);
00321 t3= (D) (__m128i) simd_unpacklo (*s3, _zero_uint8);
00322 r1l= c11 * t1 + c12 * t2 + c13 * t3;
00323 r2l= c21 * t1 + c22 * t2 + c23 * t3;
00324 t1= (D) (__m128i) simd_unpackhi (*(s1++), _zero_uint8);
00325 t2= (D) (__m128i) simd_unpackhi (*(s2++), _zero_uint8);
00326 t3= (D) (__m128i) simd_unpackhi (*(s3++), _zero_uint8);
00327 r1h= c11 * t1 + c12 * t2 + c13 * t3;
00328 r2h= c21 * t1 + c22 * t2 + c23 * t3;
00329 *(d1++)= simd_pack (r1l & _low_mask_uint16, r1h & _low_mask_uint16);
00330 *(d2++)= simd_pack (r2l & _low_mask_uint16, r2h & _low_mask_uint16); }
00331 };
00332
00333 STMPL
00334 struct _matrix_unpacked_sse_mul_3_2_uint8<rmul_op>:
00335 _matrix_unpacked_sse_mul_3_2_uint8<mul_op> {};
00336 STMPL
00337 struct _matrix_unpacked_sse_mul_3_2_uint8<lmul_op>:
00338 _matrix_unpacked_sse_mul_3_2_uint8<mul_op> {};
00339
00340 STMPL
00341 struct _matrix_unpacked_sse_mul_3_2_uint8<mul_add_op> {
00342 typedef Simd_type(uint8_t) C;
00343 typedef Simd_type(uint16_t) D;
00344 static inline void
00345 op (C*& d1, C*&d2, C*& s1, C*& s2, C*& s3,
00346 const D& c11, const D& c12, const D& c13,
00347 const D& c21, const D& c22, const D& c23) {
00348 D r1l, r1h, r2l, r2h, t1, t2, t3;
00349 t1= (D) (__m128i) simd_unpacklo (*s1, _zero_uint8);
00350 t2= (D) (__m128i) simd_unpacklo (*s2, _zero_uint8);
00351 t3= (D) (__m128i) simd_unpacklo (*s3, _zero_uint8);
00352 r1l= c11 * t1 + c12 * t2 + c13 * t3;
00353 r2l= c21 * t1 + c22 * t2 + c23 * t3;
00354 t1= (D) (__m128i) simd_unpackhi (*(s1++), _zero_uint8);
00355 t2= (D) (__m128i) simd_unpackhi (*(s2++), _zero_uint8);
00356 t3= (D) (__m128i) simd_unpackhi (*(s3++), _zero_uint8);
00357 r1h= c11 * t1 + c12 * t2 + c13 * t3;
00358 r2h= c21 * t1 + c22 * t2 + c23 * t3;
00359 *(d1++)+= simd_pack (r1l & _low_mask_uint16, r1h & _low_mask_uint16);
00360 *(d2++)+= simd_pack (r2l & _low_mask_uint16, r2h & _low_mask_uint16); }
00361 };
00362
00363 STMPL
00364 struct _matrix_unpacked_sse_mul_3_2_uint8<rmul_add_op>:
00365 _matrix_unpacked_sse_mul_3_2_uint8<mul_add_op> {};
00366 STMPL
00367 struct _matrix_unpacked_sse_mul_3_2_uint8<lmul_add_op>:
00368 _matrix_unpacked_sse_mul_3_2_uint8<mul_add_op> {};
00369
00370 struct mul_unrolled_unpacked_sse_helper_uint8 {
00371 typedef uint8_t C;
00372 typedef Simd_type(C) Simd_C;
00373 typedef uint16_t D;
00374 typedef Simd_type(D) Simd_D;
00375
00376 template<typename Op, nat ur, nat ul, nat uc, typename V, typename W>
00377 static void
00378 op (Simd_C* dest, const Simd_C* src1, const C* src2,
00379 nat r, nat rr, nat l, nat ll, nat c, nat cc) {
00380 typedef typename Vector_simd_variant (C) VV;
00381 typedef implementation<vector_allocate,VV> Alloc;
00382 typedef implementation<vector_abstractions,VV> Vec;
00383 typedef implementation<matrix_multiply,W> Mat;
00384 typedef typename Op::acc_op Acc;
00385 ASSERT (Alloc::vec_is_aligned (dest), "wrong alignment");
00386 ASSERT (Alloc::vec_is_aligned (src1), "wrong alignment");
00387
00388
00389 if (r == 0 ||
00390 l < Simd_size (C) ||
00391 c < Simd_size (C)) {
00392 mul_full_unrolled<Op, ur, ul, uc, W>
00393 (dest, src1, src2, r, rr, l, ll, c, cc);
00394 return;
00395 }
00396 Simd_C* d1, * d2, * s1, * s2, * s3;
00397 Simd_D c11, c12, c13;
00398 Simd_D c21, c22, c23;
00399
00400 nat szl= 3;
00401 nat szc= 2;
00402 nat ic, il;
00403 for (ic= 0; ic + szc - 1 < c; ic += szc) {
00404 d1= dest + Mat::index (0, ic , rr, cc);
00405 d2= dest + Mat::index (0, ic+1, rr, cc);
00406 s1= (Simd_C*) src1 + Mat::index (0, 0, rr, ll);
00407 s2= (Simd_C*) src1 + Mat::index (0, 1, rr, ll);
00408 s3= (Simd_C*) src1 + Mat::index (0, 2, rr, ll);
00409 c11= simd_set_duplicate ((D) *(src2 + Mat::index (0, ic , ll, cc)));
00410 c12= simd_set_duplicate ((D) *(src2 + Mat::index (1, ic , ll, cc)));
00411 c13= simd_set_duplicate ((D) *(src2 + Mat::index (2, ic , ll, cc)));
00412 c21= simd_set_duplicate ((D) *(src2 + Mat::index (0, ic+1, ll, cc)));
00413 c22= simd_set_duplicate ((D) *(src2 + Mat::index (1, ic+1, ll, cc)));
00414 c23= simd_set_duplicate ((D) *(src2 + Mat::index (2, ic+1, ll, cc)));
00415 for (nat ir= 0; ir < r; ir++)
00416 _matrix_unpacked_sse_mul_3_2_uint8<Op>::op
00417 (d1, d2, s1, s2, s3, c11, c12, c13, c21, c22, c23);
00418 for (il= szl; il + szl - 1 < l; il += szl) {
00419 d1= dest + Mat::index (0, ic , rr, cc);
00420 d2= dest + Mat::index (0, ic+1, rr, cc);
00421 s1= (Simd_C*) src1 + Mat::index (0, il , rr, ll);
00422 s2= (Simd_C*) src1 + Mat::index (0, il+1, rr, ll);
00423 s3= (Simd_C*) src1 + Mat::index (0, il+2, rr, ll);
00424 c11= simd_set_duplicate
00425 ((D) *(src2 + Mat::index (il+0, ic , ll, cc)));
00426 c12= simd_set_duplicate
00427 ((D) *(src2 + Mat::index (il+1, ic , ll, cc)));
00428 c13= simd_set_duplicate
00429 ((D) *(src2 + Mat::index (il+2, ic , ll, cc)));
00430 c21= simd_set_duplicate
00431 ((D) *(src2 + Mat::index (il+0, ic+1, ll, cc)));
00432 c22= simd_set_duplicate
00433 ((D) *(src2 + Mat::index (il+1, ic+1, ll, cc)));
00434 c23= simd_set_duplicate
00435 ((D) *(src2 + Mat::index (il+2, ic+1, ll, cc)));
00436 for (nat ir= 0; ir < r; ir++)
00437 _matrix_unpacked_sse_mul_3_2_uint8<Acc>::op
00438 (d1, d2, s1, s2, s3, c11, c12, c13, c21, c22, c23);
00439 }
00440 for (; il < l; il++) {
00441 d1= dest + Mat::index (0, ic , rr, cc);
00442 d2= dest + Mat::index (0, ic+1, rr, cc);
00443 s1= (Simd_C*) src1 + Mat::index (0, il, rr, ll);
00444 c11= simd_set_duplicate ((D) *(src2 + Mat::index (il, ic , ll, cc)));
00445 c21= simd_set_duplicate ((D) *(src2 + Mat::index (il, ic+1, ll, cc)));
00446 for (nat ir= 0; ir < r; ir++)
00447 _matrix_unpacked_sse_mul_2_1_uint8<Acc>::op (d1, d2, s1, c11, c21);
00448 }
00449 }
00450 for (; ic<c; ic++) {
00451 Vec::template vec_binary_scalar<Op>
00452 (dest + Mat::index (0, ic, rr, cc),
00453 src1 + Mat::index (0, 0, rr, ll),
00454 simd_set_duplicate (*(src2 + Mat::index (0, ic, ll, cc))),
00455 r);
00456 for (nat il= 1; il<l; il++) {
00457 Vec::template vec_binary_scalar<Acc>
00458 (dest + Mat::index (0, ic, rr, cc),
00459 src1 + Mat::index (0, il, rr, ll),
00460 simd_set_duplicate (*(src2 + Mat::index (il, ic, ll, cc))),
00461 r);
00462 }
00463 }
00464 }
00465 };
00466
00467 STMPL
00468 struct mul_unrolled_helper<sse_uint8_t,sse_uint8_t,uint8_t>:
00469 mul_unrolled_unpacked_sse_helper_uint8 {};
00470
00471
00472
00473
00474
00475 struct mul_unrolled_unpacked_sse_helper_int8 {
00476 template<typename Op, nat ur, nat ul, nat uc, typename V, typename W>
00477 static void
00478 op (sse_int8_t* dest, const sse_int8_t* src1, const int8_t* src2,
00479 nat r, nat rr, nat l, nat ll, nat c, nat cc) {
00480 mul_unrolled_unpacked_sse_helper_uint8::op<Op,ur,ul,uc,V,W>
00481 ((sse_uint8_t*) dest, (const sse_uint8_t*) src1, (const uint8_t*) src2,
00482 r, rr, l, ll, c, cc); }
00483 };
00484
00485 STMPL
00486 struct mul_unrolled_helper<sse_int8_t,sse_int8_t,int8_t>:
00487 mul_unrolled_unpacked_sse_helper_int8 {};
00488
00489 #endif // ALGEBRAMIX_ENABLE_SIMD && __SSE2__
00490 }
00491 #endif // __MMX__MATRIX_SIMD__HPP
00492