algebramix: include/algebramix/matrix

00001 
00002 /******************************************************************************
00003 * MODULE     : matrix_sse.hpp
00004 * DESCRIPTION: matrices using SSE instructions
00005 * COPYRIGHT  : (C) 2009, 2012  Joris van der Hoeven and Gregoire Lecerf
00006 *******************************************************************************
00007 * This software falls under the GNU general public license and comes WITHOUT
00008 * ANY WARRANTY WHATSOEVER. See the file $TEXMACS_PATH/LICENSE for more details.
00009 * If you don't have this file, write to the Free Software Foundation, Inc.,
00010 * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
00011 ******************************************************************************/
00012 
00013 #ifndef __MMX__MATRIX_SSE__HPP
00014 #define __MMX__MATRIX_SSE__HPP
00015 #include <algebramix/vector_sse.hpp>
00016 #include <algebramix/matrix_aligned.hpp>
00017 
00018 namespace mmx {
00019 #if defined (ALGEBRAMIX_ENABLE_SIMD) & defined (__SSE2__) \
00020   & defined (ALGEBRAMIX_HAVE_STDINT_H)
00021 
00022 /******************************************************************************
00023 * Product for aligned matrices
00024 ******************************************************************************/
00025 
00026 template<bool b, typename V, typename W, typename Op, typename C>
00027 struct mat_mul_simd_helper {
00028   static inline void
00029   mul (C* d, const C* s1, const C* s2,
00030        nat r, nat rr, nat l, nat ll, nat c, nat cc)
00031   {
00032     typedef implementation<matrix_multiply,V> Mat;
00033     Mat::template mul<Op> (d, s1, s2, r, rr, l, ll, c, cc);
00034   }
00035 };
00036   
00037 template<typename V, typename W, typename Op, typename C>
00038 struct mat_mul_simd_helper<true,V,W,Op,C> {
00039   static inline void
00040   mul (C* d, const C* s1, const C* s2,
00041        nat r, nat rr, nat l, nat ll, nat c, nat cc)
00042   {
00043     typedef typename Simd_type (C) simd_C;
00044     typedef implementation<matrix_multiply,W> SMat;
00045     static const nat m= Simd_size (C);
00046     VERIFY (r % m == 0, "alignment problem");
00047     VERIFY (rr % m == 0, "alignment problem");
00048     SMat::template mul<Op> ((simd_C*) d, (simd_C*) s1, s2,
00049                             r / m, rr / m, l, ll, c, cc);
00050   }
00051 };
00052 
00053 template<typename V, typename W,
00054          typename Op, typename C>
00055 struct mat_mul_aligned_helper<V,W,Op,C,C,C> {
00056   static inline void
00057   mul (C* d, const C* s1, const C* s2,
00058        nat r, nat rr, nat l, nat ll, nat c, nat cc)
00059   {
00060     mat_mul_simd_helper<sse_has_helper<Op, C>::value,V,W,Op,C>
00061       ::mul (d, s1, s2, r, rr, l, ll, c, cc);
00062   }
00063 };
00064 
00065 /******************************************************************************
00066 * Special direct unrolling
00067 ******************************************************************************/
00068 
00069 template<typename Op>
00070 struct _matrix_sse_mul_3_2 {
00071   template<typename C> static inline void
00072   op (C*& d1, C*&d2, C*& s1, C*& s2, C*& s3,
00073       const C& c11, const C& c12, const C& c13,
00074       const C& c21, const C& c22, const C& c23) {
00075     typedef typename Op::acc_op Acc;
00076     Op ::set_op (*(d1)  , c11, *(s1));
00077     Acc::set_op (*(d1)  , c12, *(s2));
00078     Acc::set_op (*(d1++), c13, *(s3));
00079     Op ::set_op (*(d2)  , c21, *(s1++));
00080     Acc::set_op (*(d2)  , c22, *(s2++));
00081     Acc::set_op (*(d2++), c23, *(s3++)); }
00082 };
00083 
00084 STMPL
00085 struct _matrix_sse_mul_3_2<mul_op> {
00086   template<typename C> static inline void
00087   op (C*& d1, C*&d2, C*& s1, C*& s2, C*& s3,
00088       const C& c11, const C& c12, const C& c13,
00089       const C& c21, const C& c22, const C& c23) {
00090     *(d1++)= c11 * *(s1)   + c12 * *(s2)   + c13 * *(s3);
00091     *(d2++)= c21 * *(s1++) + c22 * *(s2++) + c23 * *(s3++); }
00092 };
00093 
00094 STMPL
00095 struct _matrix_sse_mul_3_2<rmul_op>: _matrix_sse_mul_3_2<mul_op> {};
00096 STMPL
00097 struct _matrix_sse_mul_3_2<lmul_op>: _matrix_sse_mul_3_2<mul_op> {};
00098 
00099 STMPL
00100 struct _matrix_sse_mul_3_2<mul_add_op> {
00101   template<typename C> static inline void
00102   op (C*& d1, C*&d2, C*& s1, C*& s2, C*& s3,
00103       const C& c11, const C& c12, const C& c13,
00104       const C& c21, const C& c22, const C& c23) {
00105     *(d1++)+= c11 * *(s1)   + c12 * *(s2)   + c13 * *(s3);
00106     *(d2++)+= c21 * *(s1++) + c22 * *(s2++) + c23 * *(s3++); }
00107 };
00108 
00109 STMPL
00110 struct _matrix_sse_mul_3_2<rmul_add_op>: _matrix_sse_mul_3_2<mul_add_op> {};
00111 STMPL
00112 struct _matrix_sse_mul_3_2<lmul_add_op>: _matrix_sse_mul_3_2<mul_add_op> {};
00113 
00114 template<typename C>
00115 struct mul_unrolled_sse_helper {
00116   typedef typename Simd_type(C) Simd_C;
00117 #if 0
00118   template<typename Op, nat ur, nat ul, nat uc, typename V, typename W>
00119   static void
00120   op (Simd_C* dest, const Simd_C* src1, const C* src2,
00121       nat r, nat rr, nat l, nat ll, nat c, nat cc) {
00122     typedef vector_unrolled<4 * Simd_size(C), vector_naive> VV;
00123     typedef implementation<vector_abstractions,VV> Vec;
00124     typedef implementation<matrix_multiply,W> Mat;
00125     typedef typename Op::acc_op Acc;
00126     if (r < 4 * Simd_size(C)) {
00127       mul_unrolled<Op, ur, ul, uc, W>
00128         (dest, src1, src2, r, rr, l, ll, c, cc);
00129       return;
00130     }
00131     for (nat ic=0; ic<c; ic++) {
00132       Vec::template vec_binary_scalar<Op>
00133         (dest + Mat::index (0, ic, rr, cc),
00134          src1 + Mat::index (0, 0, rr, ll),
00135          simd_set_duplicate (*(src2 + Mat::index (0, ic, ll, cc))),
00136          r);
00137       for (nat il= 1; il<l; il++) {
00138         Vec::template vec_binary_scalar<Acc>
00139           (dest + Mat::index (0, ic, rr, cc),
00140            src1 + Mat::index (0, il, rr, ll),
00141            simd_set_duplicate (*(src2 + Mat::index (il, ic, ll, cc))),
00142            r);
00143       }
00144     }
00145   }
00146 #endif
00147 
00148   template<typename Op, nat ur, nat ul, nat uc, typename V, typename W>
00149   static void
00150   op (Simd_C* dest, const Simd_C* src1, const C* src2,
00151       nat r, nat rr, nat l, nat ll, nat c, nat cc) {
00152     typedef typename Vector_simd_variant (C) VV;
00153     typedef implementation<vector_allocate,VV> Alloc;
00154     typedef implementation<vector_abstractions,VV> Vec;
00155     typedef implementation<matrix_multiply,W> Mat;
00156     typedef typename Op::acc_op Acc;
00157     ASSERT (Alloc::vec_is_aligned (dest), "wrong alignment");
00158     ASSERT (Alloc::vec_is_aligned (src1), "wrong alignment");
00159 
00160     // Rectangular matrices product should be improved >> TODO
00161     if (r < 2 * Simd_size (C) ||
00162         l < 2 * Simd_size (C) ||
00163         c < 2 * Simd_size (C)) {
00164       mul_full_unrolled<Op, ur, ul, uc, W>
00165         (dest, src1, src2, r, rr, l, ll, c, cc);
00166       return;
00167     }
00168 
00169     Simd_C* d1, * d2, * s1, * s2, * s3;
00170     Simd_C c11, c12, c13;
00171     Simd_C c21, c22, c23;
00172     // 2 x 3 submatrices times vector is vectorized
00173     nat szl= 3;
00174     nat szc= 2;
00175     nat ic, il;
00176     for (ic= 0; ic + szc - 1 < c; ic += szc) {
00177       d1= dest + Mat::index (0, ic  , rr, cc);
00178       d2= dest + Mat::index (0, ic+1, rr, cc);
00179       s1= (Simd_C*) src1 + Mat::index (0, 0, rr, ll);
00180       s2= (Simd_C*) src1 + Mat::index (0, 1, rr, ll);
00181       s3= (Simd_C*) src1 + Mat::index (0, 2, rr, ll);
00182       c11= simd_set_duplicate (*(src2 + Mat::index (0, ic  , ll, cc)));
00183       c12= simd_set_duplicate (*(src2 + Mat::index (1, ic  , ll, cc)));
00184       c13= simd_set_duplicate (*(src2 + Mat::index (2, ic  , ll, cc)));
00185       c21= simd_set_duplicate (*(src2 + Mat::index (0, ic+1, ll, cc)));
00186       c22= simd_set_duplicate (*(src2 + Mat::index (1, ic+1, ll, cc)));
00187       c23= simd_set_duplicate (*(src2 + Mat::index (2, ic+1, ll, cc)));
00188       for (nat ir= 0; ir < r; ir++)
00189         _matrix_sse_mul_3_2<Op>::op (d1, d2, s1, s2, s3,
00190                                      c11, c12, c13, c21, c22, c23);
00191       for (il= szl; il + szl - 1 < l; il += szl) {
00192         d1= dest + Mat::index (0, ic  , rr, cc);
00193         d2= dest + Mat::index (0, ic+1, rr, cc);
00194         s1= (Simd_C*) src1 + Mat::index (0, il  , rr, ll);
00195         s2= (Simd_C*) src1 + Mat::index (0, il+1, rr, ll);
00196         s3= (Simd_C*) src1 + Mat::index (0, il+2, rr, ll);
00197         c11= simd_set_duplicate (*(src2 + Mat::index (il+0, ic  , ll, cc)));
00198         c12= simd_set_duplicate (*(src2 + Mat::index (il+1, ic  , ll, cc)));
00199         c13= simd_set_duplicate (*(src2 + Mat::index (il+2, ic  , ll, cc)));
00200         c21= simd_set_duplicate (*(src2 + Mat::index (il+0, ic+1, ll, cc)));
00201         c22= simd_set_duplicate (*(src2 + Mat::index (il+1, ic+1, ll, cc)));
00202         c23= simd_set_duplicate (*(src2 + Mat::index (il+2, ic+1, ll, cc)));
00203         for (nat ir= 0; ir < r; ir++)
00204           _matrix_sse_mul_3_2<Acc>::op (d1, d2, s1, s2, s3,
00205                                         c11, c12, c13, c21, c22, c23);
00206       }
00207       for (; il < l; il++) {
00208         d1= dest + Mat::index (0, ic  , rr, cc);
00209         d2= dest + Mat::index (0, ic+1, rr, cc);
00210         s1= (Simd_C*) src1 + Mat::index (0, il, rr, ll);
00211         c11= simd_set_duplicate (*(src2 + Mat::index (il, ic  , ll, cc)));
00212         c21= simd_set_duplicate (*(src2 + Mat::index (il, ic+1, ll, cc)));
00213         for (nat ir= 0; ir < r; ir++) {
00214           Acc::set_op (*(d1++), c11, *(s1));
00215           Acc::set_op (*(d2++), c21, *(s1++));
00216         }
00217       }
00218     }
00219     for (; ic<c; ic++) {
00220       Vec::template vec_binary_scalar<Op>
00221         (dest + Mat::index (0, ic, rr, cc),
00222          src1 + Mat::index (0, 0, rr, ll),
00223          simd_set_duplicate (*(src2 + Mat::index (0, ic, ll, cc))),
00224          r);
00225       for (nat il= 1; il<l; il++) {
00226         Vec::template vec_binary_scalar<Acc>
00227           (dest + Mat::index (0, ic, rr, cc),
00228            src1 + Mat::index (0, il, rr, ll),
00229            simd_set_duplicate (*(src2 + Mat::index (il, ic, ll, cc))),
00230            r);
00231       }
00232     }
00233   }
00234 };
00235 
00236 #define DECLARE_HELPER(C)                                               \
00237   STMPL                                                                 \
00238   struct mul_unrolled_helper<Simd_type(C), Simd_type(C), C>:            \
00239     mul_unrolled_sse_helper<C> {};
00240 
00241 DECLARE_HELPER(double)
00242 DECLARE_HELPER(uint16_t)
00243 DECLARE_HELPER(int16_t)
00244 DECLARE_HELPER(uint32_t)
00245 DECLARE_HELPER(int32_t)
00246 #undef DECLARE_HELPER
00247 
00248 /******************************************************************************
00249 * Special unrolling that unpacks and repacks for uint8_t only
00250 ******************************************************************************/
00251 
00252 template<typename Op>
00253 struct _matrix_unpacked_sse_mul_3_2_uint8 {
00254   typedef Simd_type(uint8_t) C;
00255   typedef Simd_type(uint16_t) D;
00256   static inline void
00257   op (C*& d1, C*&d2, C*& s1, C*& s2, C*& s3,
00258       const D& c11, const D& c12, const D& c13,
00259       const D& c21, const D& c22, const D& c23) {
00260     typedef typename Op::acc_op Acc;
00261     D r1l, r1h, r2l, r2h, t1, t2, t3;
00262     t1= (D) (__m128i) simd_unpacklo (*s1, _zero_uint8);
00263     t2= (D) (__m128i) simd_unpacklo (*s2, _zero_uint8);
00264     t3= (D) (__m128i) simd_unpacklo (*s3, _zero_uint8);
00265     r1l= (D) (__m128i) simd_unpacklo (*d1, _zero_uint8);
00266     r2l= (D) (__m128i) simd_unpacklo (*d2, _zero_uint8);
00267     Op ::set_op (r1l, c11, t1);
00268     Acc::set_op (r1l, c12, t2);
00269     Acc::set_op (r1l, c13, t3);
00270     Op ::set_op (r2l, c21, t1);
00271     Acc::set_op (r2l, c22, t2);
00272     Acc::set_op (r2l, c23, t3);
00273     t1= (D) (__m128i) simd_unpackhi (*(s1++), _zero_uint8);
00274     t2= (D) (__m128i) simd_unpackhi (*(s2++), _zero_uint8);
00275     t3= (D) (__m128i) simd_unpackhi (*(s3++), _zero_uint8);
00276     r1h= (D) (__m128i) simd_unpackhi (*d1, _zero_uint8);
00277     r2h= (D) (__m128i) simd_unpackhi (*d2, _zero_uint8);
00278     Op ::set_op (r1h, c11, t1);
00279     Acc::set_op (r1h, c12, t2);
00280     Acc::set_op (r1h, c13, t3);
00281     Op ::set_op (r2h, c21, t1);
00282     Acc::set_op (r2h, c22, t2);
00283     Acc::set_op (r2h, c23, t3);
00284     *(d1++)= simd_pack (r1l & _low_mask_uint16, r1h & _low_mask_uint16);
00285     *(d2++)= simd_pack (r2l & _low_mask_uint16, r2h & _low_mask_uint16); }
00286 };
00287 
00288 template<typename Op>
00289 struct _matrix_unpacked_sse_mul_2_1_uint8 {
00290   typedef Simd_type(uint8_t) C;
00291   typedef Simd_type(uint16_t) D;
00292   static inline void
00293   op (C*& d1, C*&d2, C*& s1, const D& c11, const D& c21) {
00294     typedef typename Op::acc_op Acc;
00295     D r1l, r1h, r2l, r2h, t1;
00296     t1 = (D) (__m128i) simd_unpacklo (*s1, _zero_uint8);
00297     r1l= (D) (__m128i) simd_unpacklo (*d1, _zero_uint8);
00298     r2l= (D) (__m128i) simd_unpacklo (*d2, _zero_uint8);
00299     Op::set_op (r1l, c11, t1);
00300     Op::set_op (r2l, c21, t1);
00301     t1 = (D) (__m128i) simd_unpackhi (*(s1++), _zero_uint8);
00302     r1h= (D) (__m128i) simd_unpackhi (*d1, _zero_uint8);
00303     r2h= (D) (__m128i) simd_unpackhi (*d2, _zero_uint8);
00304     Op::set_op (r1h, c11, t1);
00305     Op::set_op (r2h, c21, t1);
00306     *(d1++)= simd_pack (r1l & _low_mask_uint16, r1h & _low_mask_uint16);
00307     *(d2++)= simd_pack (r2l & _low_mask_uint16, r2h & _low_mask_uint16); }
00308 };
00309 
00310 STMPL
00311 struct _matrix_unpacked_sse_mul_3_2_uint8<mul_op> {
00312   typedef Simd_type(uint8_t) C;
00313   typedef Simd_type(uint16_t) D;
00314   static inline void
00315   op (C*& d1, C*&d2, C*& s1, C*& s2, C*& s3,
00316       const D& c11, const D& c12, const D& c13,
00317       const D& c21, const D& c22, const D& c23) {
00318     D r1l, r1h, r2l, r2h, t1, t2, t3;
00319     t1= (D) (__m128i) simd_unpacklo (*s1, _zero_uint8);
00320     t2= (D) (__m128i) simd_unpacklo (*s2, _zero_uint8);
00321     t3= (D) (__m128i) simd_unpacklo (*s3, _zero_uint8);
00322     r1l= c11 * t1 + c12 * t2 + c13 * t3;
00323     r2l= c21 * t1 + c22 * t2 + c23 * t3;
00324     t1= (D) (__m128i) simd_unpackhi (*(s1++), _zero_uint8);
00325     t2= (D) (__m128i) simd_unpackhi (*(s2++), _zero_uint8);
00326     t3= (D) (__m128i) simd_unpackhi (*(s3++), _zero_uint8);
00327     r1h= c11 * t1 + c12 * t2 + c13 * t3;
00328     r2h= c21 * t1 + c22 * t2 + c23 * t3;
00329     *(d1++)= simd_pack (r1l & _low_mask_uint16, r1h & _low_mask_uint16);
00330     *(d2++)= simd_pack (r2l & _low_mask_uint16, r2h & _low_mask_uint16); }
00331 };
00332 
00333 STMPL
00334 struct _matrix_unpacked_sse_mul_3_2_uint8<rmul_op>:
00335     _matrix_unpacked_sse_mul_3_2_uint8<mul_op> {};
00336 STMPL
00337 struct _matrix_unpacked_sse_mul_3_2_uint8<lmul_op>:
00338     _matrix_unpacked_sse_mul_3_2_uint8<mul_op> {};
00339 
00340 STMPL
00341 struct _matrix_unpacked_sse_mul_3_2_uint8<mul_add_op> {
00342   typedef Simd_type(uint8_t) C;
00343   typedef Simd_type(uint16_t) D;
00344   static inline void
00345   op (C*& d1, C*&d2, C*& s1, C*& s2, C*& s3,
00346       const D& c11, const D& c12, const D& c13,
00347       const D& c21, const D& c22, const D& c23) {
00348     D r1l, r1h, r2l, r2h, t1, t2, t3;
00349     t1= (D) (__m128i) simd_unpacklo (*s1, _zero_uint8);
00350     t2= (D) (__m128i) simd_unpacklo (*s2, _zero_uint8);
00351     t3= (D) (__m128i) simd_unpacklo (*s3, _zero_uint8);
00352     r1l= c11 * t1 + c12 * t2 + c13 * t3;
00353     r2l= c21 * t1 + c22 * t2 + c23 * t3;
00354     t1= (D) (__m128i) simd_unpackhi (*(s1++), _zero_uint8);
00355     t2= (D) (__m128i) simd_unpackhi (*(s2++), _zero_uint8);
00356     t3= (D) (__m128i) simd_unpackhi (*(s3++), _zero_uint8);
00357     r1h= c11 * t1 + c12 * t2 + c13 * t3;
00358     r2h= c21 * t1 + c22 * t2 + c23 * t3;
00359     *(d1++)+= simd_pack (r1l & _low_mask_uint16, r1h & _low_mask_uint16);
00360     *(d2++)+= simd_pack (r2l & _low_mask_uint16, r2h & _low_mask_uint16); }
00361 };
00362 
00363 STMPL
00364 struct _matrix_unpacked_sse_mul_3_2_uint8<rmul_add_op>:
00365     _matrix_unpacked_sse_mul_3_2_uint8<mul_add_op> {};
00366 STMPL
00367 struct _matrix_unpacked_sse_mul_3_2_uint8<lmul_add_op>:
00368     _matrix_unpacked_sse_mul_3_2_uint8<mul_add_op> {};
00369 
00370 struct mul_unrolled_unpacked_sse_helper_uint8 {
00371   typedef uint8_t C;
00372   typedef Simd_type(C) Simd_C;
00373   typedef uint16_t D;
00374   typedef Simd_type(D) Simd_D;
00375 
00376   template<typename Op, nat ur, nat ul, nat uc, typename V, typename W>
00377   static void
00378   op (Simd_C* dest, const Simd_C* src1, const C* src2,
00379       nat r, nat rr, nat l, nat ll, nat c, nat cc) {
00380     typedef typename Vector_simd_variant (C) VV;
00381     typedef implementation<vector_allocate,VV> Alloc;
00382     typedef implementation<vector_abstractions,VV> Vec;
00383     typedef implementation<matrix_multiply,W> Mat;
00384     typedef typename Op::acc_op Acc;
00385     ASSERT (Alloc::vec_is_aligned (dest), "wrong alignment");
00386     ASSERT (Alloc::vec_is_aligned (src1), "wrong alignment");
00387 
00388     // Rectangular matrices product should be improved >> TODO
00389     if (r == 0 ||
00390         l < Simd_size (C) ||
00391         c < Simd_size (C)) {
00392       mul_full_unrolled<Op, ur, ul, uc, W>
00393         (dest, src1, src2, r, rr, l, ll, c, cc);
00394       return;
00395     }
00396     Simd_C* d1, * d2, * s1, * s2, * s3;
00397     Simd_D c11, c12, c13;
00398     Simd_D c21, c22, c23;
00399     // 2 x 3 submatrices times vector is vectorized
00400     nat szl= 3;
00401     nat szc= 2;
00402     nat ic, il;
00403     for (ic= 0; ic + szc - 1 < c; ic += szc) {
00404       d1= dest + Mat::index (0, ic  , rr, cc);
00405       d2= dest + Mat::index (0, ic+1, rr, cc);
00406       s1= (Simd_C*) src1 + Mat::index (0, 0, rr, ll);
00407       s2= (Simd_C*) src1 + Mat::index (0, 1, rr, ll);
00408       s3= (Simd_C*) src1 + Mat::index (0, 2, rr, ll);
00409       c11= simd_set_duplicate ((D) *(src2 + Mat::index (0, ic  , ll, cc)));
00410       c12= simd_set_duplicate ((D) *(src2 + Mat::index (1, ic  , ll, cc)));
00411       c13= simd_set_duplicate ((D) *(src2 + Mat::index (2, ic  , ll, cc)));
00412       c21= simd_set_duplicate ((D) *(src2 + Mat::index (0, ic+1, ll, cc)));
00413       c22= simd_set_duplicate ((D) *(src2 + Mat::index (1, ic+1, ll, cc)));
00414       c23= simd_set_duplicate ((D) *(src2 + Mat::index (2, ic+1, ll, cc)));
00415       for (nat ir= 0; ir < r; ir++)
00416         _matrix_unpacked_sse_mul_3_2_uint8<Op>::op
00417           (d1, d2, s1, s2, s3, c11, c12, c13, c21, c22, c23);
00418       for (il= szl; il + szl - 1 < l; il += szl) {
00419         d1= dest + Mat::index (0, ic  , rr, cc);
00420         d2= dest + Mat::index (0, ic+1, rr, cc);
00421         s1= (Simd_C*) src1 + Mat::index (0, il  , rr, ll);
00422         s2= (Simd_C*) src1 + Mat::index (0, il+1, rr, ll);
00423         s3= (Simd_C*) src1 + Mat::index (0, il+2, rr, ll);
00424         c11= simd_set_duplicate
00425           ((D) *(src2 + Mat::index (il+0, ic  , ll, cc)));
00426         c12= simd_set_duplicate
00427           ((D) *(src2 + Mat::index (il+1, ic  , ll, cc)));
00428         c13= simd_set_duplicate
00429           ((D) *(src2 + Mat::index (il+2, ic  , ll, cc)));
00430         c21= simd_set_duplicate
00431           ((D) *(src2 + Mat::index (il+0, ic+1, ll, cc)));
00432         c22= simd_set_duplicate
00433           ((D) *(src2 + Mat::index (il+1, ic+1, ll, cc)));
00434         c23= simd_set_duplicate
00435           ((D) *(src2 + Mat::index (il+2, ic+1, ll, cc)));
00436         for (nat ir= 0; ir < r; ir++)
00437           _matrix_unpacked_sse_mul_3_2_uint8<Acc>::op
00438             (d1, d2, s1, s2, s3, c11, c12, c13, c21, c22, c23);
00439       }
00440       for (; il < l; il++) {
00441         d1= dest + Mat::index (0, ic  , rr, cc);
00442         d2= dest + Mat::index (0, ic+1, rr, cc);
00443         s1= (Simd_C*) src1 + Mat::index (0, il, rr, ll);
00444         c11= simd_set_duplicate ((D) *(src2 + Mat::index (il, ic  , ll, cc)));
00445         c21= simd_set_duplicate ((D) *(src2 + Mat::index (il, ic+1, ll, cc)));
00446         for (nat ir= 0; ir < r; ir++)
00447           _matrix_unpacked_sse_mul_2_1_uint8<Acc>::op (d1, d2, s1, c11, c21);
00448       }
00449     }
00450     for (; ic<c; ic++) {
00451       Vec::template vec_binary_scalar<Op>
00452         (dest + Mat::index (0, ic, rr, cc),
00453          src1 + Mat::index (0, 0, rr, ll),
00454          simd_set_duplicate (*(src2 + Mat::index (0, ic, ll, cc))),
00455          r);
00456       for (nat il= 1; il<l; il++) {
00457         Vec::template vec_binary_scalar<Acc>
00458           (dest + Mat::index (0, ic, rr, cc),
00459            src1 + Mat::index (0, il, rr, ll),
00460            simd_set_duplicate (*(src2 + Mat::index (il, ic, ll, cc))),
00461            r);
00462       }
00463     }
00464   }
00465 };
00466 
00467 STMPL
00468 struct mul_unrolled_helper<sse_uint8_t,sse_uint8_t,uint8_t>:
00469   mul_unrolled_unpacked_sse_helper_uint8 {};
00470 
00471 /******************************************************************************
00472 * Special case of int8_t
00473 ******************************************************************************/
00474 
00475 struct mul_unrolled_unpacked_sse_helper_int8 {
00476   template<typename Op, nat ur, nat ul, nat uc, typename V, typename W>
00477   static void
00478   op (sse_int8_t* dest, const sse_int8_t* src1, const int8_t* src2,
00479       nat r, nat rr, nat l, nat ll, nat c, nat cc) {
00480     mul_unrolled_unpacked_sse_helper_uint8::op<Op,ur,ul,uc,V,W>
00481       ((sse_uint8_t*) dest, (const sse_uint8_t*) src1, (const uint8_t*) src2,
00482        r, rr, l, ll, c, cc); }
00483 };
00484 
00485 STMPL
00486 struct mul_unrolled_helper<sse_int8_t,sse_int8_t,int8_t>:
00487   mul_unrolled_unpacked_sse_helper_int8 {};
00488 
00489 #endif // ALGEBRAMIX_ENABLE_SIMD && __SSE2__
00490 } // namespace mmx
00491 #endif // __MMX__MATRIX_SIMD__HPP
00492
include/algebramix/matrix_sse.hpp