Main Page | Class Hierarchy | Class List | File List | Class Members | File Members

fermiqcd_sse.h

Go to the documentation of this file.
00001 
00002 
00003 
00004 
00005 
00006 
00007 
00008 
00009 
00010 #define ALIGN16 __attribute__ ((aligned (16)))
00011 #define ALIGN64 __attribute__ ((aligned (64)))
00012 #define _ASM __asm__ __volatile__
00013 
00014 typedef struct { float c1,c2,c3,c4;  } _sse_float  ALIGN16;  
00015 typedef struct { _sse_float c1,c2,c3;} _sse_vector ALIGN16;
00016 typedef struct { int c1,c2,c3,c4;}     _sse_int    ALIGN16;
00017 typedef struct { double c1,c2; }       _sse_double ALIGN16;
00018 
00019 typedef struct {mdp_complex c11,c12,c13,c21,c22,c23,c31,c32,c33; } _sse_su3;
00020 typedef struct {mdp_complex c1,c2,c3; } _sse_su3_vector;
00021 typedef struct {_sse_su3_vector c1,c2,c3,c4; } _sse_spinor;
00022 
00023 static _sse_float _sse_float_sgn12 __attribute__ ((unused)) = {-1.0f,-1.0f,1.0f,1.0f};
00024 static _sse_float _sse_float_sgn13 __attribute__ ((unused)) = {-1.0f,1.0f,-1.0f,1.0f};
00025 static _sse_float _sse_float_sgn14 __attribute__ ((unused)) = {-1.0f,1.0f,1.0f,-1.0f};
00026 static _sse_float _sse_float_sgn23 __attribute__ ((unused)) = {1.0f,-1.0f,-1.0f,1.0f};
00027 static _sse_float _sse_float_sgn24 __attribute__ ((unused)) = {1.0f,-1.0f,1.0f,-1.0f};
00028 static _sse_float _sse_float_sgn34 __attribute__ ((unused)) = {1.0f,1.0f,-1.0f,-1.0f};
00029 static _sse_int   _sse_double_sgn  __attribute__ ((unused)) = {0x0,0x80000000,0x0,0x0};
00030 static _sse_int   _sse_double_sgn2 __attribute__ ((unused)) = {0x0,0x0,0x0,0x80000000};
00031 
00032 // //////////////////////////////////////////////////////////////////////////
00033 // Cache manipulation macros (float)
00034 // //////////////////////////////////////////////////////////////////////////
00035 
00036 
00037 #define _sse_float_prefetch_spinor(addr) \
00038 _ASM ("prefetcht0 %0 \n\t" \
00039       "prefetcht0 %1" \
00040       : \
00041       : \
00042       "m" (*(((char*)(((unsigned int)(addr))&~0x7f)))), \
00043       "m" (*(((char*)(((unsigned int)(addr))&~0x7f))+128)))
00044      
00045 #define _sse_float_prefetch_su3(addr) \
00046 _ASM ("prefetcht0 %0 \n\t" \
00047       "prefetcht0 %1" \
00048       : \
00049       : \
00050       "m" (*(((char*)(((unsigned int)(addr))&~0x7f)))), \
00051       "m" (*(((char*)(((unsigned int)(addr))&~0x7f))+128)))
00052 
00053 // //////////////////////////////////////////////////////////////////////////
00054 //
00055 // Macros for su3 vectors 
00056 //
00057 // Most of these macros operate on pairs of su3 vectors that are stored
00058 // in the low and high words of xmm0,xmm1,xmm2 or xmm3,xmm4,xmm5. For example,
00059 //
00060 // xmm0 -> sl.c1.real(),sl.c1.imag(),sh.c1.real(),sh.c1.imag()
00061 // xmm1 -> sl.c2.real(),sl.c2.imag(),sh.c2.real(),sh.c2.imag()
00062 // xmm2 -> sl.c3.real(),sl.c3.imag(),sh.c3.real(),sh.c3.imag()
00063 //
00064 // (where sl and sh are of type su3_vector). This can also be interpreted as
00065 // an _sse_vector s that is stored in these registers according to
00066 //
00067 // xmm0 -> s.c1.c1,s.c1.c2,s.c1.c3,s.c1.c4
00068 // xmm1 -> s.c2.c1,s.c2.c2,s.c2.c3,s.c2.c4
00069 // xmm2 -> s.c3.c1,s.c3.c2,s.c3.c3,s.c3.c4
00070 //
00071 // The load and store macros can be used to move data in either format
00072 // from and to the xmm registers
00073 //
00074 // //////////////////////////////////////////////////////////////////////////
00075 
00076 // //////////////////////////////////////////////////////////////////////////
00077 //
00078 // Operations for SU3 color linear algebra used in mdp_matrix (float)
00079 //
00080 // //////////////////////////////////////////////////////////////////////////
00081 
00082 // //////////////////////////////////////////////////////////////////////////
00083 // Loads two su3 vectors sl and sh to the low and high words of xmm0,xmm1,xmm2
00084 // //////////////////////////////////////////////////////////////////////////
00085 
00086 #define _sse_float_pair_load(sl,sh) \
00087 _ASM ("movlps %0, %%xmm0 \n\t" \
00088       "movlps %1, %%xmm1 \n\t" \
00089       "movlps %2, %%xmm2 \n\t" \
00090       "movhps %3, %%xmm0 \n\t" \
00091       "movhps %4, %%xmm1 \n\t" \
00092       "movhps %5, %%xmm2" \
00093        : \
00094        : \
00095        "m" ((sl).c1), \
00096        "m" ((sl).c2), \
00097        "m" ((sl).c3), \
00098        "m" ((sh).c1), \
00099        "m" ((sh).c2), \
00100        "m" ((sh).c3))
00101 
00102 // //////////////////////////////////////////////////////////////////////////
00103 // Loads two su3 vectors sl and sh to the low and high words of xmm3,xmm4,xmm5
00104 // //////////////////////////////////////////////////////////////////////////  
00105 
00106 #define _sse_float_pair_load_up(sl,sh) \
00107 _ASM ("movlps %0, %%xmm3 \n\t" \
00108       "movlps %1, %%xmm4 \n\t" \
00109       "movlps %2, %%xmm5 \n\t" \
00110       "movhps %3, %%xmm3 \n\t" \
00111       "movhps %4, %%xmm4 \n\t" \
00112       "movhps %5, %%xmm5" \
00113       : \
00114       : \
00115       "m" ((sl).c1), \
00116       "m" ((sl).c2), \
00117       "m" ((sl).c3), \
00118       "m" ((sh).c1), \
00119       "m" ((sh).c2), \
00120       "m" ((sh).c3))
00121 
00122 
00123 // //////////////////////////////////////////////////////////////////////////  
00124 // Stores the low and high words of xmm0,xmm1,xmm2 to the su3 vectors rl and rh
00125 // //////////////////////////////////////////////////////////////////////////  
00126 
00127 #define _sse_float_pair_store(rl,rh) \
00128 _ASM ("movlps %%xmm0, %0 \n\t" \
00129       "movlps %%xmm1, %1 \n\t" \
00130       "movlps %%xmm2, %2 \n\t" \
00131       "movhps %%xmm0, %3 \n\t" \
00132       "movhps %%xmm1, %4 \n\t" \
00133       "movhps %%xmm2, %5" \
00134       : \
00135       "=m" ((rl).c1), \
00136       "=m" ((rl).c2), \
00137       "=m" ((rl).c3), \
00138       "=m" ((rh).c1), \
00139       "=m" ((rh).c2), \
00140       "=m" ((rh).c3))
00141 
00142 // //////////////////////////////////////////////////////////////////////////  
00143 // Stores the low and high words of xmm3,xmm4,xmm5 to the su3 vectors rl and rh
00144 // //////////////////////////////////////////////////////////////////////////  
00145 
00146 #define _sse_float_pair_store_up(rl,rh) \
00147 _ASM ("movlps %%xmm3, %0 \n\t" \
00148       "movlps %%xmm4, %1 \n\t" \
00149       "movlps %%xmm5, %2 \n\t" \
00150       "movhps %%xmm3, %3 \n\t" \
00151       "movhps %%xmm4, %4 \n\t" \
00152       "movhps %%xmm5, %5" \
00153       : \
00154       "=m" ((rl).c1), \
00155       "=m" ((rl).c2), \
00156       "=m" ((rl).c3), \
00157       "=m" ((rh).c1), \
00158       "=m" ((rh).c2), \
00159       "=m" ((rh).c3))
00160 
00161 // //////////////////////////////////////////////////////////////////////////  
00162 // Loads the components s.c1,s.c2,s.c3 of an _sse_float_vector s to xmm0,xmm1,xmm2
00163 // //////////////////////////////////////////////////////////////////////////  
00164 
00165 #define _sse_float_vector_load(s) \
00166 _ASM ("movaps %0, %%xmm0 \n\t" \
00167       "movaps %1, %%xmm1 \n\t" \
00168       "movaps %2, %%xmm2" \
00169       : \
00170       : \
00171       "m" ((s).c1), \
00172       "m" ((s).c2), \
00173       "m" ((s).c3))
00174 
00175 #define _sse_float_vector_load_up(s) \
00176 _ASM ("movaps %0, %%xmm3 \n\t" \
00177       "movaps %1, %%xmm4 \n\t" \
00178       "movaps %2, %%xmm5" \
00179       : \
00180       : \
00181       "m" ((s).c1), \
00182       "m" ((s).c2), \
00183       "m" ((s).c3))
00184 // //////////////////////////////////////////////////////////////////////////  
00185 // Stores xmm0,xmm1,xmm2 to the components r.c1,r.c2,r.c3 of an _sse_float_vector r 
00186 // //////////////////////////////////////////////////////////////////////////  
00187 
00188 #define _sse_float_vector_store(r) \
00189 _ASM ("movaps %%xmm0, %0 \n\t" \
00190       "movaps %%xmm1, %1 \n\t" \
00191       "movaps %%xmm2, %2" \
00192       : \
00193       "=m" ((r).c1), \
00194       "=m" ((r).c2), \
00195       "=m" ((r).c3))
00196 
00197 // //////////////////////////////////////////////////////////////////////////  
00198 // Multiplies xmm0,xmm1,xmm2 with a constant _sse_float c
00199 // //////////////////////////////////////////////////////////////////////////  
00200 
00201 #define _sse_float_vector_mul(c) \
00202 _ASM ("mulps %0, %%xmm0 \n\t" \
00203       "mulps %0, %%xmm1 \n\t" \
00204       "mulps %0, %%xmm2" \
00205       : \
00206       : \
00207       "m" (c))
00208 
00209 // //////////////////////////////////////////////////////////////////////////  
00210 // multiplies xmm0, xmm1, xmm2 for complex=a*(b+I)
00211 // (written by Massimo Di Pierro)
00212 // //////////////////////////////////////////////////////////////////////////  
00213 /* deprecated
00214 
00215 #define _sse_float_vector_mulc(a,b) \
00216 _ASM ("mulps %0, %%xmm0 \n\t" \
00217       "mulps %0, %%xmm1 \n\t" \
00218       "mulps %0, %%xmm2 \n\t" \
00219       "movaps %%xmm0, %%xmm3 \n\t" \
00220       "movaps %%xmm1, %%xmm4 \n\t" \
00221       "movaps %%xmm2, %%xmm5 \n\t" \
00222       "mulps %1, %%xmm0 \n\t" \
00223       "mulps %1, %%xmm1 \n\t" \
00224       "mulps %1, %%xmm2 \n\t" \
00225       "shufps $0xb1, %%xmm3, %%xmm3 \n\t" \
00226       "shufps $0xb1, %%xmm4, %%xmm4 \n\t" \
00227       "shufps $0xb1, %%xmm5, %%xmm5 \n\t" \
00228       "mulps %2, %%xmm3 \n\t" \
00229       "mulps %2, %%xmm4 \n\t" \
00230       "mulps %2, %%xmm5 \n\t" \
00231       "addps %%xmm3, %%xmm0 \n\t" \
00232       "addps %%xmm4, %%xmm1 \n\t" \
00233       "addps %%xmm5, %%xmm2" \
00234       : \
00235       : \
00236       "m" (y), \
00237       "m" (x/y), \
00238       "m" (_sse_float_sgn13))
00239 */
00240 // //////////////////////////////////////////////////////////////////////////  
00241 // Adds xmm3,xmm4,xmm5 to xmm1,xmm2,xmm3
00242 // (modified by Massimo Di Pierro to work with g++ instead of gcc)
00243 // //////////////////////////////////////////////////////////////////////////  
00244 
00245 #ifdef SSE2FIX
00246 #define _sse_float_vector_add() \
00247 _ASM ("addps %xmm3, %xmm0 \n\t" \
00248       "addps %xmm4, %xmm1 \n\t" \
00249       "addps %xmm5, %xmm2 \n\t" \
00250       : \
00251       : )
00252 #else
00253 #define _sse_float_vector_add() \
00254 _ASM ("addps %%xmm3, %%xmm0 \n\t" \
00255       "addps %%xmm4, %%xmm1 \n\t" \
00256       "addps %%xmm5, %%xmm2 \n\t" \
00257       : \
00258       : )
00259 #endif
00260 
00261 // //////////////////////////////////////////////////////////////////////////  
00262 // Subtracts xmm3,xmm4,xmm5 from xmm1,xmm2,xmm3
00263 // (modified by Massimo Di Pierro to work with g++ instead of gcc)
00264 // //////////////////////////////////////////////////////////////////////////  
00265 
00266 #ifdef SSE2FIX
00267 #define _sse_float_vector_sub() \
00268 _ASM ("subps %xmm3, %xmm0 \n\t" \
00269       "subps %xmm4, %xmm1 \n\t" \
00270       "subps %xmm5, %xmm2" \
00271       : \
00272       :)
00273 #else
00274 #define _sse_float_vector_sub() \
00275 _ASM ("subps %%xmm3, %%xmm0 \n\t" \
00276       "subps %%xmm4, %%xmm1 \n\t" \
00277       "subps %%xmm5, %%xmm2" \
00278       : \
00279       :)
00280 #endif
00281 
00282 // //////////////////////////////////////////////////////////////////////////  
00283 // Multiplies the high words xmm3,xmm4,xmm5 with -1 and adds these registers// to xmm0,xmm1,xmm2
00284 // //////////////////////////////////////////////////////////////////////////  
00285 
00286 #define _sse_float_vector_addsub() \
00287 _ASM ("mulps %0, %%xmm3 \n\t" \
00288       "mulps %0, %%xmm4 \n\t" \
00289       "mulps %0, %%xmm5 \n\t" \
00290       "addps %%xmm3, %%xmm0 \n\t" \
00291       "addps %%xmm4, %%xmm1 \n\t" \
00292       "addps %%xmm5, %%xmm2" \
00293       : \
00294       : \
00295       "m" (_sse_float_sgn34))
00296 
00297 // //////////////////////////////////////////////////////////////////////////  
00298 // Multiplies a pair sl,sh of su3 vectors with an su3 matrix u,
00299 // assuming sl and sh are in the low and high words of xmm0,xmm1,xmm2
00300 //
00301 // On output the result is in xmm3,xmm4,xmm5 and the registers 
00302 // xmm0,xmm1,xmm2 are changed
00303 // //////////////////////////////////////////////////////////////////////////  
00304 
00305 #define _sse_float_su3_multiply(u) { \
00306 _ASM ("movss %0, %%xmm3 \n\t" \
00307       "movss %1, %%xmm6 \n\t" \
00308       "movss %2, %%xmm4 \n\t" \
00309       "movss %3, %%xmm7 \n\t" \
00310       "movss %4, %%xmm5 " \
00311       : \
00312       : \
00313       "m" ((u).c11.real()), \
00314       "m" ((u).c12.real()), \
00315       "m" ((u).c21.real()), \
00316       "m" ((u).c23.real()), \
00317       "m" ((u).c31.real())); \
00318 _ASM ("shufps $0x0, %%xmm3, %%xmm3 \n\t" \
00319       "shufps $0x0, %%xmm6, %%xmm6 \n\t" \
00320       "shufps $0x0, %%xmm4, %%xmm4 \n\t" \
00321       "mulps %%xmm0, %%xmm3 \n\t" \
00322       "shufps $0x0, %%xmm7, %%xmm7 \n\t" \
00323       "mulps %%xmm1, %%xmm6 \n\t" \
00324       "shufps $0x0, %%xmm5, %%xmm5 \n\t" \
00325       "mulps %%xmm0, %%xmm4 \n\t" \
00326       "addps %%xmm6, %%xmm3 \n\t" \
00327       "mulps %%xmm2, %%xmm7 \n\t" \
00328       "mulps %%xmm0, %%xmm5 \n\t" \
00329       "addps %%xmm7, %%xmm4 \n\t" \
00330       "movss %0, %%xmm6 \n\t" \
00331       "movss %1, %%xmm7 \n\t" \
00332       "shufps $0x0, %%xmm6, %%xmm6 \n\t" \
00333       "shufps $0x0, %%xmm7, %%xmm7 \n\t" \
00334       "mulps %%xmm1, %%xmm6 \n\t" \
00335       "mulps %%xmm2, %%xmm7 \n\t" \
00336       "addps %%xmm6, %%xmm5 \n\t" \
00337       "addps %%xmm7, %%xmm3 \n\t" \
00338       "movss %2, %%xmm6 \n\t" \
00339       "movss %3, %%xmm7 \n\t" \
00340       "shufps $0x0, %%xmm6, %%xmm6 \n\t" \
00341       "shufps $0x0, %%xmm7, %%xmm7 \n\t" \
00342       "mulps %%xmm1, %%xmm6 \n\t" \
00343       "mulps %%xmm2, %%xmm7 \n\t" \
00344       "addps %%xmm6, %%xmm4 \n\t" \
00345       "addps %%xmm7, %%xmm5" \
00346       : \
00347       : \
00348       "m" ((u).c32.real()), \
00349       "m" ((u).c13.real()), \
00350       "m" ((u).c22.real()), \
00351       "m" ((u).c33.real())); \
00352 _ASM ("movss %0, %%xmm6 \n\t" \
00353       "movss %1, %%xmm7 \n\t" \
00354       "shufps $0xb1, %%xmm0, %%xmm0 \n\t" \
00355       "shufps $0xb1, %%xmm1, %%xmm1 \n\t" \
00356       "shufps $0xb1, %%xmm2, %%xmm2 \n\t" \
00357       "shufps $0x0, %%xmm6, %%xmm6 \n\t" \
00358       "shufps $0x0, %%xmm7, %%xmm7 \n\t" \
00359       "mulps %4, %%xmm0 \n\t" \
00360       "mulps %4, %%xmm1 \n\t" \
00361       "mulps %4, %%xmm2 \n\t" \
00362       "mulps %%xmm0, %%xmm6 \n\t" \
00363       "mulps %%xmm1, %%xmm7 \n\t" \
00364       "addps %%xmm6, %%xmm3 \n\t" \
00365       "addps %%xmm7, %%xmm4 \n\t" \
00366       "movss %2, %%xmm6 \n\t" \
00367       "movss %3, %%xmm7 \n\t" \
00368       "shufps $0x0, %%xmm6, %%xmm6 \n\t" \
00369       "shufps $0x0, %%xmm7, %%xmm7 \n\t" \
00370       "mulps %%xmm2, %%xmm6 \n\t" \
00371       "mulps %%xmm0, %%xmm7 \n\t" \
00372       "addps %%xmm6, %%xmm5 \n\t" \
00373       "addps %%xmm7, %%xmm4 " \
00374       : \
00375       : \
00376       "m" ((u).c11.imag()), \
00377       "m" ((u).c22.imag()), \
00378       "m" ((u).c33.imag()), \
00379       "m" ((u).c21.imag()), \
00380       "m" (_sse_float_sgn13)); \
00381 _ASM ("movss %0, %%xmm6 \n\t" \
00382       "movss %1, %%xmm7 \n\t" \
00383       "shufps $0x0, %%xmm6, %%xmm6 \n\t" \
00384       "shufps $0x0, %%xmm7, %%xmm7 \n\t" \
00385       "mulps %%xmm1, %%xmm6 \n\t" \
00386       "mulps %%xmm0, %%xmm7 \n\t" \
00387       "addps %%xmm6, %%xmm3 \n\t" \
00388       "addps %%xmm7, %%xmm5 \n\t" \
00389       "movss %2, %%xmm0 \n\t" \
00390       "movss %3, %%xmm6 \n\t" \
00391       "movss %4, %%xmm7 \n\t" \
00392       "shufps $0x0, %%xmm0, %%xmm0 \n\t" \
00393       "shufps $0x0, %%xmm6, %%xmm6 \n\t" \
00394       "shufps $0x0, %%xmm7, %%xmm7 \n\t" \
00395       "mulps %%xmm2, %%xmm0 \n\t" \
00396       "mulps %%xmm1, %%xmm6 \n\t" \
00397       "mulps %%xmm2, %%xmm7 \n\t" \
00398       "addps %%xmm0, %%xmm3 \n\t" \
00399       "addps %%xmm6, %%xmm5 \n\t" \
00400       "addps %%xmm7, %%xmm4" \
00401       : \
00402       : \
00403       "m" ((u).c12.imag()), \
00404       "m" ((u).c31.imag()), \
00405       "m" ((u).c13.imag()), \
00406       "m" ((u).c32.imag()), \
00407       "m" ((u).c23.imag())); }
00408 
00409 // //////////////////////////////////////////////////////////////////////////  
00410 // Multiplies a pair sl,sh of su3 vectors with an su3 matrix u^dagger, 
00411 // assuming sl and sh are in the low and high words of xmm0,xmm1,xmm2
00412 //
00413 // On output the result is in xmm3,xmm4,xmm5 and the registers 
00414 // xmm0,xmm1,xmm2 are changed
00415 // //////////////////////////////////////////////////////////////////////////  
00416 
00417 #define _sse_float_su3_inverse_multiply(u) { \
00418 _ASM ("movss %0, %%xmm3 \n\t" \
00419       "movss %1, %%xmm6 \n\t" \
00420       "movss %2, %%xmm4 \n\t" \
00421       "movss %3, %%xmm7 \n\t" \
00422       "movss %4, %%xmm5 " \
00423       : \
00424       : \
00425       "m" ((u).c11.real()), \
00426       "m" ((u).c21.real()), \
00427       "m" ((u).c12.real()), \
00428       "m" ((u).c32.real()), \
00429       "m" ((u).c13.real())); \
00430 _ASM ("shufps $0x0, %%xmm3, %%xmm3 \n\t" \
00431       "shufps $0x0, %%xmm6, %%xmm6 \n\t" \
00432       "shufps $0x0, %%xmm4, %%xmm4 \n\t" \
00433       "mulps %%xmm0, %%xmm3 \n\t" \
00434       "shufps $0x0, %%xmm7, %%xmm7 \n\t" \
00435       "mulps %%xmm1, %%xmm6 \n\t" \
00436       "shufps $0x0, %%xmm5, %%xmm5 \n\t" \
00437       "mulps %%xmm0, %%xmm4 \n\t" \
00438       "addps %%xmm6, %%xmm3 \n\t" \
00439       "mulps %%xmm2, %%xmm7 \n\t" \
00440       "mulps %%xmm0, %%xmm5 \n\t" \
00441       "addps %%xmm7, %%xmm4 \n\t" \
00442       "movss %0, %%xmm6 \n\t" \
00443       "movss %1, %%xmm7 \n\t" \
00444       "shufps $0x0, %%xmm6, %%xmm6 \n\t" \
00445       "shufps $0x0, %%xmm7, %%xmm7 \n\t" \
00446       "mulps %%xmm1, %%xmm6 \n\t" \
00447       "mulps %%xmm2, %%xmm7 \n\t" \
00448       "addps %%xmm6, %%xmm5 \n\t" \
00449       "addps %%xmm7, %%xmm3 \n\t" \
00450       "movss %2, %%xmm6 \n\t" \
00451       "movss %3, %%xmm7 \n\t" \
00452       "shufps $0x0, %%xmm6, %%xmm6 \n\t" \
00453       "shufps $0x0, %%xmm7, %%xmm7 \n\t" \
00454       "mulps %%xmm1, %%xmm6 \n\t" \
00455       "mulps %%xmm2, %%xmm7 \n\t" \
00456       "addps %%xmm6, %%xmm4 \n\t" \
00457       "addps %%xmm7, %%xmm5 " \
00458       : \
00459       : \
00460       "m" ((u).c23.real()), \
00461       "m" ((u).c31.real()), \
00462       "m" ((u).c22.real()), \
00463       "m" ((u).c33.real())); \
00464 _ASM ("movss %0, %%xmm6 \n\t" \
00465       "movss %1, %%xmm7 \n\t" \
00466       "shufps $0xb1, %%xmm0, %%xmm0 \n\t" \
00467       "shufps $0xb1, %%xmm1, %%xmm1 \n\t" \
00468       "shufps $0xb1, %%xmm2, %%xmm2 \n\t" \
00469       "shufps $0x0, %%xmm6, %%xmm6 \n\t" \
00470       "shufps $0x0, %%xmm7, %%xmm7 \n\t" \
00471       "mulps %4, %%xmm0 \n\t" \
00472       "mulps %4, %%xmm1 \n\t" \
00473       "mulps %4, %%xmm2 \n\t" \
00474       "mulps %%xmm0, %%xmm6 \n\t" \
00475       "mulps %%xmm1, %%xmm7 \n\t" \
00476       "addps %%xmm6, %%xmm3 \n\t" \
00477       "addps %%xmm7, %%xmm4 \n\t" \
00478       "movss %2, %%xmm6 \n\t" \
00479       "movss %3, %%xmm7 " \
00480       : \
00481       : \
00482       "m" ((u).c11.imag()), \
00483       "m" ((u).c22.imag()), \
00484       "m" ((u).c33.imag()), \
00485       "m" ((u).c12.imag()), \
00486       "m" (_sse_float_sgn24)); \
00487 _ASM ("shufps $0x0, %%xmm6, %%xmm6 \n\t" \
00488       "shufps $0x0, %%xmm7, %%xmm7 \n\t" \
00489       "mulps %%xmm2, %%xmm6 \n\t" \
00490       "mulps %%xmm0, %%xmm7 \n\t" \
00491       "addps %%xmm6, %%xmm5 \n\t" \
00492       "addps %%xmm7, %%xmm4 \n\t" \
00493       "movss %0, %%xmm6 \n\t" \
00494       "movss %1, %%xmm7 \n\t" \
00495       "shufps $0x0, %%xmm6, %%xmm6 \n\t" \
00496       "shufps $0x0, %%xmm7, %%xmm7 \n\t" \
00497       "mulps %%xmm1, %%xmm6 \n\t" \
00498       "mulps %%xmm0, %%xmm7 \n\t" \
00499       "addps %%xmm6, %%xmm3 \n\t" \
00500       "addps %%xmm7, %%xmm5 \n\t" \
00501       "movss %2, %%xmm0 \n\t" \
00502       "movss %3, %%xmm6 \n\t" \
00503       "movss %4, %%xmm7 \n\t" \
00504       "shufps $0x0, %%xmm0, %%xmm0 \n\t" \
00505       "shufps $0x0, %%xmm6, %%xmm6 \n\t" \
00506       "shufps $0x0, %%xmm7, %%xmm7 \n\t" \
00507       "mulps %%xmm2, %%xmm0 \n\t" \
00508       "mulps %%xmm1, %%xmm6 \n\t" \
00509       "mulps %%xmm2, %%xmm7 \n\t" \
00510       "addps %%xmm0, %%xmm3 \n\t" \
00511       "addps %%xmm6, %%xmm5 \n\t" \
00512       "addps %%xmm7, %%xmm4" \
00513       : \
00514       : \
00515       "m" ((u).c21.imag()), \
00516       "m" ((u).c13.imag()), \
00517       "m" ((u).c31.imag()), \
00518       "m" ((u).c23.imag()), \
00519       "m" ((u).c32.imag())); }
00520 
00521 // //////////////////////////////////////////////////////////////////////////  
00522 //
00523 // stuff used for optimized gamma matrix algebra (float)
00524 //
00525 // //////////////////////////////////////////////////////////////////////////  
00526 
00527 
00528 // //////////////////////////////////////////////////////////////////////////  
00529 // Multiplies the low words xmm3,xmm4,xmm5 with -1 and adds these registers
00530 // to xmm0,xmm1,xmm2
00531 // //////////////////////////////////////////////////////////////////////////  
00532 
00533 #define _sse_float_vector_subadd() \
00534 _ASM ("mulps %0, %%xmm3 \n\t" \
00535       "mulps %0, %%xmm4 \n\t" \
00536       "mulps %0, %%xmm5 \n\t" \
00537       "addps %%xmm3, %%xmm0 \n\t" \
00538       "addps %%xmm4, %%xmm1 \n\t" \
00539       "addps %%xmm5, %%xmm2" \
00540       : \
00541       : \
00542       "m" (_sse_float_sgn12))
00543 
00544 // //////////////////////////////////////////////////////////////////////////  
00545 // Multiplies xmm3,xmm4,xmm5 with i and adds them to xmm1,xmm2,xmm3
00546 // //////////////////////////////////////////////////////////////////////////  
00547 
00548 #define _sse_float_vector_i_add() \
00549 _ASM ("shufps $0xb1, %%xmm3, %%xmm3 \n\t" \
00550       "shufps $0xb1, %%xmm4, %%xmm4 \n\t" \
00551       "shufps $0xb1, %%xmm5, %%xmm5 \n\t" \
00552       "mulps %0, %%xmm3 \n\t" \
00553       "mulps %0, %%xmm4 \n\t" \
00554       "mulps %0, %%xmm5 \n\t" \
00555       "addps %%xmm3, %%xmm0 \n\t" \
00556       "addps %%xmm4, %%xmm1 \n\t" \
00557       "addps %%xmm5, %%xmm2" \
00558       : \
00559       : \
00560       "m" (_sse_float_sgn13))
00561 
00562 // //////////////////////////////////////////////////////////////////////////  
00563 // Multiplies xmm3,xmm4,xmm5 with i and subtracts them from xmm1,xmm2,xmm3
00564 // //////////////////////////////////////////////////////////////////////////  
00565 
00566 #define _sse_float_vector_i_sub() \
00567 _ASM ("shufps $0xb1, %%xmm3, %%xmm3 \n\t" \
00568       "shufps $0xb1, %%xmm4, %%xmm4 \n\t" \
00569       "shufps $0xb1, %%xmm5, %%xmm5 \n\t" \
00570       "mulps %0, %%xmm3 \n\t" \
00571       "mulps %0, %%xmm4 \n\t" \
00572       "mulps %0, %%xmm5 \n\t" \
00573       "addps %%xmm3, %%xmm0 \n\t" \
00574       "addps %%xmm4, %%xmm1 \n\t" \
00575       "addps %%xmm5, %%xmm2" \
00576       : \
00577       : \
00578       "m" (_sse_float_sgn24))
00579 
00580 // //////////////////////////////////////////////////////////////////////////  
00581 // Exchanges the high and low words of xmm3,xmm4,xmm5, multiplies them with i
00582 // and adds the result to xmm1,xmm2,xmm3
00583 // //////////////////////////////////////////////////////////////////////////  
00584 
00585 #define _sse_float_vector_xch_i_add() \
00586 _ASM ("shufps $0x1b, %%xmm3, %%xmm3 \n\t" \
00587       "shufps $0x1b, %%xmm4, %%xmm4 \n\t" \
00588       "shufps $0x1b, %%xmm5, %%xmm5 \n\t" \
00589       "mulps %0, %%xmm3 \n\t" \
00590       "mulps %0, %%xmm4 \n\t" \
00591       "mulps %0, %%xmm5 \n\t" \
00592       "addps %%xmm3, %%xmm0 \n\t" \
00593       "addps %%xmm4, %%xmm1 \n\t" \
00594       "addps %%xmm5, %%xmm2" \
00595       : \
00596       : \
00597       "m" (_sse_float_sgn13))
00598 
00599 // //////////////////////////////////////////////////////////////////////////  
00600 // Exchanges the high and low words of xmm3,xmm4,xmm5, multiplies them with i
00601 // and subtracts the result from xmm1,xmm2,xmm3
00602 // //////////////////////////////////////////////////////////////////////////  
00603 
00604 #define _sse_float_vector_xch_i_sub() \
00605 _ASM ("shufps $0x1b, %%xmm3, %%xmm3 \n\t" \
00606       "shufps $0x1b, %%xmm4, %%xmm4 \n\t" \
00607       "shufps $0x1b, %%xmm5, %%xmm5 \n\t" \
00608       "mulps %0, %%xmm3 \n\t" \
00609       "mulps %0, %%xmm4 \n\t" \
00610       "mulps %0, %%xmm5 \n\t" \
00611       "addps %%xmm3, %%xmm0 \n\t" \
00612       "addps %%xmm4, %%xmm1 \n\t" \
00613       "addps %%xmm5, %%xmm2" \
00614       : \
00615       : \
00616       "m" (_sse_float_sgn24))
00617 
00618 // //////////////////////////////////////////////////////////////////////////  
00619 // Multiplies the low and high words of xmm3,xmm4,xmm5 with i and -i
00620 // respectively and adds these registers to xmm1,xmm2,xmm3
00621 // //////////////////////////////////////////////////////////////////////////  
00622 
00623 #define _sse_float_vector_i_addsub() \
00624 _ASM ("shufps $0xb1, %%xmm3, %%xmm3 \n\t" \
00625       "shufps $0xb1, %%xmm4, %%xmm4 \n\t" \
00626       "shufps $0xb1, %%xmm5, %%xmm5 \n\t" \
00627       "mulps %0, %%xmm3 \n\t" \
00628       "mulps %0, %%xmm4 \n\t" \
00629       "mulps %0, %%xmm5 \n\t" \
00630       "addps %%xmm3, %%xmm0 \n\t" \
00631       "addps %%xmm4, %%xmm1 \n\t" \
00632       "addps %%xmm5, %%xmm2" \
00633       : \
00634       : \
00635       "m" (_sse_float_sgn14))
00636 
00637 // //////////////////////////////////////////////////////////////////////////  
00638 // Multiplies the low and high words of xmm3,xmm4,xmm5 with -i and i
00639 // respectively and adds these registers to xmm1,xmm2,xmm3
00640 // //////////////////////////////////////////////////////////////////////////  
00641 
00642 #define _sse_float_vector_i_subadd() \
00643 _ASM ("shufps $0xb1, %%xmm3, %%xmm3 \n\t" \
00644       "shufps $0xb1, %%xmm4, %%xmm4 \n\t" \
00645       "shufps $0xb1, %%xmm5, %%xmm5 \n\t" \
00646       "mulps %0, %%xmm3 \n\t" \
00647       "mulps %0, %%xmm4 \n\t" \
00648       "mulps %0, %%xmm5 \n\t" \
00649       "addps %%xmm3, %%xmm0 \n\t" \
00650       "addps %%xmm4, %%xmm1 \n\t" \
00651       "addps %%xmm5, %%xmm2" \
00652       : \
00653       : \
00654       "m" (_sse_float_sgn23))
00655 
00656 // //////////////////////////////////////////////////////////////////////////  
00657 // Exchanges the high and low words in xmm3,xmm4,xmm5 
00658 // //////////////////////////////////////////////////////////////////////////  
00659 
00660 #ifdef SSE2FIX 
00661 #define _sse_float_vector_xch() \
00662 _ASM ("shufps $0x4e, %xmm3, %xmm3 \n\t" \
00663       "shufps $0x4e, %xmm4, %xmm4 \n\t" \
00664       "shufps $0x4e, %xmm5, %xmm5" \
00665       : \
00666       :)
00667 #else
00668 #define _sse_float_vector_xch() \
00669 _ASM ("shufps $0x4e, %%xmm3, %%xmm3 \n\t" \
00670       "shufps $0x4e, %%xmm4, %%xmm4 \n\t" \
00671       "shufps $0x4e, %%xmm5, %%xmm5" \
00672       : \
00673       :)
00674 #endif
00675 
00676 
00677 
00678 // //////////////////////////////////////////////////////////////////////////
00679 // Cache manipulation macros (double)
00680 // //////////////////////////////////////////////////////////////////////////
00681 
00682 #define _sse_double_prefetch_16(addr) \
00683 _ASM ("prefetcht0 %0" \
00684       : \
00685       : "m" (*(addr)))
00686 
00687 #define _sse_double_prefetch_spinor(addr) \
00688 _ASM ("prefetcht0 %0 \n\t" \
00689       "prefetcht0 %1" \
00690       : \
00691       : \
00692       "m" (*(((char*)(((unsigned int)(addr))&~0x7f)))), \
00693       "m" (*(((char*)(((unsigned int)(addr))&~0x7f))+128)))
00694 
00695 #define _sse_double_prefetch_nta_spinor(addr) \
00696 _ASM ("prefetchnta %0 \n\t" \
00697       "prefetchnta %1" \
00698       : \
00699       : \
00700       "m" (*(((char*)(((unsigned int)(addr))&~0x7f)))), \
00701       "m" (*(((char*)(((unsigned int)(addr))&~0x7f))+128)))
00702 
00703 #define _sse_double_prefetch_su3(addr) \
00704 _ASM ("prefetcht0 %0 \n\t" \
00705       "prefetcht0 %1" \
00706       : \
00707       : \
00708       "m" (*(((char*)(((unsigned int)(addr))&~0x7f)))), \
00709       "m" (*(((char*)(((unsigned int)(addr))&~0x7f))+128)))
00710 
00711 
00712 // //////////////////////////////////////////////////////////////////////////
00713 //
00714 // Operations for SU3 color linear algebra used in mdp_matrix (double)
00715 //
00716 // //////////////////////////////////////////////////////////////////////////
00717 
00718 // //////////////////////////////////////////////////////////////////////////
00719 // Loads an su3 vector s to xmm0,xmm1,xmm2
00720 // //////////////////////////////////////////////////////////////////////////
00721 
00722 #define _sse_double_load(s) \
00723 _ASM ("movapd %0, %%xmm0 \n\t" \
00724       "movapd %1, %%xmm1 \n\t" \
00725       "movapd %2, %%xmm2" \
00726       : \
00727       : \
00728       "m" ((s).c1), \
00729       "m" ((s).c2), \
00730       "m" ((s).c3))
00731 
00732 #define _sse_double_load_123(c1, c2, c3) \
00733 _ASM ("movapd %0, %%xmm0 \n\t" \
00734       "movapd %1, %%xmm1 \n\t" \
00735       "movapd %2, %%xmm2" \
00736       : \
00737       : \
00738       "m" (c1), \
00739       "m" (c2), \
00740       "m" (c3))
00741 
00742 
00743 // //////////////////////////////////////////////////////////////////////////
00744 // Loads an su3 vector s to xmm3,xmm4,xmm5
00745 // //////////////////////////////////////////////////////////////////////////  
00746 
00747 #define _sse_double_load_up(s) \
00748 _ASM ("movapd %0, %%xmm3 \n\t" \
00749       "movapd %1, %%xmm4 \n\t" \
00750       "movapd %2, %%xmm5" \
00751       : \
00752       : \
00753       "m" ((s).c1), \
00754       "m" ((s).c2), \
00755       "m" ((s).c3))
00756 
00757 #define _sse_double_load_up_123(c1, c2, c3) \
00758 _ASM ("movapd %0, %%xmm3 \n\t" \
00759       "movapd %1, %%xmm4 \n\t" \
00760       "movapd %2, %%xmm5" \
00761       : \
00762       : \
00763       "m" (c1), \
00764       "m" (c2), \
00765       "m" (c3))
00766 
00767 // //////////////////////////////////////////////////////////////////////////
00768 // Stores xmm0,xmm1,xmm2 to the components r.c1,r.c2,r.c3 of an su3 vector
00769 // //////////////////////////////////////////////////////////////////////////
00770 
00771 #define _sse_double_store(r) \
00772 _ASM ("movapd %%xmm0, %0 \n\t" \
00773       "movapd %%xmm1, %1 \n\t" \
00774       "movapd %%xmm2, %2" \
00775       : \
00776       "=m" ((r).c1), \
00777       "=m" ((r).c2), \
00778       "=m" ((r).c3))
00779 
00780 #define _sse_double_store_123(c1, c2, c3) \
00781 _ASM ("movapd %%xmm0, %0 \n\t" \
00782       "movapd %%xmm1, %1 \n\t" \
00783       "movapd %%xmm2, %2" \
00784       : \
00785       "=m" (c1), \
00786       "=m" (c2), \
00787       "=m" (c3))
00788 
00789 // //////////////////////////////////////////////////////////////////////////
00790 // Stores xmm3,xmm4,xmm5 to the components r.c1,r.c2,r.c3 of an su3 vector
00791 // //////////////////////////////////////////////////////////////////////////
00792 
00793 #define _sse_double_store_up(r) \
00794 _ASM ("movapd %%xmm3, %0 \n\t" \
00795       "movapd %%xmm4, %1 \n\t" \
00796       "movapd %%xmm5, %2" \
00797       : \
00798       "=m" ((r).c1), \
00799       "=m" ((r).c2), \
00800       "=m" ((r).c3))
00801 
00802 #define _sse_double_store_up_123(c1, c2, c3) \
00803 _ASM ("movapd %%xmm3, %0 \n\t" \
00804       "movapd %%xmm4, %1 \n\t" \
00805       "movapd %%xmm5, %2" \
00806       : \
00807       "=m" (c1), \
00808       "=m" (c2), \
00809       "=m" (c3))
00810 
00811 // //////////////////////////////////////////////////////////////////////////
00812 // Multiplies xmm0,xmm1,xmm2 with a constant _sse_double c
00813 // //////////////////////////////////////////////////////////////////////////
00814 
00815 #define _sse_double_vector_mul(c) \
00816 _ASM ("mulpd %0, %%xmm0 \n\t" \
00817       "mulpd %0, %%xmm1 \n\t" \
00818       "mulpd %0, %%xmm2" \
00819       : \
00820       : \
00821       "m" (c))
00822 
00823 // //////////////////////////////////////////////////////////////////////////
00824 // multiplies xmm0, xmm1, xmm2 for complex=a*(b+I)
00825 // //////////////////////////////////////////////////////////////////////////
00826 /* deprecated
00827 #define _sse_double_vector_mulc(a,b) \
00828 _ASM ("mulpd %0, %%xmm0 \n\t" \
00829       "mulpd %0, %%xmm1 \n\t" \
00830       "mulpd %0, %%xmm2 \n\t" \
00831       "movapd %%xmm0, %%xmm3 \n\t" \
00832       "movapd %%xmm1, %%xmm4 \n\t" \
00833       "movapd %%xmm2, %%xmm5 \n\t" \
00834       "mulpd %1, %%xmm0 \n\t" \
00835       "mulpd %1, %%xmm1 \n\t" \
00836       "mulpd %1, %%xmm2 \n\t" \
00837       "shufpd $0x1, %%xmm3, %%xmm3 \n\t" \
00838       "shufpd $0x1, %%xmm4, %%xmm4 \n\t" \
00839       "shufpd $0x1, %%xmm5, %%xmm5 \n\t" \
00840       "xorpd %2, %%xmm3 \n\t" \
00841       "xorpd %2, %%xmm4 \n\t" \
00842       "xorpd %2, %%xmm5 \n\t" \
00843       "addpd %%xmm3, %%xmm0 \n\t" \
00844       "addpd %%xmm4, %%xmm1 \n\t" \
00845       "addpd %%xmm5, %%xmm2" \
00846       : \
00847       : \
00848       "m" (a), \
00849       "m" (b), \
00850       "m" (_sse_double_sgn))      
00851 */
00852 // //////////////////////////////////////////////////////////////////////////
00853 // multiplies xmm0, xmm1, xmm2 for complex=x+I*y
00854 // //////////////////////////////////////////////////////////////////////////
00855 
00856 #define _sse_double_vector_mul_complex(x,y) \
00857 _ASM ("movapd %%xmm0, %%xmm3 \n\t" \
00858       "movapd %%xmm1, %%xmm4 \n\t" \
00859       "movapd %%xmm2, %%xmm5 \n\t" \
00860       "mulpd %1, %%xmm3 \n\t" \
00861       "mulpd %1, %%xmm4 \n\t" \
00862       "mulpd %1, %%xmm5 \n\t" \
00863       "shufpd $0x1, %%xmm3, %%xmm3 \n\t" \
00864       "shufpd $0x1, %%xmm4, %%xmm4 \n\t" \
00865       "shufpd $0x1, %%xmm5, %%xmm5 \n\t" \
00866       "xorpd %2, %%xmm3 \n\t" \
00867       "xorpd %2, %%xmm4 \n\t" \
00868       "xorpd %2, %%xmm5 \n\t" \
00869       "mulpd %0, %%xmm0 \n\t" \
00870       "mulpd %0, %%xmm1 \n\t" \
00871       "mulpd %0, %%xmm2 \n\t" \
00872       "addpd %%xmm0, %%xmm3 \n\t" \
00873       "addpd %%xmm1, %%xmm4 \n\t" \
00874       "addpd %%xmm2, %%xmm5" \
00875       : \
00876       : \
00877       "m" (x), \
00878       "m" (y), \
00879       "m" (_sse_double_sgn))      
00880 
00881 // //////////////////////////////////////////////////////////////////////////
00882 // Adds xmm3,xmm4,xmm5 to xmm1,xmm2,xmm3
00883 // //////////////////////////////////////////////////////////////////////////
00884 
00885 #ifdef SSE2FIX
00886 #define _sse_double_vector_add() \
00887 _ASM ("addpd %xmm3, %xmm0 \n\t" \
00888       "addpd %xmm4, %xmm1 \n\t" \
00889       "addpd %xmm5, %xmm2" \
00890       : \
00891       :)
00892 #else
00893 #define _sse_double_vector_add() \
00894 _ASM ("addpd %%xmm3, %%xmm0 \n\t" \
00895       "addpd %%xmm4, %%xmm1 \n\t" \
00896       "addpd %%xmm5, %%xmm2" \
00897       : \
00898       :)
00899 #endif
00900 
00901 // //////////////////////////////////////////////////////////////////////////  
00902 // Subtracts xmm3,xmm4,xmm5 from xmm1,xmm2,xmm3
00903 // //////////////////////////////////////////////////////////////////////////
00904 
00905 #ifdef SSE2FIX
00906 #define _sse_double_vector_sub() \
00907 _ASM ("subpd %xmm3, %xmm0 \n\t" \
00908       "subpd %xmm4, %xmm1 \n\t" \
00909       "subpd %xmm5, %xmm2" \
00910       : \
00911       :)
00912 #else
00913 #define _sse_double_vector_sub() \
00914 _ASM ("subpd %%xmm3, %%xmm0 \n\t" \
00915       "subpd %%xmm4, %%xmm1 \n\t" \
00916       "subpd %%xmm5, %%xmm2" \
00917       : \
00918       :)
00919 #endif
00920 
00921 // //////////////////////////////////////////////////////////////////////////
00922 // Multiplies an su3 vector s with an su3 matrix u, assuming s is
00923 // stored in  xmm0,xmm1,xmm2
00924 // On output the result is in xmm3,xmm4,xmm5 and the registers 
00925 // xmm0,xmm1,xmm2 are changed
00926 // //////////////////////////////////////////////////////////////////////////
00927 
00928 #define _sse_double_su3_multiply(u) { \
00929 _ASM ("movsd %0, %%xmm3 \n\t" \
00930       "movsd %1, %%xmm6 \n\t" \
00931       "movsd %2, %%xmm4 \n\t" \
00932       "movsd %3, %%xmm7 \n\t" \
00933       "movsd %4, %%xmm5 " \
00934       : \
00935       : \
00936       "m" ((u).c11.real()), \
00937       "m" ((u).c12.real()), \
00938       "m" ((u).c21.real()), \
00939       "m" ((u).c23.real()), \
00940       "m" ((u).c31.real())); \
00941 _ASM ("unpcklpd %%xmm3, %%xmm3 \n\t" \
00942       "unpcklpd %%xmm6, %%xmm6 \n\t" \
00943       "unpcklpd %%xmm4, %%xmm4 \n\t" \
00944       "mulpd %%xmm0, %%xmm3 \n\t" \
00945       "unpcklpd %%xmm7, %%xmm7 \n\t" \
00946       "mulpd %%xmm1, %%xmm6 \n\t" \
00947       "unpcklpd %%xmm5, %%xmm5 \n\t" \
00948       "mulpd %%xmm0, %%xmm4 \n\t" \
00949       "addpd %%xmm6, %%xmm3 \n\t" \
00950       "mulpd %%xmm2, %%xmm7 \n\t" \
00951       "mulpd %%xmm0, %%xmm5 \n\t" \
00952       "addpd %%xmm7, %%xmm4 \n\t" \
00953       "movsd %0, %%xmm6 \n\t" \
00954       "movsd %1, %%xmm7 \n\t" \
00955       "unpcklpd %%xmm6, %%xmm6 \n\t" \
00956       "unpcklpd %%xmm7, %%xmm7 \n\t" \
00957       "mulpd %%xmm1, %%xmm6 \n\t" \
00958       "mulpd %%xmm2, %%xmm7 \n\t" \
00959       "addpd %%xmm6, %%xmm5 \n\t" \
00960       "addpd %%xmm7, %%xmm3 \n\t" \
00961       "movsd %2, %%xmm6 \n\t" \
00962       "movsd %3, %%xmm7 \n\t" \
00963       "unpcklpd %%xmm6, %%xmm6 \n\t" \
00964       "unpcklpd %%xmm7, %%xmm7 \n\t" \
00965       "mulpd %%xmm1, %%xmm6 \n\t" \
00966       "mulpd %%xmm2, %%xmm7 \n\t" \
00967       "addpd %%xmm6, %%xmm4 \n\t" \
00968       "addpd %%xmm7, %%xmm5 " \
00969       : \
00970       : \
00971       "m" ((u).c32.real()), \
00972       "m" ((u).c13.real()), \
00973       "m" ((u).c22.real()), \
00974       "m" ((u).c33.real())); \
00975 _ASM ("movsd %0, %%xmm6 \n\t" \
00976       "movsd %1, %%xmm7 \n\t" \
00977       "shufpd $0x1, %%xmm0, %%xmm0 \n\t" \
00978       "shufpd $0x1, %%xmm1, %%xmm1 \n\t" \
00979       "shufpd $0x1, %%xmm2, %%xmm2 \n\t" \
00980       "unpcklpd %%xmm6, %%xmm6 \n\t" \
00981       "unpcklpd %%xmm7, %%xmm7 \n\t" \
00982       "xorpd %4, %%xmm0 \n\t" \
00983       "xorpd %4, %%xmm1 \n\t" \
00984       "xorpd %4, %%xmm2 \n\t" \
00985       "mulpd %%xmm0, %%xmm6 \n\t" \
00986       "mulpd %%xmm1, %%xmm7 \n\t" \
00987       "addpd %%xmm6, %%xmm3 \n\t" \
00988       "addpd %%xmm7, %%xmm4 \n\t" \
00989       "movsd %2, %%xmm6 \n\t" \
00990       "movsd %3, %%xmm7 " \
00991       : \
00992       : \
00993       "m" ((u).c11.imag()), \
00994       "m" ((u).c22.imag()), \
00995       "m" ((u).c33.imag()), \
00996       "m" ((u).c21.imag()), \
00997       "m" (_sse_double_sgn)); \
00998 _ASM ("unpcklpd %%xmm6, %%xmm6 \n\t" \
00999       "unpcklpd %%xmm7, %%xmm7 \n\t" \
01000       "mulpd %%xmm2, %%xmm6 \n\t" \
01001       "mulpd %%xmm0, %%xmm7 \n\t" \
01002       "addpd %%xmm6, %%xmm5 \n\t" \
01003       "addpd %%xmm7, %%xmm4 \n\t" \
01004       "movsd %0, %%xmm6 \n\t" \
01005       "movsd %1, %%xmm7 \n\t" \
01006       "unpcklpd %%xmm6, %%xmm6 \n\t" \
01007       "unpcklpd %%xmm7, %%xmm7 \n\t" \
01008       "mulpd %%xmm1, %%xmm6 \n\t" \
01009       "mulpd %%xmm0, %%xmm7 \n\t" \
01010       "addpd %%xmm6, %%xmm3 \n\t" \
01011       "addpd %%xmm7, %%xmm5 \n\t" \
01012       "movsd %2, %%xmm0 \n\t" \
01013       "movsd %3, %%xmm6 \n\t" \
01014       "movsd %4, %%xmm7 \n\t" \
01015       "unpcklpd %%xmm0, %%xmm0 \n\t" \
01016       "unpcklpd %%xmm6, %%xmm6 \n\t" \
01017       "unpcklpd %%xmm7, %%xmm7 \n\t" \
01018       "mulpd %%xmm2, %%xmm0 \n\t" \
01019       "mulpd %%xmm1, %%xmm6 \n\t" \
01020       "mulpd %%xmm2, %%xmm7 \n\t" \
01021       "addpd %%xmm0, %%xmm3 \n\t" \
01022       "addpd %%xmm6, %%xmm5 \n\t" \
01023       "addpd %%xmm7, %%xmm4 " \
01024       : \
01025       : \
01026       "m" ((u).c12.imag()), \
01027       "m" ((u).c31.imag()), \
01028       "m" ((u).c13.imag()), \
01029       "m" ((u).c32.imag()), \
01030       "m" ((u).c23.imag())); }
01031 
01032 // //////////////////////////////////////////////////////////////////////////
01033 // Multiplies an su3 vector s with an su3 matrix u^dagger, assuming s is
01034 // stored in  xmm0,xmm1,xmm2
01035 //
01036 // On output the result is in xmm3,xmm4,xmm5 and the registers 
01037 // xmm0,xmm1,xmm2 are changed
01038 // //////////////////////////////////////////////////////////////////////////
01039 
01040 #define _sse_double_su3_inverse_multiply(u) { \
01041 _ASM ("movsd %0, %%xmm3 \n\t" \
01042       "movsd %1, %%xmm6 \n\t" \
01043       "movsd %2, %%xmm4 \n\t" \
01044       "movsd %3, %%xmm7 \n\t" \
01045       "movsd %4, %%xmm5 " \
01046       : \
01047       : \
01048       "m" ((u).c11.real()), \
01049       "m" ((u).c21.real()), \
01050       "m" ((u).c12.real()), \
01051       "m" ((u).c32.real()), \
01052       "m" ((u).c13.real())); \
01053 _ASM ("unpcklpd %%xmm3, %%xmm3 \n\t" \
01054       "unpcklpd %%xmm6, %%xmm6 \n\t" \
01055       "unpcklpd %%xmm4, %%xmm4 \n\t" \
01056       "mulpd %%xmm0, %%xmm3 \n\t" \
01057       "unpcklpd %%xmm7, %%xmm7 \n\t" \
01058       "mulpd %%xmm1, %%xmm6 \n\t" \
01059       "unpcklpd %%xmm5, %%xmm5 \n\t" \
01060       "mulpd %%xmm0, %%xmm4 \n\t" \
01061       "addpd %%xmm6, %%xmm3 \n\t" \
01062       "mulpd %%xmm2, %%xmm7 \n\t" \
01063       "mulpd %%xmm0, %%xmm5 \n\t" \
01064       "addpd %%xmm7, %%xmm4 \n\t" \
01065       "movsd %0, %%xmm6 \n\t" \
01066       "movsd %1, %%xmm7 \n\t" \
01067       "unpcklpd %%xmm6, %%xmm6 \n\t" \
01068       "unpcklpd %%xmm7, %%xmm7 \n\t" \
01069       "mulpd %%xmm1, %%xmm6 \n\t" \
01070       "mulpd %%xmm2, %%xmm7 \n\t" \
01071       "addpd %%xmm6, %%xmm5 \n\t" \
01072       "addpd %%xmm7, %%xmm3 \n\t" \
01073       "movsd %2, %%xmm6 \n\t" \
01074       "movsd %3, %%xmm7 \n\t" \
01075       "unpcklpd %%xmm6, %%xmm6 \n\t" \
01076       "unpcklpd %%xmm7, %%xmm7 \n\t" \
01077       "mulpd %%xmm1, %%xmm6 \n\t" \
01078       "mulpd %%xmm2, %%xmm7 \n\t" \
01079       "addpd %%xmm6, %%xmm4 \n\t" \
01080       "addpd %%xmm7, %%xmm5" \
01081       : \
01082       : \
01083       "m" ((u).c23.real()), \
01084       "m" ((u).c31.real()), \
01085       "m" ((u).c22.real()), \
01086       "m" ((u).c33.real())); \
01087 _ASM ("movsd %0, %%xmm6 \n\t" \
01088       "movsd %1, %%xmm7 \n\t" \
01089       "xorpd %4, %%xmm0 \n\t" \
01090       "xorpd %4, %%xmm1 \n\t" \
01091       "xorpd %4, %%xmm2 \n\t" \
01092       "unpcklpd %%xmm6, %%xmm6 \n\t" \
01093       "unpcklpd %%xmm7, %%xmm7 \n\t" \
01094       "shufpd $0x1, %%xmm0, %%xmm0 \n\t" \
01095       "shufpd $0x1, %%xmm1, %%xmm1 \n\t" \
01096       "shufpd $0x1, %%xmm2, %%xmm2 \n\t" \
01097       "mulpd %%xmm0, %%xmm6 \n\t" \
01098       "mulpd %%xmm1, %%xmm7 \n\t" \
01099       "addpd %%xmm6, %%xmm3 \n\t" \
01100       "addpd %%xmm7, %%xmm4 \n\t" \
01101       "movsd %2, %%xmm6 \n\t" \
01102       "movsd %3, %%xmm7 " \
01103       : \
01104       : \
01105       "m" ((u).c11.imag()), \
01106       "m" ((u).c22.imag()), \
01107       "m" ((u).c33.imag()), \
01108       "m" ((u).c12.imag()), \
01109       "m" (_sse_double_sgn)); \
01110 _ASM ("unpcklpd %%xmm6, %%xmm6 \n\t" \
01111       "unpcklpd %%xmm7, %%xmm7 \n\t" \
01112       "mulpd %%xmm2, %%xmm6 \n\t" \
01113       "mulpd %%xmm0, %%xmm7 \n\t" \
01114       "addpd %%xmm6, %%xmm5 \n\t" \
01115       "addpd %%xmm7, %%xmm4 \n\t" \
01116       "movsd %0, %%xmm6 \n\t" \
01117       "movsd %1, %%xmm7 \n\t" \
01118       "unpcklpd %%xmm6, %%xmm6 \n\t" \
01119       "unpcklpd %%xmm7, %%xmm7 \n\t" \
01120       "mulpd %%xmm1, %%xmm6 \n\t" \
01121       "mulpd %%xmm0, %%xmm7 \n\t" \
01122       "addpd %%xmm6, %%xmm3 \n\t" \
01123       "addpd %%xmm7, %%xmm5 \n\t" \
01124       "movsd %2, %%xmm0 \n\t" \
01125       "movsd %3, %%xmm6 \n\t" \
01126       "movsd %4, %%xmm7 \n\t" \
01127       "unpcklpd %%xmm0, %%xmm0 \n\t" \
01128       "unpcklpd %%xmm6, %%xmm6 \n\t" \
01129       "unpcklpd %%xmm7, %%xmm7 \n\t" \
01130       "mulpd %%xmm2, %%xmm0 \n\t" \
01131       "mulpd %%xmm1, %%xmm6 \n\t" \
01132       "mulpd %%xmm2, %%xmm7 \n\t" \
01133       "addpd %%xmm0, %%xmm3 \n\t" \
01134       "addpd %%xmm6, %%xmm5 \n\t" \
01135       "addpd %%xmm7, %%xmm4 " \
01136       : \
01137       : \
01138       "m" ((u).c21.imag()), \
01139       "m" ((u).c13.imag()), \
01140       "m" ((u).c31.imag()), \
01141       "m" ((u).c23.imag()), \
01142       "m" ((u).c32.imag())); }
01143 
01144 // //////////////////////////////////////////////////////////////////////////  
01145 //
01146 // stuff used for optimized gamma matrix algebra (float)
01147 //
01148 // //////////////////////////////////////////////////////////////////////////  
01149 
01150 // //////////////////////////////////////////////////////////////////////////
01151 // Multiplies xmm3,xmm4,xmm5 with i
01152 // //////////////////////////////////////////////////////////////////////////
01153 
01154 #define _sse_double_vector_i_mul() \
01155 _ASM ("shufpd $0x1, %%xmm3, %%xmm3 \n\t" \
01156       "shufpd $0x1, %%xmm4, %%xmm4 \n\t" \
01157       "shufpd $0x1, %%xmm5, %%xmm5 \n\t" \
01158       "xorpd %0, %%xmm3 \n\t" \
01159       "xorpd %0, %%xmm4 \n\t" \
01160       "xorpd %0, %%xmm5" \
01161       : \
01162       : \
01163       "m" (_sse_double_sgn))
01164 
01165 // //////////////////////////////////////////////////////////////////////////
01166 // Multiplies xmm3,xmm4,xmm5 with -i
01167 // //////////////////////////////////////////////////////////////////////////
01168 
01169 #define _sse_double_vector_minus_i_mul() \
01170 _ASM ("xorpd %0, %%xmm3 \n\t" \
01171       "xorpd %0, %%xmm4 \n\t" \
01172       "xorpd %0, %%xmm5 \n\t" \
01173       "shufpd $0x1, %%xmm3, %%xmm3 \n\t" \
01174       "shufpd $0x1, %%xmm4, %%xmm4 \n\t" \
01175       "shufpd $0x1, %%xmm5, %%xmm5" \
01176       : \
01177       : \
01178       "m" (_sse_double_sgn))
01179 
01180 
01181 // //////////////////////////////////////////////////////////////////////////  
01182 //
01183 // stuff used for optimized vector operations
01184 //
01185 // //////////////////////////////////////////////////////////////////////////  
01186 
01187 // //////////////////////////////////////////////////////////////////////////
01188 // r+=s[0].c1+s[0].c2+...+s[7].c1+s[7].c2;
01189 // //////////////////////////////////////////////////////////////////////////
01190 
01191 #define _sse_double_add_norm_square_16(r,c) { \
01192 _ASM ("movapd %0, %%xmm0 \n\t" \
01193       "movapd %1, %%xmm1 \n\t" \
01194       "movapd %2, %%xmm2 \n\t" \
01195       "movapd %3, %%xmm3" \
01196       : \
01197       : \
01198       "m" (*((r))), \
01199       "m" (*((r)+1)), \
01200       "m" (*((r)+2)), \
01201       "m" (*((r)+3))); \
01202 _ASM ("movapd %0, %%xmm4 \n\t" \
01203       "movapd %1, %%xmm5 \n\t" \
01204       "movapd %2, %%xmm6 \n\t" \
01205       "movapd %3, %%xmm7 \n\t" \
01206       "mulpd %%xmm0, %%xmm0 \n\t" \
01207       "mulpd %%xmm1, %%xmm1 \n\t" \
01208       "mulpd %%xmm2, %%xmm2 \n\t" \
01209       "mulpd %%xmm3, %%xmm3 \n\t" \
01210       "mulpd %%xmm4, %%xmm4 \n\t" \
01211       "mulpd %%xmm5, %%xmm5 \n\t" \
01212       "mulpd %%xmm6, %%xmm6 \n\t" \
01213       "mulpd %%xmm7, %%xmm7 \n\t" \
01214       "addpd %%xmm0, %%xmm1 \n\t" \
01215       "addpd %%xmm2, %%xmm3 \n\t" \
01216       "addpd %%xmm4, %%xmm5 \n\t" \
01217       "addpd %%xmm6, %%xmm7 \n\t" \
01218       "addpd %%xmm1, %%xmm3 \n\t" \
01219       "addpd %%xmm5, %%xmm7 \n\t" \
01220       "addpd %%xmm3, %%xmm7" \
01221       : \
01222       : \
01223       "m" (*((r)+4)), \
01224       "m" (*((r)+5)), \
01225       "m" (*((r)+6)), \
01226       "m" (*((r)+7))); \
01227 _ASM ("movapd %0,     %%xmm1 \n\t" \
01228       "addpd  %%xmm1, %%xmm7 \n\t" \
01229       "movapd %%xmm7, %0" \
01230       : \
01231       "=m" (c)); }
01232 
01233 // //////////////////////////////////////////////////////////////////////////
01234 // r[0].c1=s[0].c1, r[0].2=s[0].c2+...+r[7].c2=s[7].c2;
01235 // //////////////////////////////////////////////////////////////////////////
01236 
01237 #define _sse_double_add_real_scalar_product_16(r,s,c) { \
01238 _ASM ("movapd %0, %%xmm0 \n\t" \
01239       "movapd %1, %%xmm1 \n\t" \
01240       "movapd %2, %%xmm2 \n\t" \
01241       "movapd %3, %%xmm3 \n\t" \
01242       : \
01243       : \
01244       "m" (*((r))), \
01245       "m" (*((r)+1)), \
01246       "m" (*((r)+2)), \
01247       "m" (*((r)+3))); \
01248 _ASM ("mulpd %0, %%xmm0 \n\t"           \
01249       "mulpd %1, %%xmm1 \n\t" \
01250       "mulpd %2, %%xmm2 \n\t" \
01251       "mulpd %3, %%xmm3 \n\t" \
01252       : \
01253       : \
01254       "m" (*((s))), \
01255       "m" (*((s)+1)), \
01256       "m" (*((s)+2)), \
01257       "m" (*((s)+3))); \
01258 _ASM ("movapd %0, %%xmm4 \n\t" \
01259       "movapd %1, %%xmm5 \n\t" \
01260       "movapd %2, %%xmm6 \n\t" \
01261       "movapd %3, %%xmm7 \n\t" \
01262       : \
01263       : \
01264       "m" (*((r)+4)), \
01265       "m" (*((r)+5)), \
01266       "m" (*((r)+6)), \
01267       "m" (*((r)+7)));        \
01268 _ASM ("mulpd %0, %%xmm4 \n\t"           \
01269       "mulpd %1, %%xmm5 \n\t" \
01270       "mulpd %2, %%xmm6 \n\t" \
01271       "mulpd %3, %%xmm7 \n\t" \
01272       : \
01273       : \
01274       "m" (*((s)+4)), \
01275       "m" (*((s)+5)), \
01276       "m" (*((s)+6)), \
01277       "m" (*((s)+7))); \
01278 _ASM ("addpd %%xmm0, %%xmm1 \n\t" \
01279       "addpd %%xmm2, %%xmm3 \n\t" \
01280       "addpd %%xmm4, %%xmm5 \n\t" \
01281       "addpd %%xmm6, %%xmm7 \n\t" \
01282       "addpd %%xmm1, %%xmm3 \n\t" \
01283       "addpd %%xmm5, %%xmm7 \n\t" \
01284       "addpd %%xmm3, %%xmm7 \n\t" \
01285       "movapd %0, %%xmm1    \n\t" \
01286       "addpd %%xmm1, %%xmm7 \n\t" \
01287       "movapd %%xmm7, %0    \n\t" \
01288       : \
01289       "=m" (c)); }
01290 
01291 #define _sse_double_add_imag_scalar_product_16(r,s,c) { \
01292 _ASM ("movapd %0, %%xmm0 \n\t" \
01293       "movapd %1, %%xmm1 \n\t" \
01294       "movapd %2, %%xmm2 \n\t" \
01295       "movapd %3, %%xmm3 \n\t" \
01296       : \
01297       : \
01298       "m" (*((r))), \
01299       "m" (*((r)+1)), \
01300       "m" (*((r)+2)), \
01301       "m" (*((r)+3))); \
01302 _ASM ("shufpd $0x1, %%xmm0, %%xmm0 \n\t"        \
01303       "shufpd $0x1, %%xmm1, %%xmm1 \n\t" \
01304       "shufpd $0x1, %%xmm2, %%xmm2 \n\t" \
01305       "shufpd $0x1, %%xmm3, %%xmm3 \n\t" \
01306       "mulpd %0, %%xmm0 \n\t" \
01307       "mulpd %1, %%xmm1 \n\t" \
01308       "mulpd %2, %%xmm2 \n\t" \
01309       "mulpd %3, %%xmm3 \n\t" \
01310       : \
01311       : \
01312       "m" (*((s))), \
01313       "m" (*((s)+1)), \
01314       "m" (*((s)+2)), \
01315       "m" (*((s)+3))); \
01316 _ASM ("movapd %0, %%xmm4 \n\t" \
01317       "movapd %1, %%xmm5 \n\t" \
01318       "movapd %2, %%xmm6 \n\t" \
01319       "movapd %3, %%xmm7 \n\t" \
01320       : \
01321       : \
01322       "m" (*((r)+4)), \
01323       "m" (*((r)+5)), \
01324       "m" (*((r)+6)), \
01325       "m" (*((r)+7)));                   \
01326 _ASM ("shufpd $0x1, %%xmm4, %%xmm4 \n\t"        \
01327       "shufpd $0x1, %%xmm5, %%xmm5 \n\t" \
01328       "shufpd $0x1, %%xmm6, %%xmm6 \n\t" \
01329       "shufpd $0x1, %%xmm7, %%xmm7 \n\t" \
01330       "mulpd %0, %%xmm4 \n\t" \
01331       "mulpd %1, %%xmm5 \n\t" \
01332       "mulpd %2, %%xmm6 \n\t" \
01333       "mulpd %3, %%xmm7 \n\t" \
01334       : \
01335       : \
01336       "m" (*((s)+4)), \
01337       "m" (*((s)+5)), \
01338       "m" (*((s)+6)), \
01339       "m" (*((s)+7))); \
01340 _ASM ("addpd %%xmm0, %%xmm1 \n\t" \
01341       "addpd %%xmm2, %%xmm3 \n\t" \
01342       "addpd %%xmm4, %%xmm5 \n\t" \
01343       "addpd %%xmm6, %%xmm7 \n\t" \
01344       "addpd %%xmm1, %%xmm3 \n\t" \
01345       "addpd %%xmm5, %%xmm7 \n\t" \
01346       "addpd %%xmm3, %%xmm7 \n\t" \
01347       "movapd %0, %%xmm1    \n\t" \
01348       "addpd %%xmm1, %%xmm7 \n\t" \
01349       "movapd %%xmm7, %0    \n\t" \
01350       : \
01351       "=m" (c)); }
01352 
01353 #define _sse_double_hermitian_su3(r,s) { \
01354 _ASM ("movapd %0, %%xmm0 \n\t"\
01355       "xorpd  %3, %%xmm0 \n\t" \
01356       "movapd %1, %%xmm1 \n\t"\
01357       "xorpd  %3, %%xmm1 \n\t" \
01358       "movapd %2, %%xmm2 \n\t"\
01359       "xorpd  %3, %%xmm2 \n\t" \
01360       : \
01361       : \
01362       "m" (*((s))), \
01363       "m" (*((s)+4)), \
01364       "m" (*((s)+8)), \
01365       "m" (_sse_double_sgn2)); \
01366 _ASM ("movapd %%xmm0, %0 \n\t"\
01367       "movapd %%xmm1, %1 \n\t"\
01368       "movapd %%xmm2, %2 \n\t"\
01369       : \
01370       "=m" (*((r))), \
01371       "=m" (*((r)+4)), \
01372       "=m" (*((r)+8))); \
01373 _ASM ("movapd %0, %%xmm0 \n\t"\
01374       "xorpd  %3, %%xmm0 \n\t" \
01375       "movapd %1, %%xmm1 \n\t"\
01376       "xorpd  %3, %%xmm1 \n\t" \
01377       "movapd %2, %%xmm2 \n\t"\
01378       "xorpd  %3, %%xmm2 \n\t" \
01379       : \
01380       : \
01381       "m" (*((s)+1)), \
01382       "m" (*((s)+2)), \
01383       "m" (*((s)+5)), \
01384       "m" (_sse_double_sgn2)); \
01385 _ASM ("movapd %%xmm0, %0 \n\t"\
01386       "movapd %%xmm1, %1 \n\t"\
01387       "movapd %%xmm2, %2 \n\t"\
01388       : \
01389       "=m" (*((r)+3)), \
01390       "=m" (*((r)+6)), \
01391       "=m" (*((r)+7))); \
01392 _ASM ("movapd %0, %%xmm0 \n\t"\
01393       "xorpd  %3, %%xmm0 \n\t" \
01394       "movapd %1, %%xmm1 \n\t"\
01395       "xorpd  %3, %%xmm1 \n\t" \
01396       "movapd %2, %%xmm2 \n\t"\
01397       "xorpd  %3, %%xmm2 \n\t" \
01398       : \
01399       : \
01400       "m" (*((s)+3)), \
01401       "m" (*((s)+6)), \
01402       "m" (*((s)+7)), \
01403       "m" (_sse_double_sgn2)); \
01404 _ASM ("movapd %%xmm0, %0 \n\t"\
01405       "movapd %%xmm1, %1 \n\t"\
01406       "movapd %%xmm2, %2 \n\t"\
01407       : \
01408       "=m" (*((r)+1)), \
01409       "=m" (*((r)+2)), \
01410       "=m" (*((r)+5))); } \
01411 
01412 // //////////////////////////////////////////////////////////////////////////
01413 // r[0].c1=s[0].c1, r[0].2=s[0].c2+...+r[7].c2=s[7].c2;
01414 // //////////////////////////////////////////////////////////////////////////
01415 
01416 #define _sse_double_copy_16(r,s) { \
01417 _ASM ("movapd %0, %%xmm0 \n\t" \
01418       "movapd %1, %%xmm1 \n\t" \
01419       "movapd %2, %%xmm2 \n\t" \
01420       "movapd %3, %%xmm3 \n\t" \
01421       : \
01422       : \
01423       "m" (*((s))), \
01424       "m" (*((s)+1)), \
01425       "m" (*((s)+2)), \
01426       "m" (*((s)+3)));         \
01427 _ASM ("movapd %0, %%xmm4 \n\t"  \
01428       "movapd %1, %%xmm5 \n\t" \
01429       "movapd %2, %%xmm6 \n\t" \
01430       "movapd %3, %%xmm7 \n\t" \
01431       : \
01432       : \
01433       "m" (*((s)+4)), \
01434       "m" (*((s)+5)), \
01435       "m" (*((s)+6)), \
01436       "m" (*((s)+7))); \
01437 _ASM ("movapd %%xmm0, %0 \n\t" \
01438       "movapd %%xmm1, %1 \n\t" \
01439       "movapd %%xmm2, %2 \n\t" \
01440       "movapd %%xmm3, %3 \n\t" \
01441       : \
01442       "=m" (*((r))), \
01443       "=m" (*((r)+1)), \
01444       "=m" (*((r)+2)), \
01445       "=m" (*((r)+3)));        \
01446 _ASM ("movapd %%xmm4, %0 \n\t"  \
01447       "movapd %%xmm5, %1 \n\t" \
01448       "movapd %%xmm6, %2 \n\t" \
01449       "movapd %%xmm7, %3 \n\t" \
01450       : \
01451       "=m" (*((r)+4)), \
01452       "=m" (*((r)+5)), \
01453       "=m" (*((r)+6)), \
01454       "=m" (*((r)+7))); }
01455 
01456 // //////////////////////////////////////////////////////////////////////////
01457 // r[0].c1+=s[0].c1, r[0].2+=s[0].c2 ... r[7].c2+=s[7].c2;
01458 // //////////////////////////////////////////////////////////////////////////
01459 
01460 #define _sse_double_add_16(r,s) { \
01461 _ASM ("movapd %0, %%xmm0 \n\t" \
01462      "movapd %1, %%xmm1 \n\t" \
01463      "movapd %2, %%xmm2 \n\t" \
01464      "movapd %3, %%xmm3 \n\t" \
01465      : \
01466      : \
01467      "m" (*((s))), \
01468      "m" (*((s)+1)), \
01469      "m" (*((s)+2)), \
01470      "m" (*((s)+3)));         \
01471 _ASM ("movapd %0, %%xmm4 \n\t"          \
01472      "movapd %1, %%xmm5 \n\t" \
01473      "movapd %2, %%xmm6 \n\t" \
01474      "movapd %3, %%xmm7 \n\t" \
01475      : \
01476      : \
01477      "m" (*((s)+4)), \
01478      "m" (*((s)+5)), \
01479      "m" (*((s)+6)), \
01480      "m" (*((s)+7))); \
01481 _ASM ("addpd %0, %%xmm0 \n\t" \
01482       "addpd %1, %%xmm1 \n\t" \
01483       "addpd %2, %%xmm2 \n\t" \
01484       "addpd %3, %%xmm3 \n\t" \
01485       : \
01486       : \
01487       "m" (*((r))), \
01488       "m" (*((r)+1)), \
01489       "m" (*((r)+2)), \
01490       "m" (*((r)+3)));        \
01491 _ASM ("addpd %0, %%xmm4 \n\t"           \
01492       "addpd %1, %%xmm5 \n\t" \
01493       "addpd %2, %%xmm6 \n\t" \
01494       "addpd %3, %%xmm7 \n\t" \
01495       : \
01496       : \
01497       "m" (*((r)+4)), \
01498       "m" (*((r)+5)), \
01499       "m" (*((r)+6)), \
01500       "m" (*((r)+7))); \
01501 _ASM ("movapd %%xmm0, %0 \n\t" \
01502       "movapd %%xmm1, %1 \n\t" \
01503       "movapd %%xmm2, %2 \n\t" \
01504       "movapd %%xmm3, %3 \n\t" \
01505       :                        \
01506       "=m" (*((r))),           \
01507       "=m" (*((r)+1)),         \
01508       "=m" (*((r)+2)),         \
01509       "=m" (*((r)+3)));        \
01510 _ASM ("movapd %%xmm4, %0 \n\t"          \
01511       "movapd %%xmm5, %1 \n\t"                  \
01512       "movapd %%xmm6, %2 \n\t"                  \
01513       "movapd %%xmm7, %3 \n\t"                  \
01514       :                                         \
01515       "=m" (*((r)+4)),                          \
01516       "=m" (*((r)+5)),                          \
01517       "=m" (*((r)+6)),                          \
01518       "=m" (*((r)+7))); }
01519 
01520 // //////////////////////////////////////////////////////////////////////////
01521 // r[0].c1-=s[0].c1, r[0].2-=s[0].c2 ... r[7].c2-=s[7].c2; CHECK MAY BE WRONG
01522 // //////////////////////////////////////////////////////////////////////////
01523 
01524 #define _sse_double_sub_16(r,s) { \
01525 _ASM ("movapd %0, %%xmm0 \n\t" \
01526      "movapd %1, %%xmm1 \n\t" \
01527      "movapd %2, %%xmm2 \n\t" \
01528      "movapd %3, %%xmm3 \n\t" \
01529      : \
01530      : \
01531      "m" (*((s))), \
01532      "m" (*((s)+1)), \
01533      "m" (*((s)+2)), \
01534       "m" (*((s)+3)));        \
01535 _ASM ("movapd %0, %%xmm4 \n\t"          \
01536      "movapd %1, %%xmm5 \n\t" \
01537      "movapd %2, %%xmm6 \n\t" \
01538      "movapd %3, %%xmm7 \n\t" \
01539      : \
01540      : \
01541      "m" (*((s)+4)), \
01542      "m" (*((s)+5)), \
01543      "m" (*((s)+6)), \
01544      "m" (*((s)+7))); \
01545 _ASM ("subpd %0, %%xmm0 \n\t" \
01546       "subpd %1, %%xmm1 \n\t" \
01547       "subpd %2, %%xmm2 \n\t" \
01548       "subpd %3, %%xmm3 \n\t" \
01549       : \
01550       : \
01551       "m" (*((r))), \
01552       "m" (*((r)+1)), \
01553       "m" (*((r)+2)), \
01554       "m" (*((r)+3)));        \
01555 _ASM ("subpd %0, %%xmm4 \n\t"           \
01556       "subpd %1, %%xmm5 \n\t" \
01557       "subpd %2, %%xmm6 \n\t" \
01558       "subpd %3, %%xmm7 \n\t" \
01559       : \
01560       : \
01561       "m" (*((r)+4)), \
01562       "m" (*((r)+5)), \
01563       "m" (*((r)+6)), \
01564       "m" (*((r)+7))); \
01565 _ASM ("movapd %%xmm0, %0 \n\t" \
01566      "movapd %%xmm1, %1 \n\t" \
01567      "movapd %%xmm2, %2 \n\t" \
01568      "movapd %%xmm3, %3 \n\t" \
01569      : \
01570      "=m" (*((r))), \
01571      "=m" (*((r)+1)), \
01572      "=m" (*((r)+2)), \
01573       "=m" (*((r)+3)));       \
01574 _ASM ("movapd %%xmm4, %0 \n\t"          \
01575      "movapd %%xmm5, %1 \n\t" \
01576      "movapd %%xmm6, %2 \n\t" \
01577      "movapd %%xmm7, %3 \n\t" \
01578      : \
01579      "=m" (*((r)+4)), \
01580      "=m" (*((r)+5)), \
01581      "=m" (*((r)+6)), \
01582      "=m" (*((r)+7))); }
01583 
01584 // //////////////////////////////////////////////////////////////////////////
01585 // r[0].c1=c*s[0].c1, r[0].2=c*s[0].c2 ... r[7].c2=c*s[7].c2;
01586 // //////////////////////////////////////////////////////////////////////////
01587 
01588 #define  _sse_double_add_multiply_16(r,c,s) { \
01589 _ASM ("movapd %0, %%xmm0 \n\t" \
01590      "movapd %1, %%xmm1 \n\t" \
01591      "movapd %2, %%xmm2 \n\t" \
01592      "movapd %3, %%xmm3 \n\t" \
01593      : \
01594      : \
01595      "m" (*((s))), \
01596      "m" (*((s)+1)), \
01597      "m" (*((s)+2)), \
01598       "m" (*((s)+3)));        \
01599 _ASM ("movapd %0, %%xmm4 \n\t"          \
01600      "movapd %1, %%xmm5 \n\t" \
01601      "movapd %2, %%xmm6 \n\t" \
01602      "movapd %3, %%xmm7 \n\t" \
01603      "mulpd %4, %%xmm0 \n\t" \
01604      "mulpd %4, %%xmm1 \n\t" \
01605      "mulpd %4, %%xmm2 \n\t" \
01606      "mulpd %4, %%xmm3 \n\t" \
01607      "mulpd %4, %%xmm4 \n\t" \
01608      "mulpd %4, %%xmm5 \n\t" \
01609      "mulpd %4, %%xmm6 \n\t" \
01610      "mulpd %4, %%xmm7 \n\t" \
01611      : \
01612      : \
01613      "m" (*((s)+4)), \
01614      "m" (*((s)+5)), \
01615      "m" (*((s)+6)), \
01616      "m" (*((s)+7)), \
01617      "m" (c)); \
01618 _ASM ("addpd %0, %%xmm0 \n\t" \
01619       "addpd %1, %%xmm1 \n\t" \
01620       "addpd %2, %%xmm2 \n\t" \
01621       "addpd %3, %%xmm3 \n\t" \
01622       : \
01623       : \
01624       "m" (*((r))), \
01625       "m" (*((r)+1)), \
01626       "m" (*((r)+2)), \
01627       "m" (*((r)+3)));        \
01628 _ASM ("addpd %0, %%xmm4 \n\t"           \
01629       "addpd %1, %%xmm5 \n\t" \
01630       "addpd %2, %%xmm6 \n\t" \
01631       "addpd %3, %%xmm7 \n\t" \
01632       : \
01633       : \
01634       "m" (*((r)+4)), \
01635       "m" (*((r)+5)), \
01636       "m" (*((r)+6)), \
01637       "m" (*((r)+7))); \
01638 _ASM ("movapd %%xmm0, %0 \n\t" \
01639       "movapd %%xmm1, %1 \n\t" \
01640       "movapd %%xmm2, %2 \n\t" \
01641       "movapd %%xmm3, %3 \n\t" \
01642       : \
01643       "=m" (*((r))), \
01644       "=m" (*((r)+1)), \
01645       "=m" (*((r)+2)), \
01646       "=m" (*((r)+3)));        \
01647 _ASM ("movapd %%xmm4, %0 \n\t"  \
01648       "movapd %%xmm5, %1 \n\t" \
01649       "movapd %%xmm6, %2 \n\t" \
01650       "movapd %%xmm7, %3 \n\t" \
01651       : \
01652       "=m" (*((r)+4)), \
01653       "=m" (*((r)+5)), \
01654       "=m" (*((r)+6)), \
01655       "=m" (*((r)+7))); }
01656 
01657 #define  _sse_double_multiply_16(r,c,s) { \
01658 _ASM ("movapd %0, %%xmm0 \n\t" \
01659       "movapd %1, %%xmm1 \n\t" \
01660       "movapd %2, %%xmm2 \n\t" \
01661       "movapd %3, %%xmm3 \n\t" \
01662       : \
01663       : \
01664       "m" (*((s))), \
01665       "m" (*((s)+1)), \
01666       "m" (*((s)+2)), \
01667       "m" (*((s)+3)));         \
01668 _ASM ("movapd %0, %%xmm4 \n\t"  \
01669       "movapd %1, %%xmm5 \n\t" \
01670       "movapd %2, %%xmm6 \n\t" \
01671       "movapd %3, %%xmm7 \n\t" \
01672       : \
01673       : \
01674       "m" (*((s)+4)), \
01675       "m" (*((s)+5)), \
01676       "m" (*((s)+6)), \
01677       "m" (*((s)+7))); \
01678 _ASM ("mulpd  %0, %%xmm0 \n\t" \
01679       "mulpd  %0, %%xmm1 \n\t" \
01680       "mulpd  %0, %%xmm2 \n\t" \
01681       "mulpd  %0, %%xmm3 \n\t" \
01682       "mulpd  %0, %%xmm4 \n\t" \
01683       "mulpd  %0, %%xmm5 \n\t" \
01684       "mulpd  %0, %%xmm6 \n\t" \
01685       "mulpd  %0, %%xmm7 \n\t" \
01686       : \
01687       : \
01688       "m" (c)); \
01689 _ASM ("movapd %%xmm0, %0 \n\t" \
01690       "movapd %%xmm1, %1 \n\t" \
01691       "movapd %%xmm2, %2 \n\t" \
01692       "movapd %%xmm3, %3 \n\t" \
01693       : \
01694       "=m" (*((r))), \
01695       "=m" (*((r)+1)), \
01696       "=m" (*((r)+2)), \
01697       "=m" (*((r)+3)));        \
01698 _ASM ("movapd %%xmm4, %0 \n\t"  \
01699       "movapd %%xmm5, %1 \n\t" \
01700       "movapd %%xmm6, %2 \n\t" \
01701       "movapd %%xmm7, %3 \n\t" \
01702       : \
01703       "=m" (*((r)+4)), \
01704       "=m" (*((r)+5)), \
01705       "=m" (*((r)+6)), \
01706       "=m" (*((r)+7))); }
01707 
01708 
01709 static void _sse_check_alignment(void* var, unsigned long base) {
01710   unsigned long af1=(unsigned int) var;
01711   if (af1!=(af1&~base)) {
01712     error("_sse_check_alignment()\nVariable not aligned properly");
01713   }
01714 }
01715 

Generated on Sun Feb 27 15:12:19 2005 by  doxygen 1.4.1