00001
00002
00003
00004
00005
00006
00007
00008
00009
00010 #define ALIGN16 __attribute__ ((aligned (16)))
00011 #define ALIGN64 __attribute__ ((aligned (64)))
00012 #define _ASM __asm__ __volatile__
00013
00014 typedef struct { float c1,c2,c3,c4; } _sse_float ALIGN16;
00015 typedef struct { _sse_float c1,c2,c3;} _sse_vector ALIGN16;
00016 typedef struct { int c1,c2,c3,c4;} _sse_int ALIGN16;
00017 typedef struct { double c1,c2; } _sse_double ALIGN16;
00018
00019 typedef struct {mdp_complex c11,c12,c13,c21,c22,c23,c31,c32,c33; } _sse_su3;
00020 typedef struct {mdp_complex c1,c2,c3; } _sse_su3_vector;
00021 typedef struct {_sse_su3_vector c1,c2,c3,c4; } _sse_spinor;
00022
00023 static _sse_float _sse_float_sgn12 __attribute__ ((unused)) = {-1.0f,-1.0f,1.0f,1.0f};
00024 static _sse_float _sse_float_sgn13 __attribute__ ((unused)) = {-1.0f,1.0f,-1.0f,1.0f};
00025 static _sse_float _sse_float_sgn14 __attribute__ ((unused)) = {-1.0f,1.0f,1.0f,-1.0f};
00026 static _sse_float _sse_float_sgn23 __attribute__ ((unused)) = {1.0f,-1.0f,-1.0f,1.0f};
00027 static _sse_float _sse_float_sgn24 __attribute__ ((unused)) = {1.0f,-1.0f,1.0f,-1.0f};
00028 static _sse_float _sse_float_sgn34 __attribute__ ((unused)) = {1.0f,1.0f,-1.0f,-1.0f};
00029 static _sse_int _sse_double_sgn __attribute__ ((unused)) = {0x0,0x80000000,0x0,0x0};
00030 static _sse_int _sse_double_sgn2 __attribute__ ((unused)) = {0x0,0x0,0x0,0x80000000};
00031
00032
00033
00034
00035
00036
00037 #define _sse_float_prefetch_spinor(addr) \
00038 _ASM ("prefetcht0 %0 \n\t" \
00039 "prefetcht0 %1" \
00040 : \
00041 : \
00042 "m" (*(((char*)(((unsigned int)(addr))&~0x7f)))), \
00043 "m" (*(((char*)(((unsigned int)(addr))&~0x7f))+128)))
00044
00045 #define _sse_float_prefetch_su3(addr) \
00046 _ASM ("prefetcht0 %0 \n\t" \
00047 "prefetcht0 %1" \
00048 : \
00049 : \
00050 "m" (*(((char*)(((unsigned int)(addr))&~0x7f)))), \
00051 "m" (*(((char*)(((unsigned int)(addr))&~0x7f))+128)))
00052
00053
00054
00055
00056
00057
00058
00059
00060
00061
00062
00063
00064
00065
00066
00067
00068
00069
00070
00071
00072
00073
00074
00075
00076
00077
00078
00079
00080
00081
00082
00083
00084
00085
00086 #define _sse_float_pair_load(sl,sh) \
00087 _ASM ("movlps %0, %%xmm0 \n\t" \
00088 "movlps %1, %%xmm1 \n\t" \
00089 "movlps %2, %%xmm2 \n\t" \
00090 "movhps %3, %%xmm0 \n\t" \
00091 "movhps %4, %%xmm1 \n\t" \
00092 "movhps %5, %%xmm2" \
00093 : \
00094 : \
00095 "m" ((sl).c1), \
00096 "m" ((sl).c2), \
00097 "m" ((sl).c3), \
00098 "m" ((sh).c1), \
00099 "m" ((sh).c2), \
00100 "m" ((sh).c3))
00101
00102
00103
00104
00105
00106 #define _sse_float_pair_load_up(sl,sh) \
00107 _ASM ("movlps %0, %%xmm3 \n\t" \
00108 "movlps %1, %%xmm4 \n\t" \
00109 "movlps %2, %%xmm5 \n\t" \
00110 "movhps %3, %%xmm3 \n\t" \
00111 "movhps %4, %%xmm4 \n\t" \
00112 "movhps %5, %%xmm5" \
00113 : \
00114 : \
00115 "m" ((sl).c1), \
00116 "m" ((sl).c2), \
00117 "m" ((sl).c3), \
00118 "m" ((sh).c1), \
00119 "m" ((sh).c2), \
00120 "m" ((sh).c3))
00121
00122
00123
00124
00125
00126
00127 #define _sse_float_pair_store(rl,rh) \
00128 _ASM ("movlps %%xmm0, %0 \n\t" \
00129 "movlps %%xmm1, %1 \n\t" \
00130 "movlps %%xmm2, %2 \n\t" \
00131 "movhps %%xmm0, %3 \n\t" \
00132 "movhps %%xmm1, %4 \n\t" \
00133 "movhps %%xmm2, %5" \
00134 : \
00135 "=m" ((rl).c1), \
00136 "=m" ((rl).c2), \
00137 "=m" ((rl).c3), \
00138 "=m" ((rh).c1), \
00139 "=m" ((rh).c2), \
00140 "=m" ((rh).c3))
00141
00142
00143
00144
00145
00146 #define _sse_float_pair_store_up(rl,rh) \
00147 _ASM ("movlps %%xmm3, %0 \n\t" \
00148 "movlps %%xmm4, %1 \n\t" \
00149 "movlps %%xmm5, %2 \n\t" \
00150 "movhps %%xmm3, %3 \n\t" \
00151 "movhps %%xmm4, %4 \n\t" \
00152 "movhps %%xmm5, %5" \
00153 : \
00154 "=m" ((rl).c1), \
00155 "=m" ((rl).c2), \
00156 "=m" ((rl).c3), \
00157 "=m" ((rh).c1), \
00158 "=m" ((rh).c2), \
00159 "=m" ((rh).c3))
00160
00161
00162
00163
00164
00165 #define _sse_float_vector_load(s) \
00166 _ASM ("movaps %0, %%xmm0 \n\t" \
00167 "movaps %1, %%xmm1 \n\t" \
00168 "movaps %2, %%xmm2" \
00169 : \
00170 : \
00171 "m" ((s).c1), \
00172 "m" ((s).c2), \
00173 "m" ((s).c3))
00174
00175 #define _sse_float_vector_load_up(s) \
00176 _ASM ("movaps %0, %%xmm3 \n\t" \
00177 "movaps %1, %%xmm4 \n\t" \
00178 "movaps %2, %%xmm5" \
00179 : \
00180 : \
00181 "m" ((s).c1), \
00182 "m" ((s).c2), \
00183 "m" ((s).c3))
00184
00185
00186
00187
00188 #define _sse_float_vector_store(r) \
00189 _ASM ("movaps %%xmm0, %0 \n\t" \
00190 "movaps %%xmm1, %1 \n\t" \
00191 "movaps %%xmm2, %2" \
00192 : \
00193 "=m" ((r).c1), \
00194 "=m" ((r).c2), \
00195 "=m" ((r).c3))
00196
00197
00198
00199
00200
00201 #define _sse_float_vector_mul(c) \
00202 _ASM ("mulps %0, %%xmm0 \n\t" \
00203 "mulps %0, %%xmm1 \n\t" \
00204 "mulps %0, %%xmm2" \
00205 : \
00206 : \
00207 "m" (c))
00208
00209
00210
00211
00212
00213
00214
00215
00216
00217
00218
00219
00220
00221
00222
00223
00224
00225
00226
00227
00228
00229
00230
00231
00232
00233
00234
00235
00236
00237
00238
00239
00240
00241
00242
00243
00244
00245 #ifdef SSE2FIX
00246 #define _sse_float_vector_add() \
00247 _ASM ("addps %xmm3, %xmm0 \n\t" \
00248 "addps %xmm4, %xmm1 \n\t" \
00249 "addps %xmm5, %xmm2 \n\t" \
00250 : \
00251 : )
00252 #else
00253 #define _sse_float_vector_add() \
00254 _ASM ("addps %%xmm3, %%xmm0 \n\t" \
00255 "addps %%xmm4, %%xmm1 \n\t" \
00256 "addps %%xmm5, %%xmm2 \n\t" \
00257 : \
00258 : )
00259 #endif
00260
00261
00262
00263
00264
00265
00266 #ifdef SSE2FIX
00267 #define _sse_float_vector_sub() \
00268 _ASM ("subps %xmm3, %xmm0 \n\t" \
00269 "subps %xmm4, %xmm1 \n\t" \
00270 "subps %xmm5, %xmm2" \
00271 : \
00272 :)
00273 #else
00274 #define _sse_float_vector_sub() \
00275 _ASM ("subps %%xmm3, %%xmm0 \n\t" \
00276 "subps %%xmm4, %%xmm1 \n\t" \
00277 "subps %%xmm5, %%xmm2" \
00278 : \
00279 :)
00280 #endif
00281
00282
00283
00284
00285
00286 #define _sse_float_vector_addsub() \
00287 _ASM ("mulps %0, %%xmm3 \n\t" \
00288 "mulps %0, %%xmm4 \n\t" \
00289 "mulps %0, %%xmm5 \n\t" \
00290 "addps %%xmm3, %%xmm0 \n\t" \
00291 "addps %%xmm4, %%xmm1 \n\t" \
00292 "addps %%xmm5, %%xmm2" \
00293 : \
00294 : \
00295 "m" (_sse_float_sgn34))
00296
00297
00298
00299
00300
00301
00302
00303
00304
00305 #define _sse_float_su3_multiply(u) { \
00306 _ASM ("movss %0, %%xmm3 \n\t" \
00307 "movss %1, %%xmm6 \n\t" \
00308 "movss %2, %%xmm4 \n\t" \
00309 "movss %3, %%xmm7 \n\t" \
00310 "movss %4, %%xmm5 " \
00311 : \
00312 : \
00313 "m" ((u).c11.real()), \
00314 "m" ((u).c12.real()), \
00315 "m" ((u).c21.real()), \
00316 "m" ((u).c23.real()), \
00317 "m" ((u).c31.real())); \
00318 _ASM ("shufps $0x0, %%xmm3, %%xmm3 \n\t" \
00319 "shufps $0x0, %%xmm6, %%xmm6 \n\t" \
00320 "shufps $0x0, %%xmm4, %%xmm4 \n\t" \
00321 "mulps %%xmm0, %%xmm3 \n\t" \
00322 "shufps $0x0, %%xmm7, %%xmm7 \n\t" \
00323 "mulps %%xmm1, %%xmm6 \n\t" \
00324 "shufps $0x0, %%xmm5, %%xmm5 \n\t" \
00325 "mulps %%xmm0, %%xmm4 \n\t" \
00326 "addps %%xmm6, %%xmm3 \n\t" \
00327 "mulps %%xmm2, %%xmm7 \n\t" \
00328 "mulps %%xmm0, %%xmm5 \n\t" \
00329 "addps %%xmm7, %%xmm4 \n\t" \
00330 "movss %0, %%xmm6 \n\t" \
00331 "movss %1, %%xmm7 \n\t" \
00332 "shufps $0x0, %%xmm6, %%xmm6 \n\t" \
00333 "shufps $0x0, %%xmm7, %%xmm7 \n\t" \
00334 "mulps %%xmm1, %%xmm6 \n\t" \
00335 "mulps %%xmm2, %%xmm7 \n\t" \
00336 "addps %%xmm6, %%xmm5 \n\t" \
00337 "addps %%xmm7, %%xmm3 \n\t" \
00338 "movss %2, %%xmm6 \n\t" \
00339 "movss %3, %%xmm7 \n\t" \
00340 "shufps $0x0, %%xmm6, %%xmm6 \n\t" \
00341 "shufps $0x0, %%xmm7, %%xmm7 \n\t" \
00342 "mulps %%xmm1, %%xmm6 \n\t" \
00343 "mulps %%xmm2, %%xmm7 \n\t" \
00344 "addps %%xmm6, %%xmm4 \n\t" \
00345 "addps %%xmm7, %%xmm5" \
00346 : \
00347 : \
00348 "m" ((u).c32.real()), \
00349 "m" ((u).c13.real()), \
00350 "m" ((u).c22.real()), \
00351 "m" ((u).c33.real())); \
00352 _ASM ("movss %0, %%xmm6 \n\t" \
00353 "movss %1, %%xmm7 \n\t" \
00354 "shufps $0xb1, %%xmm0, %%xmm0 \n\t" \
00355 "shufps $0xb1, %%xmm1, %%xmm1 \n\t" \
00356 "shufps $0xb1, %%xmm2, %%xmm2 \n\t" \
00357 "shufps $0x0, %%xmm6, %%xmm6 \n\t" \
00358 "shufps $0x0, %%xmm7, %%xmm7 \n\t" \
00359 "mulps %4, %%xmm0 \n\t" \
00360 "mulps %4, %%xmm1 \n\t" \
00361 "mulps %4, %%xmm2 \n\t" \
00362 "mulps %%xmm0, %%xmm6 \n\t" \
00363 "mulps %%xmm1, %%xmm7 \n\t" \
00364 "addps %%xmm6, %%xmm3 \n\t" \
00365 "addps %%xmm7, %%xmm4 \n\t" \
00366 "movss %2, %%xmm6 \n\t" \
00367 "movss %3, %%xmm7 \n\t" \
00368 "shufps $0x0, %%xmm6, %%xmm6 \n\t" \
00369 "shufps $0x0, %%xmm7, %%xmm7 \n\t" \
00370 "mulps %%xmm2, %%xmm6 \n\t" \
00371 "mulps %%xmm0, %%xmm7 \n\t" \
00372 "addps %%xmm6, %%xmm5 \n\t" \
00373 "addps %%xmm7, %%xmm4 " \
00374 : \
00375 : \
00376 "m" ((u).c11.imag()), \
00377 "m" ((u).c22.imag()), \
00378 "m" ((u).c33.imag()), \
00379 "m" ((u).c21.imag()), \
00380 "m" (_sse_float_sgn13)); \
00381 _ASM ("movss %0, %%xmm6 \n\t" \
00382 "movss %1, %%xmm7 \n\t" \
00383 "shufps $0x0, %%xmm6, %%xmm6 \n\t" \
00384 "shufps $0x0, %%xmm7, %%xmm7 \n\t" \
00385 "mulps %%xmm1, %%xmm6 \n\t" \
00386 "mulps %%xmm0, %%xmm7 \n\t" \
00387 "addps %%xmm6, %%xmm3 \n\t" \
00388 "addps %%xmm7, %%xmm5 \n\t" \
00389 "movss %2, %%xmm0 \n\t" \
00390 "movss %3, %%xmm6 \n\t" \
00391 "movss %4, %%xmm7 \n\t" \
00392 "shufps $0x0, %%xmm0, %%xmm0 \n\t" \
00393 "shufps $0x0, %%xmm6, %%xmm6 \n\t" \
00394 "shufps $0x0, %%xmm7, %%xmm7 \n\t" \
00395 "mulps %%xmm2, %%xmm0 \n\t" \
00396 "mulps %%xmm1, %%xmm6 \n\t" \
00397 "mulps %%xmm2, %%xmm7 \n\t" \
00398 "addps %%xmm0, %%xmm3 \n\t" \
00399 "addps %%xmm6, %%xmm5 \n\t" \
00400 "addps %%xmm7, %%xmm4" \
00401 : \
00402 : \
00403 "m" ((u).c12.imag()), \
00404 "m" ((u).c31.imag()), \
00405 "m" ((u).c13.imag()), \
00406 "m" ((u).c32.imag()), \
00407 "m" ((u).c23.imag())); }
00408
00409
00410
00411
00412
00413
00414
00415
00416
00417 #define _sse_float_su3_inverse_multiply(u) { \
00418 _ASM ("movss %0, %%xmm3 \n\t" \
00419 "movss %1, %%xmm6 \n\t" \
00420 "movss %2, %%xmm4 \n\t" \
00421 "movss %3, %%xmm7 \n\t" \
00422 "movss %4, %%xmm5 " \
00423 : \
00424 : \
00425 "m" ((u).c11.real()), \
00426 "m" ((u).c21.real()), \
00427 "m" ((u).c12.real()), \
00428 "m" ((u).c32.real()), \
00429 "m" ((u).c13.real())); \
00430 _ASM ("shufps $0x0, %%xmm3, %%xmm3 \n\t" \
00431 "shufps $0x0, %%xmm6, %%xmm6 \n\t" \
00432 "shufps $0x0, %%xmm4, %%xmm4 \n\t" \
00433 "mulps %%xmm0, %%xmm3 \n\t" \
00434 "shufps $0x0, %%xmm7, %%xmm7 \n\t" \
00435 "mulps %%xmm1, %%xmm6 \n\t" \
00436 "shufps $0x0, %%xmm5, %%xmm5 \n\t" \
00437 "mulps %%xmm0, %%xmm4 \n\t" \
00438 "addps %%xmm6, %%xmm3 \n\t" \
00439 "mulps %%xmm2, %%xmm7 \n\t" \
00440 "mulps %%xmm0, %%xmm5 \n\t" \
00441 "addps %%xmm7, %%xmm4 \n\t" \
00442 "movss %0, %%xmm6 \n\t" \
00443 "movss %1, %%xmm7 \n\t" \
00444 "shufps $0x0, %%xmm6, %%xmm6 \n\t" \
00445 "shufps $0x0, %%xmm7, %%xmm7 \n\t" \
00446 "mulps %%xmm1, %%xmm6 \n\t" \
00447 "mulps %%xmm2, %%xmm7 \n\t" \
00448 "addps %%xmm6, %%xmm5 \n\t" \
00449 "addps %%xmm7, %%xmm3 \n\t" \
00450 "movss %2, %%xmm6 \n\t" \
00451 "movss %3, %%xmm7 \n\t" \
00452 "shufps $0x0, %%xmm6, %%xmm6 \n\t" \
00453 "shufps $0x0, %%xmm7, %%xmm7 \n\t" \
00454 "mulps %%xmm1, %%xmm6 \n\t" \
00455 "mulps %%xmm2, %%xmm7 \n\t" \
00456 "addps %%xmm6, %%xmm4 \n\t" \
00457 "addps %%xmm7, %%xmm5 " \
00458 : \
00459 : \
00460 "m" ((u).c23.real()), \
00461 "m" ((u).c31.real()), \
00462 "m" ((u).c22.real()), \
00463 "m" ((u).c33.real())); \
00464 _ASM ("movss %0, %%xmm6 \n\t" \
00465 "movss %1, %%xmm7 \n\t" \
00466 "shufps $0xb1, %%xmm0, %%xmm0 \n\t" \
00467 "shufps $0xb1, %%xmm1, %%xmm1 \n\t" \
00468 "shufps $0xb1, %%xmm2, %%xmm2 \n\t" \
00469 "shufps $0x0, %%xmm6, %%xmm6 \n\t" \
00470 "shufps $0x0, %%xmm7, %%xmm7 \n\t" \
00471 "mulps %4, %%xmm0 \n\t" \
00472 "mulps %4, %%xmm1 \n\t" \
00473 "mulps %4, %%xmm2 \n\t" \
00474 "mulps %%xmm0, %%xmm6 \n\t" \
00475 "mulps %%xmm1, %%xmm7 \n\t" \
00476 "addps %%xmm6, %%xmm3 \n\t" \
00477 "addps %%xmm7, %%xmm4 \n\t" \
00478 "movss %2, %%xmm6 \n\t" \
00479 "movss %3, %%xmm7 " \
00480 : \
00481 : \
00482 "m" ((u).c11.imag()), \
00483 "m" ((u).c22.imag()), \
00484 "m" ((u).c33.imag()), \
00485 "m" ((u).c12.imag()), \
00486 "m" (_sse_float_sgn24)); \
00487 _ASM ("shufps $0x0, %%xmm6, %%xmm6 \n\t" \
00488 "shufps $0x0, %%xmm7, %%xmm7 \n\t" \
00489 "mulps %%xmm2, %%xmm6 \n\t" \
00490 "mulps %%xmm0, %%xmm7 \n\t" \
00491 "addps %%xmm6, %%xmm5 \n\t" \
00492 "addps %%xmm7, %%xmm4 \n\t" \
00493 "movss %0, %%xmm6 \n\t" \
00494 "movss %1, %%xmm7 \n\t" \
00495 "shufps $0x0, %%xmm6, %%xmm6 \n\t" \
00496 "shufps $0x0, %%xmm7, %%xmm7 \n\t" \
00497 "mulps %%xmm1, %%xmm6 \n\t" \
00498 "mulps %%xmm0, %%xmm7 \n\t" \
00499 "addps %%xmm6, %%xmm3 \n\t" \
00500 "addps %%xmm7, %%xmm5 \n\t" \
00501 "movss %2, %%xmm0 \n\t" \
00502 "movss %3, %%xmm6 \n\t" \
00503 "movss %4, %%xmm7 \n\t" \
00504 "shufps $0x0, %%xmm0, %%xmm0 \n\t" \
00505 "shufps $0x0, %%xmm6, %%xmm6 \n\t" \
00506 "shufps $0x0, %%xmm7, %%xmm7 \n\t" \
00507 "mulps %%xmm2, %%xmm0 \n\t" \
00508 "mulps %%xmm1, %%xmm6 \n\t" \
00509 "mulps %%xmm2, %%xmm7 \n\t" \
00510 "addps %%xmm0, %%xmm3 \n\t" \
00511 "addps %%xmm6, %%xmm5 \n\t" \
00512 "addps %%xmm7, %%xmm4" \
00513 : \
00514 : \
00515 "m" ((u).c21.imag()), \
00516 "m" ((u).c13.imag()), \
00517 "m" ((u).c31.imag()), \
00518 "m" ((u).c23.imag()), \
00519 "m" ((u).c32.imag())); }
00520
00521
00522
00523
00524
00525
00526
00527
00528
00529
00530
00531
00532
00533 #define _sse_float_vector_subadd() \
00534 _ASM ("mulps %0, %%xmm3 \n\t" \
00535 "mulps %0, %%xmm4 \n\t" \
00536 "mulps %0, %%xmm5 \n\t" \
00537 "addps %%xmm3, %%xmm0 \n\t" \
00538 "addps %%xmm4, %%xmm1 \n\t" \
00539 "addps %%xmm5, %%xmm2" \
00540 : \
00541 : \
00542 "m" (_sse_float_sgn12))
00543
00544
00545
00546
00547
00548 #define _sse_float_vector_i_add() \
00549 _ASM ("shufps $0xb1, %%xmm3, %%xmm3 \n\t" \
00550 "shufps $0xb1, %%xmm4, %%xmm4 \n\t" \
00551 "shufps $0xb1, %%xmm5, %%xmm5 \n\t" \
00552 "mulps %0, %%xmm3 \n\t" \
00553 "mulps %0, %%xmm4 \n\t" \
00554 "mulps %0, %%xmm5 \n\t" \
00555 "addps %%xmm3, %%xmm0 \n\t" \
00556 "addps %%xmm4, %%xmm1 \n\t" \
00557 "addps %%xmm5, %%xmm2" \
00558 : \
00559 : \
00560 "m" (_sse_float_sgn13))
00561
00562
00563
00564
00565
00566 #define _sse_float_vector_i_sub() \
00567 _ASM ("shufps $0xb1, %%xmm3, %%xmm3 \n\t" \
00568 "shufps $0xb1, %%xmm4, %%xmm4 \n\t" \
00569 "shufps $0xb1, %%xmm5, %%xmm5 \n\t" \
00570 "mulps %0, %%xmm3 \n\t" \
00571 "mulps %0, %%xmm4 \n\t" \
00572 "mulps %0, %%xmm5 \n\t" \
00573 "addps %%xmm3, %%xmm0 \n\t" \
00574 "addps %%xmm4, %%xmm1 \n\t" \
00575 "addps %%xmm5, %%xmm2" \
00576 : \
00577 : \
00578 "m" (_sse_float_sgn24))
00579
00580
00581
00582
00583
00584
00585 #define _sse_float_vector_xch_i_add() \
00586 _ASM ("shufps $0x1b, %%xmm3, %%xmm3 \n\t" \
00587 "shufps $0x1b, %%xmm4, %%xmm4 \n\t" \
00588 "shufps $0x1b, %%xmm5, %%xmm5 \n\t" \
00589 "mulps %0, %%xmm3 \n\t" \
00590 "mulps %0, %%xmm4 \n\t" \
00591 "mulps %0, %%xmm5 \n\t" \
00592 "addps %%xmm3, %%xmm0 \n\t" \
00593 "addps %%xmm4, %%xmm1 \n\t" \
00594 "addps %%xmm5, %%xmm2" \
00595 : \
00596 : \
00597 "m" (_sse_float_sgn13))
00598
00599
00600
00601
00602
00603
00604 #define _sse_float_vector_xch_i_sub() \
00605 _ASM ("shufps $0x1b, %%xmm3, %%xmm3 \n\t" \
00606 "shufps $0x1b, %%xmm4, %%xmm4 \n\t" \
00607 "shufps $0x1b, %%xmm5, %%xmm5 \n\t" \
00608 "mulps %0, %%xmm3 \n\t" \
00609 "mulps %0, %%xmm4 \n\t" \
00610 "mulps %0, %%xmm5 \n\t" \
00611 "addps %%xmm3, %%xmm0 \n\t" \
00612 "addps %%xmm4, %%xmm1 \n\t" \
00613 "addps %%xmm5, %%xmm2" \
00614 : \
00615 : \
00616 "m" (_sse_float_sgn24))
00617
00618
00619
00620
00621
00622
00623 #define _sse_float_vector_i_addsub() \
00624 _ASM ("shufps $0xb1, %%xmm3, %%xmm3 \n\t" \
00625 "shufps $0xb1, %%xmm4, %%xmm4 \n\t" \
00626 "shufps $0xb1, %%xmm5, %%xmm5 \n\t" \
00627 "mulps %0, %%xmm3 \n\t" \
00628 "mulps %0, %%xmm4 \n\t" \
00629 "mulps %0, %%xmm5 \n\t" \
00630 "addps %%xmm3, %%xmm0 \n\t" \
00631 "addps %%xmm4, %%xmm1 \n\t" \
00632 "addps %%xmm5, %%xmm2" \
00633 : \
00634 : \
00635 "m" (_sse_float_sgn14))
00636
00637
00638
00639
00640
00641
00642 #define _sse_float_vector_i_subadd() \
00643 _ASM ("shufps $0xb1, %%xmm3, %%xmm3 \n\t" \
00644 "shufps $0xb1, %%xmm4, %%xmm4 \n\t" \
00645 "shufps $0xb1, %%xmm5, %%xmm5 \n\t" \
00646 "mulps %0, %%xmm3 \n\t" \
00647 "mulps %0, %%xmm4 \n\t" \
00648 "mulps %0, %%xmm5 \n\t" \
00649 "addps %%xmm3, %%xmm0 \n\t" \
00650 "addps %%xmm4, %%xmm1 \n\t" \
00651 "addps %%xmm5, %%xmm2" \
00652 : \
00653 : \
00654 "m" (_sse_float_sgn23))
00655
00656
00657
00658
00659
00660 #ifdef SSE2FIX
00661 #define _sse_float_vector_xch() \
00662 _ASM ("shufps $0x4e, %xmm3, %xmm3 \n\t" \
00663 "shufps $0x4e, %xmm4, %xmm4 \n\t" \
00664 "shufps $0x4e, %xmm5, %xmm5" \
00665 : \
00666 :)
00667 #else
00668 #define _sse_float_vector_xch() \
00669 _ASM ("shufps $0x4e, %%xmm3, %%xmm3 \n\t" \
00670 "shufps $0x4e, %%xmm4, %%xmm4 \n\t" \
00671 "shufps $0x4e, %%xmm5, %%xmm5" \
00672 : \
00673 :)
00674 #endif
00675
00676
00677
00678
00679
00680
00681
00682 #define _sse_double_prefetch_16(addr) \
00683 _ASM ("prefetcht0 %0" \
00684 : \
00685 : "m" (*(addr)))
00686
00687 #define _sse_double_prefetch_spinor(addr) \
00688 _ASM ("prefetcht0 %0 \n\t" \
00689 "prefetcht0 %1" \
00690 : \
00691 : \
00692 "m" (*(((char*)(((unsigned int)(addr))&~0x7f)))), \
00693 "m" (*(((char*)(((unsigned int)(addr))&~0x7f))+128)))
00694
00695 #define _sse_double_prefetch_nta_spinor(addr) \
00696 _ASM ("prefetchnta %0 \n\t" \
00697 "prefetchnta %1" \
00698 : \
00699 : \
00700 "m" (*(((char*)(((unsigned int)(addr))&~0x7f)))), \
00701 "m" (*(((char*)(((unsigned int)(addr))&~0x7f))+128)))
00702
00703 #define _sse_double_prefetch_su3(addr) \
00704 _ASM ("prefetcht0 %0 \n\t" \
00705 "prefetcht0 %1" \
00706 : \
00707 : \
00708 "m" (*(((char*)(((unsigned int)(addr))&~0x7f)))), \
00709 "m" (*(((char*)(((unsigned int)(addr))&~0x7f))+128)))
00710
00711
00712
00713
00714
00715
00716
00717
00718
00719
00720
00721
00722 #define _sse_double_load(s) \
00723 _ASM ("movapd %0, %%xmm0 \n\t" \
00724 "movapd %1, %%xmm1 \n\t" \
00725 "movapd %2, %%xmm2" \
00726 : \
00727 : \
00728 "m" ((s).c1), \
00729 "m" ((s).c2), \
00730 "m" ((s).c3))
00731
00732 #define _sse_double_load_123(c1, c2, c3) \
00733 _ASM ("movapd %0, %%xmm0 \n\t" \
00734 "movapd %1, %%xmm1 \n\t" \
00735 "movapd %2, %%xmm2" \
00736 : \
00737 : \
00738 "m" (c1), \
00739 "m" (c2), \
00740 "m" (c3))
00741
00742
00743
00744
00745
00746
00747 #define _sse_double_load_up(s) \
00748 _ASM ("movapd %0, %%xmm3 \n\t" \
00749 "movapd %1, %%xmm4 \n\t" \
00750 "movapd %2, %%xmm5" \
00751 : \
00752 : \
00753 "m" ((s).c1), \
00754 "m" ((s).c2), \
00755 "m" ((s).c3))
00756
00757 #define _sse_double_load_up_123(c1, c2, c3) \
00758 _ASM ("movapd %0, %%xmm3 \n\t" \
00759 "movapd %1, %%xmm4 \n\t" \
00760 "movapd %2, %%xmm5" \
00761 : \
00762 : \
00763 "m" (c1), \
00764 "m" (c2), \
00765 "m" (c3))
00766
00767
00768
00769
00770
00771 #define _sse_double_store(r) \
00772 _ASM ("movapd %%xmm0, %0 \n\t" \
00773 "movapd %%xmm1, %1 \n\t" \
00774 "movapd %%xmm2, %2" \
00775 : \
00776 "=m" ((r).c1), \
00777 "=m" ((r).c2), \
00778 "=m" ((r).c3))
00779
00780 #define _sse_double_store_123(c1, c2, c3) \
00781 _ASM ("movapd %%xmm0, %0 \n\t" \
00782 "movapd %%xmm1, %1 \n\t" \
00783 "movapd %%xmm2, %2" \
00784 : \
00785 "=m" (c1), \
00786 "=m" (c2), \
00787 "=m" (c3))
00788
00789
00790
00791
00792
00793 #define _sse_double_store_up(r) \
00794 _ASM ("movapd %%xmm3, %0 \n\t" \
00795 "movapd %%xmm4, %1 \n\t" \
00796 "movapd %%xmm5, %2" \
00797 : \
00798 "=m" ((r).c1), \
00799 "=m" ((r).c2), \
00800 "=m" ((r).c3))
00801
00802 #define _sse_double_store_up_123(c1, c2, c3) \
00803 _ASM ("movapd %%xmm3, %0 \n\t" \
00804 "movapd %%xmm4, %1 \n\t" \
00805 "movapd %%xmm5, %2" \
00806 : \
00807 "=m" (c1), \
00808 "=m" (c2), \
00809 "=m" (c3))
00810
00811
00812
00813
00814
00815 #define _sse_double_vector_mul(c) \
00816 _ASM ("mulpd %0, %%xmm0 \n\t" \
00817 "mulpd %0, %%xmm1 \n\t" \
00818 "mulpd %0, %%xmm2" \
00819 : \
00820 : \
00821 "m" (c))
00822
00823
00824
00825
00826
00827
00828
00829
00830
00831
00832
00833
00834
00835
00836
00837
00838
00839
00840
00841
00842
00843
00844
00845
00846
00847
00848
00849
00850
00851
00852
00853
00854
00855
00856 #define _sse_double_vector_mul_complex(x,y) \
00857 _ASM ("movapd %%xmm0, %%xmm3 \n\t" \
00858 "movapd %%xmm1, %%xmm4 \n\t" \
00859 "movapd %%xmm2, %%xmm5 \n\t" \
00860 "mulpd %1, %%xmm3 \n\t" \
00861 "mulpd %1, %%xmm4 \n\t" \
00862 "mulpd %1, %%xmm5 \n\t" \
00863 "shufpd $0x1, %%xmm3, %%xmm3 \n\t" \
00864 "shufpd $0x1, %%xmm4, %%xmm4 \n\t" \
00865 "shufpd $0x1, %%xmm5, %%xmm5 \n\t" \
00866 "xorpd %2, %%xmm3 \n\t" \
00867 "xorpd %2, %%xmm4 \n\t" \
00868 "xorpd %2, %%xmm5 \n\t" \
00869 "mulpd %0, %%xmm0 \n\t" \
00870 "mulpd %0, %%xmm1 \n\t" \
00871 "mulpd %0, %%xmm2 \n\t" \
00872 "addpd %%xmm0, %%xmm3 \n\t" \
00873 "addpd %%xmm1, %%xmm4 \n\t" \
00874 "addpd %%xmm2, %%xmm5" \
00875 : \
00876 : \
00877 "m" (x), \
00878 "m" (y), \
00879 "m" (_sse_double_sgn))
00880
00881
00882
00883
00884
00885 #ifdef SSE2FIX
00886 #define _sse_double_vector_add() \
00887 _ASM ("addpd %xmm3, %xmm0 \n\t" \
00888 "addpd %xmm4, %xmm1 \n\t" \
00889 "addpd %xmm5, %xmm2" \
00890 : \
00891 :)
00892 #else
00893 #define _sse_double_vector_add() \
00894 _ASM ("addpd %%xmm3, %%xmm0 \n\t" \
00895 "addpd %%xmm4, %%xmm1 \n\t" \
00896 "addpd %%xmm5, %%xmm2" \
00897 : \
00898 :)
00899 #endif
00900
00901
00902
00903
00904
00905 #ifdef SSE2FIX
00906 #define _sse_double_vector_sub() \
00907 _ASM ("subpd %xmm3, %xmm0 \n\t" \
00908 "subpd %xmm4, %xmm1 \n\t" \
00909 "subpd %xmm5, %xmm2" \
00910 : \
00911 :)
00912 #else
00913 #define _sse_double_vector_sub() \
00914 _ASM ("subpd %%xmm3, %%xmm0 \n\t" \
00915 "subpd %%xmm4, %%xmm1 \n\t" \
00916 "subpd %%xmm5, %%xmm2" \
00917 : \
00918 :)
00919 #endif
00920
00921
00922
00923
00924
00925
00926
00927
00928 #define _sse_double_su3_multiply(u) { \
00929 _ASM ("movsd %0, %%xmm3 \n\t" \
00930 "movsd %1, %%xmm6 \n\t" \
00931 "movsd %2, %%xmm4 \n\t" \
00932 "movsd %3, %%xmm7 \n\t" \
00933 "movsd %4, %%xmm5 " \
00934 : \
00935 : \
00936 "m" ((u).c11.real()), \
00937 "m" ((u).c12.real()), \
00938 "m" ((u).c21.real()), \
00939 "m" ((u).c23.real()), \
00940 "m" ((u).c31.real())); \
00941 _ASM ("unpcklpd %%xmm3, %%xmm3 \n\t" \
00942 "unpcklpd %%xmm6, %%xmm6 \n\t" \
00943 "unpcklpd %%xmm4, %%xmm4 \n\t" \
00944 "mulpd %%xmm0, %%xmm3 \n\t" \
00945 "unpcklpd %%xmm7, %%xmm7 \n\t" \
00946 "mulpd %%xmm1, %%xmm6 \n\t" \
00947 "unpcklpd %%xmm5, %%xmm5 \n\t" \
00948 "mulpd %%xmm0, %%xmm4 \n\t" \
00949 "addpd %%xmm6, %%xmm3 \n\t" \
00950 "mulpd %%xmm2, %%xmm7 \n\t" \
00951 "mulpd %%xmm0, %%xmm5 \n\t" \
00952 "addpd %%xmm7, %%xmm4 \n\t" \
00953 "movsd %0, %%xmm6 \n\t" \
00954 "movsd %1, %%xmm7 \n\t" \
00955 "unpcklpd %%xmm6, %%xmm6 \n\t" \
00956 "unpcklpd %%xmm7, %%xmm7 \n\t" \
00957 "mulpd %%xmm1, %%xmm6 \n\t" \
00958 "mulpd %%xmm2, %%xmm7 \n\t" \
00959 "addpd %%xmm6, %%xmm5 \n\t" \
00960 "addpd %%xmm7, %%xmm3 \n\t" \
00961 "movsd %2, %%xmm6 \n\t" \
00962 "movsd %3, %%xmm7 \n\t" \
00963 "unpcklpd %%xmm6, %%xmm6 \n\t" \
00964 "unpcklpd %%xmm7, %%xmm7 \n\t" \
00965 "mulpd %%xmm1, %%xmm6 \n\t" \
00966 "mulpd %%xmm2, %%xmm7 \n\t" \
00967 "addpd %%xmm6, %%xmm4 \n\t" \
00968 "addpd %%xmm7, %%xmm5 " \
00969 : \
00970 : \
00971 "m" ((u).c32.real()), \
00972 "m" ((u).c13.real()), \
00973 "m" ((u).c22.real()), \
00974 "m" ((u).c33.real())); \
00975 _ASM ("movsd %0, %%xmm6 \n\t" \
00976 "movsd %1, %%xmm7 \n\t" \
00977 "shufpd $0x1, %%xmm0, %%xmm0 \n\t" \
00978 "shufpd $0x1, %%xmm1, %%xmm1 \n\t" \
00979 "shufpd $0x1, %%xmm2, %%xmm2 \n\t" \
00980 "unpcklpd %%xmm6, %%xmm6 \n\t" \
00981 "unpcklpd %%xmm7, %%xmm7 \n\t" \
00982 "xorpd %4, %%xmm0 \n\t" \
00983 "xorpd %4, %%xmm1 \n\t" \
00984 "xorpd %4, %%xmm2 \n\t" \
00985 "mulpd %%xmm0, %%xmm6 \n\t" \
00986 "mulpd %%xmm1, %%xmm7 \n\t" \
00987 "addpd %%xmm6, %%xmm3 \n\t" \
00988 "addpd %%xmm7, %%xmm4 \n\t" \
00989 "movsd %2, %%xmm6 \n\t" \
00990 "movsd %3, %%xmm7 " \
00991 : \
00992 : \
00993 "m" ((u).c11.imag()), \
00994 "m" ((u).c22.imag()), \
00995 "m" ((u).c33.imag()), \
00996 "m" ((u).c21.imag()), \
00997 "m" (_sse_double_sgn)); \
00998 _ASM ("unpcklpd %%xmm6, %%xmm6 \n\t" \
00999 "unpcklpd %%xmm7, %%xmm7 \n\t" \
01000 "mulpd %%xmm2, %%xmm6 \n\t" \
01001 "mulpd %%xmm0, %%xmm7 \n\t" \
01002 "addpd %%xmm6, %%xmm5 \n\t" \
01003 "addpd %%xmm7, %%xmm4 \n\t" \
01004 "movsd %0, %%xmm6 \n\t" \
01005 "movsd %1, %%xmm7 \n\t" \
01006 "unpcklpd %%xmm6, %%xmm6 \n\t" \
01007 "unpcklpd %%xmm7, %%xmm7 \n\t" \
01008 "mulpd %%xmm1, %%xmm6 \n\t" \
01009 "mulpd %%xmm0, %%xmm7 \n\t" \
01010 "addpd %%xmm6, %%xmm3 \n\t" \
01011 "addpd %%xmm7, %%xmm5 \n\t" \
01012 "movsd %2, %%xmm0 \n\t" \
01013 "movsd %3, %%xmm6 \n\t" \
01014 "movsd %4, %%xmm7 \n\t" \
01015 "unpcklpd %%xmm0, %%xmm0 \n\t" \
01016 "unpcklpd %%xmm6, %%xmm6 \n\t" \
01017 "unpcklpd %%xmm7, %%xmm7 \n\t" \
01018 "mulpd %%xmm2, %%xmm0 \n\t" \
01019 "mulpd %%xmm1, %%xmm6 \n\t" \
01020 "mulpd %%xmm2, %%xmm7 \n\t" \
01021 "addpd %%xmm0, %%xmm3 \n\t" \
01022 "addpd %%xmm6, %%xmm5 \n\t" \
01023 "addpd %%xmm7, %%xmm4 " \
01024 : \
01025 : \
01026 "m" ((u).c12.imag()), \
01027 "m" ((u).c31.imag()), \
01028 "m" ((u).c13.imag()), \
01029 "m" ((u).c32.imag()), \
01030 "m" ((u).c23.imag())); }
01031
01032
01033
01034
01035
01036
01037
01038
01039
01040 #define _sse_double_su3_inverse_multiply(u) { \
01041 _ASM ("movsd %0, %%xmm3 \n\t" \
01042 "movsd %1, %%xmm6 \n\t" \
01043 "movsd %2, %%xmm4 \n\t" \
01044 "movsd %3, %%xmm7 \n\t" \
01045 "movsd %4, %%xmm5 " \
01046 : \
01047 : \
01048 "m" ((u).c11.real()), \
01049 "m" ((u).c21.real()), \
01050 "m" ((u).c12.real()), \
01051 "m" ((u).c32.real()), \
01052 "m" ((u).c13.real())); \
01053 _ASM ("unpcklpd %%xmm3, %%xmm3 \n\t" \
01054 "unpcklpd %%xmm6, %%xmm6 \n\t" \
01055 "unpcklpd %%xmm4, %%xmm4 \n\t" \
01056 "mulpd %%xmm0, %%xmm3 \n\t" \
01057 "unpcklpd %%xmm7, %%xmm7 \n\t" \
01058 "mulpd %%xmm1, %%xmm6 \n\t" \
01059 "unpcklpd %%xmm5, %%xmm5 \n\t" \
01060 "mulpd %%xmm0, %%xmm4 \n\t" \
01061 "addpd %%xmm6, %%xmm3 \n\t" \
01062 "mulpd %%xmm2, %%xmm7 \n\t" \
01063 "mulpd %%xmm0, %%xmm5 \n\t" \
01064 "addpd %%xmm7, %%xmm4 \n\t" \
01065 "movsd %0, %%xmm6 \n\t" \
01066 "movsd %1, %%xmm7 \n\t" \
01067 "unpcklpd %%xmm6, %%xmm6 \n\t" \
01068 "unpcklpd %%xmm7, %%xmm7 \n\t" \
01069 "mulpd %%xmm1, %%xmm6 \n\t" \
01070 "mulpd %%xmm2, %%xmm7 \n\t" \
01071 "addpd %%xmm6, %%xmm5 \n\t" \
01072 "addpd %%xmm7, %%xmm3 \n\t" \
01073 "movsd %2, %%xmm6 \n\t" \
01074 "movsd %3, %%xmm7 \n\t" \
01075 "unpcklpd %%xmm6, %%xmm6 \n\t" \
01076 "unpcklpd %%xmm7, %%xmm7 \n\t" \
01077 "mulpd %%xmm1, %%xmm6 \n\t" \
01078 "mulpd %%xmm2, %%xmm7 \n\t" \
01079 "addpd %%xmm6, %%xmm4 \n\t" \
01080 "addpd %%xmm7, %%xmm5" \
01081 : \
01082 : \
01083 "m" ((u).c23.real()), \
01084 "m" ((u).c31.real()), \
01085 "m" ((u).c22.real()), \
01086 "m" ((u).c33.real())); \
01087 _ASM ("movsd %0, %%xmm6 \n\t" \
01088 "movsd %1, %%xmm7 \n\t" \
01089 "xorpd %4, %%xmm0 \n\t" \
01090 "xorpd %4, %%xmm1 \n\t" \
01091 "xorpd %4, %%xmm2 \n\t" \
01092 "unpcklpd %%xmm6, %%xmm6 \n\t" \
01093 "unpcklpd %%xmm7, %%xmm7 \n\t" \
01094 "shufpd $0x1, %%xmm0, %%xmm0 \n\t" \
01095 "shufpd $0x1, %%xmm1, %%xmm1 \n\t" \
01096 "shufpd $0x1, %%xmm2, %%xmm2 \n\t" \
01097 "mulpd %%xmm0, %%xmm6 \n\t" \
01098 "mulpd %%xmm1, %%xmm7 \n\t" \
01099 "addpd %%xmm6, %%xmm3 \n\t" \
01100 "addpd %%xmm7, %%xmm4 \n\t" \
01101 "movsd %2, %%xmm6 \n\t" \
01102 "movsd %3, %%xmm7 " \
01103 : \
01104 : \
01105 "m" ((u).c11.imag()), \
01106 "m" ((u).c22.imag()), \
01107 "m" ((u).c33.imag()), \
01108 "m" ((u).c12.imag()), \
01109 "m" (_sse_double_sgn)); \
01110 _ASM ("unpcklpd %%xmm6, %%xmm6 \n\t" \
01111 "unpcklpd %%xmm7, %%xmm7 \n\t" \
01112 "mulpd %%xmm2, %%xmm6 \n\t" \
01113 "mulpd %%xmm0, %%xmm7 \n\t" \
01114 "addpd %%xmm6, %%xmm5 \n\t" \
01115 "addpd %%xmm7, %%xmm4 \n\t" \
01116 "movsd %0, %%xmm6 \n\t" \
01117 "movsd %1, %%xmm7 \n\t" \
01118 "unpcklpd %%xmm6, %%xmm6 \n\t" \
01119 "unpcklpd %%xmm7, %%xmm7 \n\t" \
01120 "mulpd %%xmm1, %%xmm6 \n\t" \
01121 "mulpd %%xmm0, %%xmm7 \n\t" \
01122 "addpd %%xmm6, %%xmm3 \n\t" \
01123 "addpd %%xmm7, %%xmm5 \n\t" \
01124 "movsd %2, %%xmm0 \n\t" \
01125 "movsd %3, %%xmm6 \n\t" \
01126 "movsd %4, %%xmm7 \n\t" \
01127 "unpcklpd %%xmm0, %%xmm0 \n\t" \
01128 "unpcklpd %%xmm6, %%xmm6 \n\t" \
01129 "unpcklpd %%xmm7, %%xmm7 \n\t" \
01130 "mulpd %%xmm2, %%xmm0 \n\t" \
01131 "mulpd %%xmm1, %%xmm6 \n\t" \
01132 "mulpd %%xmm2, %%xmm7 \n\t" \
01133 "addpd %%xmm0, %%xmm3 \n\t" \
01134 "addpd %%xmm6, %%xmm5 \n\t" \
01135 "addpd %%xmm7, %%xmm4 " \
01136 : \
01137 : \
01138 "m" ((u).c21.imag()), \
01139 "m" ((u).c13.imag()), \
01140 "m" ((u).c31.imag()), \
01141 "m" ((u).c23.imag()), \
01142 "m" ((u).c32.imag())); }
01143
01144
01145
01146
01147
01148
01149
01150
01151
01152
01153
01154 #define _sse_double_vector_i_mul() \
01155 _ASM ("shufpd $0x1, %%xmm3, %%xmm3 \n\t" \
01156 "shufpd $0x1, %%xmm4, %%xmm4 \n\t" \
01157 "shufpd $0x1, %%xmm5, %%xmm5 \n\t" \
01158 "xorpd %0, %%xmm3 \n\t" \
01159 "xorpd %0, %%xmm4 \n\t" \
01160 "xorpd %0, %%xmm5" \
01161 : \
01162 : \
01163 "m" (_sse_double_sgn))
01164
01165
01166
01167
01168
01169 #define _sse_double_vector_minus_i_mul() \
01170 _ASM ("xorpd %0, %%xmm3 \n\t" \
01171 "xorpd %0, %%xmm4 \n\t" \
01172 "xorpd %0, %%xmm5 \n\t" \
01173 "shufpd $0x1, %%xmm3, %%xmm3 \n\t" \
01174 "shufpd $0x1, %%xmm4, %%xmm4 \n\t" \
01175 "shufpd $0x1, %%xmm5, %%xmm5" \
01176 : \
01177 : \
01178 "m" (_sse_double_sgn))
01179
01180
01181
01182
01183
01184
01185
01186
01187
01188
01189
01190
01191 #define _sse_double_add_norm_square_16(r,c) { \
01192 _ASM ("movapd %0, %%xmm0 \n\t" \
01193 "movapd %1, %%xmm1 \n\t" \
01194 "movapd %2, %%xmm2 \n\t" \
01195 "movapd %3, %%xmm3" \
01196 : \
01197 : \
01198 "m" (*((r))), \
01199 "m" (*((r)+1)), \
01200 "m" (*((r)+2)), \
01201 "m" (*((r)+3))); \
01202 _ASM ("movapd %0, %%xmm4 \n\t" \
01203 "movapd %1, %%xmm5 \n\t" \
01204 "movapd %2, %%xmm6 \n\t" \
01205 "movapd %3, %%xmm7 \n\t" \
01206 "mulpd %%xmm0, %%xmm0 \n\t" \
01207 "mulpd %%xmm1, %%xmm1 \n\t" \
01208 "mulpd %%xmm2, %%xmm2 \n\t" \
01209 "mulpd %%xmm3, %%xmm3 \n\t" \
01210 "mulpd %%xmm4, %%xmm4 \n\t" \
01211 "mulpd %%xmm5, %%xmm5 \n\t" \
01212 "mulpd %%xmm6, %%xmm6 \n\t" \
01213 "mulpd %%xmm7, %%xmm7 \n\t" \
01214 "addpd %%xmm0, %%xmm1 \n\t" \
01215 "addpd %%xmm2, %%xmm3 \n\t" \
01216 "addpd %%xmm4, %%xmm5 \n\t" \
01217 "addpd %%xmm6, %%xmm7 \n\t" \
01218 "addpd %%xmm1, %%xmm3 \n\t" \
01219 "addpd %%xmm5, %%xmm7 \n\t" \
01220 "addpd %%xmm3, %%xmm7" \
01221 : \
01222 : \
01223 "m" (*((r)+4)), \
01224 "m" (*((r)+5)), \
01225 "m" (*((r)+6)), \
01226 "m" (*((r)+7))); \
01227 _ASM ("movapd %0, %%xmm1 \n\t" \
01228 "addpd %%xmm1, %%xmm7 \n\t" \
01229 "movapd %%xmm7, %0" \
01230 : \
01231 "=m" (c)); }
01232
01233
01234
01235
01236
01237 #define _sse_double_add_real_scalar_product_16(r,s,c) { \
01238 _ASM ("movapd %0, %%xmm0 \n\t" \
01239 "movapd %1, %%xmm1 \n\t" \
01240 "movapd %2, %%xmm2 \n\t" \
01241 "movapd %3, %%xmm3 \n\t" \
01242 : \
01243 : \
01244 "m" (*((r))), \
01245 "m" (*((r)+1)), \
01246 "m" (*((r)+2)), \
01247 "m" (*((r)+3))); \
01248 _ASM ("mulpd %0, %%xmm0 \n\t" \
01249 "mulpd %1, %%xmm1 \n\t" \
01250 "mulpd %2, %%xmm2 \n\t" \
01251 "mulpd %3, %%xmm3 \n\t" \
01252 : \
01253 : \
01254 "m" (*((s))), \
01255 "m" (*((s)+1)), \
01256 "m" (*((s)+2)), \
01257 "m" (*((s)+3))); \
01258 _ASM ("movapd %0, %%xmm4 \n\t" \
01259 "movapd %1, %%xmm5 \n\t" \
01260 "movapd %2, %%xmm6 \n\t" \
01261 "movapd %3, %%xmm7 \n\t" \
01262 : \
01263 : \
01264 "m" (*((r)+4)), \
01265 "m" (*((r)+5)), \
01266 "m" (*((r)+6)), \
01267 "m" (*((r)+7))); \
01268 _ASM ("mulpd %0, %%xmm4 \n\t" \
01269 "mulpd %1, %%xmm5 \n\t" \
01270 "mulpd %2, %%xmm6 \n\t" \
01271 "mulpd %3, %%xmm7 \n\t" \
01272 : \
01273 : \
01274 "m" (*((s)+4)), \
01275 "m" (*((s)+5)), \
01276 "m" (*((s)+6)), \
01277 "m" (*((s)+7))); \
01278 _ASM ("addpd %%xmm0, %%xmm1 \n\t" \
01279 "addpd %%xmm2, %%xmm3 \n\t" \
01280 "addpd %%xmm4, %%xmm5 \n\t" \
01281 "addpd %%xmm6, %%xmm7 \n\t" \
01282 "addpd %%xmm1, %%xmm3 \n\t" \
01283 "addpd %%xmm5, %%xmm7 \n\t" \
01284 "addpd %%xmm3, %%xmm7 \n\t" \
01285 "movapd %0, %%xmm1 \n\t" \
01286 "addpd %%xmm1, %%xmm7 \n\t" \
01287 "movapd %%xmm7, %0 \n\t" \
01288 : \
01289 "=m" (c)); }
01290
01291 #define _sse_double_add_imag_scalar_product_16(r,s,c) { \
01292 _ASM ("movapd %0, %%xmm0 \n\t" \
01293 "movapd %1, %%xmm1 \n\t" \
01294 "movapd %2, %%xmm2 \n\t" \
01295 "movapd %3, %%xmm3 \n\t" \
01296 : \
01297 : \
01298 "m" (*((r))), \
01299 "m" (*((r)+1)), \
01300 "m" (*((r)+2)), \
01301 "m" (*((r)+3))); \
01302 _ASM ("shufpd $0x1, %%xmm0, %%xmm0 \n\t" \
01303 "shufpd $0x1, %%xmm1, %%xmm1 \n\t" \
01304 "shufpd $0x1, %%xmm2, %%xmm2 \n\t" \
01305 "shufpd $0x1, %%xmm3, %%xmm3 \n\t" \
01306 "mulpd %0, %%xmm0 \n\t" \
01307 "mulpd %1, %%xmm1 \n\t" \
01308 "mulpd %2, %%xmm2 \n\t" \
01309 "mulpd %3, %%xmm3 \n\t" \
01310 : \
01311 : \
01312 "m" (*((s))), \
01313 "m" (*((s)+1)), \
01314 "m" (*((s)+2)), \
01315 "m" (*((s)+3))); \
01316 _ASM ("movapd %0, %%xmm4 \n\t" \
01317 "movapd %1, %%xmm5 \n\t" \
01318 "movapd %2, %%xmm6 \n\t" \
01319 "movapd %3, %%xmm7 \n\t" \
01320 : \
01321 : \
01322 "m" (*((r)+4)), \
01323 "m" (*((r)+5)), \
01324 "m" (*((r)+6)), \
01325 "m" (*((r)+7))); \
01326 _ASM ("shufpd $0x1, %%xmm4, %%xmm4 \n\t" \
01327 "shufpd $0x1, %%xmm5, %%xmm5 \n\t" \
01328 "shufpd $0x1, %%xmm6, %%xmm6 \n\t" \
01329 "shufpd $0x1, %%xmm7, %%xmm7 \n\t" \
01330 "mulpd %0, %%xmm4 \n\t" \
01331 "mulpd %1, %%xmm5 \n\t" \
01332 "mulpd %2, %%xmm6 \n\t" \
01333 "mulpd %3, %%xmm7 \n\t" \
01334 : \
01335 : \
01336 "m" (*((s)+4)), \
01337 "m" (*((s)+5)), \
01338 "m" (*((s)+6)), \
01339 "m" (*((s)+7))); \
01340 _ASM ("addpd %%xmm0, %%xmm1 \n\t" \
01341 "addpd %%xmm2, %%xmm3 \n\t" \
01342 "addpd %%xmm4, %%xmm5 \n\t" \
01343 "addpd %%xmm6, %%xmm7 \n\t" \
01344 "addpd %%xmm1, %%xmm3 \n\t" \
01345 "addpd %%xmm5, %%xmm7 \n\t" \
01346 "addpd %%xmm3, %%xmm7 \n\t" \
01347 "movapd %0, %%xmm1 \n\t" \
01348 "addpd %%xmm1, %%xmm7 \n\t" \
01349 "movapd %%xmm7, %0 \n\t" \
01350 : \
01351 "=m" (c)); }
01352
01353 #define _sse_double_hermitian_su3(r,s) { \
01354 _ASM ("movapd %0, %%xmm0 \n\t"\
01355 "xorpd %3, %%xmm0 \n\t" \
01356 "movapd %1, %%xmm1 \n\t"\
01357 "xorpd %3, %%xmm1 \n\t" \
01358 "movapd %2, %%xmm2 \n\t"\
01359 "xorpd %3, %%xmm2 \n\t" \
01360 : \
01361 : \
01362 "m" (*((s))), \
01363 "m" (*((s)+4)), \
01364 "m" (*((s)+8)), \
01365 "m" (_sse_double_sgn2)); \
01366 _ASM ("movapd %%xmm0, %0 \n\t"\
01367 "movapd %%xmm1, %1 \n\t"\
01368 "movapd %%xmm2, %2 \n\t"\
01369 : \
01370 "=m" (*((r))), \
01371 "=m" (*((r)+4)), \
01372 "=m" (*((r)+8))); \
01373 _ASM ("movapd %0, %%xmm0 \n\t"\
01374 "xorpd %3, %%xmm0 \n\t" \
01375 "movapd %1, %%xmm1 \n\t"\
01376 "xorpd %3, %%xmm1 \n\t" \
01377 "movapd %2, %%xmm2 \n\t"\
01378 "xorpd %3, %%xmm2 \n\t" \
01379 : \
01380 : \
01381 "m" (*((s)+1)), \
01382 "m" (*((s)+2)), \
01383 "m" (*((s)+5)), \
01384 "m" (_sse_double_sgn2)); \
01385 _ASM ("movapd %%xmm0, %0 \n\t"\
01386 "movapd %%xmm1, %1 \n\t"\
01387 "movapd %%xmm2, %2 \n\t"\
01388 : \
01389 "=m" (*((r)+3)), \
01390 "=m" (*((r)+6)), \
01391 "=m" (*((r)+7))); \
01392 _ASM ("movapd %0, %%xmm0 \n\t"\
01393 "xorpd %3, %%xmm0 \n\t" \
01394 "movapd %1, %%xmm1 \n\t"\
01395 "xorpd %3, %%xmm1 \n\t" \
01396 "movapd %2, %%xmm2 \n\t"\
01397 "xorpd %3, %%xmm2 \n\t" \
01398 : \
01399 : \
01400 "m" (*((s)+3)), \
01401 "m" (*((s)+6)), \
01402 "m" (*((s)+7)), \
01403 "m" (_sse_double_sgn2)); \
01404 _ASM ("movapd %%xmm0, %0 \n\t"\
01405 "movapd %%xmm1, %1 \n\t"\
01406 "movapd %%xmm2, %2 \n\t"\
01407 : \
01408 "=m" (*((r)+1)), \
01409 "=m" (*((r)+2)), \
01410 "=m" (*((r)+5))); } \
01411
01412
01413
01414
01415
01416 #define _sse_double_copy_16(r,s) { \
01417 _ASM ("movapd %0, %%xmm0 \n\t" \
01418 "movapd %1, %%xmm1 \n\t" \
01419 "movapd %2, %%xmm2 \n\t" \
01420 "movapd %3, %%xmm3 \n\t" \
01421 : \
01422 : \
01423 "m" (*((s))), \
01424 "m" (*((s)+1)), \
01425 "m" (*((s)+2)), \
01426 "m" (*((s)+3))); \
01427 _ASM ("movapd %0, %%xmm4 \n\t" \
01428 "movapd %1, %%xmm5 \n\t" \
01429 "movapd %2, %%xmm6 \n\t" \
01430 "movapd %3, %%xmm7 \n\t" \
01431 : \
01432 : \
01433 "m" (*((s)+4)), \
01434 "m" (*((s)+5)), \
01435 "m" (*((s)+6)), \
01436 "m" (*((s)+7))); \
01437 _ASM ("movapd %%xmm0, %0 \n\t" \
01438 "movapd %%xmm1, %1 \n\t" \
01439 "movapd %%xmm2, %2 \n\t" \
01440 "movapd %%xmm3, %3 \n\t" \
01441 : \
01442 "=m" (*((r))), \
01443 "=m" (*((r)+1)), \
01444 "=m" (*((r)+2)), \
01445 "=m" (*((r)+3))); \
01446 _ASM ("movapd %%xmm4, %0 \n\t" \
01447 "movapd %%xmm5, %1 \n\t" \
01448 "movapd %%xmm6, %2 \n\t" \
01449 "movapd %%xmm7, %3 \n\t" \
01450 : \
01451 "=m" (*((r)+4)), \
01452 "=m" (*((r)+5)), \
01453 "=m" (*((r)+6)), \
01454 "=m" (*((r)+7))); }
01455
01456
01457
01458
01459
01460 #define _sse_double_add_16(r,s) { \
01461 _ASM ("movapd %0, %%xmm0 \n\t" \
01462 "movapd %1, %%xmm1 \n\t" \
01463 "movapd %2, %%xmm2 \n\t" \
01464 "movapd %3, %%xmm3 \n\t" \
01465 : \
01466 : \
01467 "m" (*((s))), \
01468 "m" (*((s)+1)), \
01469 "m" (*((s)+2)), \
01470 "m" (*((s)+3))); \
01471 _ASM ("movapd %0, %%xmm4 \n\t" \
01472 "movapd %1, %%xmm5 \n\t" \
01473 "movapd %2, %%xmm6 \n\t" \
01474 "movapd %3, %%xmm7 \n\t" \
01475 : \
01476 : \
01477 "m" (*((s)+4)), \
01478 "m" (*((s)+5)), \
01479 "m" (*((s)+6)), \
01480 "m" (*((s)+7))); \
01481 _ASM ("addpd %0, %%xmm0 \n\t" \
01482 "addpd %1, %%xmm1 \n\t" \
01483 "addpd %2, %%xmm2 \n\t" \
01484 "addpd %3, %%xmm3 \n\t" \
01485 : \
01486 : \
01487 "m" (*((r))), \
01488 "m" (*((r)+1)), \
01489 "m" (*((r)+2)), \
01490 "m" (*((r)+3))); \
01491 _ASM ("addpd %0, %%xmm4 \n\t" \
01492 "addpd %1, %%xmm5 \n\t" \
01493 "addpd %2, %%xmm6 \n\t" \
01494 "addpd %3, %%xmm7 \n\t" \
01495 : \
01496 : \
01497 "m" (*((r)+4)), \
01498 "m" (*((r)+5)), \
01499 "m" (*((r)+6)), \
01500 "m" (*((r)+7))); \
01501 _ASM ("movapd %%xmm0, %0 \n\t" \
01502 "movapd %%xmm1, %1 \n\t" \
01503 "movapd %%xmm2, %2 \n\t" \
01504 "movapd %%xmm3, %3 \n\t" \
01505 : \
01506 "=m" (*((r))), \
01507 "=m" (*((r)+1)), \
01508 "=m" (*((r)+2)), \
01509 "=m" (*((r)+3))); \
01510 _ASM ("movapd %%xmm4, %0 \n\t" \
01511 "movapd %%xmm5, %1 \n\t" \
01512 "movapd %%xmm6, %2 \n\t" \
01513 "movapd %%xmm7, %3 \n\t" \
01514 : \
01515 "=m" (*((r)+4)), \
01516 "=m" (*((r)+5)), \
01517 "=m" (*((r)+6)), \
01518 "=m" (*((r)+7))); }
01519
01520
01521
01522
01523
01524 #define _sse_double_sub_16(r,s) { \
01525 _ASM ("movapd %0, %%xmm0 \n\t" \
01526 "movapd %1, %%xmm1 \n\t" \
01527 "movapd %2, %%xmm2 \n\t" \
01528 "movapd %3, %%xmm3 \n\t" \
01529 : \
01530 : \
01531 "m" (*((s))), \
01532 "m" (*((s)+1)), \
01533 "m" (*((s)+2)), \
01534 "m" (*((s)+3))); \
01535 _ASM ("movapd %0, %%xmm4 \n\t" \
01536 "movapd %1, %%xmm5 \n\t" \
01537 "movapd %2, %%xmm6 \n\t" \
01538 "movapd %3, %%xmm7 \n\t" \
01539 : \
01540 : \
01541 "m" (*((s)+4)), \
01542 "m" (*((s)+5)), \
01543 "m" (*((s)+6)), \
01544 "m" (*((s)+7))); \
01545 _ASM ("subpd %0, %%xmm0 \n\t" \
01546 "subpd %1, %%xmm1 \n\t" \
01547 "subpd %2, %%xmm2 \n\t" \
01548 "subpd %3, %%xmm3 \n\t" \
01549 : \
01550 : \
01551 "m" (*((r))), \
01552 "m" (*((r)+1)), \
01553 "m" (*((r)+2)), \
01554 "m" (*((r)+3))); \
01555 _ASM ("subpd %0, %%xmm4 \n\t" \
01556 "subpd %1, %%xmm5 \n\t" \
01557 "subpd %2, %%xmm6 \n\t" \
01558 "subpd %3, %%xmm7 \n\t" \
01559 : \
01560 : \
01561 "m" (*((r)+4)), \
01562 "m" (*((r)+5)), \
01563 "m" (*((r)+6)), \
01564 "m" (*((r)+7))); \
01565 _ASM ("movapd %%xmm0, %0 \n\t" \
01566 "movapd %%xmm1, %1 \n\t" \
01567 "movapd %%xmm2, %2 \n\t" \
01568 "movapd %%xmm3, %3 \n\t" \
01569 : \
01570 "=m" (*((r))), \
01571 "=m" (*((r)+1)), \
01572 "=m" (*((r)+2)), \
01573 "=m" (*((r)+3))); \
01574 _ASM ("movapd %%xmm4, %0 \n\t" \
01575 "movapd %%xmm5, %1 \n\t" \
01576 "movapd %%xmm6, %2 \n\t" \
01577 "movapd %%xmm7, %3 \n\t" \
01578 : \
01579 "=m" (*((r)+4)), \
01580 "=m" (*((r)+5)), \
01581 "=m" (*((r)+6)), \
01582 "=m" (*((r)+7))); }
01583
01584
01585
01586
01587
01588 #define _sse_double_add_multiply_16(r,c,s) { \
01589 _ASM ("movapd %0, %%xmm0 \n\t" \
01590 "movapd %1, %%xmm1 \n\t" \
01591 "movapd %2, %%xmm2 \n\t" \
01592 "movapd %3, %%xmm3 \n\t" \
01593 : \
01594 : \
01595 "m" (*((s))), \
01596 "m" (*((s)+1)), \
01597 "m" (*((s)+2)), \
01598 "m" (*((s)+3))); \
01599 _ASM ("movapd %0, %%xmm4 \n\t" \
01600 "movapd %1, %%xmm5 \n\t" \
01601 "movapd %2, %%xmm6 \n\t" \
01602 "movapd %3, %%xmm7 \n\t" \
01603 "mulpd %4, %%xmm0 \n\t" \
01604 "mulpd %4, %%xmm1 \n\t" \
01605 "mulpd %4, %%xmm2 \n\t" \
01606 "mulpd %4, %%xmm3 \n\t" \
01607 "mulpd %4, %%xmm4 \n\t" \
01608 "mulpd %4, %%xmm5 \n\t" \
01609 "mulpd %4, %%xmm6 \n\t" \
01610 "mulpd %4, %%xmm7 \n\t" \
01611 : \
01612 : \
01613 "m" (*((s)+4)), \
01614 "m" (*((s)+5)), \
01615 "m" (*((s)+6)), \
01616 "m" (*((s)+7)), \
01617 "m" (c)); \
01618 _ASM ("addpd %0, %%xmm0 \n\t" \
01619 "addpd %1, %%xmm1 \n\t" \
01620 "addpd %2, %%xmm2 \n\t" \
01621 "addpd %3, %%xmm3 \n\t" \
01622 : \
01623 : \
01624 "m" (*((r))), \
01625 "m" (*((r)+1)), \
01626 "m" (*((r)+2)), \
01627 "m" (*((r)+3))); \
01628 _ASM ("addpd %0, %%xmm4 \n\t" \
01629 "addpd %1, %%xmm5 \n\t" \
01630 "addpd %2, %%xmm6 \n\t" \
01631 "addpd %3, %%xmm7 \n\t" \
01632 : \
01633 : \
01634 "m" (*((r)+4)), \
01635 "m" (*((r)+5)), \
01636 "m" (*((r)+6)), \
01637 "m" (*((r)+7))); \
01638 _ASM ("movapd %%xmm0, %0 \n\t" \
01639 "movapd %%xmm1, %1 \n\t" \
01640 "movapd %%xmm2, %2 \n\t" \
01641 "movapd %%xmm3, %3 \n\t" \
01642 : \
01643 "=m" (*((r))), \
01644 "=m" (*((r)+1)), \
01645 "=m" (*((r)+2)), \
01646 "=m" (*((r)+3))); \
01647 _ASM ("movapd %%xmm4, %0 \n\t" \
01648 "movapd %%xmm5, %1 \n\t" \
01649 "movapd %%xmm6, %2 \n\t" \
01650 "movapd %%xmm7, %3 \n\t" \
01651 : \
01652 "=m" (*((r)+4)), \
01653 "=m" (*((r)+5)), \
01654 "=m" (*((r)+6)), \
01655 "=m" (*((r)+7))); }
01656
01657 #define _sse_double_multiply_16(r,c,s) { \
01658 _ASM ("movapd %0, %%xmm0 \n\t" \
01659 "movapd %1, %%xmm1 \n\t" \
01660 "movapd %2, %%xmm2 \n\t" \
01661 "movapd %3, %%xmm3 \n\t" \
01662 : \
01663 : \
01664 "m" (*((s))), \
01665 "m" (*((s)+1)), \
01666 "m" (*((s)+2)), \
01667 "m" (*((s)+3))); \
01668 _ASM ("movapd %0, %%xmm4 \n\t" \
01669 "movapd %1, %%xmm5 \n\t" \
01670 "movapd %2, %%xmm6 \n\t" \
01671 "movapd %3, %%xmm7 \n\t" \
01672 : \
01673 : \
01674 "m" (*((s)+4)), \
01675 "m" (*((s)+5)), \
01676 "m" (*((s)+6)), \
01677 "m" (*((s)+7))); \
01678 _ASM ("mulpd %0, %%xmm0 \n\t" \
01679 "mulpd %0, %%xmm1 \n\t" \
01680 "mulpd %0, %%xmm2 \n\t" \
01681 "mulpd %0, %%xmm3 \n\t" \
01682 "mulpd %0, %%xmm4 \n\t" \
01683 "mulpd %0, %%xmm5 \n\t" \
01684 "mulpd %0, %%xmm6 \n\t" \
01685 "mulpd %0, %%xmm7 \n\t" \
01686 : \
01687 : \
01688 "m" (c)); \
01689 _ASM ("movapd %%xmm0, %0 \n\t" \
01690 "movapd %%xmm1, %1 \n\t" \
01691 "movapd %%xmm2, %2 \n\t" \
01692 "movapd %%xmm3, %3 \n\t" \
01693 : \
01694 "=m" (*((r))), \
01695 "=m" (*((r)+1)), \
01696 "=m" (*((r)+2)), \
01697 "=m" (*((r)+3))); \
01698 _ASM ("movapd %%xmm4, %0 \n\t" \
01699 "movapd %%xmm5, %1 \n\t" \
01700 "movapd %%xmm6, %2 \n\t" \
01701 "movapd %%xmm7, %3 \n\t" \
01702 : \
01703 "=m" (*((r)+4)), \
01704 "=m" (*((r)+5)), \
01705 "=m" (*((r)+6)), \
01706 "=m" (*((r)+7))); }
01707
01708
01709 static void _sse_check_alignment(void* var, unsigned long base) {
01710 unsigned long af1=(unsigned int) var;
01711 if (af1!=(af1&~base)) {
01712 error("_sse_check_alignment()\nVariable not aligned properly");
01713 }
01714 }
01715