00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013 #if defined(SSE2)
00014
00029 class StaggeredAsqtadActionSSE2 {
00030 public:
00031 static void mul_Q(staggered_field &chi_out,
00032 staggered_field &psi_in,
00033 gauge_field &U_in,
00034 coefficients &coeff,
00035 int parity=EVENODD) {
00036
00037 _sse_su3_vector* chi=(_sse_su3_vector*) chi_out.physical_address();
00038 _sse_su3_vector* psi=(_sse_su3_vector*) psi_in.physical_address();
00039
00040 _sse_su3* U=(_sse_su3*) U_in.physical_address();
00041 _sse_su3* LL=(_sse_su3*) U_in.long_links.physical_address();
00042 long** iup=U_in.lattice().up;
00043 long** idw=U_in.lattice().dw;
00044
00045 long start=U_in.lattice().start_index(ME,parity);
00046 long stop =U_in.lattice().stop_index(ME,parity);
00047
00048 mdp_real two_mass;
00049 int sign;
00050 if(coeff.has_key("mass")) two_mass=2.0*coeff["mass"];
00051 else error("coefficient mass undefined");
00052 if(coeff.has_key("sign")) sign=(int) coeff["sign"];
00053 else sign=1;
00054
00055 site x(psi_in.lattice());
00056
00057 #if defined(USE_DOUBLE_PRECISION)
00058
00059 static _sse_double fact1 ALIGN16;
00060 static _sse_su3_vector r ALIGN16;
00061 _sse_su3 *up, *um;
00062 _sse_su3_vector *s,*sp,*sm,*sn;
00063 long ix,iy,iz;
00064 int sign0, sign1, sign2, sign3;
00065
00066 fact1.c1=1.0f*two_mass;
00067 fact1.c2=1.0f*two_mass;
00068
00069 sp=psi+iup[start][0];
00070 up=U+4*start;
00071
00072
00073
00074 for(ix=start; ix<stop; ix++) {
00075
00076 s=psi+ix;
00077
00078 x.idx=ix;
00079 if(sign>0) {
00080 sign0=(int) (psi_in.eta(x,0)+1);
00081 sign1=(int) (psi_in.eta(x,1)+1);
00082 sign2=(int) (psi_in.eta(x,2)+1);
00083 sign3=(int) (psi_in.eta(x,3)+1);
00084 } else {
00085 sign0=(int) (1.0-psi_in.eta(x,0));
00086 sign1=(int) (1.0-psi_in.eta(x,1));
00087 sign2=(int) (1.0-psi_in.eta(x,2));
00088 sign3=(int) (1.0-psi_in.eta(x,3));
00089 }
00090
00091
00092
00093 iy=idw[ix][0];
00094 sm=psi+iy;
00095 _sse_float_prefetch_spinor(sm);
00096 um=(_sse_su3*) U+iy*4;
00097 _sse_float_prefetch_su3(um);
00098
00099 _sse_double_load(*sp);
00100 _sse_double_su3_multiply(*up);
00101 _sse_double_load(*s);
00102 _sse_double_vector_mul(fact1);
00103 if(sign0) _sse_double_vector_add();
00104 else _sse_double_vector_sub();
00105 _sse_double_store(r);
00106
00107 up++;
00108 sp=psi+iup[ix][1];
00109 _sse_float_prefetch_spinor(sp);
00110
00111 _sse_double_load(*sm);
00112 _sse_double_su3_inverse_multiply(*um);
00113 _sse_double_load(r);
00114 if(sign0) _sse_double_vector_sub();
00115 else _sse_double_vector_add();
00116 _sse_double_store(r);
00117
00118
00119
00120
00121 iy=idw[ix][1];
00122 sm=psi+iy;
00123 _sse_float_prefetch_spinor(sm);
00124 um=U+iy*4+1;
00125 _sse_float_prefetch_su3(um);
00126
00127 _sse_double_load(*sp);
00128 _sse_double_su3_multiply(*up);
00129 _sse_double_load(r);
00130 if(sign1) _sse_double_vector_add();
00131 else _sse_double_vector_sub();
00132 _sse_double_store(r);
00133
00134 up++;
00135 sp=psi+iup[ix][2];
00136 _sse_float_prefetch_spinor(sp);
00137
00138 _sse_double_load(*sm);
00139 _sse_double_su3_inverse_multiply(*um);
00140 _sse_double_load(r);
00141 if(sign1) _sse_double_vector_sub();
00142 else _sse_double_vector_add();
00143 _sse_double_store(r);
00144
00145
00146
00147 iy=idw[ix][2];
00148 sm=psi+iy;
00149 _sse_float_prefetch_spinor(sm);
00150 um=U+iy*4+2;
00151 _sse_float_prefetch_su3(um);
00152
00153 _sse_double_load(*sp);
00154 _sse_double_su3_multiply(*up);
00155 _sse_double_load(r);
00156 if(sign2) _sse_double_vector_add();
00157 else _sse_double_vector_sub();
00158 _sse_double_store(r);
00159
00160 up++;
00161 sp=psi+iup[ix][3];
00162 _sse_float_prefetch_spinor(sp);
00163
00164 _sse_double_load(*sm);
00165 _sse_double_su3_inverse_multiply(*um);
00166 _sse_double_load(r);
00167 if(sign2) _sse_double_vector_sub();
00168 else _sse_double_vector_add();
00169 _sse_double_store(r);
00170
00171
00172
00173 sn=chi+ix;
00174 _sse_float_prefetch_spinor(sn);
00175
00176 iy=idw[ix][3];
00177 sm=psi+iy;
00178 _sse_float_prefetch_spinor(sm);
00179 um=U+iy*4+3;
00180 _sse_float_prefetch_su3(um);
00181
00182 _sse_double_load(*sp);
00183 _sse_double_su3_multiply(*up);
00184 _sse_double_load(r);
00185 if(sign3) _sse_double_vector_add();
00186 else _sse_double_vector_sub();
00187 _sse_double_store(r);
00188
00189 iz=ix+1;
00190 if(iz<stop) {
00191 sp=psi+iup[iz][0];
00192 _sse_float_prefetch_spinor(sp);
00193 up++;
00194 }
00195 _sse_double_load(*sm);
00196 _sse_double_su3_inverse_multiply(*um);
00197 _sse_double_load(r);
00198 if(sign3) _sse_double_vector_sub();
00199 else _sse_double_vector_add();
00200 _sse_double_store(*sn);
00201
00202 }
00203
00204 if(LL==0) return;
00205
00206
00207
00208 sp=psi+iup[iup[iup[start][0]][0]][0];
00209 up=LL+4*start;
00210
00211 for(ix=start; ix<stop; ix++) {
00212
00213 sn=chi+ix;
00214 _sse_float_prefetch_spinor(sn);
00215
00216 x.idx=ix;
00217 if(sign>0) {
00218 sign0=(int) (psi_in.eta(x,0)+1);
00219 sign1=(int) (psi_in.eta(x,1)+1);
00220 sign2=(int) (psi_in.eta(x,2)+1);
00221 sign3=(int) (psi_in.eta(x,3)+1);
00222 } else {
00223 sign0=(int) (1.0-psi_in.eta(x,0));
00224 sign1=(int) (1.0-psi_in.eta(x,1));
00225 sign2=(int) (1.0-psi_in.eta(x,2));
00226 sign3=(int) (1.0-psi_in.eta(x,3));
00227 }
00228
00229
00230
00231 iy=idw[idw[idw[ix][0]][0]][0];
00232
00233 sm=psi+iy;
00234 _sse_float_prefetch_spinor(sm);
00235 um=(_sse_su3*) LL+iy*4;
00236 _sse_float_prefetch_su3(um);
00237
00238 _sse_double_load(*sp);
00239 _sse_double_su3_multiply(*up);
00240 _sse_double_load(*sn);
00241 if(sign0) _sse_double_vector_add();
00242 else _sse_double_vector_sub();
00243 _sse_double_store(r);
00244
00245 up++;
00246 sp=psi+iup[iup[iup[ix][1]][1]][1];
00247 _sse_float_prefetch_spinor(sp);
00248
00249 _sse_double_load(*sm);
00250 _sse_double_su3_inverse_multiply(*um);
00251 _sse_double_load(r);
00252 if(sign0) _sse_double_vector_sub();
00253 else _sse_double_vector_add();
00254 _sse_double_store(r);
00255
00256
00257
00258
00259 iy=idw[idw[idw[ix][1]][1]][1];
00260
00261 sm=psi+iy;
00262 _sse_float_prefetch_spinor(sm);
00263 um=LL+iy*4+1;
00264 _sse_float_prefetch_su3(um);
00265
00266 _sse_double_load(*sp);
00267 _sse_double_su3_multiply(*up);
00268 _sse_double_load(r);
00269 if(sign1) _sse_double_vector_add();
00270 else _sse_double_vector_sub();
00271 _sse_double_store(r);
00272
00273 up++;
00274 sp=psi+iup[iup[iup[ix][2]][2]][2];
00275
00276 _sse_float_prefetch_spinor(sp);
00277
00278 _sse_double_load(*sm);
00279 _sse_double_su3_inverse_multiply(*um);
00280 _sse_double_load(r);
00281 if(sign1) _sse_double_vector_sub();
00282 else _sse_double_vector_add();
00283 _sse_double_store(r);
00284
00285
00286
00287 iy=idw[idw[idw[ix][2]][2]][2];
00288
00289 sm=psi+iy;
00290 _sse_float_prefetch_spinor(sm);
00291 um=LL+iy*4+2;
00292 _sse_float_prefetch_su3(um);
00293
00294 _sse_double_load(*sp);
00295 _sse_double_su3_multiply(*up);
00296 _sse_double_load(r);
00297 if(sign2) _sse_double_vector_add();
00298 else _sse_double_vector_sub();
00299 _sse_double_store(r);
00300
00301 up++;
00302 sp=psi+iup[iup[iup[ix][3]][3]][3];
00303 _sse_float_prefetch_spinor(sp);
00304
00305 _sse_double_load(*sm);
00306 _sse_double_su3_inverse_multiply(*um);
00307 _sse_double_load(r);
00308 if(sign2) _sse_double_vector_sub();
00309 else _sse_double_vector_add();
00310 _sse_double_store(r);
00311
00312
00313
00314 _sse_float_prefetch_spinor(sn);
00315
00316 iy=idw[idw[idw[ix][3]][3]][3];
00317
00318 sm=psi+iy;
00319 _sse_float_prefetch_spinor(sm);
00320 um=LL+iy*4+3;
00321 _sse_float_prefetch_su3(um);
00322
00323 _sse_double_load(*sp);
00324 _sse_double_su3_multiply(*up);
00325 _sse_double_load(r);
00326 if(sign3) _sse_double_vector_add();
00327 else _sse_double_vector_sub();
00328 _sse_double_store(r);
00329
00330 iz=ix+1;
00331 if(iz<stop) {
00332 sp=psi+iup[iup[iup[iz][0]][0]][0];
00333 _sse_float_prefetch_spinor(sp);
00334 up++;
00335 }
00336 _sse_double_load(*sm);
00337 _sse_double_su3_inverse_multiply(*um);
00338 _sse_double_load(r);
00339 if(sign3) _sse_double_vector_sub();
00340 else _sse_double_vector_add();
00341 _sse_double_store(*sn);
00342
00343 }
00344
00345 #else
00346
00347 static _sse_float fact1 ALIGN16;
00348 static _sse_vector r ALIGN16;
00349 _sse_su3 *up, *um;
00350 _sse_su3_vector *s,*sp,*sm,*sn;
00351 _sse_su3_vector dump;
00352 long ix,iy,iz;
00353 int *peta;
00354 int sign0, sign1, sign2, sign3;
00355
00356 _sse_check_alignment(&fact1, 0xf);
00357 _sse_check_alignment(&r, 0xf);
00358
00359 fact1.c1=1.0f*two_mass;
00360 fact1.c2=1.0f*two_mass;
00361 fact1.c3=1.0f*two_mass;
00362 fact1.c4=1.0f*two_mass;
00363
00364 dump.c1.real()=0.0f;
00365 dump.c1.imag()=0.0f;
00366 dump.c2.real()=0.0f;
00367 dump.c2.imag()=0.0f;
00368 dump.c3.real()=0.0f;
00369 dump.c3.imag()=0.0f;
00370
00371 sp=psi+iup[start][0];
00372 up=U+4*start;
00373
00374
00375
00376 for(ix=start; ix<stop; ix++) {
00377
00378 s=psi+ix;
00379
00380 x.idx=ix;
00381
00382 if(sign>0) {
00383 sign0=(int) (psi_in.eta(x,0)+1);
00384 sign1=(int) (psi_in.eta(x,1)+1);
00385 sign2=(int) (psi_in.eta(x,2)+1);
00386 sign3=(int) (psi_in.eta(x,3)+1);
00387 } else {
00388 sign0=(int) (1.0-psi_in.eta(x,0));
00389 sign1=(int) (1.0-psi_in.eta(x,1));
00390 sign2=(int) (1.0-psi_in.eta(x,2));
00391 sign3=(int) (1.0-psi_in.eta(x,3));
00392 }
00393
00394
00395
00396 iy=idw[ix][0];
00397 sm=psi+iy;
00398 _sse_float_prefetch_spinor(sm);
00399 um=(_sse_su3*) U+iy*4;
00400 _sse_float_prefetch_su3(um);
00401
00402 _sse_float_pair_load(*sp,dump);
00403 _sse_float_su3_multiply(*up);
00404 _sse_float_pair_load(*s,dump);
00405 _sse_float_vector_mul(fact1);
00406 if(sign0) _sse_float_vector_add();
00407 else _sse_float_vector_sub();
00408 _sse_float_vector_store(r);
00409
00410 up++;
00411 sp=psi+iup[ix][1];
00412 _sse_float_prefetch_spinor(sp);
00413
00414 _sse_float_pair_load(*sm,dump);
00415 _sse_float_su3_inverse_multiply(*um);
00416 _sse_float_vector_load(r);
00417 if(sign0) _sse_float_vector_sub();
00418 else _sse_float_vector_add();
00419 _sse_float_vector_store(r);
00420
00421
00422
00423
00424 iy=idw[ix][1];
00425 sm=psi+iy;
00426 _sse_float_prefetch_spinor(sm);
00427 um=U+iy*4+1;
00428 _sse_float_prefetch_su3(um);
00429
00430 _sse_float_pair_load(*sp,dump);
00431 _sse_float_su3_multiply(*up);
00432 _sse_float_vector_load(r);
00433 if(sign1) _sse_float_vector_add();
00434 else _sse_float_vector_sub();
00435 _sse_float_vector_store(r);
00436
00437 up++;
00438 sp=psi+iup[ix][2];
00439 _sse_float_prefetch_spinor(sp);
00440
00441 _sse_float_pair_load(*sm,dump);
00442 _sse_float_su3_inverse_multiply(*um);
00443 _sse_float_vector_load(r);
00444 if(sign1) _sse_float_vector_sub();
00445 else _sse_float_vector_add();
00446 _sse_float_vector_store(r);
00447
00448
00449
00450 iy=idw[ix][2];
00451 sm=psi+iy;
00452 _sse_float_prefetch_spinor(sm);
00453 um=U+iy*4+2;
00454 _sse_float_prefetch_su3(um);
00455
00456 _sse_float_pair_load(*sp,dump);
00457 _sse_float_su3_multiply(*up);
00458 _sse_float_vector_load(r);
00459 if(sign2) _sse_float_vector_add();
00460 else _sse_float_vector_sub();
00461 _sse_float_vector_store(r);
00462
00463 up++;
00464 sp=psi+iup[ix][3];
00465 _sse_float_prefetch_spinor(sp);
00466
00467 _sse_float_pair_load(*sm,dump);
00468 _sse_float_su3_inverse_multiply(*um);
00469 _sse_float_vector_load(r);
00470 if(sign2) _sse_float_vector_sub();
00471 else _sse_float_vector_add();
00472 _sse_float_vector_store(r);
00473
00474
00475
00476 sn=chi+ix;
00477 _sse_float_prefetch_spinor(sn);
00478
00479 iy=idw[ix][3];
00480 sm=psi+iy;
00481 _sse_float_prefetch_spinor(sm);
00482 um=U+iy*4+3;
00483 _sse_float_prefetch_su3(um);
00484
00485
00486 _sse_float_pair_load(*sp,dump);
00487 _sse_float_su3_multiply(*up);
00488 _sse_float_vector_load(r);
00489 if(sign3) _sse_float_vector_add();
00490 else _sse_float_vector_sub();
00491 _sse_float_vector_store(r);
00492
00493 iz=ix+1;
00494 if(iz<stop) {
00495 sp=psi+iup[iz][0];
00496 _sse_float_prefetch_spinor(sp);
00497 up++;
00498 }
00499
00500 _sse_float_pair_load(*sm,dump);
00501 _sse_float_su3_inverse_multiply(*um);
00502 _sse_float_vector_load(r);
00503 if(sign3) _sse_float_vector_sub();
00504 else _sse_float_vector_add();
00505 _sse_float_pair_store(*sn,dump);
00506
00507 }
00508
00509 if(LL==0) return;
00510
00511
00512
00513 sp=psi+iup[iup[iup[start][0]][0]][0];
00514 up=LL+4*start;
00515
00516 for(ix=start; ix<stop; ix++) {
00517
00518 sn=chi+ix;
00519 _sse_float_prefetch_spinor(sn);
00520
00521 x.idx=ix;
00522 if(sign>0) {
00523 sign0=(int) (psi_in.eta(x,0)+1);
00524 sign1=(int) (psi_in.eta(x,1)+1);
00525 sign2=(int) (psi_in.eta(x,2)+1);
00526 sign3=(int) (psi_in.eta(x,3)+1);
00527 } else {
00528 sign0=(int) (1.0-psi_in.eta(x,0));
00529 sign1=(int) (1.0-psi_in.eta(x,1));
00530 sign2=(int) (1.0-psi_in.eta(x,2));
00531 sign3=(int) (1.0-psi_in.eta(x,3));
00532 }
00533
00534
00535
00536 iy=idw[idw[idw[ix][0]][0]][0];
00537
00538 sm=psi+iy;
00539 _sse_float_prefetch_spinor(sm);
00540 um=(_sse_su3*) LL+iy*4;
00541 _sse_float_prefetch_su3(um);
00542
00543 _sse_float_pair_load(*sp,dump);
00544 _sse_float_su3_multiply(*up);
00545 _sse_float_pair_load(*sn,dump);
00546 if(sign0) _sse_float_vector_add();
00547 else _sse_float_vector_sub();
00548 _sse_float_vector_store(r);
00549
00550 up++;
00551 sp=psi+iup[iup[iup[ix][1]][1]][1];
00552 _sse_float_prefetch_spinor(sp);
00553
00554 _sse_float_pair_load(*sm,dump);
00555 _sse_float_su3_inverse_multiply(*um);
00556 _sse_float_vector_load(r);
00557 if(sign0) _sse_float_vector_sub();
00558 else _sse_float_vector_add();
00559 _sse_float_vector_store(r);
00560
00561
00562
00563
00564 iy=idw[idw[idw[ix][1]][1]][1];
00565
00566 sm=psi+iy;
00567 _sse_float_prefetch_spinor(sm);
00568 um=LL+iy*4+1;
00569 _sse_float_prefetch_su3(um);
00570
00571 _sse_float_pair_load(*sp,dump);
00572 _sse_float_su3_multiply(*up);
00573 _sse_float_vector_load(r);
00574 if(sign1) _sse_float_vector_add();
00575 else _sse_float_vector_sub();
00576 _sse_float_vector_store(r);
00577
00578 up++;
00579 sp=psi+iup[iup[iup[ix][2]][2]][2];
00580 _sse_float_prefetch_spinor(sp);
00581
00582 _sse_float_pair_load(*sm,dump);
00583 _sse_float_su3_inverse_multiply(*um);
00584 _sse_float_vector_load(r);
00585 if(sign1) _sse_float_vector_sub();
00586 else _sse_float_vector_add();
00587 _sse_float_vector_store(r);
00588
00589
00590
00591 iy=idw[idw[idw[ix][2]][2]][2];
00592 sm=psi+iy;
00593 _sse_float_prefetch_spinor(sm);
00594 um=LL+iy*4+2;
00595 _sse_float_prefetch_su3(um);
00596
00597 _sse_float_pair_load(*sp,dump);
00598 _sse_float_su3_multiply(*up);
00599 _sse_float_vector_load(r);
00600 if(sign2) _sse_float_vector_add();
00601 else _sse_float_vector_sub();
00602 _sse_float_vector_store(r);
00603
00604 up++;
00605 sp=psi+iup[iup[iup[ix][3]][3]][3];
00606 _sse_float_prefetch_spinor(sp);
00607
00608 _sse_float_pair_load(*sm,dump);
00609 _sse_float_su3_inverse_multiply(*um);
00610 _sse_float_vector_load(r);
00611 if(sign2) _sse_float_vector_sub();
00612 else _sse_float_vector_add();
00613 _sse_float_vector_store(r);
00614
00615
00616
00617 _sse_float_prefetch_spinor(sn);
00618
00619 iy=idw[idw[idw[ix][3]][3]][3];
00620
00621 sm=psi+iy;
00622 _sse_float_prefetch_spinor(sm);
00623 um=LL+iy*4+3;
00624 _sse_float_prefetch_su3(um);
00625
00626 _sse_float_pair_load(*sp,dump);
00627 _sse_float_su3_multiply(*up);
00628 _sse_float_vector_load(r);
00629 if(sign3) _sse_float_vector_add();
00630 else _sse_float_vector_sub();
00631 _sse_float_vector_store(r);
00632
00633 iz=ix+1;
00634 if(iz<stop) {
00635 sp=psi+iup[iup[iup[iz][0]][0]][0];
00636 _sse_float_prefetch_spinor(sp);
00637 up++;
00638 }
00639 _sse_float_pair_load(*sm,dump);
00640 _sse_float_su3_inverse_multiply(*um);
00641 _sse_float_vector_load(r);
00642 if(sign3) _sse_float_vector_sub();
00643 else _sse_float_vector_add();
00644 _sse_float_pair_store(*sn,dump);
00645
00646 }
00647 #endif
00648 }
00649 };
00650
00651 #endif // id fefined(SSE2)
00652
00653