Main Page | Class Hierarchy | Class List | File List | Class Members | File Members

fermiqcd_staggered_actions_sse2.h

Go to the documentation of this file.
00001 
00002 
00003 
00004 
00005 
00006 
00007 
00008 
00009 
00010 
00011 
00012 
00013 #if defined(SSE2)
00014 
00029 class StaggeredAsqtadActionSSE2 {
00030  public:
00031   static void mul_Q(staggered_field &chi_out,
00032                     staggered_field &psi_in,
00033                     gauge_field &U_in,
00034                     coefficients &coeff, 
00035                     int parity=EVENODD) {
00036     
00037     _sse_su3_vector* chi=(_sse_su3_vector*) chi_out.physical_address();
00038     _sse_su3_vector* psi=(_sse_su3_vector*) psi_in.physical_address();
00039     
00040     _sse_su3*  U=(_sse_su3*) U_in.physical_address();
00041     _sse_su3*  LL=(_sse_su3*) U_in.long_links.physical_address();
00042     long** iup=U_in.lattice().up;
00043     long** idw=U_in.lattice().dw;
00044     
00045     long   start=U_in.lattice().start_index(ME,parity); 
00046     long   stop =U_in.lattice().stop_index(ME,parity);  
00047     
00048     mdp_real two_mass;
00049     int sign;
00050     if(coeff.has_key("mass")) two_mass=2.0*coeff["mass"];
00051     else error("coefficient mass undefined");
00052     if(coeff.has_key("sign")) sign=(int) coeff["sign"];
00053     else sign=1;
00054 
00055     site x(psi_in.lattice());
00056     
00057 #if defined(USE_DOUBLE_PRECISION) 
00058 
00059   static _sse_double fact1 ALIGN16;
00060   static _sse_su3_vector r ALIGN16;
00061   _sse_su3 *up, *um;   
00062   _sse_su3_vector *s,*sp,*sm,*sn;
00063   long ix,iy,iz;
00064   int sign0, sign1, sign2, sign3;
00065 
00066   fact1.c1=1.0f*two_mass;
00067   fact1.c2=1.0f*two_mass;
00068 
00069   sp=psi+iup[start][0];
00070   up=U+4*start;
00071 
00072   /************************ loop over all lattice sites ***************/
00073   
00074   for(ix=start; ix<stop; ix++) {
00075 
00076     s=psi+ix;
00077     
00078     x.idx=ix;
00079     if(sign>0) {
00080       sign0=(int) (psi_in.eta(x,0)+1);
00081       sign1=(int) (psi_in.eta(x,1)+1);
00082       sign2=(int) (psi_in.eta(x,2)+1);
00083       sign3=(int) (psi_in.eta(x,3)+1);
00084     } else {
00085       sign0=(int) (1.0-psi_in.eta(x,0));
00086       sign1=(int) (1.0-psi_in.eta(x,1));
00087       sign2=(int) (1.0-psi_in.eta(x,2));
00088       sign3=(int) (1.0-psi_in.eta(x,3));
00089     }
00090 
00091     /**************** mu =0 *****************/
00092 
00093     iy=idw[ix][0];
00094     sm=psi+iy;
00095     _sse_float_prefetch_spinor(sm);
00096     um=(_sse_su3*) U+iy*4;
00097     _sse_float_prefetch_su3(um);
00098 
00099     _sse_double_load(*sp);
00100     _sse_double_su3_multiply(*up);
00101     _sse_double_load(*s);
00102     _sse_double_vector_mul(fact1);
00103     if(sign0) _sse_double_vector_add();
00104     else       _sse_double_vector_sub();
00105     _sse_double_store(r);
00106     
00107     up++;
00108     sp=psi+iup[ix][1];
00109     _sse_float_prefetch_spinor(sp);
00110     
00111     _sse_double_load(*sm);
00112     _sse_double_su3_inverse_multiply(*um);
00113     _sse_double_load(r);
00114     if(sign0) _sse_double_vector_sub();
00115     else       _sse_double_vector_add();
00116     _sse_double_store(r);
00117 
00118 
00119     /**************** mu =1 *****************/
00120 
00121     iy=idw[ix][1];
00122     sm=psi+iy;
00123     _sse_float_prefetch_spinor(sm);
00124     um=U+iy*4+1;
00125     _sse_float_prefetch_su3(um);
00126 
00127     _sse_double_load(*sp);
00128     _sse_double_su3_multiply(*up);
00129     _sse_double_load(r);
00130     if(sign1) _sse_double_vector_add();
00131     else       _sse_double_vector_sub();
00132     _sse_double_store(r);
00133 
00134     up++;
00135     sp=psi+iup[ix][2];
00136     _sse_float_prefetch_spinor(sp);
00137 
00138     _sse_double_load(*sm);
00139     _sse_double_su3_inverse_multiply(*um);
00140     _sse_double_load(r);
00141     if(sign1) _sse_double_vector_sub();
00142     else       _sse_double_vector_add();
00143     _sse_double_store(r);
00144 
00145     /**************** mu = 2 *****************/
00146 
00147     iy=idw[ix][2];
00148     sm=psi+iy;
00149     _sse_float_prefetch_spinor(sm);
00150     um=U+iy*4+2;
00151     _sse_float_prefetch_su3(um);
00152 
00153     _sse_double_load(*sp);
00154     _sse_double_su3_multiply(*up);
00155     _sse_double_load(r);
00156     if(sign2) _sse_double_vector_add();
00157     else       _sse_double_vector_sub();
00158     _sse_double_store(r);
00159 
00160     up++;
00161     sp=psi+iup[ix][3];
00162     _sse_float_prefetch_spinor(sp);
00163 
00164     _sse_double_load(*sm);
00165     _sse_double_su3_inverse_multiply(*um);
00166     _sse_double_load(r);
00167     if(sign2) _sse_double_vector_sub();
00168     else       _sse_double_vector_add();
00169     _sse_double_store(r);
00170  
00171     /**************** mu =3 *****************/
00172 
00173     sn=chi+ix;
00174     _sse_float_prefetch_spinor(sn);
00175 
00176     iy=idw[ix][3];
00177     sm=psi+iy;
00178     _sse_float_prefetch_spinor(sm);
00179     um=U+iy*4+3;
00180     _sse_float_prefetch_su3(um);
00181 
00182     _sse_double_load(*sp);
00183     _sse_double_su3_multiply(*up);
00184     _sse_double_load(r);
00185     if(sign3) _sse_double_vector_add();
00186     else       _sse_double_vector_sub();
00187     _sse_double_store(r);
00188 
00189     iz=ix+1;
00190     if(iz<stop) {
00191       sp=psi+iup[iz][0];
00192       _sse_float_prefetch_spinor(sp);
00193       up++;
00194     }
00195     _sse_double_load(*sm);
00196     _sse_double_su3_inverse_multiply(*um);
00197     _sse_double_load(r);
00198     if(sign3) _sse_double_vector_sub();
00199     else       _sse_double_vector_add();
00200     _sse_double_store(*sn);
00201 
00202   }
00203   
00204   if(LL==0) return;
00205 
00206   /************************ loop over all lattice sites for Naik term ***************/
00207 
00208   sp=psi+iup[iup[iup[start][0]][0]][0];
00209   up=LL+4*start;
00210   
00211   for(ix=start; ix<stop; ix++) {
00212     
00213     sn=chi+ix;
00214     _sse_float_prefetch_spinor(sn);
00215     
00216    x.idx=ix;
00217     if(sign>0) {
00218       sign0=(int) (psi_in.eta(x,0)+1);
00219       sign1=(int) (psi_in.eta(x,1)+1);
00220       sign2=(int) (psi_in.eta(x,2)+1);
00221       sign3=(int) (psi_in.eta(x,3)+1);
00222     } else {
00223       sign0=(int) (1.0-psi_in.eta(x,0));
00224       sign1=(int) (1.0-psi_in.eta(x,1));
00225       sign2=(int) (1.0-psi_in.eta(x,2));
00226       sign3=(int) (1.0-psi_in.eta(x,3));
00227     }
00228 
00229     /**************** mu =0 *****************/
00230 
00231     iy=idw[idw[idw[ix][0]][0]][0];
00232     
00233     sm=psi+iy;
00234     _sse_float_prefetch_spinor(sm);
00235     um=(_sse_su3*) LL+iy*4;
00236     _sse_float_prefetch_su3(um);
00237     
00238     _sse_double_load(*sp);
00239     _sse_double_su3_multiply(*up);
00240     _sse_double_load(*sn);
00241     if(sign0) _sse_double_vector_add();
00242     else      _sse_double_vector_sub();
00243     _sse_double_store(r);
00244     
00245     up++;
00246       sp=psi+iup[iup[iup[ix][1]][1]][1];  
00247     _sse_float_prefetch_spinor(sp);
00248     
00249     _sse_double_load(*sm);
00250     _sse_double_su3_inverse_multiply(*um);
00251     _sse_double_load(r);
00252     if(sign0) _sse_double_vector_sub();
00253     else       _sse_double_vector_add();
00254     _sse_double_store(r);
00255 
00256 
00257     /**************** mu =1 *****************/
00258 
00259     iy=idw[idw[idw[ix][1]][1]][1];
00260     
00261     sm=psi+iy;
00262     _sse_float_prefetch_spinor(sm);
00263     um=LL+iy*4+1;
00264     _sse_float_prefetch_su3(um);
00265 
00266     _sse_double_load(*sp);
00267     _sse_double_su3_multiply(*up);
00268     _sse_double_load(r);
00269     if(sign1) _sse_double_vector_add();
00270     else       _sse_double_vector_sub();
00271     _sse_double_store(r);
00272 
00273     up++;
00274     sp=psi+iup[iup[iup[ix][2]][2]][2];
00275     
00276     _sse_float_prefetch_spinor(sp);
00277 
00278     _sse_double_load(*sm);
00279     _sse_double_su3_inverse_multiply(*um);
00280     _sse_double_load(r);
00281     if(sign1) _sse_double_vector_sub();
00282     else       _sse_double_vector_add();
00283     _sse_double_store(r);
00284 
00285     /**************** mu = 2 *****************/
00286 
00287     iy=idw[idw[idw[ix][2]][2]][2];
00288    
00289     sm=psi+iy;
00290     _sse_float_prefetch_spinor(sm);
00291     um=LL+iy*4+2;
00292     _sse_float_prefetch_su3(um);
00293 
00294     _sse_double_load(*sp);
00295     _sse_double_su3_multiply(*up);
00296     _sse_double_load(r);
00297     if(sign2) _sse_double_vector_add();
00298     else       _sse_double_vector_sub();
00299     _sse_double_store(r);
00300 
00301     up++;
00302     sp=psi+iup[iup[iup[ix][3]][3]][3];   
00303     _sse_float_prefetch_spinor(sp);
00304 
00305     _sse_double_load(*sm);
00306     _sse_double_su3_inverse_multiply(*um);
00307     _sse_double_load(r);
00308     if(sign2) _sse_double_vector_sub();
00309     else       _sse_double_vector_add();
00310     _sse_double_store(r);
00311  
00312     /**************** mu =3 *****************/
00313 
00314     _sse_float_prefetch_spinor(sn);
00315 
00316     iy=idw[idw[idw[ix][3]][3]][3];
00317   
00318     sm=psi+iy;
00319     _sse_float_prefetch_spinor(sm);
00320     um=LL+iy*4+3;
00321     _sse_float_prefetch_su3(um);
00322 
00323     _sse_double_load(*sp);
00324     _sse_double_su3_multiply(*up);
00325     _sse_double_load(r);
00326     if(sign3) _sse_double_vector_add();
00327     else       _sse_double_vector_sub();
00328     _sse_double_store(r);
00329 
00330     iz=ix+1;
00331     if(iz<stop) {
00332       sp=psi+iup[iup[iup[iz][0]][0]][0];
00333       _sse_float_prefetch_spinor(sp);
00334       up++;
00335     }
00336     _sse_double_load(*sm);
00337     _sse_double_su3_inverse_multiply(*um);
00338     _sse_double_load(r);
00339     if(sign3) _sse_double_vector_sub();
00340     else       _sse_double_vector_add();
00341     _sse_double_store(*sn);
00342 
00343   }
00344 
00345 #else
00346 
00347   static _sse_float fact1 ALIGN16;
00348   static _sse_vector r   ALIGN16;
00349   _sse_su3 *up, *um;   
00350   _sse_su3_vector *s,*sp,*sm,*sn;
00351   _sse_su3_vector dump;
00352   long ix,iy,iz;
00353   int *peta;
00354   int sign0, sign1, sign2, sign3;
00355 
00356   _sse_check_alignment(&fact1, 0xf);
00357   _sse_check_alignment(&r, 0xf);
00358 
00359   fact1.c1=1.0f*two_mass;
00360   fact1.c2=1.0f*two_mass;
00361   fact1.c3=1.0f*two_mass;
00362   fact1.c4=1.0f*two_mass;
00363 
00364   dump.c1.real()=0.0f;
00365   dump.c1.imag()=0.0f;
00366   dump.c2.real()=0.0f;
00367   dump.c2.imag()=0.0f;
00368   dump.c3.real()=0.0f;
00369   dump.c3.imag()=0.0f;
00370 
00371   sp=psi+iup[start][0];
00372   up=U+4*start;
00373 
00374   /************************ loop over all lattice sites ***************/
00375   
00376   for(ix=start; ix<stop; ix++) {
00377     
00378     s=psi+ix;
00379 
00380     x.idx=ix;
00381              
00382     if(sign>0) {
00383       sign0=(int) (psi_in.eta(x,0)+1);
00384       sign1=(int) (psi_in.eta(x,1)+1);
00385       sign2=(int) (psi_in.eta(x,2)+1);
00386       sign3=(int) (psi_in.eta(x,3)+1);
00387     } else {
00388       sign0=(int) (1.0-psi_in.eta(x,0));
00389       sign1=(int) (1.0-psi_in.eta(x,1));
00390       sign2=(int) (1.0-psi_in.eta(x,2));
00391       sign3=(int) (1.0-psi_in.eta(x,3));
00392     }
00393 
00394     /**************** mu =0 *****************/
00395 
00396     iy=idw[ix][0];
00397     sm=psi+iy;
00398     _sse_float_prefetch_spinor(sm);
00399     um=(_sse_su3*) U+iy*4;
00400     _sse_float_prefetch_su3(um);
00401     
00402     _sse_float_pair_load(*sp,dump);
00403     _sse_float_su3_multiply(*up);
00404     _sse_float_pair_load(*s,dump);
00405     _sse_float_vector_mul(fact1);
00406     if(sign0) _sse_float_vector_add();
00407     else       _sse_float_vector_sub();
00408     _sse_float_vector_store(r);
00409     
00410     up++;
00411     sp=psi+iup[ix][1];
00412     _sse_float_prefetch_spinor(sp);
00413     
00414     _sse_float_pair_load(*sm,dump);
00415     _sse_float_su3_inverse_multiply(*um);
00416     _sse_float_vector_load(r);
00417     if(sign0) _sse_float_vector_sub();
00418     else       _sse_float_vector_add();
00419     _sse_float_vector_store(r);
00420 
00421 
00422     /**************** mu =1 *****************/
00423 
00424     iy=idw[ix][1];
00425     sm=psi+iy;
00426     _sse_float_prefetch_spinor(sm);
00427     um=U+iy*4+1;
00428     _sse_float_prefetch_su3(um);
00429 
00430     _sse_float_pair_load(*sp,dump);
00431     _sse_float_su3_multiply(*up);
00432     _sse_float_vector_load(r);
00433     if(sign1) _sse_float_vector_add();
00434     else       _sse_float_vector_sub();
00435     _sse_float_vector_store(r);
00436 
00437     up++;
00438     sp=psi+iup[ix][2];
00439     _sse_float_prefetch_spinor(sp);
00440 
00441     _sse_float_pair_load(*sm,dump);
00442     _sse_float_su3_inverse_multiply(*um);
00443     _sse_float_vector_load(r);
00444     if(sign1) _sse_float_vector_sub();
00445     else       _sse_float_vector_add();
00446     _sse_float_vector_store(r);
00447 
00448     /**************** mu = 2 *****************/
00449 
00450     iy=idw[ix][2];
00451     sm=psi+iy;
00452     _sse_float_prefetch_spinor(sm);
00453     um=U+iy*4+2;
00454     _sse_float_prefetch_su3(um);
00455 
00456     _sse_float_pair_load(*sp,dump);
00457     _sse_float_su3_multiply(*up);
00458     _sse_float_vector_load(r);
00459     if(sign2) _sse_float_vector_add();
00460     else       _sse_float_vector_sub();
00461     _sse_float_vector_store(r);
00462 
00463     up++;
00464     sp=psi+iup[ix][3];
00465     _sse_float_prefetch_spinor(sp);
00466 
00467     _sse_float_pair_load(*sm,dump);
00468     _sse_float_su3_inverse_multiply(*um);
00469     _sse_float_vector_load(r);
00470     if(sign2) _sse_float_vector_sub();
00471     else       _sse_float_vector_add();
00472     _sse_float_vector_store(r);
00473  
00474     /**************** mu =3 *****************/
00475 
00476     sn=chi+ix;
00477     _sse_float_prefetch_spinor(sn);
00478 
00479     iy=idw[ix][3];
00480     sm=psi+iy;
00481     _sse_float_prefetch_spinor(sm);
00482     um=U+iy*4+3;
00483     _sse_float_prefetch_su3(um);
00484 
00485 
00486     _sse_float_pair_load(*sp,dump);
00487     _sse_float_su3_multiply(*up);
00488     _sse_float_vector_load(r);
00489     if(sign3) _sse_float_vector_add();
00490     else       _sse_float_vector_sub();
00491     _sse_float_vector_store(r);
00492 
00493     iz=ix+1;
00494     if(iz<stop) {
00495       sp=psi+iup[iz][0];
00496       _sse_float_prefetch_spinor(sp);
00497       up++;
00498     }
00499 
00500     _sse_float_pair_load(*sm,dump);
00501     _sse_float_su3_inverse_multiply(*um);
00502     _sse_float_vector_load(r);
00503     if(sign3) _sse_float_vector_sub();
00504     else       _sse_float_vector_add();
00505     _sse_float_pair_store(*sn,dump);
00506 
00507   }
00508   
00509   if(LL==0) return;
00510 
00511   /************************ loop over all lattice sites for Naik term ***************/
00512 
00513   sp=psi+iup[iup[iup[start][0]][0]][0];
00514   up=LL+4*start;
00515   
00516   for(ix=start; ix<stop; ix++) {
00517     
00518     sn=chi+ix;
00519     _sse_float_prefetch_spinor(sn);
00520     
00521     x.idx=ix;
00522     if(sign>0) {
00523       sign0=(int) (psi_in.eta(x,0)+1);
00524       sign1=(int) (psi_in.eta(x,1)+1);
00525       sign2=(int) (psi_in.eta(x,2)+1);
00526       sign3=(int) (psi_in.eta(x,3)+1);
00527     } else {
00528       sign0=(int) (1.0-psi_in.eta(x,0));
00529       sign1=(int) (1.0-psi_in.eta(x,1));
00530       sign2=(int) (1.0-psi_in.eta(x,2));
00531       sign3=(int) (1.0-psi_in.eta(x,3));
00532     }   
00533 
00534     /**************** mu =0 *****************/
00535 
00536     iy=idw[idw[idw[ix][0]][0]][0];
00537 
00538     sm=psi+iy;
00539     _sse_float_prefetch_spinor(sm);
00540     um=(_sse_su3*) LL+iy*4;
00541     _sse_float_prefetch_su3(um);
00542     
00543     _sse_float_pair_load(*sp,dump);
00544     _sse_float_su3_multiply(*up);
00545     _sse_float_pair_load(*sn,dump);
00546     if(sign0) _sse_float_vector_add();
00547     else       _sse_float_vector_sub();
00548     _sse_float_vector_store(r);
00549     
00550     up++; 
00551     sp=psi+iup[iup[iup[ix][1]][1]][1];
00552     _sse_float_prefetch_spinor(sp);
00553     
00554     _sse_float_pair_load(*sm,dump);
00555     _sse_float_su3_inverse_multiply(*um);
00556     _sse_float_vector_load(r);
00557     if(sign0) _sse_float_vector_sub();
00558     else       _sse_float_vector_add();
00559     _sse_float_vector_store(r);
00560 
00561 
00562     /**************** mu =1 *****************/
00563 
00564     iy=idw[idw[idw[ix][1]][1]][1];
00565 
00566     sm=psi+iy;
00567     _sse_float_prefetch_spinor(sm);
00568     um=LL+iy*4+1;
00569     _sse_float_prefetch_su3(um);
00570 
00571     _sse_float_pair_load(*sp,dump);
00572     _sse_float_su3_multiply(*up);
00573     _sse_float_vector_load(r);
00574     if(sign1) _sse_float_vector_add();
00575     else       _sse_float_vector_sub();
00576     _sse_float_vector_store(r);
00577 
00578     up++;
00579     sp=psi+iup[iup[iup[ix][2]][2]][2];
00580     _sse_float_prefetch_spinor(sp);
00581 
00582     _sse_float_pair_load(*sm,dump);
00583     _sse_float_su3_inverse_multiply(*um);
00584     _sse_float_vector_load(r);
00585     if(sign1) _sse_float_vector_sub();
00586     else       _sse_float_vector_add();
00587     _sse_float_vector_store(r);
00588 
00589     /**************** mu = 2 *****************/
00590 
00591     iy=idw[idw[idw[ix][2]][2]][2];
00592     sm=psi+iy;
00593     _sse_float_prefetch_spinor(sm);
00594     um=LL+iy*4+2;
00595     _sse_float_prefetch_su3(um);
00596 
00597     _sse_float_pair_load(*sp,dump);
00598     _sse_float_su3_multiply(*up);
00599     _sse_float_vector_load(r);
00600     if(sign2) _sse_float_vector_add();
00601     else       _sse_float_vector_sub();
00602     _sse_float_vector_store(r);
00603 
00604     up++;
00605     sp=psi+iup[iup[iup[ix][3]][3]][3];
00606     _sse_float_prefetch_spinor(sp);
00607 
00608     _sse_float_pair_load(*sm,dump);
00609     _sse_float_su3_inverse_multiply(*um);
00610     _sse_float_vector_load(r);
00611     if(sign2) _sse_float_vector_sub();
00612     else       _sse_float_vector_add();
00613     _sse_float_vector_store(r);
00614  
00615     /**************** mu =3 *****************/
00616 
00617     _sse_float_prefetch_spinor(sn);
00618 
00619     iy=idw[idw[idw[ix][3]][3]][3];
00620 
00621     sm=psi+iy;
00622     _sse_float_prefetch_spinor(sm);
00623     um=LL+iy*4+3;
00624     _sse_float_prefetch_su3(um);
00625 
00626     _sse_float_pair_load(*sp,dump);
00627     _sse_float_su3_multiply(*up);
00628     _sse_float_vector_load(r);
00629     if(sign3) _sse_float_vector_add();
00630     else       _sse_float_vector_sub();
00631     _sse_float_vector_store(r);
00632 
00633     iz=ix+1;
00634     if(iz<stop) {
00635       sp=psi+iup[iup[iup[iz][0]][0]][0];
00636       _sse_float_prefetch_spinor(sp);
00637       up++;
00638     }
00639     _sse_float_pair_load(*sm,dump);
00640     _sse_float_su3_inverse_multiply(*um);
00641     _sse_float_vector_load(r);
00642     if(sign3) _sse_float_vector_sub();
00643     else       _sse_float_vector_add();
00644     _sse_float_pair_store(*sn,dump);
00645 
00646   }
00647 #endif
00648   }
00649 };
00650 
00651 #endif // id fefined(SSE2)
00652 
00653 

Generated on Sun Feb 27 15:12:19 2005 by  doxygen 1.4.1