Main Page | Class Hierarchy | Class List | File List | Class Members | File Members

fermiqcd_fermi_actions_sse2.h

Go to the documentation of this file.
00001 
00002 
00003 
00004 
00005 
00006 
00007 
00008 
00009 
00010 
00011 #if defined(SSE2)
00012 
00034 class FermiCloverActionSSE2 {
00035  public:
00036   static void mul_Q(fermi_field &chi_out, 
00037                     fermi_field &psi_in, 
00038                     gauge_field &U_in, 
00039                     coefficients &coeff,
00040                     int parity=EVENODD) { 
00041     
00042     register int   ndim=psi_in.lattice().ndim;
00043     register int   nspin=psi_in.nspin;
00044     register int   nc=psi_in.nc;
00045     register mdp_real kappa_t=0;
00046     register mdp_real kappa_s=0;
00047     register mdp_real r_t;
00048     register mdp_real r_s;
00049     register mdp_real cSW;
00050     register mdp_real c_E;
00051     register mdp_real c_B;
00052     register int sign;
00053 
00054     if(coeff.has_key("kappa")) kappa_s=kappa_t=coeff["kappa"];
00055     if(coeff.has_key("kappa_t")) kappa_t=coeff["kappa_t"];
00056     if(coeff.has_key("kappa_s")) kappa_s=coeff["kappa_s"];
00057     if(kappa_t==0) error("FermiCloverActionSSE2\nkappa_t=0 or undeclared");
00058     if(kappa_s==0) error("FermiCloverActionSSE2\nkappa_s=0 or undeclared");
00059     if(coeff.has_key("r_t")) r_t=coeff["r_t"];       else r_t=1;
00060     if(coeff.has_key("r_s")) r_s=coeff["r_s"];       else r_s=1;
00061     if(coeff.has_key("c_{sw}")) cSW=coeff["c_{sw}"]; else cSW=0;
00062     if(coeff.has_key("c_E")) c_E=coeff["c_E"];       else c_E=1;
00063     if(coeff.has_key("c_B")) c_B=coeff["c_B"];       else c_B=1;
00064     if(coeff.has_key("sign")) sign=(int) coeff["sign"];   else sign=+1;
00065     if(parity!=EVENODD) error("FermiCloverActionSSE2\nparity must be EVENODD here");
00066 
00067 #if !defined(USE_DOUBLE_PRECISION)
00068 
00069     _sse_spinor *chi=(_sse_spinor*) chi_out.physical_address();
00070     _sse_check_alignment((void*) chi, 0xf);
00071     _sse_spinor *psi=(_sse_spinor*) psi_in.physical_address();
00072     _sse_check_alignment((void*) psi, 0xf);
00073     _sse_su3    *U=(_sse_su3*) U_in.physical_address();
00074     _sse_check_alignment((void*) U, 0xf);
00075     _sse_su3    *uem=0;
00076     if(cSW!=0.0) {
00077       uem=(_sse_su3*) U_in.em.physical_address();
00078       _sse_check_alignment((void*) uem, 0xf);
00079     }
00080     long   **iup=U_in.lattice().up;
00081     long   **idw=U_in.lattice().dw;
00082     long   start=U_in.lattice().start_index(ME,0); // even
00083     long   stop =U_in.lattice().stop_index(ME,1);  // odd
00084     
00085     _sse_float fact1 ALIGN16;
00086     _sse_float fact2 ALIGN16;
00087     _sse_float fact3 ALIGN16;
00088     _sse_float fact4 ALIGN16;
00089     _sse_float fact5 ALIGN16;
00090     _sse_float fact6 ALIGN16;
00091     _sse_vector r12_1  ALIGN16;
00092     _sse_vector r34_1  ALIGN16;
00093     _sse_vector r12_2  ALIGN16;
00094     _sse_vector r34_2  ALIGN16;
00095     _sse_vector r0  ALIGN16;
00096     long ix1,iy1,iy2,iz1;
00097     float rho;
00098     _sse_su3 *up1,*um1,*um2;   
00099     _sse_spinor *s1,*sp1,*sp2,*sm1,*sm2,*sn1;
00100     
00101     if(sign!=1) exit(1);
00102 
00103     if((stop-start)%2 !=0) 
00104       error("FermiCloverActionSSE2\nProblem with parallelization: odd # of sites on process!");
00105     
00106     if(r_t!=1.0) 
00107       error("FermiCloverActionSSE2\nr_t!=1 not compatible with SSE2\n");
00108 
00109     _sse_check_alignment((void*) &fact1, 0xf);
00110     _sse_check_alignment((void*) &fact2, 0xf);
00111     _sse_check_alignment((void*) &fact3, 0xf);
00112     _sse_check_alignment((void*) &fact4, 0xf);
00113     _sse_check_alignment((void*) &fact5, 0xf);
00114     _sse_check_alignment((void*) &fact6, 0xf);
00115     _sse_check_alignment((void*) &r12_1, 0xf);
00116     _sse_check_alignment((void*) &r34_1, 0xf);
00117     _sse_check_alignment((void*) &r12_2, 0xf);
00118     _sse_check_alignment((void*) &r34_2, 0xf);
00119     
00120     /* empty vector */
00121     
00122     r0.c1.c1=r0.c1.c2=r0.c1.c3=r0.c1.c4=0;
00123     r0.c2.c1=r0.c2.c2=r0.c2.c3=r0.c2.c4=0;
00124     r0.c3.c1=r0.c3.c2=r0.c3.c3=r0.c3.c4=0;
00125     
00126     rho=-1.0f/kappa_s;
00127     
00128     /* coefficient of (1 +- Gamma[i]) */
00129     
00130     fact1.c1=rho;
00131     fact1.c2=rho;
00132     fact1.c3=rho;
00133     fact1.c4=rho;
00134     
00135     fact2.c1=-1.0f*kappa_s;
00136     fact2.c2=fact2.c1;
00137     fact2.c3=fact2.c1;
00138     fact2.c4=fact2.c1;
00139     
00140     /* coefficient of rho/2*(1 +- Gamma[0]) */
00141     
00142     fact3.c1=(1.0f+r_t)*kappa_t/kappa_s;
00143     fact3.c2=fact3.c1;
00144     fact3.c3=fact3.c1;
00145     fact3.c4=fact3.c1;
00146     
00147     fact4.c1=-1.0f;
00148     fact4.c2=-1.0f;
00149     fact4.c3=-1.0f;
00150     fact4.c4=-1.0f;
00151     
00152     /* coefficient of Sigma[i][j]*U.em(x,i,j) */
00153     
00154     fact5.c1=1.0f*kappa_s*cSW*c_B;
00155     fact5.c2=fact5.c1;
00156     fact5.c3=fact5.c1;
00157     fact5.c4=fact5.c1;
00158 
00159     /* coefficient of fact5*Sigma[0][i]*U.em(x,0,i) */
00160     
00161     fact6.c1=1.0f*c_E/c_B;
00162     fact6.c2=fact6.c1;
00163     fact6.c3=fact6.c1;
00164     fact6.c4=fact6.c1;
00165     
00166     sp1=(_sse_spinor*) &psi[iup[start][0]];
00167     sp2=(_sse_spinor*) &psi[iup[start+1][0]];
00168     up1=(_sse_su3*) U+4*start;
00169     
00170     /************************ loop over all lattice sites ***************/
00171     
00172     
00173     for (ix1=start; ix1<stop; ix1+=2) {   
00174 
00175 
00176       s1=psi+ix1;
00177       _sse_float_prefetch_spinor(s1);
00178 
00179 
00180       /******************************* direction +0 ***********************/
00181       
00182       iy1=idw[ix1][0];
00183       iy2=idw[ix1+1][0];
00184       sm1=psi+iy1;
00185       sm2=psi+iy2;
00186       _sse_float_prefetch_spinor(sm1);
00187       _sse_float_prefetch_spinor(sm2);
00188       
00189       _sse_float_pair_load((*sp1).c3,(*sp1).c4);
00190       _sse_float_vector_mul(fact3);
00191 
00192       _sse_float_su3_multiply((*up1));
00193 
00194       _sse_float_pair_load((*s1).c1,(*s1).c2);
00195       _sse_float_vector_mul(fact1);      
00196       _sse_float_vector_store(r12_1);
00197       
00198       _sse_float_pair_load((*s1).c3,(*s1).c4);
00199       _sse_float_vector_mul(fact1);      
00200       _sse_float_vector_add();
00201       _sse_float_vector_store(r34_1);
00202       
00203       um1=U+iy1*4;
00204       _sse_float_prefetch_su3(um1);
00205       um2=U+iy2*4;
00206       _sse_float_prefetch_su3(um2);
00207       
00208       _sse_float_pair_load((*(sp2)).c3,(*(sp2)).c4);
00209       _sse_float_vector_mul(fact3);
00210       
00211       _sse_float_su3_multiply((*(up1+4)));
00212       
00213       _sse_float_pair_load((*(s1+1)).c1,(*(s1+1)).c2);
00214       _sse_float_vector_mul(fact1);      
00215       _sse_float_vector_store(r12_2);
00216     
00217     _sse_float_pair_load((*(s1+1)).c3,(*(s1+1)).c4);
00218     _sse_float_vector_mul(fact1);      
00219     _sse_float_vector_add();
00220     _sse_float_vector_store(r34_2);
00221 
00222     
00223     /******************************* direction -0 ***********************/
00224 
00225     sp1=psi+iup[ix1][1];
00226     sp2=psi+iup[ix1+1][1];
00227     _sse_float_prefetch_spinor(sp1);
00228     _sse_float_prefetch_spinor(sp2);
00229         
00230     _sse_float_pair_load((*sm1).c1,(*sm1).c2);
00231     _sse_float_vector_mul(fact3);      
00232     
00233     _sse_float_su3_inverse_multiply((*um1));
00234     
00235     _sse_float_vector_load(r12_1);
00236     _sse_float_vector_add();
00237     _sse_float_vector_store(r12_1);
00238     
00239     up1++;
00240     _sse_float_prefetch_su3(up1);      
00241 
00242     _sse_float_pair_load((*(sm2)).c1,(*(sm2)).c2);
00243     _sse_float_vector_mul(fact3);      
00244     
00245     _sse_float_su3_inverse_multiply((*(um2)));
00246     
00247     _sse_float_vector_load(r12_2);
00248     _sse_float_vector_add();
00249     _sse_float_vector_store(r12_2);
00250       
00251     /******************************* direction +1 ***********************/
00252     
00253     iy1=idw[ix1][1];
00254     iy2=idw[ix1+1][1];
00255     sm1=psi+iy1;
00256     sm2=psi+iy2;
00257     _sse_float_prefetch_spinor(sm1);
00258     _sse_float_prefetch_spinor(sm2);
00259     
00260     _sse_float_pair_load((*sp1).c1,(*sp1).c2);
00261     _sse_float_pair_load_up((*sp1).c4,(*sp1).c3);
00262     _sse_float_vector_sub();
00263     
00264     _sse_float_su3_multiply((*up1));
00265     
00266     _sse_float_vector_load(r12_1);
00267     _sse_float_vector_add();
00268     _sse_float_vector_store(r12_1);
00269     
00270     _sse_float_vector_load(r34_1);
00271     _sse_float_vector_xch();
00272     _sse_float_vector_sub();
00273     _sse_float_vector_store(r34_1);
00274     
00275     um1=U+iy1*4+1;
00276     _sse_float_prefetch_su3(um1);      
00277     um2=U+iy2*4+1;
00278     _sse_float_prefetch_su3(um2);      
00279     
00280     _sse_float_pair_load((*(sp2)).c1,(*(sp2)).c2);
00281     _sse_float_pair_load_up((*(sp2)).c4,(*(sp2)).c3);
00282     _sse_float_vector_sub();
00283     
00284     _sse_float_su3_multiply((*(up1+4)));
00285     
00286     _sse_float_vector_load(r12_2);
00287     _sse_float_vector_add();
00288     _sse_float_vector_store(r12_2);
00289     
00290     _sse_float_vector_load(r34_2);
00291     _sse_float_vector_xch();
00292     _sse_float_vector_sub();
00293     _sse_float_vector_store(r34_2);
00294     
00295     /******************************* direction -1 ***********************/
00296     
00297     sp1=psi+iup[ix1][2];
00298     sp2=psi+iup[ix1+1][2];
00299     _sse_float_prefetch_spinor(sp1);
00300     _sse_float_prefetch_spinor(sp2);
00301     
00302     _sse_float_pair_load((*sm1).c1,(*sm1).c2);
00303     _sse_float_pair_load_up((*sm1).c4,(*sm1).c3);
00304     _sse_float_vector_add();
00305     
00306     _sse_float_su3_inverse_multiply((*um1));
00307     
00308     _sse_float_vector_load(r12_1);
00309     _sse_float_vector_add();
00310     _sse_float_vector_store(r12_1);      
00311 
00312     _sse_float_vector_load(r34_1);
00313     _sse_float_vector_xch();
00314     _sse_float_vector_add();
00315     _sse_float_vector_store(r34_1);      
00316     
00317     up1++;
00318     _sse_float_prefetch_su3(up1);      
00319     
00320     _sse_float_pair_load((*(sm2)).c1,(*(sm2)).c2);
00321     _sse_float_pair_load_up((*(sm2)).c4,(*(sm2)).c3);
00322     _sse_float_vector_add();
00323     
00324     _sse_float_su3_inverse_multiply((*(um2)));
00325     
00326     _sse_float_vector_load(r12_2);
00327     _sse_float_vector_add();
00328     _sse_float_vector_store(r12_2);      
00329     
00330     _sse_float_vector_load(r34_2);
00331     _sse_float_vector_xch();
00332     _sse_float_vector_add();
00333     _sse_float_vector_store(r34_2); 
00334     
00335     /******************************* direction +2 ***********************/
00336     
00337     iy1=idw[ix1][2];
00338     iy2=idw[ix1+1][2];
00339     sm1=psi+iy1;
00340     sm2=psi+iy2;
00341     _sse_float_prefetch_spinor(sm1);
00342     _sse_float_prefetch_spinor(sm2);
00343 
00344     _sse_float_pair_load((*sp1).c1,(*sp1).c2);
00345     _sse_float_pair_load_up((*sp1).c4,(*sp1).c3);
00346     _sse_float_vector_i_addsub();
00347     
00348     _sse_float_su3_multiply((*up1));
00349     
00350     _sse_float_vector_load(r12_1);
00351     _sse_float_vector_add();
00352     _sse_float_vector_store(r12_1);       
00353     
00354     _sse_float_vector_load(r34_1);
00355     _sse_float_vector_xch();
00356     _sse_float_vector_i_addsub();
00357     _sse_float_vector_store(r34_1);       
00358     
00359 
00360     um1=U+iy1*4+2;
00361     um2=U+iy2*4+2;     
00362     _sse_float_prefetch_su3(um1);
00363     _sse_float_prefetch_su3(um2);
00364     
00365     _sse_float_pair_load((*(sp2)).c1,(*(sp2)).c2);
00366     _sse_float_pair_load_up((*(sp2)).c4,(*(sp2)).c3);
00367     _sse_float_vector_i_addsub();
00368     
00369     _sse_float_su3_multiply((*(up1+4)));
00370     
00371     _sse_float_vector_load(r12_2);
00372     _sse_float_vector_add();
00373     _sse_float_vector_store(r12_2);       
00374     
00375     _sse_float_vector_load(r34_2);
00376     _sse_float_vector_xch();
00377     _sse_float_vector_i_addsub();
00378     _sse_float_vector_store(r34_2); 
00379     
00380     /******************************* direction -2 ***********************/
00381     
00382     sp1=psi+iup[ix1][3];
00383     sp2=psi+iup[ix1+1][3];
00384     _sse_float_prefetch_spinor(sp1);
00385     _sse_float_prefetch_spinor(sp2);
00386 
00387     _sse_float_pair_load((*sm1).c1,(*sm1).c2);
00388     _sse_float_pair_load_up((*sm1).c4,(*sm1).c3);
00389     _sse_float_vector_i_subadd();      
00390     
00391     _sse_float_su3_inverse_multiply((*um1));
00392     
00393     _sse_float_vector_load(r12_1);
00394     _sse_float_vector_add();
00395     _sse_float_vector_store(r12_1);
00396     
00397     _sse_float_vector_load(r34_1);
00398     _sse_float_vector_xch();
00399     _sse_float_vector_i_subadd();
00400     _sse_float_vector_store(r34_1);
00401     
00402     up1++;
00403     _sse_float_prefetch_su3(up1);
00404     
00405     _sse_float_pair_load((*(sm2)).c1,(*(sm2)).c2);
00406     _sse_float_pair_load_up((*(sm2)).c4,(*(sm2)).c3);
00407     _sse_float_vector_i_subadd();      
00408     
00409     _sse_float_su3_inverse_multiply((*(um2)));
00410     
00411     _sse_float_vector_load(r12_2);
00412     _sse_float_vector_add();
00413     _sse_float_vector_store(r12_2);
00414     
00415     _sse_float_vector_load(r34_2);
00416     _sse_float_vector_xch();
00417     _sse_float_vector_i_subadd();
00418     _sse_float_vector_store(r34_2);
00419     
00420     /******************************* direction +3 ***********************/
00421 
00422     iy1=idw[ix1][3];
00423     iy2=idw[ix1+1][3];
00424     sm1=psi+iy1;
00425     sm2=psi+iy2;
00426     _sse_float_prefetch_spinor(sm1);
00427     _sse_float_prefetch_spinor(sm2);
00428 
00429     _sse_float_pair_load((*sp1).c1,(*sp1).c2);
00430     _sse_float_pair_load_up((*sp1).c3,(*sp1).c4);
00431     _sse_float_vector_subadd();
00432     
00433     _sse_float_su3_multiply((*up1));
00434       
00435     _sse_float_vector_load(r12_1);
00436     _sse_float_vector_add();
00437     _sse_float_vector_store(r12_1);
00438     
00439     _sse_float_vector_load(r34_1);
00440     _sse_float_vector_subadd();
00441     _sse_float_vector_store(r34_1);      
00442     
00443     um1=U+iy1*4+3;
00444     _sse_float_prefetch_su3(um1);
00445     um2=U+iy2*4+3;     
00446     _sse_float_prefetch_su3(um2);
00447 
00448     _sse_float_pair_load((*sp2).c1,(*sp2).c2);
00449     _sse_float_pair_load_up((*sp2).c3,(*sp2).c4);
00450     _sse_float_vector_subadd();
00451     
00452     _sse_float_su3_multiply((*(up1+4)));
00453     
00454     _sse_float_vector_load(r12_2);
00455     _sse_float_vector_add();
00456     _sse_float_vector_store(r12_2);
00457     
00458     _sse_float_vector_load(r34_2);
00459     _sse_float_vector_subadd();
00460     _sse_float_vector_store(r34_2); 
00461     
00462 /******************************* direction -3 ***********************/
00463 
00464     sn1=(_sse_spinor*) &chi[ix1];      
00465     _sse_float_prefetch_spinor(sn1);
00466     
00467     iz1=ix1+2;
00468     if (iz1<stop) {
00469       sp1=(_sse_spinor*) &psi[iup[iz1][0]];
00470       sp2=(_sse_spinor*) &psi[iup[iz1+1][0]];
00471       _sse_float_prefetch_spinor(sp1);
00472       _sse_float_prefetch_spinor(sp2);  
00473     }
00474 
00475     _sse_float_pair_load((*sm1).c1,(*sm1).c2);
00476     _sse_float_pair_load_up((*sm1).c3,(*sm1).c4);
00477     _sse_float_vector_addsub();
00478     
00479     _sse_float_su3_inverse_multiply((*um1));
00480         
00481     _sse_float_vector_load(r12_1);
00482     _sse_float_vector_add();
00483     _sse_float_vector_mul(fact2);
00484     _sse_float_pair_store((*sn1).c1,(*sn1).c2);
00485     
00486     _sse_float_vector_load(r34_1);
00487     _sse_float_vector_addsub();
00488     _sse_float_vector_mul(fact2);
00489     _sse_float_pair_store((*sn1).c3,(*sn1).c4);      
00490     
00491     up1=U+iz1*4;
00492     _sse_float_prefetch_su3(up1);
00493 
00494     
00495     _sse_float_pair_load((*sm2).c1,(*sm2).c2);
00496     _sse_float_pair_load_up((*sm2).c3,(*sm2).c4);
00497     _sse_float_vector_addsub();
00498     
00499     _sse_float_su3_inverse_multiply((*um2));
00500     
00501     _sse_float_vector_load(r12_2);
00502     _sse_float_vector_add();
00503     _sse_float_vector_mul(fact2);
00504     _sse_float_pair_store((*(sn1+1)).c1,(*(sn1+1)).c2);
00505     
00506     _sse_float_vector_load(r34_2);
00507     _sse_float_vector_addsub();
00508     _sse_float_vector_mul(fact2);
00509     _sse_float_pair_store((*(sn1+1)).c3,(*(sn1+1)).c4); 
00510     
00511     /******************************** end of loop ***********************/
00512   }
00513 
00514   if(cSW==0) return;
00515 
00516   /* 
00517      everything here must be in agreement with gauge_field::ordered_index()
00518   */
00519     
00520   /*********** loop over all lattice sites for clover term *************/
00521     
00522   um1=uem+6*start;
00523   
00524   for (ix1=start; ix1<stop; ix1+=2) {   
00525     s1=psi+ix1;
00526     _sse_float_prefetch_spinor(s1);
00527 
00528     //************************* mu=0, nu=1 ***********************
00529       
00530       _sse_float_prefetch_su3(um1+1);
00531       _sse_float_prefetch_su3(um1+7);
00532       
00533       _sse_float_pair_load((*s1).c4,(*s1).c3);
00534       _sse_float_vector_mul(fact4);
00535       _sse_float_su3_multiply((*um1));
00536       // set this to zero 
00537       _sse_float_vector_load(r0);
00538       _sse_float_vector_add();
00539       _sse_float_vector_store(r12_1);
00540 
00541       _sse_float_pair_load((*s1).c2,(*s1).c1);
00542       _sse_float_su3_multiply((*um1));    
00543       _sse_float_vector_load(r0);
00544       _sse_float_vector_add();
00545       _sse_float_vector_store(r34_1);
00546       
00547       _sse_float_pair_load((*(s1+1)).c4,(*(s1+1)).c3);
00548       _sse_float_vector_mul(fact4);
00549       _sse_float_su3_multiply((*(um1+6)));    
00550       // set this to zero 
00551       _sse_float_vector_load(r0);
00552       _sse_float_vector_add();
00553       _sse_float_vector_store(r12_2);
00554 
00555       _sse_float_pair_load((*(s1+1)).c2,(*(s1+1)).c1);
00556       _sse_float_su3_multiply((*(um1+6)));    
00557       // set this to zero 
00558       _sse_float_vector_load(r0);
00559       _sse_float_vector_add();
00560       _sse_float_vector_store(r34_2);
00561       
00562       um1++;
00563       
00564       //************************* mu=0, nu=2 ***********************
00565       
00566       _sse_float_prefetch_su3(um1+1);
00567       _sse_float_prefetch_su3(um1+7);
00568       
00569       _sse_float_pair_load((*s1).c4,(*s1).c3);
00570       _sse_float_su3_multiply((*um1));
00571       _sse_float_vector_load(r12_1);
00572       _sse_float_vector_i_addsub();
00573       _sse_float_vector_store(r12_1);
00574       
00575       _sse_float_pair_load((*s1).c2,(*s1).c1);
00576       _sse_float_su3_multiply((*um1));
00577       _sse_float_vector_load(r34_1);
00578       _sse_float_vector_i_subadd();
00579       _sse_float_vector_store(r34_1);
00580       
00581       _sse_float_pair_load((*(s1+1)).c4,(*(s1+1)).c3);
00582       _sse_float_su3_multiply((*(um1+6)));
00583       _sse_float_vector_load(r12_2);
00584       _sse_float_vector_i_addsub();
00585       _sse_float_vector_store(r12_2);
00586       
00587       _sse_float_pair_load((*(s1+1)).c2,(*(s1+1)).c1);
00588       _sse_float_su3_multiply((*(um1+6)));
00589       _sse_float_vector_load(r34_2);
00590       _sse_float_vector_i_subadd();
00591       _sse_float_vector_store(r34_2);
00592       
00593       um1++;
00594       
00595       //************************* mu=0, nu=3 ***********************
00596       
00597       _sse_float_prefetch_su3(um1+1);
00598       _sse_float_prefetch_su3(um1+7);
00599       
00600       _sse_float_pair_load((*s1).c3,(*s1).c4);
00601       _sse_float_su3_multiply((*um1));
00602       _sse_float_vector_load(r12_1);
00603       _sse_float_vector_subadd();
00604       _sse_float_vector_mul(fact6);
00605       _sse_float_vector_store(r12_1);
00606       
00607       _sse_float_pair_load((*s1).c1,(*s1).c2);
00608       _sse_float_su3_multiply((*um1));
00609       _sse_float_vector_load(r34_1);
00610       _sse_float_vector_addsub();
00611       _sse_float_vector_mul(fact6);
00612       _sse_float_vector_store(r34_1);
00613       
00614       _sse_float_pair_load((*(s1+1)).c3,(*(s1+1)).c4);
00615       _sse_float_su3_multiply((*(um1+6)));
00616       _sse_float_vector_load(r12_2);
00617       _sse_float_vector_subadd();
00618       _sse_float_vector_mul(fact6);
00619       _sse_float_vector_store(r12_2);
00620       
00621       _sse_float_pair_load((*(s1+1)).c1,(*(s1+1)).c2);
00622       _sse_float_su3_multiply((*(um1+6)));
00623       _sse_float_vector_load(r34_2);
00624       _sse_float_vector_addsub();
00625       _sse_float_vector_mul(fact6);
00626       _sse_float_vector_store(r34_2);
00627       
00628       um1++;
00629       
00630       //************************* mu=1, nu=2 ***********************
00631       
00632       _sse_float_prefetch_su3(um1+1);
00633       _sse_float_prefetch_su3(um1+7);
00634       
00635       _sse_float_pair_load((*s1).c1,(*s1).c2);
00636       _sse_float_su3_multiply((*um1));
00637       _sse_float_vector_load(r12_1);
00638       _sse_float_vector_i_subadd();
00639       _sse_float_vector_store(r12_1);
00640       
00641       _sse_float_pair_load((*s1).c3,(*s1).c4);
00642       _sse_float_su3_multiply((*um1));
00643       _sse_float_vector_load(r34_1);
00644       _sse_float_vector_i_subadd();
00645       _sse_float_vector_store(r34_1);
00646       
00647       _sse_float_pair_load((*(s1+1)).c1,(*(s1+1)).c2);
00648       _sse_float_su3_multiply((*(um1+6)));
00649       _sse_float_vector_load(r12_2);
00650       _sse_float_vector_i_subadd();
00651       _sse_float_vector_store(r12_2);
00652       
00653       _sse_float_pair_load((*(s1+1)).c3,(*(s1+1)).c4);
00654       _sse_float_su3_multiply((*(um1+6)));
00655       _sse_float_vector_load(r34_2);
00656       _sse_float_vector_i_subadd();
00657       _sse_float_vector_store(r34_2);
00658       
00659       um1++;
00660       
00661       //************************* mu=1, nu=3 ***********************
00662       
00663       _sse_float_prefetch_su3(um1+1);
00664       _sse_float_prefetch_su3(um1+7);
00665       
00666       _sse_float_pair_load((*s1).c2,(*s1).c1);
00667       _sse_float_su3_multiply((*um1));
00668       _sse_float_vector_load(r12_1);
00669       _sse_float_vector_addsub();
00670       _sse_float_vector_store(r12_1);
00671       
00672       _sse_float_pair_load((*s1).c4,(*s1).c3);
00673       _sse_float_su3_multiply((*um1));
00674       _sse_float_vector_load(r34_1);
00675       _sse_float_vector_addsub();
00676       _sse_float_vector_store(r34_1);
00677       
00678       _sse_float_pair_load((*(s1+1)).c2,(*(s1+1)).c1);
00679       _sse_float_su3_multiply((*(um1+6)));
00680       _sse_float_vector_load(r12_2);
00681       _sse_float_vector_addsub();
00682       _sse_float_vector_store(r12_2);
00683       
00684       _sse_float_pair_load((*(s1+1)).c4,(*(s1+1)).c3);
00685       _sse_float_su3_multiply((*(um1+6)));
00686       _sse_float_vector_load(r34_2);
00687       _sse_float_vector_addsub();
00688       _sse_float_vector_store(r34_2);
00689       
00690       um1++;
00691       
00692       //************************* mu=2, nu=3 ***********************
00693       
00694       sn1=(_sse_spinor*) &chi[ix1];      
00695       _sse_float_prefetch_spinor(sn1);
00696       
00697       _sse_float_pair_load((*s1).c2,(*s1).c1);
00698       _sse_float_su3_multiply((*um1));
00699       _sse_float_vector_load(r12_1);
00700       _sse_float_vector_i_sub();
00701       _sse_float_vector_store(r12_1);
00702       
00703       _sse_float_pair_load((*s1).c4,(*s1).c3);
00704       _sse_float_su3_multiply((*um1));
00705       _sse_float_vector_load(r34_1);
00706       _sse_float_vector_i_sub();
00707       _sse_float_vector_store(r34_1);
00708       
00709       _sse_float_pair_load((*(s1+1)).c2,(*(s1+1)).c1);
00710       _sse_float_su3_multiply((*(um1+6)));
00711       _sse_float_vector_load(r12_2);
00712       _sse_float_vector_i_sub();
00713       _sse_float_vector_store(r12_2);
00714       
00715       _sse_float_pair_load((*(s1+1)).c4,(*(s1+1)).c3);
00716       _sse_float_su3_multiply((*(um1+6)));
00717       _sse_float_vector_load(r34_2);
00718       _sse_float_vector_i_sub();
00719       _sse_float_vector_store(r34_2);
00720       
00721       um1+=7;
00722       if(ix1<stop-1) {
00723         _sse_float_prefetch_su3(um1);
00724         _sse_float_prefetch_su3(um1+6);
00725       }
00726 
00727 
00728       _sse_float_pair_load_up((*sn1).c1,(*sn1).c2);
00729       _sse_float_vector_load(r12_1);
00730       _sse_float_vector_mul(fact5);
00731       _sse_float_vector_add();
00732       _sse_float_pair_store((*sn1).c1,(*sn1).c2);
00733 
00734       _sse_float_pair_load_up((*sn1).c3,(*sn1).c4);
00735       _sse_float_vector_load(r34_1);
00736       _sse_float_vector_mul(fact5);
00737       _sse_float_vector_add();
00738       _sse_float_pair_store((*sn1).c3,(*sn1).c4);
00739 
00740       _sse_float_pair_load_up((*(sn1+1)).c1,(*(sn1+1)).c2);
00741       _sse_float_vector_load(r12_2);
00742       _sse_float_vector_mul(fact5);
00743       _sse_float_vector_add();
00744       _sse_float_pair_store((*(sn1+1)).c1,(*(sn1+1)).c2);
00745 
00746       _sse_float_pair_load_up((*(sn1+1)).c3,(*(sn1+1)).c4);
00747       _sse_float_vector_load(r34_2);
00748       _sse_float_vector_mul(fact5);
00749       _sse_float_vector_add();
00750       _sse_float_pair_store((*(sn1+1)).c3,(*(sn1+1)).c4);
00751 
00752 
00753       //************** end of loop ***********************
00754   }
00755 
00756 
00757 #else
00758 
00759   _sse_spinor *chi=(_sse_spinor*) chi_out.physical_address();
00760   _sse_spinor *psi=(_sse_spinor*) psi_in.physical_address();
00761   _sse_su3    *U=(_sse_su3*) U_in.physical_address();
00762   _sse_su3    *uem=(_sse_su3*) U_in.em.physical_address();
00763   long   **iup=U_in.lattice().up;
00764   long   **idw=U_in.lattice().dw;
00765   long   start=U_in.lattice().start_index(ME,0); // even
00766   long   stop =U_in.lattice().stop_index(ME,1);  // odd
00767   
00768   _sse_double fact1 ALIGN16;
00769   _sse_double fact2 ALIGN16;
00770   _sse_double fact3 ALIGN16;
00771   _sse_double fact4 ALIGN16;
00772   _sse_double fact5 ALIGN16;
00773   _sse_double fact6 ALIGN16;
00774   _sse_spinor rs ALIGN16;
00775   _sse_spinor r0 ALIGN16;
00776   long ix,iy,iz;
00777   double rho;
00778   _sse_su3 *up, *um;   
00779   _sse_spinor *s,*sp,*sm,*sn;
00780 
00781   if(sign!=1) exit(1);
00782   if((stop-start)%2 !=0)
00783     error("FermiCloverActionSSE2\nProblem with parallelization: odd # of sites on process!");
00784 
00785   if(r_t!=1.0) 
00786     error("FermiCloverActionSSE2\nr_t!=1 not compatible with SSE2\n");
00787 
00788   _sse_check_alignment((void*) &fact1, 0xf);
00789   _sse_check_alignment((void*) &fact2, 0xf);
00790   _sse_check_alignment((void*) &fact3, 0xf);
00791   _sse_check_alignment((void*) &fact4, 0xf);
00792   _sse_check_alignment((void*) &fact5, 0xf);
00793   _sse_check_alignment((void*) &fact6, 0xf);
00794   _sse_check_alignment((void*) &rs, 0xf);
00795   _sse_check_alignment((void*) &r0, 0xf);
00796 
00797   r0.c1.c1.real()=r0.c1.c2.real()=r0.c1.c3.real()=0;
00798   r0.c2.c1.real()=r0.c2.c2.real()=r0.c2.c3.real()=0;
00799   r0.c3.c1.real()=r0.c3.c2.real()=r0.c3.c3.real()=0;
00800   r0.c4.c1.real()=r0.c4.c2.real()=r0.c4.c3.real()=0;
00801 
00802   rho=-1.0/kappa_s;
00803 
00804   fact1.c1=rho;
00805   fact1.c2=rho;
00806   
00807   fact2.c1=-1.0*kappa_s;
00808   fact2.c2=fact2.c1;
00809 
00810   fact3.c1=(1.0+r_t)*kappa_t/kappa_s;
00811   fact3.c2=fact3.c1;
00812 
00813   fact4.c1=-1.0;
00814   fact4.c2=-1.0;
00815 
00816   fact5.c1=1.0*kappa_s*cSW*c_E;
00817   fact5.c2=fact5.c1;
00818 
00819   fact6.c1=1.0*c_E/c_B;
00820   fact6.c2=fact6.c1;
00821 
00822   sp=(_sse_spinor*) &psi[iup[start][0]];
00823   up=(_sse_su3*) U+4*start;
00824 
00825    /************************ loop over all lattice sites ***************/
00826 
00827   for (ix=start; ix<stop; ix++) {   
00828     s=psi+ix;
00829     _sse_double_prefetch_spinor(s);
00830     
00831     /******************************* direction +0 ***********************/
00832 
00833     iy=idw[ix][0];
00834     sm=psi+iy;
00835     _sse_double_prefetch_spinor(sm);
00836 
00837     _sse_double_load((*s).c1);
00838     _sse_double_vector_mul(fact1);      
00839     _sse_double_store(rs.c1);
00840     _sse_double_load((*s).c2);
00841     _sse_double_vector_mul(fact1);      
00842     _sse_double_store(rs.c2);
00843 
00844     um=U+iy*4;
00845     _sse_double_prefetch_su3(um);
00846 
00847     _sse_double_load((*sp).c3);
00848     _sse_double_vector_mul(fact3);
00849     _sse_double_su3_multiply((*up));
00850     _sse_double_load((*s).c3);
00851     _sse_double_vector_mul(fact1);      
00852     _sse_double_vector_add();
00853     _sse_double_store(rs.c3);
00854 
00855     _sse_double_load((*sp).c4);
00856     _sse_double_vector_mul(fact3);
00857     _sse_double_su3_multiply((*up));
00858     _sse_double_load((*s).c4);
00859     _sse_double_vector_mul(fact1);      
00860     _sse_double_vector_add();
00861     _sse_double_store(rs.c4);
00862    
00863     
00864     /******************************* direction -0 ***********************/
00865 
00866     sp=psi+iup[ix][1];
00867     _sse_double_prefetch_spinor(sp);
00868     up++;
00869     _sse_double_prefetch_su3(up);      
00870         
00871     _sse_double_load((*sm).c1);
00872     _sse_double_vector_mul(fact3);          
00873     _sse_double_su3_inverse_multiply((*um));
00874     _sse_double_load(rs.c1);
00875     _sse_double_vector_add();
00876     _sse_double_store(rs.c1);
00877 
00878     _sse_double_load((*sm).c2);
00879     _sse_double_vector_mul(fact3);          
00880     _sse_double_su3_inverse_multiply((*um));
00881     _sse_double_load(rs.c2);
00882     _sse_double_vector_add();
00883     _sse_double_store(rs.c2);
00884       
00885     /******************************* direction +1 ***********************/
00886         
00887     iy=idw[ix][1];
00888     sm=psi+iy;
00889     _sse_double_prefetch_spinor(sm);
00890     um=U+iy*4+1;
00891     _sse_double_prefetch_su3(um);      
00892 
00893     _sse_double_load((*sp).c1);
00894     _sse_double_load_up((*sp).c4);
00895     _sse_double_vector_sub();
00896     _sse_double_su3_multiply((*up));
00897     _sse_double_load(rs.c1);
00898     _sse_double_vector_add();
00899     _sse_double_store(rs.c1);
00900     _sse_double_load(rs.c4);
00901     _sse_double_vector_sub();
00902     _sse_double_store(rs.c4);
00903 
00904     _sse_double_load((*sp).c2);
00905     _sse_double_load_up((*sp).c3);
00906     _sse_double_vector_sub();
00907     _sse_double_su3_multiply((*up));
00908     _sse_double_load(rs.c2);
00909     _sse_double_vector_add();
00910     _sse_double_store(rs.c2);
00911     _sse_double_load(rs.c3);
00912     _sse_double_vector_sub();
00913     _sse_double_store(rs.c3);
00914     
00915     
00916     /******************************* direction -1 ***********************/
00917     
00918     sp=psi+iup[ix][2];
00919     _sse_double_prefetch_spinor(sp);
00920     up++;
00921     _sse_double_prefetch_su3(up);      
00922 
00923     _sse_double_load((*sm).c1);
00924     _sse_double_load_up((*sm).c4);
00925     _sse_double_vector_add();
00926     _sse_double_su3_inverse_multiply((*um));
00927     _sse_double_load(rs.c1);
00928     _sse_double_vector_add();
00929     _sse_double_store(rs.c1);      
00930     _sse_double_load(rs.c4);
00931     _sse_double_vector_add();
00932     _sse_double_store(rs.c4);      
00933   
00934     _sse_double_load((*sm).c2);
00935     _sse_double_load_up((*sm).c3);
00936     _sse_double_vector_add();
00937     _sse_double_su3_inverse_multiply((*um));
00938     _sse_double_load(rs.c2);
00939     _sse_double_vector_add();
00940     _sse_double_store(rs.c2);      
00941     _sse_double_load(rs.c3);
00942     _sse_double_vector_add();
00943     _sse_double_store(rs.c3);      
00944 
00945     /******************************* direction +2 ***********************/
00946     
00947     iy=idw[ix][2];
00948     sm=psi+iy;
00949     _sse_double_prefetch_spinor(sm);
00950     um=U+iy*4+2;
00951     _sse_double_prefetch_su3(um);
00952 
00953     _sse_double_load((*sp).c1);
00954     _sse_double_load_up((*sp).c4);
00955     _sse_double_vector_i_mul(); _sse_double_vector_add();
00956     _sse_double_su3_multiply((*up));
00957     _sse_double_load(rs.c1);
00958     _sse_double_vector_add();
00959     _sse_double_store(rs.c1);       
00960     _sse_double_load(rs.c4);
00961     _sse_double_vector_i_mul(); _sse_double_vector_sub();
00962     _sse_double_store(rs.c4);       
00963     
00964     _sse_double_load((*sp).c2);
00965     _sse_double_load_up((*sp).c3);
00966     _sse_double_vector_i_mul(); _sse_double_vector_sub();
00967     _sse_double_su3_multiply((*up));
00968     _sse_double_load(rs.c2);
00969     _sse_double_vector_add();
00970     _sse_double_store(rs.c2);       
00971     _sse_double_load(rs.c3);
00972     _sse_double_vector_i_mul(); _sse_double_vector_add();
00973     _sse_double_store(rs.c3); 
00974 
00975     
00976     /******************************* direction -2 ***********************/
00977     
00978     sp=psi+iup[ix][3];
00979     _sse_double_prefetch_spinor(sp);
00980     up++;
00981     _sse_double_prefetch_su3(up);
00982 
00983     _sse_double_load((*sm).c1);
00984     _sse_double_load_up((*sm).c4);
00985     _sse_double_vector_i_mul(); _sse_double_vector_sub();      
00986     _sse_double_su3_inverse_multiply((*um));
00987     _sse_double_load(rs.c1);
00988     _sse_double_vector_add();
00989     _sse_double_store(rs.c1);
00990     _sse_double_load(rs.c4);
00991     _sse_double_vector_i_mul(); _sse_double_vector_add();
00992     _sse_double_store(rs.c4);
00993     
00994     _sse_double_load((*sm).c2);
00995     _sse_double_load_up((*sm).c3);
00996     _sse_double_vector_i_mul(); _sse_double_vector_add();      
00997     _sse_double_su3_inverse_multiply((*um));
00998     _sse_double_load(rs.c2);
00999     _sse_double_vector_add();
01000     _sse_double_store(rs.c2);
01001     _sse_double_load(rs.c3);
01002     _sse_double_vector_i_mul(); _sse_double_vector_sub();
01003     _sse_double_store(rs.c3);
01004     
01005     /******************************* direction +3 ***********************/
01006 
01007     iy=idw[ix][3];
01008     sm=psi+iy;
01009     _sse_double_prefetch_spinor(sm);
01010     um=U+iy*4+3;
01011     _sse_double_prefetch_su3(um);
01012 
01013     _sse_double_load((*sp).c1);
01014     _sse_double_load_up((*sp).c3);
01015     _sse_double_vector_sub();
01016     _sse_double_su3_multiply((*up));
01017     _sse_double_load(rs.c1);
01018     _sse_double_vector_add();
01019     _sse_double_store(rs.c1);
01020     _sse_double_load(rs.c3);
01021     _sse_double_vector_sub();
01022     _sse_double_store(rs.c3);      
01023     
01024     _sse_double_load((*sp).c2);
01025     _sse_double_load_up((*sp).c4);
01026     _sse_double_vector_add();
01027     _sse_double_su3_multiply((*up));
01028     _sse_double_load(rs.c2);
01029     _sse_double_vector_add();
01030     _sse_double_store(rs.c2);
01031     _sse_double_load(rs.c4);
01032     _sse_double_vector_add();
01033     _sse_double_store(rs.c4);
01034     
01035     /******************************* direction -3 ***********************/
01036 
01037     sn=chi+ix;      
01038     _sse_double_prefetch_spinor(sn);
01039     
01040     iz=ix+1;
01041     if (iz<stop) {
01042       sp=psi+iup[iz][0];
01043       _sse_double_prefetch_spinor(sp);
01044       up=U+iz*4;
01045       _sse_double_prefetch_su3(up);
01046     }
01047 
01048     _sse_double_load((*sm).c1);
01049     _sse_double_load_up((*sm).c3);
01050     _sse_double_vector_add();
01051     _sse_double_su3_inverse_multiply((*um));
01052     _sse_double_load(rs.c1);
01053     _sse_double_vector_add();
01054     _sse_double_vector_mul(fact2);
01055     _sse_double_store((*sn).c1);
01056     _sse_double_load(rs.c3);
01057     _sse_double_vector_add();
01058     _sse_double_vector_mul(fact2);
01059     _sse_double_store((*sn).c3);
01060     
01061     _sse_double_load((*sm).c2);
01062     _sse_double_load_up((*sm).c4);
01063     _sse_double_vector_sub(); 
01064     _sse_double_su3_inverse_multiply((*um));
01065     _sse_double_load(rs.c2);
01066     _sse_double_vector_add();
01067     _sse_double_vector_mul(fact2);
01068     _sse_double_store((*sn).c2);
01069     _sse_double_load(rs.c4);
01070     _sse_double_vector_sub();
01071     _sse_double_vector_mul(fact2);
01072     _sse_double_store((*sn).c4);      
01073     
01074     /******************************** end of loop ***********************/    
01075   }
01076 
01077   if(cSW==0) return;
01078 
01079   /* 
01080      everything here must be in agreement with gauge_field::ordered_index()
01081   */
01082     
01083   /*********** loop over all lattice sites for clover term *************/
01084     
01085   um=uem+6*start;
01086 
01087   for (ix=start; ix<stop; ix++) {   
01088     s=psi+ix;
01089     _sse_double_prefetch_spinor(s);
01090     
01091     /************************** mu=0, nu=1 ***********************/
01092 
01093       
01094       _sse_double_prefetch_su3(um+1);
01095       
01096       _sse_double_load((*s).c4);
01097       _sse_double_vector_mul(fact4);
01098       _sse_double_su3_multiply((*um));
01099       /* set this to zero */
01100       _sse_double_load(r0.c1);
01101       _sse_double_vector_add();
01102       _sse_double_store(rs.c1);
01103 
01104       _sse_double_load((*s).c3);
01105       _sse_double_vector_mul(fact4);
01106       _sse_double_su3_multiply((*um));
01107       /* set this to zero */
01108       _sse_double_load(r0.c2);
01109       _sse_double_vector_add();
01110       _sse_double_store(rs.c2);
01111 
01112       _sse_double_load((*s).c2);
01113       _sse_double_su3_multiply((*um));    
01114       _sse_double_load(r0.c3);
01115       _sse_double_vector_add();
01116       _sse_double_store(rs.c3);
01117       
01118       _sse_double_load((*s).c1);
01119       _sse_double_su3_multiply((*um));    
01120       _sse_double_load(r0.c4);
01121       _sse_double_vector_add();
01122       _sse_double_store(rs.c4);
01123     
01124       um++;
01125       
01126       /************************** mu=0, nu=2 ***********************/
01127       
01128       _sse_double_prefetch_su3(um+1);
01129       
01130       _sse_double_load((*s).c4);
01131       _sse_double_su3_multiply((*um));
01132       _sse_double_load(rs.c1);
01133       _sse_double_vector_i_mul(); _sse_double_vector_add();
01134       _sse_double_store(rs.c1);
01135       
01136       _sse_double_load((*s).c3);
01137       _sse_double_su3_multiply((*um));
01138       _sse_double_load(rs.c2);
01139       _sse_double_vector_i_mul(); _sse_double_vector_sub();
01140       _sse_double_store(rs.c2);
01141       
01142       _sse_double_load((*s).c2);
01143       _sse_double_su3_multiply((*um));
01144       _sse_double_load(rs.c3);
01145       _sse_double_vector_i_mul(); _sse_double_vector_sub();
01146       _sse_double_store(rs.c3);
01147       
01148       _sse_double_load((*s).c1);
01149       _sse_double_su3_multiply((*um));
01150       _sse_double_load(rs.c4);
01151       _sse_double_vector_i_mul(); _sse_double_vector_add();
01152       _sse_double_store(rs.c4); 
01153       
01154       um++;
01155       
01156       /************************** mu=0, nu=3 ***********************/
01157       
01158       _sse_double_prefetch_su3(um+1);
01159       
01160       _sse_double_load((*s).c3);
01161       _sse_double_su3_multiply((*um));
01162       _sse_double_load(rs.c1);
01163       _sse_double_vector_sub();
01164       _sse_double_vector_mul(fact6);
01165       _sse_double_store(rs.c1);
01166       
01167       _sse_double_load((*s).c4);
01168       _sse_double_su3_multiply((*um));
01169       _sse_double_load(rs.c2);
01170       _sse_double_vector_add();
01171       _sse_double_vector_mul(fact6);
01172       _sse_double_store(rs.c2);
01173       
01174       _sse_double_load((*s).c1);
01175       _sse_double_su3_multiply((*um));
01176       _sse_double_load(rs.c3);
01177       _sse_double_vector_add();
01178       _sse_double_vector_mul(fact6);
01179       _sse_double_store(rs.c3);
01180           
01181       _sse_double_load((*s).c2);
01182       _sse_double_su3_multiply((*um));
01183       _sse_double_load(rs.c4);
01184       _sse_double_vector_sub();
01185       _sse_double_vector_mul(fact6);
01186       _sse_double_store(rs.c4);
01187           
01188       um++;
01189       
01190       /************************** mu=1, nu=2 ***********************/
01191       
01192       _sse_double_prefetch_su3(um+1);
01193       
01194       _sse_double_load((*s).c1);
01195       _sse_double_su3_multiply((*um));
01196       _sse_double_load(rs.c1);
01197       _sse_double_vector_i_mul(); _sse_double_vector_sub();
01198       _sse_double_store(rs.c1);
01199       
01200       _sse_double_load((*s).c2);
01201       _sse_double_su3_multiply((*um));
01202       _sse_double_load(rs.c2);
01203       _sse_double_vector_i_mul(); _sse_double_vector_add();
01204       _sse_double_store(rs.c2);
01205       
01206       _sse_double_load((*s).c3);
01207       _sse_double_su3_multiply((*um));
01208       _sse_double_load(rs.c3);
01209       _sse_double_vector_i_mul(); _sse_double_vector_sub();
01210       _sse_double_store(rs.c3);
01211 
01212       _sse_double_load((*s).c4);
01213       _sse_double_su3_multiply((*um));
01214       _sse_double_load(rs.c4);
01215       _sse_double_vector_i_mul(); _sse_double_vector_add();
01216       _sse_double_store(rs.c4);
01217       
01218       um++;
01219       
01220       /************************** mu=1, nu=3 ***********************/
01221       
01222       _sse_double_prefetch_su3(um+1);
01223       
01224       _sse_double_load((*s).c2);
01225       _sse_double_su3_multiply((*um));
01226       _sse_double_load(rs.c1);
01227       _sse_double_vector_add();
01228       _sse_double_store(rs.c1);
01229       
01230       _sse_double_load((*s).c1);
01231       _sse_double_su3_multiply((*um));
01232       _sse_double_load(rs.c2);
01233       _sse_double_vector_sub();
01234       _sse_double_store(rs.c2);
01235       
01236       _sse_double_load((*s).c4);
01237       _sse_double_su3_multiply((*um));
01238       _sse_double_load(rs.c3);
01239       _sse_double_vector_add();
01240       _sse_double_store(rs.c3);
01241       
01242       _sse_double_load((*s).c3);
01243       _sse_double_su3_multiply((*um));
01244       _sse_double_load(rs.c4);
01245       _sse_double_vector_sub();
01246       _sse_double_store(rs.c4);
01247       
01248       um++;
01249       
01250       /************************** mu=2, nu=3 ***********************/
01251 
01252       sn=(_sse_spinor*) &chi[ix];      
01253       _sse_double_prefetch_spinor(sn);
01254       
01255       _sse_double_load((*s).c2);
01256       _sse_double_su3_multiply((*um));
01257       _sse_double_load(rs.c1);
01258       _sse_double_vector_i_mul(); _sse_double_vector_sub();
01259       _sse_double_store(rs.c1);
01260       
01261       _sse_double_load((*s).c1);
01262       _sse_double_su3_multiply((*um));
01263       _sse_double_load(rs.c2);
01264       _sse_double_vector_i_mul(); _sse_double_vector_sub();
01265       _sse_double_store(rs.c2);
01266       
01267       _sse_double_load((*s).c4);
01268       _sse_double_su3_multiply((*um));
01269       _sse_double_load(rs.c3);
01270       _sse_double_vector_i_mul(); _sse_double_vector_sub();
01271       _sse_double_store(rs.c3);
01272       
01273       _sse_double_load((*s).c3);
01274       _sse_double_su3_multiply((*um));
01275       _sse_double_load(rs.c4);
01276       _sse_double_vector_i_mul(); _sse_double_vector_sub();
01277       _sse_double_store(rs.c4);
01278 
01279       um++;
01280       if(ix<stop) {
01281         _sse_double_prefetch_su3(um);
01282       }
01283 
01284       _sse_double_load_up((*sn).c1);
01285       _sse_double_load(rs.c1);
01286       _sse_double_vector_mul(fact5);
01287       _sse_double_vector_add();
01288       _sse_double_store((*sn).c1);
01289 
01290       _sse_double_load_up((*sn).c2);
01291       _sse_double_load(rs.c2);
01292       _sse_double_vector_mul(fact5);
01293       _sse_double_vector_add();
01294       _sse_double_store((*sn).c2);
01295 
01296       _sse_double_load_up((*sn).c3);
01297       _sse_double_load(rs.c3);
01298       _sse_double_vector_mul(fact5);
01299       _sse_double_vector_add();
01300       _sse_double_store((*sn).c3);
01301 
01302       _sse_double_load_up((*sn).c4);
01303       _sse_double_load(rs.c4);
01304       _sse_double_vector_mul(fact5);
01305       _sse_double_vector_add();
01306       _sse_double_store((*sn).c4);
01307       
01308       /*************** end of loop ***********************/
01309   }
01310 #endif // if defined(USE_DOUBLE_PRECISION)
01311  
01312   }
01313 };
01314 
01315 #endif // if defined(SSE2)
01316 
01317 

Generated on Sun Feb 27 15:12:18 2005 by  doxygen 1.4.1