00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013 bool mdp_write_double_as_float(FILE *fp,
00014 void* data,
00015 long psize,
00016 long header_size,
00017 long position,
00018 const mdp_lattice &lattice) {
00019 double *p=(double*) data;
00020 float *q=(float*) malloc(psize/2);
00021 for(uint i=0; i<psize/sizeof(double); i++) q[i]=p[i];
00022 if(fseek(fp, position*psize/2+header_size, SEEK_SET) ||
00023 fwrite(q, psize/2,1, fp)!=1) return false;
00024 free(q);
00025 return true;
00026 }
00027
00028 bool mdp_read_double_as_float(FILE *fp,
00029 void* data,
00030 long psize,
00031 long header_size,
00032 long position,
00033 const mdp_lattice &lattice) {
00034 double *p=(double*) data;
00035 float *q=(float*) malloc(psize/2);
00036 if(fseek(fp, position*psize/2+header_size, SEEK_SET) ||
00037 fread(q, psize/2,1, fp)!=1) return false;
00038 for(uint i=0; i<psize/sizeof(double); i++) p[i]=q[i];
00039 free(q);
00040 return true;
00041 }
00042
00043 bool mdp_write_float_as_double(FILE *fp,
00044 void* data,
00045 long psize,
00046 long header_size,
00047 long position,
00048 const mdp_lattice &lattice) {
00049 float *p=(float*) data;
00050 double *q=(double*) malloc(psize*2);
00051 for(uint i=0; i<psize/sizeof(float); i++) q[i]=p[i];
00052 if(fseek(fp, position*psize*2+header_size, SEEK_SET) ||
00053 fwrite(q, psize*2,1, fp)!=1) return false;
00054 free(q);
00055 return true;
00056 }
00057
00058 bool mdp_read_float_as_double(FILE *fp,
00059 void* data,
00060 long psize,
00061 long header_size,
00062 long position,
00063 const mdp_lattice &lattice) {
00064 float *p=(float*) data;
00065 double *q=(double*) malloc(psize*2);
00066 if(fseek(fp, position*psize*2+header_size, SEEK_SET) ||
00067 fread(q, psize*2,1, fp)!=1) return false;
00068 for(uint i=0; i<psize/sizeof(float); i++) p[i]=q[i];
00069 free(q);
00070 return true;
00071 }
00072
00085 class mdp_complex_field : public mdp_field<mdp_complex> {
00086 public:
00087 mdp_complex_field() {}
00088 mdp_complex_field(mdp_lattice& lattice, int n=1) {
00089 allocate_field(lattice,n);
00090 }
00091 inline void operator= (const mdp_complex_field &psi) {
00092 if(&lattice()==0) return;
00093 if(&lattice() != &psi.lattice() ||
00094 size!=psi.size ||
00095 field_components!=psi.field_components)
00096 error("mdp_field: operator=() impatible fields");
00097
00098 long i=0;
00099
00100 #if defined(SSE2) && defined(USE_DOUBLE_PRECISION) && !defined(NO_SSE2_LINALG)
00101 _sse_double* r=(_sse_double*) m;
00102 _sse_double* s=(_sse_double*) psi.m;
00103
00104 for(; i<size-7; i+=8, r+=8, s+=8) {
00105 _sse_double_prefetch_16(s+8);
00106 _sse_double_copy_16(r,s);
00107 }
00108 #endif
00109
00110 for(; i<size; i++) m[i]=psi.m[i];
00111 }
00112
00113 inline void operator*=(const mdp_complex alpha) {
00114 long i_min=physical_local_start(EVENODD);
00115 long i_max=physical_local_stop(EVENODD);
00116 long i=i_min;
00117 for(; i<i_max; i++) m[i]*=alpha;
00118 }
00119 inline void operator/=(const mdp_complex alpha) {
00120 (*this)*=(1.0/alpha);
00121 }
00122
00123 inline void operator*=(const mdp_real alpha) {
00124 long i_min=physical_local_start(EVENODD);
00125 long i_max=physical_local_stop(EVENODD);
00126 long i=i_min;
00127
00128 if(alpha==1) return;
00129
00130 #if defined(SSE2) && defined(USE_DOUBLE_PRECISION) && !defined(NO_SSE2_LINALG)
00131 static _sse_double c ALIGN16;
00132 _sse_double* r=(_sse_double*) physical_address(i_min);
00133
00134 _sse_check_alignment(&c,0xf);
00135
00136 c.c1=c.c2=alpha;
00137 for(; i<i_max-7; i+=8, r+=8) {
00138 _sse_double_prefetch_16(r+8);
00139 _sse_double_multiply_16(r,c,r);
00140 }
00141 #endif
00142
00143 for(; i<i_max; i++) m[i]*=alpha;
00144 }
00145
00146 inline void operator/=(const mdp_real alpha) {
00147 (*this)*=(1.0/alpha);
00148 }
00149
00150
00151 inline void operator+=(mdp_complex_field &psi) {
00152 long i_min=psi.physical_local_start(EVENODD);
00153 long i_max=psi.physical_local_stop(EVENODD);
00154 long i=i_min;
00155
00156 #if defined(SSE2) && defined(USE_DOUBLE_PRECISION) && !defined(NO_SSE2_LINALG)
00157 _sse_double* r=(_sse_double*) physical_address(i_min);
00158 _sse_double* s=(_sse_double*) psi.physical_address(i_min);
00159
00160 for(; i<i_max-7; i+=8, r+=8, s+=8) {
00161 _sse_double_prefetch_16(s+8);
00162 _sse_double_add_16(r,s);
00163 }
00164 #endif
00165
00166 for(; i<i_max; i++) m[i]+=psi.m[i];
00167 }
00168
00169 inline void operator-=(mdp_complex_field &psi) {
00170 long i_min=psi.physical_local_start(EVENODD);
00171 long i_max=psi.physical_local_stop(EVENODD);
00172 long i=i_min;
00173
00174 #if defined(SSE2) && defined(USE_DOUBLE_PRECISION) && !defined(NO_SSE2_LINALG)
00175 _sse_double* r=(_sse_double*) physical_address(i_min);
00176 _sse_double* s=(_sse_double*) psi.physical_address(i_min);
00177
00178 for(; i<i_max-7; i+=8, r+=8, s+=8) {
00179 _sse_double_prefetch_16(s+8);
00180 _sse_double_sub_16(r,s);
00181 }
00182 #endif
00183
00184 for(; i<i_max; i++) m[i]-=psi.m[i];
00185 }
00186
00187
00188 inline friend mdp_real norm_square(mdp_complex_field &psi,
00189 int parity=EVENODD) {
00190 double n2=0;
00191 long i_min=psi.physical_local_start(parity);
00192 long i_max=psi.physical_local_stop(parity);
00193 long i=i_min;
00194
00195 #if defined(SSE2) && defined(USE_DOUBLE_PRECISION) && !defined(NO_SSE2_LINALG)
00196 static _sse_double c ALIGN16;
00197 _sse_double* r=(_sse_double*) psi.physical_address(i_min);
00198
00199 _sse_check_alignment(&c,0xf);
00200
00201 c.c1=c.c2=0;
00202 for(; i<i_max-7; i+=8, r+=8) {
00203 _sse_double_prefetch_16(r+8);
00204 _sse_double_add_norm_square_16(r,c);
00205 }
00206 n2+=c.c1+c.c2;
00207 #endif
00208
00209 for(; i<i_max; i++)
00210 n2+=abs2(psi[i]);
00211 mpi.add(n2);
00212 return n2;
00213 }
00214
00215 inline friend mdp_complex scalar_product(mdp_complex_field &psi,
00216 mdp_complex_field &chi,
00217 int parity=EVENODD) {
00218 mdp_complex n2=0;
00219 long i_min=psi.physical_local_start(parity);
00220 long i_max=psi.physical_local_stop(parity);
00221 long i=i_min;
00222
00223 #if defined(SSE2) && defined(USE_DOUBLE_PRECISION) && !defined(NO_SSE2_LINALG)
00224 static _sse_double c,d ALIGN16;
00225 _sse_double* r=(_sse_double*) psi.physical_address(i_min);
00226 _sse_double* s=(_sse_double*) chi.physical_address(i_min);
00227
00228 _sse_check_alignment(&c,0xf);
00229
00230 c.c1=c.c2=0;
00231 d.c1=d.c2=0;
00232 for(; i<i_max-7; i+=8, r+=8, s+=8) {
00233 _sse_double_prefetch_16(r+8);
00234 _sse_double_prefetch_16(s+8);
00235 _sse_double_add_real_scalar_product_16(r,s,c);
00236 _sse_double_add_imag_scalar_product_16(r,s,d);
00237 }
00238 n2+=mdp_complex(c.c1+c.c2,d.c2-d.c1);
00239 #endif
00240
00241 for(;i<i_max; i++)
00242 n2+=conj(psi[i])*chi[i];
00243 mpi.add(n2);
00244
00245 return n2;
00246 }
00247
00248 inline friend mdp_real real_scalar_product(mdp_complex_field &psi,
00249 mdp_complex_field &chi,
00250 int parity=EVENODD) {
00251
00252 double n2=0;
00253 long i_min=psi.physical_local_start(parity);
00254 long i_max=psi.physical_local_stop(parity);
00255 long i=i_min;
00256
00257 #if defined(SSE2) && defined(USE_DOUBLE_PRECISION) && !defined(NO_SSE2_LINALG)
00258 static _sse_double c ALIGN16;
00259 _sse_double* r=(_sse_double*) psi.physical_address(i_min);
00260 _sse_double* s=(_sse_double*) chi.physical_address(i_min);
00261
00262 _sse_check_alignment(&c,0xf);
00263
00264 c.c1=c.c2=0;
00265 for(; i<i_max-7; i+=8, r+=8, s+=8) {
00266 _sse_double_prefetch_16(r+8);
00267 _sse_double_prefetch_16(s+8);
00268 _sse_double_add_real_scalar_product_16(r,s,c);
00269 }
00270 n2+=c.c1+c.c2;
00271 #endif
00272
00273 for(;i<i_max; i++)
00274 n2+=
00275 real(chi[i])*real(psi[i])+
00276 imag(chi[i])*imag(psi[i]);
00277
00278 mpi.add(n2);
00279 return n2;
00280 }
00281
00282 inline friend mdp_real imag_scalar_product(mdp_complex_field &psi,
00283 mdp_complex_field &chi,
00284 int parity=EVENODD) {
00285 double n2=0;
00286 long i_min=psi.physical_local_start(parity);
00287 long i_max=psi.physical_local_stop(parity);
00288 long i=i_min;
00289
00290 #if defined(SSE2) && defined(USE_DOUBLE_PRECISION) && !defined(NO_SSE2_LINALG)
00291 static _sse_double c ALIGN16;
00292 _sse_double* r=(_sse_double*) psi.physical_address(i_min);
00293 _sse_double* s=(_sse_double*) chi.physical_address(i_min);
00294
00295 _sse_check_alignment(&c,0xf);
00296
00297 c.c1=c.c2=0;
00298 for(; i<i_max-7; i+=8, r+=8, s+=8) {
00299 _sse_double_prefetch_16(r+8);
00300 _sse_double_prefetch_16(s+8);
00301 _sse_double_add_imag_scalar_product_16(r,s,c);
00302 }
00303 n2+=c.c2-c.c1;
00304 #endif
00305
00306 for(;i<i_max; i++)
00307 n2+=
00308 real(psi[i])*imag(chi[i])+
00309 imag(psi[i])*real(chi[i]);
00310 mpi.add(n2);
00311 return n2;
00312 }
00313
00314 inline friend void mdp_add_scaled_field(mdp_complex_field &psi,
00315 mdp_real alpha,
00316 mdp_complex_field &chi,
00317 int parity=EVENODD) {
00318 long i_min=psi.physical_local_start(parity);
00319 long i_max=psi.physical_local_stop(parity);
00320 long i=i_min;
00321
00322 #if defined(SSE2) && defined(USE_DOUBLE_PRECISION) && !defined(NO_SSE2_LINALG)
00323 static _sse_double c ALIGN16;
00324 _sse_double* r=(_sse_double*) psi.physical_address(i_min);
00325 _sse_double* s=(_sse_double*) chi.physical_address(i_min);
00326
00327 _sse_check_alignment(&c,0xf);
00328
00329 c.c1=c.c2=alpha;
00330 for(i=0; i<i_max-7; i+=8, r+=8, s+=8) {
00331 _sse_double_prefetch_16(r+8);
00332 _sse_double_prefetch_16(s+8);
00333 _sse_double_add_multiply_16(r,c,s);
00334 }
00335
00336 #endif
00337
00338 for(;i<i_max; i++)
00339 psi[i]+=alpha*chi[i];
00340 }
00341
00342 inline friend void mdp_add_scaled_field(mdp_complex_field &psi,
00343 mdp_complex alpha,
00344 mdp_complex_field &chi,
00345 int parity=EVENODD) {
00346 long i_min=psi.physical_local_start(parity);
00347 long i_max=psi.physical_local_stop(parity);
00348 long i=i_min;
00349
00350 for(;i<i_max; i++)
00351 psi[i]+=alpha*chi[i];
00352 }
00353
00354 inline friend mdp_complex operator* (mdp_complex_field &psi,
00355 mdp_complex_field &chi) {
00356 return scalar_product(psi, chi);
00357 }
00358
00359 inline friend mdp_real relative_residue(mdp_complex_field& p,
00360 mdp_complex_field& q,
00361 int parity=EVENODD) {
00362 register double residue=0, num=0, den=0;
00363 long i_min=p.physical_local_start(parity);
00364 register long i_max=q.physical_local_stop(parity);
00365 register long i=i_min;
00366
00367 for(;i<i_max;) {
00368 num+=abs2(p[i]);
00369 den+=abs2(q[i]);
00370 if(++i % p.size_per_site()==0) {
00371 residue+=(den==0)?1.0:(num/den);
00372 num=den=0;
00373 }
00374 }
00375 mpi.add(residue);
00376 return sqrt(residue/p.lattice().global_volume());
00377 }
00378
00379 bool save_as_float(string filename,
00380 int processIO=0,
00381 long max_buffer_size=1024,
00382 bool load_header=true,
00383 long skip_bytes=0) {
00384 #if defined(USE_DOUBLE_PRECISION)
00385 header.bytes_per_site/=2;
00386 save(filename,processIO, max_buffer_size,load_header,skip_bytes,
00387 mdp_write_double_as_float);
00388 header.bytes_per_site*=2;
00389 #else
00390 save(filename,processIO, max_buffer_size,load_header,skip_bytes,0);
00391 #endif
00392 return true;
00393 }
00394
00395 bool load_as_float(string filename,
00396 int processIO=0,
00397 long max_buffer_size=1024,
00398 bool load_header=true,
00399 long skip_bytes=0) {
00400
00401 #if defined(USE_DOUBLE_PRECISION)
00402 header.bytes_per_site/=2;
00403 load(filename,processIO, max_buffer_size,load_header,skip_bytes,
00404 mdp_read_double_as_float, true);
00405 header.bytes_per_site*=2;
00406 #else
00407 load(filename,processIO, max_buffer_size,load_header,skip_bytes,0,true);
00408 #endif
00409 return true;
00410 }
00411
00412
00413 bool load_as_double(string filename,
00414 int processIO=0,
00415 long max_buffer_size=1024,
00416 bool load_header=true,
00417 long skip_bytes=0) {
00418 #if !defined(USE_DOUBLE_PRECISION)
00419 header.bytes_per_site*=2;
00420 load(filename,processIO, max_buffer_size,load_header,skip_bytes,
00421 mdp_read_float_as_double, true);
00422 header.bytes_per_site/=2;
00423 #else
00424 load(filename,processIO, max_buffer_size,load_header,skip_bytes,0,true);
00425 #endif
00426 return true;
00427 }
00428
00429 bool save_as_double(string filename,
00430 int processIO=0,
00431 long max_buffer_size=1024,
00432 bool load_header=true,
00433 long skip_bytes=0) {
00434 #if !defined(USE_DOUBLE_PRECISION)
00435 header.bytes_per_site*=2;
00436 save(filename,processIO, max_buffer_size,load_header,skip_bytes,
00437 mdp_write_float_as_double);
00438 header.bytes_per_site/=2;
00439 #else
00440 save(filename,processIO, max_buffer_size,load_header,skip_bytes,0);
00441 #endif
00442 return true;
00443 }
00444 };
00445
00446