00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include <config.h>
00013 #include <util/gjp.h>
00014 #include <comms/scu.h>
00015 #include <comms/glb.h>
00016 #include <util/lattice.h>
00017 #include <util/dirac_op.h>
00018 #include <util/vector.h>
00019 #include <stdio.h>
00020 #include <stdlib.h>
00021
00022 #include <fcntl.h>
00023
00024 #include <math.h>
00025 CPS_START_NAMESPACE
00026
00027 #undef num //temperary test for uc_l and uc_nl
00028
00029
00030 #define fat_link
00031 #define naik
00032 #define staple5
00033 #define staple7
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046 typedef struct gauge_agg{
00047 int src;
00048 int dest;
00049 IFloat mat[18];
00050 } gauge_agg ;
00051
00052 void dirac_sum2_acc_cpp(int s, long chi, long tmpfrm, long b);
00053 void dirac_cmv_cpp( int sites, long chi, long u,long a, long tmpfrm);
00054 void dirac_cmv_jcw_agg_cpp( int sites, long chi, long u,long a, long tmpfrm);
00055 void dirac_sum2_cpp(int s, long chi, long tmpfrm, long b);
00056 void copy_buffer_cpp(int n, long src, long dest, long ptable);
00057
00058 extern "C" void copy_buffer(int n, long src, long dest, long ptable);
00059
00060 enum{VECT_LEN=6, VECT_LEN2=6, MATRIX_SIZE=18, SITE_LEN=72, NUM_DIR=8};
00061
00062
00063
00064
00065
00066
00067
00068
00069
00070 static int size[4];
00071 static int split;
00072 static int coord[4];
00073 static int coord_nn[4];
00074 static int coord_knn[4];
00075 static int vol;
00076 static int non_local_chi_3;
00077 static int non_local_chi;
00078 static int local_chi;
00079 static int local_chi_3;
00080 static int buffer_order[3][4];
00081 static int buffer_flush[3][8];
00082 static int nflush;
00083
00084
00085
00086
00087
00088
00089
00090
00091
00092
00093 static gauge_agg * uc_l_agg[2];
00094 static gauge_agg * uc_nl_agg[2];
00095 static IFloat * uc_l[2];
00096 static IFloat * uc_nl[2];
00097
00098
00099
00100
00101
00102
00103 static IFloat *tmpfrm;
00104
00105 static IFloat *Tbuffer[3][2];
00106
00107
00108
00109
00110 static int intreg[18] ;
00111 static IFloat dreg[18] ;
00112 static int * ToffsetP[3][2];
00113 static int * ToffsetM[3][2];
00114 static int countP[3][2];
00115 static int countM[3][2];
00116
00117
00118
00119
00120 static IFloat * chi_off_node_total;
00121 static IFloat * chi_off_node[3][8];
00122 static IFloat * chi_off_node_p[3][8];
00123
00124
00125
00126
00127
00128
00129
00130
00131
00132
00133
00134
00135 static IFloat ** chi_l[2];
00136 static IFloat ** chi_nl[2];
00137
00138
00139
00140
00141
00142
00143
00144
00145
00146
00147 #if 0
00148 SCUDirArgIR * SCUarg[2*NUM_DIR];
00149 SCUDirArgIR SCUargIR[2*NUM_DIR];
00150 SCUDirArgMulti * SCUmulti;
00151 SCUDirArgMulti SCUmultiIR;
00152
00153 SCUDirArgIR * SCUarg_1[2*NUM_DIR];
00154 SCUDirArgIR SCUarg_1IR[2*NUM_DIR];
00155 SCUDirArgMulti * SCUmulti_1;
00156 SCUDirArgMulti SCUmulti_1IR;
00157
00158 SCUDMAInst *SCUDMAarg_p[NUM_DIR*4];
00159 SCUDMAInst SCUDMAarg[NUM_DIR*4];
00160
00161 SCUDirArgIR * SCUarg_2[2*NUM_DIR];
00162 SCUDirArgIR SCUarg_2IR[2*NUM_DIR];
00163 SCUDirArgMulti * SCUmulti_2;
00164 SCUDirArgMulti SCUmulti_2IR;
00165 #endif
00166
00167
00168
00169
00170
00171
00172 static int Xoffset[3][NUM_DIR];
00173
00174
00175
00176
00177
00178
00179 static int SetCoord( int sg );
00180
00181
00182
00183
00184
00185
00186
00187
00188
00189
00190 static int CoordNN( int nn );
00191
00192 static int CoordkNN( int nn, int k );
00193
00194
00195
00196
00197
00198 static int LexGauge( int * c );
00199
00200
00201
00202
00203
00204 static int LexVector( int * c );
00205
00206
00207
00208
00209
00210
00211 static int LexSurface( int * cc, int surface );
00212
00213 static IFloat * gauge_field_addr;
00214
00215 static int blklen[NUM_DIR/2];
00216 static int numblk[NUM_DIR/2];
00217 static int stride[NUM_DIR/2];
00218
00219 int k;
00220
00221
00222
00223
00224 extern "C"
00225 void asqtad_dirac_init(Fasqtad * lat )
00226 {
00227 gauge_field_addr = ( IFloat * ) lat->GaugeField();
00228 int r,c,i,j,m,n,nn;
00229 int local_count[2];
00230 int non_local_count[2];
00231 int local_count_3[2];
00232 int non_local_count_3[3][2];
00233 int off_node;
00234 int x[NUM_DIR/2];
00235 char buf[200];
00236 int fd;
00237 char *cname = "DiracOpAsqtad";
00238 char *fname = "dirac_init(const void *gauge)";
00239
00240
00241
00242
00243
00244
00245
00246
00247
00248
00249
00250
00251
00252
00253
00254
00255
00256
00257
00258 int sg, sc;
00259
00260
00261
00262
00263
00264 int odd;
00265
00266
00267
00268
00269
00270
00271
00272
00273 size[0] = GJP.TnodeSites();
00274 size[1] = GJP.XnodeSites();
00275 size[2] = GJP.YnodeSites();
00276 size[3] = GJP.ZnodeSites();
00277 split = ( (size[0]>2 &&size[1]>2 && size[2]>2 && size[3] >2 ) ? 0 : 1 );
00278
00279
00280 vol = size[0] * size[1] * size[2] * size[3];
00281
00282 non_local_chi = 2*(size[0]*size[1]*size[2] + size[1]*size[2]*size[3]+
00283 size[2]*size[3]*size[0] + size[3]*size[0]*size[1]);
00284
00285 non_local_chi_3 = 0;
00286 for(i=0;i<4;i++){
00287 if(size[i]<4)
00288 non_local_chi_3 += 4*vol/size[i];
00289 else
00290 non_local_chi_3 += 6*vol/size[i];
00291 }
00292
00293
00294 local_chi = NUM_DIR*vol - non_local_chi;
00295 local_chi_3 = NUM_DIR*vol - non_local_chi_3;
00296 nflush = vol/8;
00297 tmpfrm = (IFloat*)smalloc(NUM_DIR*2*vol/2 * VECT_LEN2 * sizeof(IFloat),
00298 "tmpfrm",fname,cname);
00299
00300
00301
00302
00303
00304
00305
00306 chi_off_node_total = (IFloat*)
00307 smalloc(3*non_local_chi*VECT_LEN*sizeof(IFloat)/2,
00308 "chi_off_node_total",fname,cname);
00309 for ( j= 0; j < 3; j++ ){
00310 chi_off_node[j][0] = &(chi_off_node_total[ j*non_local_chi* VECT_LEN/2 ] );
00311 chi_off_node_p[j][0] = (IFloat *)(sizeof (IFloat)*j*non_local_chi* VECT_LEN/2 );
00312 for ( i = 1; i < NUM_DIR; i++ ){
00313 chi_off_node[j][i] = chi_off_node[j][i-1]+vol/(2*size[i%4])*VECT_LEN;
00314 chi_off_node_p[j][i] = chi_off_node_p[j][i-1]+vol/(2*size[i%4])*VECT_LEN;
00315 }
00316 }
00317
00318
00319
00320
00321
00322
00323
00324
00325
00326 #ifndef SIMUL
00327 for ( i = 0; i < 2; i++ ){
00328 chi_l[i] = (IFloat**)smalloc((2*(local_chi+local_chi_3)/2)*sizeof(IFloat*),
00329 "chi_l[i]", fname, cname);
00330 chi_nl[i] = (IFloat**)
00331 smalloc((2*(non_local_chi+non_local_chi_3)/2)*sizeof(IFloat*),
00332 "chi_l[i]", fname, cname);
00333 }
00334 #endif
00335
00336 for ( i = 0; i < 2; i++){
00337 local_count[i] = 0;
00338 non_local_count[i] = 0;
00339 local_count_3[i] = 0;
00340 for( n = 0; n < 3; n++ ) non_local_count_3[n][i] = 0;
00341
00342 }
00343
00344
00345 #ifndef SIMUL
00346
00347
00348
00349 for ( n = 0; n < NUM_DIR; n++ ) {
00350
00351
00352
00353
00354 for (x[3] = 0; x[3] < size[3]; x[3]++){
00355 for (x[2] = 0; x[2] < size[2]; x[2]++){
00356 for (x[1] = 0; x[1] < size[1]; x[1]++){
00357 for (x[0] = 0; x[0] < size[0]; x[0]++){
00358
00359 for (i = 0; i < 4 ; i++) coord[i] = x[i];
00360
00361 odd = ( coord[0] + coord[1] + coord[2] + coord[3] ) % 2;
00362
00363 sg = coord[1] + size[1] * ( coord[2] + size[2] * ( coord[3] + size[3] * coord[0] ));
00364 m = (2* NUM_DIR + 1) * (sg/2);
00365
00366 if ( CoordNN( n ) ) {
00367
00368
00369
00370
00371
00372
00373
00374 *( chi_nl[ odd ] + 2 * non_local_count[ odd ] )
00375 = ( IFloat * ) ( VECT_LEN * ( LexVector( coord_nn ) / 2 ) * sizeof(IFloat));
00376
00377
00378 *( chi_nl[ odd ] + 2*non_local_count[ odd ] +1 ) =
00379 ( IFloat * ) ( VECT_LEN2 * (LexVector( coord ) / 2 * 2*NUM_DIR+n ) * sizeof(IFloat));
00380 non_local_count[odd]++;
00381 }
00382 }
00383 }
00384 }
00385 }
00386 }
00387
00388
00389
00390 for ( n = 0; n < NUM_DIR; n++ ) {
00391
00392
00393
00394
00395 for (x[3] = 0; x[3] < size[3]; x[3]++){
00396 for (x[2] = 0; x[2] < size[2]; x[2]++){
00397 for (x[1] = 0; x[1] < size[1]; x[1]++){
00398 for (x[0] = 0; x[0] < size[0]; x[0]++){
00399
00400 for (i = 0; i < 4 ; i++) coord[i] = x[i];
00401
00402 odd = ( coord[0] + coord[1] + coord[2] + coord[3] ) % 2;
00403
00404 sg = coord[1] + size[1] * ( coord[2] + size[2] * ( coord[3] + size[3] * coord[0] ));
00405 m = (2* NUM_DIR + 1) * (sg/2);
00406
00407 if ( !CoordNN( n ) ) {
00408 #ifndef SIMUL
00409
00410 *( chi_l[ odd ] + 2 * local_count[ odd ] )
00411 = ( IFloat * ) ( VECT_LEN * ( LexVector( coord_nn ) / 2 ) * sizeof(IFloat));
00412
00413 *( chi_l[ odd ] + 2 * local_count[ odd ] + 1) =
00414 ( IFloat * ) ( VECT_LEN2 * (LexVector( coord ) / 2 *2*NUM_DIR+n) * sizeof(IFloat));
00415 #endif
00416 local_count[odd]++;
00417 }
00418 }
00419 }
00420 }
00421 }
00422 }
00423
00424
00425
00426 for ( n = 0; n < NUM_DIR; n++ ) {
00427
00428
00429
00430
00431 for (x[3] = 0; x[3] < size[3]; x[3]++){
00432 for (x[2] = 0; x[2] < size[2]; x[2]++){
00433 for (x[1] = 0; x[1] < size[1]; x[1]++){
00434 for (x[0] = 0; x[0] < size[0]; x[0]++){
00435
00436 for (i = 0; i < 4 ; i++) coord[i] = x[i];
00437
00438 odd = ( coord[0] + coord[1] + coord[2] + coord[3] ) % 2;
00439
00440 sg = coord[1] + size[1] * ( coord[2] + size[2] * ( coord[3] + size[3] * coord[0] ));
00441 m = (2* NUM_DIR + 1) * (sg/2);
00442
00443
00444
00445
00446
00447 if ( CoordkNN( n, 3 ) ) {
00448
00449 for (j=0; j<3; j++){
00450 if((coord_knn[n%4]==j&&n<4)||(coord_knn[n%4]==(size[n%4]-1-j)&& n>3)) {
00451 *( chi_nl[ odd ] + 2 * non_local_count_3[j][odd]+(j+1)*non_local_chi )
00452 = ( IFloat * ) ( VECT_LEN * ( LexVector( coord_knn ) / 2 )
00453 * sizeof(IFloat));
00454 *( chi_nl[odd] + 2 * non_local_count_3[j][odd] + (j+1)*non_local_chi+1 ) = ( IFloat * ) ( VECT_LEN2 * (LexVector( coord ) / 2*2*NUM_DIR+n+8 ) * sizeof(IFloat));
00455 non_local_count_3[j][odd]++;
00456 }
00457 }
00458
00459
00460 }
00461 }
00462 }
00463 }
00464 }
00465 }
00466
00467
00468
00469 for ( n = 0; n < NUM_DIR; n++ ) {
00470
00471
00472
00473
00474 for (x[3] = 0; x[3] < size[3]; x[3]++){
00475 for (x[2] = 0; x[2] < size[2]; x[2]++){
00476 for (x[1] = 0; x[1] < size[1]; x[1]++){
00477 for (x[0] = 0; x[0] < size[0]; x[0]++){
00478
00479 for (i = 0; i < 4 ; i++) coord[i] = x[i];
00480
00481 odd = ( coord[0] + coord[1] + coord[2] + coord[3] ) % 2;
00482
00483 sg = coord[1] + size[1] * ( coord[2] + size[2] * ( coord[3] + size[3] * coord[0] ));
00484 m = (2* NUM_DIR + 1) * (sg/2);
00485
00486 if ( !CoordkNN( n, 3 ) ) {
00487 #ifndef SIMUL
00488
00489 *( chi_l[ odd ] + 2 * local_count_3[ odd ] + local_chi )
00490 = ( IFloat * ) ( VECT_LEN * ( LexVector( coord_knn ) / 2 )
00491 * sizeof(IFloat));
00492
00493 *( chi_l[ odd ] + 2 * local_count_3[ odd ] + local_chi + 1) =
00494 ( IFloat * ) ( VECT_LEN2 * (LexVector( coord ) / 2 *2*NUM_DIR+n+8) * sizeof(IFloat));
00495 #endif
00496 local_count_3[odd]++;
00497 }
00498
00499
00500 }
00501 }
00502 }
00503 }
00504 }
00505 #endif
00506 FILE *fp;
00507
00508 #ifndef SIMUL
00509 #if 0
00510 fp=Fopen("chi_l.h","w");
00511 for(j=0;j<2;j++){
00512 Fprintf(fp,"IFloat * chi_l%d[] LOCATE(\"edramtransient\") = {\n",j);
00513 Fprintf(fp," (IFloat *) %d",*(chi_l[j]));
00514 for(i=1;i< 2*((local_chi+local_chi_3)/2);i++){
00515 Fprintf(fp,",\n (IFloat *) %d",*(chi_l[j]+i));
00516 }
00517 Fprintf(fp,"\n};\n");
00518 }
00519 Fclose(fp);
00520
00521 fp=Fopen("chi_nl.h","w");
00522 Fprintf(fp,"testing\n");
00523 for(j=0;j<2;j++){
00524 Fprintf(fp,"IFloat * chi_nl%d[] LOCATE(\"edramtransient\") = {\n",j);
00525 Fprintf(fp," (IFloat *) %d",*(chi_nl[j]));
00526 for(i=1;i< 2*((non_local_chi+ non_local_chi_3)/2);i++){
00527 Fprintf(fp,",\n (IFloat *) %d",*(chi_nl[j]+i));
00528 }
00529 Fprintf(fp,"\n};\n");
00530 }
00531
00532 Fclose(fp);
00533 #endif
00534
00535 #endif
00536
00537
00538
00539
00540
00541
00542
00543
00544
00545
00546
00547 blklen[0] = VECT_LEN * sizeof(IFloat) * size[1] * size[2] * size[3] / 2;
00548 blklen[1] = VECT_LEN * sizeof(IFloat) * size[0] / 2;
00549 blklen[2] = VECT_LEN * sizeof(IFloat) * size[0] * size[1] / 2;
00550 blklen[3] = VECT_LEN * sizeof(IFloat) * size[0] * size[1] * size[2] / 2;
00551
00552 numblk[0] = 1;
00553 numblk[1] = size[2] * size[3];
00554 numblk[2] = size[3];
00555 numblk[3] = 1;
00556
00557 stride[0] = 0;
00558 stride[1] = (VECT_LEN * size[0] * ( size[1] - 1 ) / 2)*sizeof(IFloat);
00559 stride[2] = (VECT_LEN * size[0] * size[1] * ( size[2] - 1 ) / 2)* sizeof(IFloat) ;
00560 stride[3] = 0;
00561
00562
00563
00564
00565
00566
00567
00568 for ( k = 0; k < 3; k++ ) {
00569 for ( i = 0; i < 2; i++ ) {
00570 #ifndef SIMUL_TBUF
00571 Tbuffer[k][i] = (IFloat*)
00572 smalloc(size[1]*size[2]*size[3]*VECT_LEN*sizeof(IFloat)/2,
00573 "Tbuffer[k][i]", fname, cname);
00574 ToffsetP[k][i] = (int*) smalloc(size[1]*size[2]*size[3]*sizeof(int)/2,
00575 "ToffsetP[k][i]", fname, cname);
00576 ToffsetM[k][i] = (int*) smalloc(size[1]*size[2]*size[3]*sizeof(int)/2,
00577 "ToffsetM[k][i]", fname, cname);
00578 #endif
00579 countP[k][i] = 0;
00580 countM[k][i] = 0;
00581 }
00582 }
00583 for ( sg = 0; sg < vol; sg++ ) {
00584
00585 odd = SetCoord( sg );
00586 sc = LexVector( coord );
00587
00588 for ( int j = 0; j < 3; j++ ) {
00589 if ( coord[0] == j ) {
00590 *( ToffsetM[j][ odd ] + countM[j][ odd ] ) = VECT_LEN * ( sc / 2 );
00591 countM[j][ odd ]++;
00592 }
00593 if ( coord[0] == size[0] - 1 -j ) {
00594 *( ToffsetP[j][ odd ] + countP[j][ odd ] ) = VECT_LEN * ( sc / 2 );
00595 countP[j][ odd ]++;
00596 }
00597 }
00598 }
00599
00600 int vol3 = (size[1] * size[2] * size[3])/2;
00601
00602
00603
00604
00605
00606
00607 for ( i = 0; i < NUM_DIR; i++ ) {
00608 j = i % (NUM_DIR/2);
00609 #if 0
00610 SCUarg[i + NUM_DIR] = &(SCUargIR[i+NUM_DIR]);
00611 SCUarg[i + NUM_DIR] ->Init(chi_off_node[2][i], scudir[i], SCU_REC,
00612 VECT_LEN * sizeof(IFloat) * vol / ( 2 * size[j] ), 1, 0, IR_7);
00613 SCUarg[i + NUM_DIR] ->Assert();
00614
00615 SCUDMAarg_p[(i+NUM_DIR)*2] = &(SCUDMAarg[(i+NUM_DIR)*2]);
00616 SCUDMAarg_p[(i+NUM_DIR)*2] ->Init(chi_off_node[0][i],
00617 VECT_LEN * sizeof(IFloat) * vol / ( 2 * size[j] ), 1, 0);
00618 SCUDMAarg_p[(i+NUM_DIR)*2+1] = &(SCUDMAarg[(i+NUM_DIR)*2+1]);
00619 SCUDMAarg_p[(i+NUM_DIR)*2+1] ->Init(chi_off_node[1][i],
00620 VECT_LEN * sizeof(IFloat) * vol / ( 2 * size[j] ), 1, 0);
00621
00622 if( split ){
00623 SCUarg_1[i + NUM_DIR] = &(SCUarg_1IR[i+NUM_DIR]);
00624 SCUarg_1[i + NUM_DIR] ->Init(scudir[i],SCU_REC, &SCUDMAarg_p[(i+NUM_DIR)*2],1, IR_5);
00625 SCUarg_1[i + NUM_DIR] ->Assert();
00626 SCUarg_2[i + NUM_DIR] = &(SCUarg_2IR[i+NUM_DIR]);
00627 SCUarg_2[i + NUM_DIR] ->Init(scudir[i],SCU_REC, &SCUDMAarg_p[(i+NUM_DIR)*2+1],1, IR_6);
00628 SCUarg_2[i + NUM_DIR] ->Assert();
00629 } else {
00630 SCUarg_1[i + NUM_DIR] = &(SCUarg_1IR[i+NUM_DIR]);
00631 SCUarg_1[i + NUM_DIR] ->Init(scudir[i],SCU_REC, &SCUDMAarg_p[(i+NUM_DIR)*2],2, IR_5);
00632 SCUarg_1[i + NUM_DIR] ->Assert();
00633 }
00634
00635
00636
00637 buffer_flush[0][i] = VECT_LEN * sizeof(IFloat) * vol/ (12288 * size[j]);
00638 buffer_flush[1][i] = VECT_LEN * sizeof(IFloat) * vol/ (12288 * size[j]);
00639 buffer_flush[2][i] = VECT_LEN * sizeof(IFloat) * vol/ (12288 * size[j]);
00640
00641
00642 if ((i == 0) || ( i == 4)){
00643 SCUarg[i] = &(SCUargIR[i]);
00644 if (size[j] >4)
00645 SCUarg[i] ->Init (Tbuffer[2][(4 - i)/4], scudir[i], SCU_SEND,
00646 blklen[j], numblk[j], stride[j], IR_7 );
00647 else if (size[j] >2)
00648 SCUarg[i] ->Init (Tbuffer[1][i/4], scudir[i], SCU_SEND,
00649 blklen[j], numblk[j], stride[j], IR_7 );
00650 else
00651 SCUarg[i] ->Init (chi_off_node[0][4-i], scudir[i], SCU_SEND,
00652 blklen[j], numblk[j], stride[j], IR_7 );
00653 SCUarg[i] ->Assert();
00654
00655 SCUDMAarg_p[i*2] = &(SCUDMAarg[i*2]);
00656 SCUDMAarg_p[i*2] ->Init(Tbuffer[0][(4 - i)/4],
00657 blklen[j], numblk[j], stride[j]);
00658 SCUDMAarg_p[i*2+1] = &(SCUDMAarg[i*2+1]);
00659
00660 if(size[j]>2)
00661 SCUDMAarg_p[i*2+1] ->Init(Tbuffer[1][(4 - i)/4],
00662 blklen[j], numblk[j], stride[j]);
00663 else
00664 SCUDMAarg_p[i*2+1] ->Init(Tbuffer[0][i/4],
00665 blklen[j], numblk[j], stride[j]);
00666 if( split ){
00667 SCUarg_1[i] = &(SCUarg_1IR[i]);
00668 SCUarg_1[i] ->Init(scudir[i],SCU_SEND,&SCUDMAarg_p[i*2],1,IR_5);
00669 SCUarg_1[i] ->Assert();
00670 SCUarg_2[i] = &(SCUarg_2IR[i]);
00671 SCUarg_2[i] ->Init(scudir[i],SCU_SEND,&SCUDMAarg_p[i*2+1],1,IR_6);
00672 SCUarg_2[i] ->Assert();
00673 } else {
00674 SCUarg_1[i] = &(SCUarg_1IR[i]);
00675 SCUarg_1[i] ->Init(scudir[i],SCU_SEND,&SCUDMAarg_p[i*2],2,IR_5);
00676 SCUarg_1[i] ->Assert();
00677 }
00678
00679 }
00680 else{
00681 SCUarg[i] = &(SCUargIR[i]);
00682 if(size[j] >2)
00683 SCUarg[i] ->Init(( void * ) 0, scudir[i], SCU_SEND,
00684 blklen[j], numblk[j], stride[j], IR_7 );
00685 else
00686 SCUarg[i] ->Init(chi_off_node[0][(i+4)%NUM_DIR], scudir[i], SCU_SEND,
00687 VECT_LEN * sizeof(IFloat) * vol / ( 2 * size[j]),1,0, IR_7 );
00688 SCUarg[i] ->Assert();
00689
00690 SCUDMAarg_p[i*2] = &(SCUDMAarg[i*2]);
00691 SCUDMAarg_p[i*2] ->Init((void *)0,
00692 blklen[j], numblk[j], stride[j]);
00693 SCUDMAarg_p[i*2+1] = &(SCUDMAarg[i*2+1]);
00694 SCUDMAarg_p[i*2+1] ->Init((void *)0,
00695 blklen[j], numblk[j], stride[j]);
00696 if( split ){
00697 SCUarg_1[i] = &(SCUarg_1IR[i]);
00698 SCUarg_1[i] -> Init(scudir[i],SCU_SEND,&SCUDMAarg_p[i*2],1,IR_5);
00699 SCUarg_1[i] ->Assert();
00700 SCUarg_2[i] = &(SCUarg_2IR[i]);
00701 SCUarg_2[i] -> Init(scudir[i],SCU_SEND,&SCUDMAarg_p[i*2+1],1,IR_6);
00702 SCUarg_2[i] ->Assert();
00703 } else {
00704 SCUarg_1[i] = &(SCUarg_1IR[i]);
00705 SCUarg_1[i] -> Init(scudir[i],SCU_SEND,&SCUDMAarg_p[i*2],2,IR_5);
00706 SCUarg_1[i] ->Assert();
00707 }
00708 }
00709 #endif
00710 }
00711
00712
00713 #if 0
00714 SCUmulti = &SCUmultiIR;
00715 SCUmulti->Init(SCUarg, 2*NUM_DIR);
00716 SCUmulti_1 = &SCUmulti_1IR;
00717 SCUmulti_1->Init(SCUarg_1, 2*NUM_DIR);
00718 if( split ){
00719 SCUmulti_2 = &SCUmulti_2IR;
00720 SCUmulti_2->Init(SCUarg_2, 2*NUM_DIR);
00721 }
00722 #endif
00723
00724
00725
00726
00727
00728
00729 for ( k = 0; k < 3; k++ ) {
00730 Xoffset[k][0] = 0;
00731 Xoffset[k][1] = VECT_LEN * size[0] * (size[1] - 1-k) / 2;
00732 Xoffset[k][2] = VECT_LEN * size[0] * size[1] * (size[2] - 1-k) / 2;
00733 Xoffset[k][3] = VECT_LEN * size[0] * size[1] * size[2] * (size[3]-1-k) / 2;
00734 Xoffset[k][4] = 0;
00735 Xoffset[k][5] = VECT_LEN * size[0] *k /2;
00736 Xoffset[k][6] = VECT_LEN * size[0] * size[1]*k /2;
00737 Xoffset[k][7] = VECT_LEN * size[0] * size[1] * size[2]*k /2;
00738 }
00739 }
00740
00741 extern "C"
00742 void asqtad_destroy_dirac_buf()
00743 {
00744 char *cname = "";
00745 char *fname = "asqtad_destroy_dirac_buf()";
00746 int i,k;
00747
00748 for ( i = 0; i < 2; i++ ) {
00749 for ( k = 0; k < 3; k++ ) {
00750 #ifndef SIMUL_TBUF
00751 sfree(Tbuffer[k][i],"Tbuffer[k][i]",fname,cname);
00752 sfree(ToffsetP[k][i],"ToffsetP[k][i]",fname,cname);
00753 sfree(ToffsetM[k][i],"ToffsetM[k][i]",fname,cname);
00754 #endif
00755 }
00756 #ifndef SIMUL
00757 sfree(chi_nl[i],"chi_nl[i]",fname,cname);
00758 sfree(chi_l[i],"chi_l[i]",fname,cname);
00759 #endif
00760 }
00761
00762 sfree(chi_off_node_total,"chi_off_node_total",fname,cname);
00763 for ( i = 0; i < NUM_DIR; i++ ) {
00764 #if 0
00765 delete SCUarg[i];
00766 delete SCUarg[i+8];
00767 delete SCUarg_1[i];
00768 delete SCUarg_1[i+8];
00769 for(k=0;k<4;k++)
00770 delete SCUDMAarg_p[i*4+k];
00771 #endif
00772 }
00773 sfree(tmpfrm,"tmpfrm",fname,cname);
00774 }
00775
00776
00777
00778
00779
00780 static int SetCoord( int sg )
00781 {
00782 coord[1] = sg % size[1];
00783 coord[2] = ((int)( sg / size[1] )) % size[2];
00784 coord[3] = ((int)( sg / ( size[1] * size[2] ) )) % size[3];
00785 coord[0] = ((int)( sg / ( size[1] * size[2] * size[3] ) )) % size[0];
00786
00787 return ( coord[0] + coord[1] + coord[2] + coord[3] ) % 2;
00788 }
00789
00790
00791
00792
00793
00794
00795 void TransfP(int off_node, int nflush_g,IFloat * v,IFloat * mtmp
00796
00797 , int n ){
00798 getPlusData(mtmp,v,18,(n+1)%4);
00799 }
00800
00801 void TransfM(int off_node, int nflush_g,IFloat * v,IFloat * mtmp, int n ){
00802 getMinusData(mtmp,v,18,(n+1)%4);
00803 }
00804 void DaggerM (IFloat * w_t1,IFloat * v){
00805 int i, j, c, r;
00806 for ( r = 0; r < 3; r++ ) {
00807 for ( c = 0; c < 3; c++ ) {
00808 i = 6*r + 2*c;
00809 j = 6*c + 2*r;
00810 w_t1[i] = v[j];
00811 w_t1[i+1] = -v[j+1];
00812 }
00813 }
00814 }
00815
00816 void Parallel( IFloat * InMatrix,IFloat *OutMatrix , int *coord, int dir_1, int dir_2, IFloat *v, IFloat * u,int multi_flag ,int tranfs_flag )
00817 {
00818 int nn, nflush_g=1, off_node;
00819 IFloat stp3_1[18], mtmp[18] ;
00820 if ( dir_1 <4){
00821
00822 nn = ( dir_1 - 1 + 4 ) % 4;
00823 v = u + SITE_LEN *(LexGauge( coord ))+ MATRIX_SIZE * nn;
00824 DaggerM(stp3_1, v);
00825 } else{
00826
00827 nn = ( dir_1 - 1 + 4 ) % 4;
00828 v = u + SITE_LEN *(LexGauge( coord ))+ MATRIX_SIZE * nn;
00829 for (int i=0; i<18; i++){
00830 stp3_1[i] = v[i];
00831 }
00832 }
00833
00834 if ( multi_flag )
00835 mDotMEqual( OutMatrix, stp3_1, InMatrix ) ;
00836 else {
00837 for (int i=0; i<18; i++){
00838 OutMatrix[i] = stp3_1[i];
00839 }
00840 }
00841
00842
00843 if ( tranfs_flag){
00844 if(dir_2<4){
00845
00846 off_node = CoordNN( dir_2 );
00847 coord[ dir_2]= coord_nn[ dir_2];
00848
00849 TransfM( off_node, nflush_g, OutMatrix, mtmp,dir_2%4 );
00850 }else {
00851 dir_2= dir_2%4;
00852 off_node = CoordNN( dir_2+4 );
00853 coord[ dir_2]= coord_nn[ dir_2];
00854
00855 TransfP( off_node, nflush_g, OutMatrix, mtmp,dir_2%4 );
00856
00857 }
00858 }
00859
00860 }
00861
00862
00863 void Staple3_PP( IFloat * InMatrix,IFloat *OutMatrix , int *coord, int n,
00864 int nu, IFloat *v, IFloat * u, int nflush_g,int sum_flag )
00865 {
00866 int nn, i, off_node;
00867 IFloat stp3_0[18], stp3_2[18], stp3_4[18], mtmp[18];
00868
00869 CoordNN( n );
00870 coord[n]= coord_nn[n];
00871
00872
00873 Parallel( InMatrix,stp3_0, coord, nu+4, nu, v, u, 1, 1 );
00874
00875
00876 off_node = CoordNN( n+4 );
00877 coord[n]= coord_nn[n];
00878 TransfP( off_node, nflush_g, stp3_0, mtmp, n);
00879
00880
00881 Parallel( stp3_0,stp3_2, coord, n, nu+4, v, u, 1, 1 );
00882
00883 Parallel( stp3_2,stp3_4, coord, nu, 0, v, u, 1, 0 );
00884
00885 for ( i = 0; i < 18; i++ ) {
00886 OutMatrix[i] = stp3_4[i];
00887 }
00888
00889 }
00890
00891 void Staple3_PN( IFloat * InMatrix,IFloat *OutMatrix , int *coord, int n,
00892 int nu, IFloat *v, IFloat * u, int nflush_g,int sum_flag )
00893 {
00894 int nn, i, off_node;
00895 IFloat stp3_0[18], stp3_2[18], stp3_4[18], mtmp[18];
00896
00897
00898 CoordNN( n );
00899 coord[n]= coord_nn[n];
00900 CoordNN( nu+4 );
00901 coord[nu]= coord_nn[nu];
00902
00903
00904 Parallel( InMatrix,stp3_0, coord, nu, n+4, v, u, 1, 1 );
00905
00906
00907 Parallel(stp3_0 ,stp3_2, coord, n, 0, v, u, 1, 0 );
00908
00909
00910 Parallel(stp3_2 ,stp3_4, coord, nu+4, nu, v, u, 1, 1 );
00911
00912 for ( i = 0; i < 18; i++ ) {
00913 OutMatrix[i] = stp3_4[i];
00914 }
00915
00916
00917
00918 }
00919
00920
00921 void Staple3_NP( IFloat * InMatrix,IFloat *OutMatrix , int *coord, int n,int nu, IFloat *v, IFloat * u, int nflush_g,int sum_flag )
00922 {
00923 int nn, i, off_node;
00924 IFloat stp3_0[18], stp3_2[18], stp3_4[18], mtmp[18];
00925
00926
00927 CoordNN( n+4 );
00928 coord[n]= coord_nn[n];
00929
00930
00931 Parallel( InMatrix,stp3_0, coord, nu+4, nu, v, u, 1, 1 );
00932
00933
00934 Parallel( stp3_0,stp3_2, coord, n+4, n, v, u, 1, 1 );
00935
00936
00937 CoordNN( nu+4 );
00938 coord[nu]= coord_nn[nu];
00939 TransfP( off_node, nflush_g, stp3_2 , mtmp, nu );
00940
00941
00942 Parallel( stp3_2,stp3_4, coord, nu, 0, v, u, 1, 0 );
00943
00944
00945 for ( i = 0; i < 18; i++ ) {
00946
00947 OutMatrix[i] = stp3_4[i];
00948 }
00949
00950
00951
00952 }
00953
00954 void Staple3_NN( IFloat * InMatrix,IFloat *OutMatrix , int *coord, int n,int nu, IFloat *v, IFloat * u, int nflush_g, int sum_flag )
00955 {
00956
00957 int nn, i, off_node;
00958 IFloat stp3_0[18], stp3_2[18], stp3_4[18], mtmp[18];
00959
00960
00961 CoordNN( n+4 );
00962 coord[n]= coord_nn[n];
00963 CoordNN( nu+4 );
00964 coord[nu]= coord_nn[nu];
00965
00966
00967 Parallel( InMatrix,stp3_0, coord, nu, 0, v, u, 1, 0 );
00968
00969
00970
00971 Parallel(stp3_0 ,stp3_2, coord, n+4, n, v, u, 1, 1 );
00972
00973
00974 Parallel(stp3_2 ,stp3_4, coord, nu+4, nu, v, u, 1, 1 );
00975
00976 for ( i = 0; i < 18; i++ ) {
00977
00978 OutMatrix[i] = stp3_4[i];
00979
00980 }
00981
00982
00983 }
00984
00985
00986
00987
00988
00989 void Staple5_PP( IFloat * InMatrix,IFloat *OutMatrix , int *coord, int n,
00990 int nu,int ro, IFloat *v, IFloat * u, int nflush_g )
00991 {
00992 int nn, i, off_node;
00993 IFloat stp5_0[18], stp5_1[18], stp5_2[18],
00994 stp5_3[18] , stp5_4[18], stp5_5[18], mtmp[18];
00995
00996 CoordNN( n );
00997 coord[n]= coord_nn[n];
00998 nn = ( ro - 1 + 4 ) % 4;
00999 v = u + SITE_LEN *(LexGauge( coord ))+ MATRIX_SIZE * nn;
01000 mDotMEqual( stp5_0, v, InMatrix ) ;
01001
01002 off_node = CoordNN( ro );
01003 coord[ro]= coord_nn[ro];
01004 TransfM( off_node, nflush_g, stp5_0, mtmp, ro);
01005
01006 CoordNN( n+4 );
01007 coord[n]= coord_nn[n];
01008
01009
01011 if( nu<4)
01012 Staple3_PP(stp5_0,stp5_4, coord, n,nu, v, u, nflush_g,1 );
01013 else
01014 Staple3_PN(stp5_0,stp5_4, coord, n,nu%4, v, u, nflush_g,1 );
01015
01016
01017
01018
01019 off_node = CoordNN( ro +4 );
01020 coord[ro]= coord_nn[ro];
01021 TransfP( off_node, nflush_g, stp5_4, mtmp, ro);
01022
01024 nn = ( ro - 1 + 4 ) % 4;
01025 v = u + SITE_LEN *(LexGauge( coord ))+ MATRIX_SIZE * nn;
01026 DaggerM(stp5_1,v);
01027 mDotMEqual( OutMatrix, stp5_1, stp5_4) ;
01028
01029
01030 }
01031
01032
01033 void Staple5_PN( IFloat * InMatrix,IFloat *OutMatrix , int *coord, int n,
01034 int nu,int ro, IFloat *v, IFloat * u, int nflush_g )
01035 {
01036
01037 int nn, i, off_node;
01038 IFloat stp5_0[18], stp5_1[18], stp5_2[18],
01039 stp5_3[18] , stp5_4[18], stp5_5[18], mtmp[18];
01040
01041
01042
01043
01044 CoordNN( n );
01045 coord[n]= coord_nn[n];
01046 CoordNN( ro+4 );
01047 coord[ro]= coord_nn[ro];
01048
01049 nn = ( ro - 1 + 4 ) % 4;
01050 v = u + SITE_LEN *(LexGauge( coord ))+ MATRIX_SIZE * nn;
01051 DaggerM(stp5_1, v);
01052 mDotMEqual( stp5_0, stp5_1 , InMatrix ) ;
01053
01054
01055
01056
01057
01058
01059
01060
01061 CoordNN( n+4 );
01062 coord[n]= coord_nn[n];
01063
01064
01065
01066 if( nu<4)
01067 Staple3_PP(stp5_0,stp5_4, coord, n,nu, v, u, nflush_g,1 );
01068 else
01069 Staple3_PN(stp5_0,stp5_4, coord, n,nu%4, v, u, nflush_g,1 );
01070
01071
01072
01073 nn = ( ro - 1 + 4 ) % 4;
01074 v = u + SITE_LEN *(LexGauge( coord ))+ MATRIX_SIZE * nn;
01075 mDotMEqual( OutMatrix, v, stp5_4) ;
01076
01077
01078 off_node = CoordNN( ro );
01079 coord[ro]= coord_nn[ro];
01080 TransfM( off_node, nflush_g, OutMatrix, mtmp, ro);
01081
01082 }
01083
01084 void Staple5_NP( IFloat * InMatrix,IFloat *OutMatrix , int *coord, int n,
01085 int nu,int ro, IFloat *v, IFloat * u, int nflush_g )
01086 {
01087 int nn, i, off_node;
01088 IFloat stp5_0[18], stp5_1[18], stp5_2[18],
01089 stp5_3[18] , stp5_4[18], stp5_5[18], mtmp[18];
01090
01091
01092
01093 CoordNN( n+4 );
01094 coord[n]= coord_nn[n];
01095 nn = ( ro - 1 + 4 ) % 4;
01096 v = u + SITE_LEN *(LexGauge( coord ))+ MATRIX_SIZE * nn;
01097 mDotMEqual( stp5_0, v, InMatrix ) ;
01098
01099 off_node = CoordNN( ro );
01100 coord[ro]= coord_nn[ro];
01101 TransfM( off_node, nflush_g, stp5_0, mtmp, ro);
01102
01103 CoordNN( n );
01104 coord[n]= coord_nn[n];
01105
01106
01108 if( nu<4)
01109 Staple3_NP(stp5_0,stp5_4, coord, n,nu, v, u, nflush_g,1 );
01110 else
01111 Staple3_NN(stp5_0,stp5_4, coord, n,nu%4, v, u, nflush_g,1 );
01112
01113
01114
01115
01116 off_node = CoordNN( ro +4 );
01117 coord[ro]= coord_nn[ro];
01118 TransfP( off_node, nflush_g, stp5_4, mtmp, ro);
01119
01121 nn = ( ro - 1 + 4 ) % 4;
01122 v = u + SITE_LEN *(LexGauge( coord ))+ MATRIX_SIZE * nn;
01123 DaggerM(stp5_1,v);
01124 mDotMEqual(OutMatrix, stp5_1, stp5_4) ;
01125
01126
01127
01128 }
01129 void Staple5_NN( IFloat * InMatrix,IFloat *OutMatrix , int *coord, int n,
01130 int nu,int ro, IFloat *v, IFloat * u, int nflush_g )
01131 {
01132 int nn, i, off_node;
01133 IFloat stp5_0[18], stp5_1[18], stp5_2[18],
01134 stp5_3[18] , stp5_4[18], stp5_5[18], mtmp[18];
01135
01136
01137
01138
01139 CoordNN( n+4 );
01140 coord[n]= coord_nn[n];
01141 CoordNN( ro+4 );
01142 coord[ro]= coord_nn[ro];
01143
01144 nn = ( ro - 1 + 4 ) % 4;
01145 v = u + SITE_LEN *(LexGauge( coord ))+ MATRIX_SIZE * nn;
01146 DaggerM(stp5_1, v);
01147 mDotMEqual( stp5_0,stp5_1 , InMatrix ) ;
01148
01149 CoordNN( n );
01150 coord[n]= coord_nn[n];
01151
01152
01153
01154 if( nu<4)
01155 Staple3_NP(stp5_0,stp5_4, coord, n,nu, v, u, nflush_g,1 );
01156 else
01157 Staple3_NN(stp5_0,stp5_4, coord, n,nu%4, v, u, nflush_g,1 );
01158
01159
01160
01161 nn = ( ro - 1 + 4 ) % 4;
01162 v = u + SITE_LEN *(LexGauge( coord ))+ MATRIX_SIZE * nn;
01163 mDotMEqual( OutMatrix,v, stp5_4) ;
01164
01165
01166 off_node = CoordNN( ro );
01167 coord[ro]= coord_nn[ro];
01168 TransfM( off_node, nflush_g, OutMatrix, mtmp, ro);
01169
01170
01171 }
01172
01173
01174 #ifdef staple7
01175
01176
01177
01178
01179
01180
01181 void Staple7_PP( IFloat * InMatrix,IFloat *OutMatrix , int *coord, int n,
01182 int nu,int ro,int de, IFloat *v, IFloat * u, int nflush_g )
01183 {
01184 int nn, i, off_node;
01185 IFloat stp5_0[18], stp5_1[18], stp5_2[18],
01186 stp5_3[18] , stp5_4[18], stp5_5[18], mtmp[18];
01187
01188 CoordNN( n );
01189 coord[n]= coord_nn[n];
01190 nn = ( de - 1 + 4 ) % 4;
01191 v = u + SITE_LEN *(LexGauge( coord ))+ MATRIX_SIZE * nn;
01192 mDotMEqual( stp5_0, v, InMatrix ) ;
01193
01194 off_node = CoordNN( de );
01195 coord[de]= coord_nn[de];
01196 TransfM( off_node, nflush_g, stp5_0, mtmp, de);
01197
01198 CoordNN( n+4 );
01199 coord[n]= coord_nn[n];
01200
01201
01203 if( ro<4)
01204 Staple5_PP(stp5_0,stp5_4, coord, n,nu,ro, v, u, nflush_g );
01205 else
01206 Staple5_PN(stp5_0,stp5_4, coord, n,nu,ro%4, v, u, nflush_g );
01207
01208
01209
01210
01211 off_node = CoordNN( de +4 );
01212 coord[de]= coord_nn[de];
01213 TransfP( off_node, nflush_g, stp5_4, mtmp, de);
01214
01215
01216 nn = ( de - 1 + 4 ) % 4;
01217 v = u + SITE_LEN *(LexGauge( coord ))+ MATRIX_SIZE * nn;
01218 DaggerM(stp5_1,v);
01219 mDotMEqual( OutMatrix, stp5_1, stp5_4) ;
01220
01221
01222 }
01223
01224
01225 void Staple7_PN( IFloat * InMatrix,IFloat *OutMatrix , int *coord, int n,
01226 int nu,int ro,int de, IFloat *v, IFloat * u, int nflush_g )
01227 {
01228
01229 int nn, i, off_node;
01230 IFloat stp5_0[18], stp5_1[18], stp5_2[18],
01231 stp5_3[18] , stp5_4[18], stp5_5[18], mtmp[18];
01232
01233
01234
01235
01236 CoordNN( n );
01237 coord[n]= coord_nn[n];
01238 CoordNN( de+4 );
01239 coord[de]= coord_nn[de];
01240
01241 nn = ( de - 1 + 4 ) % 4;
01242 v = u + SITE_LEN *(LexGauge( coord ))+ MATRIX_SIZE * nn;
01243 DaggerM(stp5_1, v);
01244 mDotMEqual( stp5_0, stp5_1 , InMatrix ) ;
01245
01246 CoordNN( n+4 );
01247 coord[n]= coord_nn[n];
01248
01249
01250
01251 if( ro<4)
01252 Staple5_PP(stp5_0,stp5_4, coord, n,nu,ro, v, u, nflush_g);
01253
01254 else
01255 Staple5_PN(stp5_0,stp5_4, coord, n,nu,ro%4, v, u, nflush_g );
01256
01257
01258
01259 nn = ( de - 1 + 4 ) % 4;
01260 v = u + SITE_LEN *(LexGauge( coord ))+ MATRIX_SIZE * nn;
01261 mDotMEqual( OutMatrix, v, stp5_4) ;
01262
01263
01264 off_node = CoordNN( de );
01265 coord[de]= coord_nn[de];
01266 TransfM( off_node, nflush_g, OutMatrix, mtmp, de);
01267
01268 }
01269
01270
01271 void Staple7_NP( IFloat * InMatrix,IFloat *OutMatrix , int *coord, int n,
01272 int nu,int ro,int de, IFloat *v, IFloat * u, int nflush_g )
01273 {
01274
01275
01276 int nn, i, off_node;
01277 IFloat stp5_0[18], stp5_1[18], stp5_2[18],
01278 stp5_3[18] , stp5_4[18], stp5_5[18], mtmp[18];
01279
01280
01281
01282
01283
01284
01285
01286
01287
01288
01289
01290
01291
01292
01293
01294
01295
01296
01297
01298
01299
01300
01301
01302
01303
01304
01305
01306
01307
01308
01310
01311
01312
01313
01314
01315
01316
01317
01318
01319 CoordNN( n+4 );
01320 coord[n]= coord_nn[n];
01321 nn = ( de - 1 + 4 ) % 4;
01322 v = u + SITE_LEN *(LexGauge( coord ))+ MATRIX_SIZE * nn;
01323 mDotMEqual( stp5_0, v, InMatrix ) ;
01324
01325 off_node = CoordNN( de );
01326 coord[de]= coord_nn[de];
01327 TransfM( off_node, nflush_g, stp5_0, mtmp, de);
01328
01329 CoordNN( n );
01330 coord[n]= coord_nn[n];
01331
01332
01333
01334 if( ro<4)
01335 Staple5_NP(stp5_0,stp5_4, coord, n,nu,ro, v, u, nflush_g);
01336 else
01337 Staple5_NN(stp5_0,stp5_4, coord, n,nu, ro%4, v, u, nflush_g );
01338
01339
01340
01341
01342 off_node = CoordNN( de +4 );
01343 coord[de]= coord_nn[de];
01344 TransfP( off_node, nflush_g, stp5_4, mtmp, de);
01345
01346
01347 nn = ( de - 1 + 4 ) % 4;
01348 v = u + SITE_LEN *(LexGauge( coord ))+ MATRIX_SIZE * nn;
01349 DaggerM(stp5_1,v);
01350 mDotMEqual(OutMatrix, stp5_1, stp5_4) ;
01351
01352
01353
01354 }
01355
01356
01357 void Staple7_NN( IFloat * InMatrix,IFloat *OutMatrix , int *coord, int n,
01358 int nu,int ro,int de, IFloat *v, IFloat * u, int nflush_g )
01359 {
01360 int nn, i, off_node;
01361 IFloat stp5_0[18], stp5_1[18], stp5_2[18],
01362 stp5_3[18] , stp5_4[18], stp5_5[18], mtmp[18];
01363
01364
01365
01366
01367 CoordNN( n+4 );
01368 coord[n]= coord_nn[n];
01369 CoordNN( de+4 );
01370 coord[de]= coord_nn[de];
01371
01372 nn = ( de - 1 + 4 ) % 4;
01373 v = u + SITE_LEN *(LexGauge( coord ))+ MATRIX_SIZE * nn;
01374 DaggerM(stp5_1, v);
01375 mDotMEqual( stp5_0,stp5_1 , InMatrix ) ;
01376
01377 CoordNN( n );
01378 coord[n]= coord_nn[n];
01379
01380
01381
01382 if( ro<4)
01383 Staple5_NP(stp5_0,stp5_4, coord, n,nu,ro, v, u, nflush_g );
01384 else
01385 Staple5_NN(stp5_0,stp5_4, coord, n,nu,ro%4, v, u, nflush_g );
01386
01387
01388
01389 nn = ( de - 1 + 4 ) % 4;
01390 v = u + SITE_LEN *(LexGauge( coord ))+ MATRIX_SIZE * nn;
01391 mDotMEqual( OutMatrix,v, stp5_4) ;
01392
01393
01394 off_node = CoordNN( de );
01395 coord[de]= coord_nn[de];
01396 TransfM( off_node, nflush_g, OutMatrix, mtmp, de);
01397
01398
01399
01400 }
01401
01402 #endif //staple7
01403 extern "C"
01404 void asqtad_dirac_init_g()
01405 {
01406
01407 IFloat * u = gauge_field_addr;
01408 int c,i,j,m,n,r;
01409 int off_node;
01410 int local_count[2];
01411 int nflush_g = 2;
01412 int non_local_count[2];
01413 int local_count_3[2];
01414 int non_local_count_3[3][2];
01415 int x[NUM_DIR/2];
01416 char *cname = "";
01417 char *fname = "asqtad_dirac_init_g()";
01418
01419
01420
01421
01422
01423
01424 IFloat c1 = GJP.KS_coeff();
01425 IFloat c2 = GJP.Naik_coeff();
01426 IFloat c3 = -GJP.staple3_coeff();
01427 IFloat c5 = GJP.staple5_coeff();
01428 IFloat c7 = -GJP.staple7_coeff();
01429 IFloat c6 = GJP.Lepage_coeff();
01430
01431
01432
01433
01434
01435
01436
01437
01438
01439
01440
01441
01442
01443
01444
01445
01446 int sg;
01447
01448
01449
01450
01451
01452 int odd;
01453
01454
01455
01456
01457
01458
01459
01460
01461 int nn;
01462
01463
01464
01465
01466
01467
01468
01469 IFloat * v;
01470 IFloat * w, wp1[18];
01471 IFloat * w3;
01472
01473 IFloat w_t1[18], w_t2[18], w_t3[18];
01474 IFloat mtmp[18] ;
01475
01476
01477
01478
01479
01480 VRB.Func(cname,fname);
01481 #if 0
01482 size[0] = GJP.TnodeSites();
01483 size[1] = GJP.XnodeSites();
01484 size[2] = GJP.YnodeSites();
01485 size[3] = GJP.ZnodeSites();
01486
01487 vol = size[0] * size[1] * size[2] * size[3];
01488
01489 non_local_chi = 2*(size[0]*size[1]*size[2] + size[1]*size[2]*size[3]+
01490 size[2]*size[3]*size[0] + size[3]*size[0]*size[1]);
01491
01492 non_local_chi_3 = 6*(size[0]*size[1]*size[2] + size[1]*size[2]*size[3]+
01493 size[2]*size[3]*size[0] + size[3]*size[0]*size[1]);
01494
01495 local_chi = NUM_DIR*vol - non_local_chi;
01496 local_chi_3 = NUM_DIR*vol - non_local_chi_3;
01497 #endif
01498
01499 #ifdef SIMUL_U
01500 uc_l[0] = uc_l0;
01501 uc_l[1] = uc_l1;
01502 uc_nl[0] = uc_nl0;
01503 uc_nl[1] = uc_nl1;
01504 #endif
01505
01506
01507
01508
01509 for ( i = 0; i < 2; i++ ){
01510
01511 #ifndef SIMUL_U
01512 uc_l[i] = (IFloat*)
01513 smalloc(MATRIX_SIZE*((local_chi+local_chi_3)/2)*sizeof(IFloat),
01514 "uc_l[i]", fname, cname);
01515 for(int j=0;j<MATRIX_SIZE*(local_chi+local_chi_3)/2;j++)
01516 uc_l[i][j]=0.;
01517 #endif
01518 uc_l_agg[i] = (gauge_agg*)
01519 smalloc(((local_chi+local_chi_3)/2)*sizeof(gauge_agg),
01520 "uc_l_agg[i]", fname, cname);
01521
01522
01523 #ifndef SIMUL_U
01524 uc_nl[i] = (IFloat*)
01525 smalloc(MATRIX_SIZE*((non_local_chi+non_local_chi_3)/2) * sizeof(IFloat),
01526 "uc_nl[i]", fname, cname);
01527 for(int j=0;j<MATRIX_SIZE*(non_local_chi+non_local_chi_3)/2;j++)
01528 uc_nl[i][j]=0.;
01529 #endif
01530 uc_nl_agg[i] = (gauge_agg*)
01531 smalloc(((non_local_chi+non_local_chi_3)/2)*sizeof(gauge_agg),
01532 "uc_nl_agg[i]", fname, cname);
01533
01534 }
01535
01536 #ifndef SIMUL_U
01537
01538
01539
01540 for ( i = 0; i < 2; i++){
01541 local_count[i] = 0;
01542 non_local_count[i] = 0;
01543 local_count_3[i]=0;
01544 for (j=0; j<3; j++) non_local_count_3[j][i] = 0;
01545
01546
01547 }
01548
01549
01550
01551
01552
01553 for ( n = 0; n < NUM_DIR/2; n++ ) {
01554
01555
01556
01557
01558
01559
01560 for (x[3] = 0; x[3] < size[3]; x[3]++){
01561 for (x[2] = 0; x[2] < size[2]; x[2]++){
01562 for (x[1] = 0; x[1] < size[1]; x[1]++){
01563 for (x[0] = 0; x[0] < size[0]; x[0]++){
01564
01565 for (i = 0; i < 4 ; i++) coord[i] = x[i];
01566 odd = ( coord[0] + coord[1] + coord[2] + coord[3] ) % 2;
01567 sg = coord[1] + size[1] * ( coord[2] + size[2] * ( coord[3] + size[3] * coord[0] ));
01568
01569
01570
01571
01572 nn = ( n - 1 + 4 ) % 4;
01573 v = u + SITE_LEN *(LexGauge( coord ))+ MATRIX_SIZE * nn;
01574
01575
01576 if ( CoordNN( n ) ) {
01577 w = uc_nl[odd] + MATRIX_SIZE * non_local_count[odd];
01578 for ( r = 0; r < 3; r++ ) {
01579 for ( c = 0; c < 3; c++ ) {
01580 i = 6*r + 2*c;
01581 j = 6*c + 2*r;
01582 w[i] = v[j];
01583 w[i+1] = -v[j+1];
01584 }
01585 }
01586 non_local_count[odd]++;
01587 }
01588 else {
01589 w = uc_l[odd] + MATRIX_SIZE * local_count[odd];
01590 for ( r = 0; r < 3; r++ ) {
01591 for ( c = 0; c < 3; c++ ) {
01592 i = 6*r + 2*c;
01593 j = 6*c + 2*r;
01594 w[i] = v[j];
01595 w[i+1] = -v[j+1];
01596 }
01597 }
01598 local_count[odd]++;
01599 }
01600
01601 for (i = 0; i < 4 ; i++) coord[i] = x[i];
01602 CoordNN( n );
01603 coord[n]=coord_nn[n];
01604 CoordNN( n );
01605 coord[n]=coord_nn[n];
01606
01607
01608 nn = ( n - 1 + 4 ) % 4;
01609 v = u + SITE_LEN * (LexGauge( coord )) + MATRIX_SIZE * nn;
01610 DaggerM( w_t2, v);
01611
01612 off_node= CoordNN( n+4 );
01613 TransfP( off_node, nflush_g, w_t2, mtmp, n);
01614
01615
01616 coord[n]=coord_nn[n];
01617 nn = ( n - 1 + 4 ) % 4;
01618 v = u + SITE_LEN * (LexGauge( coord )) + MATRIX_SIZE * nn;
01619 DaggerM( w_t1, v);
01620
01621 mDotMEqual( w_t3, w_t1, w_t2 ) ;
01622
01623 off_node= CoordNN( n+4 );
01624 coord[n]=coord_nn[n];
01625 TransfP( off_node, nflush_g, w_t3, mtmp, n);
01626
01627
01628
01629
01630
01631 for (i = 0; i < 4 ; i++) coord[i] = x[i];
01632 if ( CoordkNN (n, 3)) {
01633
01634 for (j=0; j<3; j++){
01635
01636 if ((coord_knn[n%4]==j)) {
01637 w3 = uc_nl[odd]+MATRIX_SIZE*(non_local_count_3[j][odd]+(j+1)* non_local_chi/2);
01638 non_local_count_3[j][odd]++;
01639 }
01640 }
01641
01642 }
01643 else {
01644 w3 = uc_l[odd] + MATRIX_SIZE * (local_chi/2 + local_count_3[odd]);
01645 local_count_3[odd]++;
01646 }
01647
01648
01649
01650
01651 mDotMEqual(w3 , w , w_t3 ) ;
01652
01653 for ( i = 0; i < 18; i++ ) {
01654 w3[i] = c2 * w3[i];
01655
01656 }
01657
01658
01659
01660
01661
01662 int nu;
01663 IFloat stp3_0[18], stp3_2[18], stp3_4[18], stp3_5[18];
01664 IFloat stp5_0[18], stp5_1[18], stp5_2[18], stp5_3[18] , stp5_4[18], stp5_5[18];
01665 IFloat UnitMatrix[18], stp5_f[18], stp3_f[18] ;
01666 IFloat stp7_0[18], stp7_f[18], stp7_2[18];
01667 IFloat stp6_f[18];
01668
01669 for ( i = 0; i < 18; i++ ) {
01670 stp3_5[i] = 0.;
01671 stp5_f[i] = 0.;
01672 stp6_f[i] = 0.;
01673 stp3_f[i] = 0.;
01674 stp7_f[i] = 0.;
01675 UnitMatrix[i] = 0.;
01676 }
01677
01678
01679 UnitMatrix[0]=1.;
01680 UnitMatrix[8]=1.;
01681 UnitMatrix[16]=1.;
01682
01683 for (nu=0; nu < 8 ; nu++){
01684 if (nu%4 != n){
01685 for (i = 0; i < 4 ; i++) coord[i] = x[i];
01686 if (nu<4){
01687 Staple3_PP (UnitMatrix,stp3_4, coord, n,nu, v,u,nflush_g,1 );
01688 Staple5_PP (UnitMatrix,stp5_2, coord, n,nu, nu, v,u,nflush_g );
01689 }
01690 else{
01691 Staple3_PN (UnitMatrix,stp3_4, coord, n,nu%4, v,u,nflush_g,1 );
01692 Staple5_PN (UnitMatrix,stp5_2, coord, n,nu,nu%4, v,u,nflush_g );
01693 }
01694
01695 for ( i = 0; i < 18; i++ ) {
01696 stp3_5[i] += stp3_4[i];
01697 stp6_f[i] += stp5_2[i];
01698 }
01699 }}
01700
01701
01702
01703
01704
01705 for (int ro=0; ro < 4 ; ro++){
01706 if (ro%4 != n)
01707 {
01708 for (nu=0; nu < 8 ; nu++){
01709 if ( ( (nu%4)!= n )&&( (nu%4)!=(ro%4)) )
01710 {
01711 for (i = 0; i < 4 ; i++) coord[i] = x[i];
01712
01713 Staple5_PP (UnitMatrix ,stp5_2 , coord, n, nu, ro, v, u, nflush_g );
01714
01715 for ( i = 0; i < 18; i++ ) {
01716 stp5_f[i] += stp5_2[i];}
01717 }
01718 }}}
01719
01720 for (int ro=0; ro < 4 ; ro++){
01721 if (ro != n)
01722 {
01723 for (nu=0; nu < 8 ; nu++){
01724 if ( ( (nu%4)!= n )&&( (nu%4)!=(ro%4)) )
01725 {
01726 for (i = 0; i < 4 ; i++) coord[i] = x[i];
01727
01728 Staple5_PN(UnitMatrix ,stp5_2 , coord, n, nu, ro, v, u, nflush_g );
01729
01730 for ( i = 0; i < 18; i++ ) {
01731 stp5_f[i] += stp5_2[i];}
01732 }
01733 }
01734 }
01735 }
01736
01737
01738
01739 for (int de=0; de < 4 ;de++){
01740 if (de%4 != n ){
01741
01742 for (int ro=0; ro < 8 ; ro++){
01743 if (ro%4 != n && ro%4 != de%4)
01744 {
01745 for (nu=0; nu < 8 ; nu++){
01746 if (nu%4 != n && nu%4 != ro%4 && nu%4 != de%4)
01747 {
01748 for (i = 0; i < 4 ; i++) coord[i] = x[i];
01749
01750 Staple7_PP (UnitMatrix ,stp7_2 , coord, n, nu, ro,de, v, u, nflush_g );
01751
01752 for ( i = 0; i < 18; i++ ) {
01753 stp7_f[i] += stp7_2[i];}
01754 }
01755 }}} }}
01756
01757
01758 for (int de=0; de < 4 ;de++){
01759 if (de%4 != n){
01760
01761 for (int ro=0; ro < 8 ; ro++){
01762 if (ro%4 != n && ro%4 != de%4)
01763 {
01764 for (nu=0; nu < 8 ; nu++){
01765 if (nu%4 != n && nu%4 != ro%4 && nu%4 != de%4)
01766 {
01767 for (i = 0; i < 4 ; i++) coord[i] = x[i];
01768
01769 Staple7_PN (UnitMatrix ,stp7_2 , coord, n, nu, ro,de, v, u, nflush_g );
01770
01771 for ( i = 0; i < 18; i++ ) {
01772 stp7_f[i] += stp7_2[i];}
01773 }
01774 }}} }}
01775
01776
01777
01778
01779
01780
01781
01782
01783
01784
01785
01786
01787
01788
01789
01790 for ( i = 0; i < 18; i++ ) {
01791
01792 w[i]*= c1;
01793 w[i] = w[i] + c3 * stp3_5[i]+ c5 * stp5_f[i] + c6
01794 *stp6_f[i] + c7 * stp7_f[i] ;
01795
01796
01797
01798
01799 }
01800
01801
01802
01803 }
01804 }
01805 }
01806 }
01807 }
01808
01809
01810
01811
01812
01813
01814
01815 for ( n = 0; n < NUM_DIR/2; n++ ) {
01816
01817 for (x[3] = 0; x[3] < size[3]; x[3]++){
01818 for (x[2] = 0; x[2] < size[2]; x[2]++){
01819 for (x[1] = 0; x[1] < size[1]; x[1]++){
01820 for (x[0] = 0; x[0] < size[0]; x[0]++){
01821
01822 for (i = 0; i < 4 ; i++) coord[i] = x[i];
01823 odd = ( coord[0] + coord[1] + coord[2] + coord[3] ) % 2;
01824
01825 nn = ( n - 1 + 4 ) % 4;
01826
01827
01828
01829 off_node = CoordNN( n + 4 );
01830
01831 v = u + SITE_LEN * LexGauge( coord_nn ) + MATRIX_SIZE * nn;
01832
01833 if ( off_node ) {
01834 w = uc_nl[odd] + MATRIX_SIZE * non_local_count[odd];
01835 non_local_count[odd]++;
01836 TransfM( off_node, nflush_g, v, mtmp, n);
01837 v = mtmp;
01838 }
01839 else{
01840 w = uc_l[odd] + MATRIX_SIZE * local_count[odd];
01841 local_count[odd]++;
01842 }
01843
01844
01845 for ( i = 0; i < 18; i++ ) {
01846 w[i] = -v[i];
01847 }
01848
01849 for (i = 0; i < 4 ; i++) coord[i] = x[i];
01850 CoordNN( n + 4);
01851 coord[n]=coord_nn[n];
01852 CoordNN( n + 4);
01853 coord[n]=coord_nn[n];
01854 off_node=CoordNN( n + 4);
01855
01856
01857 v = u + SITE_LEN * LexGauge( coord_nn ) + MATRIX_SIZE * nn;
01858 for ( i = 0; i < 18; i++ ) {
01859 w_t2[i] = v[i];
01860 }
01861
01862 TransfM( off_node, nflush_g, w_t2, mtmp, n);
01863
01864 v = u + SITE_LEN * LexGauge( coord ) + MATRIX_SIZE * nn;
01865 for ( i = 0; i < 18; i++ ) {
01866 w_t1[i] = v[i];
01867 }
01868
01869 mDotMEqual(w_t3, w_t1, w_t2 ) ;
01870 off_node= CoordkNN( n,2 );
01871 TransfM( off_node, nflush_g, w_t3, mtmp, n);
01872
01873
01874
01875
01876 for (i = 0; i < 4 ; i++) coord[i] = x[i];
01877 if ( CoordkNN (n + 4, 3)) {
01878
01879 for (j=0; j<3; j++){
01880
01881 if ((coord_knn[n%4]==(size[n%4]-1-j))) {
01882 w3 = uc_nl[odd]+MATRIX_SIZE*(non_local_count_3[j][odd]+(j+1)* non_local_chi/2);
01883 non_local_count_3[j][odd]++;
01884 }
01885 }
01886
01887 }
01888 else {
01889 w3 = uc_l[odd] + MATRIX_SIZE * (local_count_3[odd] + local_chi/2);
01890 local_count_3[odd]++;
01891 }
01892
01893
01894 mDotMEqual( w3 , w , w_t3 ) ;
01895
01896 for ( i = 0; i < 18; i++ ) {
01897 w3[i] = c2 * w3[i];
01898
01899 }
01900
01901
01902 int nu;
01903 IFloat stp3_0[18],stp3_1[18], stp3_2[18], stp3_3[18], stp3_4[18], stp3_5[18];
01904 IFloat stp5_0[18], stp5_1[18], stp5_2[18], stp5_3[18] , stp5_4[18], stp5_5[18];
01905 IFloat UnitMatrix[18], stp5_f[18], stp3_f[18], stp7_2[18],stp7_f[18],stp6_f[18] ;
01906
01907
01908
01909
01910 for ( i = 0; i < 18; i++ ) {
01911 stp3_5[i] = 0;
01912 stp5_f[i] = 0;
01913 stp3_f[i] = 0;
01914 stp6_f[i] = 0;
01915 stp7_f[i] = 0;
01916 UnitMatrix[i] = 0;
01917
01918 }
01919
01920 UnitMatrix[0]=1;
01921 UnitMatrix[8]=1;
01922 UnitMatrix[16]=1;
01923
01924
01925 for (nu=0; nu < 8 ; nu++){
01926 if (nu%4 != n){
01927 for (i = 0; i < 4 ; i++) coord[i] = x[i];
01928 if (nu<4){
01929 Staple3_NP (UnitMatrix,stp3_4, coord, n,nu, v,u,nflush_g,1 );
01930 Staple5_NP (UnitMatrix,stp5_2, coord, n,nu, nu, v,u,nflush_g);
01931 }
01932 else {
01933 Staple3_NN (UnitMatrix,stp3_4, coord, n,nu%4, v,u,nflush_g,1 );
01934 Staple5_NN (UnitMatrix,stp5_2, coord, n,nu,nu%4, v,u,nflush_g);
01935 }
01936
01937 for ( i = 0; i < 18; i++ ) {
01938 stp3_5[i] += stp3_4[i];
01939 stp6_f[i] += stp5_2[i];
01940 }
01941 }}
01942
01943
01944 for (int ro=0; ro < 4 ; ro++){
01945 if (ro%4 != n)
01946 {
01947 for (nu=0; nu < 8 ; nu++){
01948 if (nu%4 != n && nu%4 != ro%4)
01949 {
01950 for (i = 0; i < 4 ; i++) coord[i] = x[i];
01951 Staple5_NP (UnitMatrix ,stp5_2 , coord, n, nu, ro, v, u, nflush_g );
01952
01953
01954 for ( i = 0; i < 18; i++ ) {
01955 stp5_f[i] += stp5_2[i]; }
01956
01957
01958 }
01959 }
01960
01961 }}
01962
01963
01964
01965 for (int ro=0; ro < 4 ; ro++){
01966 if (ro != n)
01967 {
01968 for (nu=0; nu < 8 ; nu++){
01969 if (nu%4 != n && nu%4 != ro%4)
01970 {
01971 for (i = 0; i < 4 ; i++) coord[i] = x[i];
01972
01973 Staple5_NN(UnitMatrix ,stp5_2 , coord, n, nu, ro, v, u, nflush_g );
01974
01975 for ( i = 0; i < 18; i++ ) {
01976 stp5_f[i] += stp5_2[i];}
01977
01978
01979 }
01980 }
01981 }
01982 }
01983
01984
01985
01986
01987 for (int de=0; de < 4 ;de++){
01988 if (de%4 != n){
01989
01990 for (int ro=0; ro < 8 ; ro++){
01991 if (ro%4 != n && ro%4 != de%4)
01992 {
01993 for (nu=0; nu < 8 ; nu++){
01994
01995 if (nu%4 != n && nu%4 != ro%4 && nu%4 != de%4)
01996 {
01997 for (i = 0; i < 4 ; i++) coord[i] = x[i];
01998
01999 Staple7_NP (UnitMatrix ,stp7_2 , coord, n, nu, ro,de, v, u, nflush_g );
02000
02001
02002
02003
02004
02005
02006
02007
02008
02009
02010
02011
02012
02013
02014
02015
02016
02017
02018
02019
02020
02021
02022
02023
02024
02025
02026
02027
02028
02029
02030
02031
02032
02033
02034
02035
02036
02037 for ( i = 0; i < 18; i++ ) {
02038 stp7_f[i] += stp7_2[i];}
02039 }
02040 }}} }}
02041
02042
02043
02044
02045 for (int de=0; de < 4 ;de++){
02046 if (de%4 != n){
02047
02048 for (int ro=0; ro < 8 ; ro++){
02049 if (ro%4 != n && ro%4 != de%4)
02050 {
02051
02052 for (nu=0; nu < 8 ; nu++){
02053 if (nu%4 != n && nu%4 != ro%4 && nu%4 != de%4)
02054 {
02055 for (i = 0; i < 4 ; i++) coord[i] = x[i];
02056
02057 Staple7_NN (UnitMatrix ,stp7_2 , coord, n, nu, ro,de, v, u, nflush_g );
02058
02059 for ( i = 0; i < 18; i++ ) {
02060 stp7_f[i] += stp7_2[i];}
02061 }
02062 }}} }}
02063
02064
02065
02066
02067
02068
02069
02070
02071
02072
02073 for ( i = 0; i < 18; i++ ) {
02074
02075 w[i]*= c1;
02076
02077 w[i] = w[i] - c3 * stp3_5[i]- c5 * stp5_f[i] - c6*stp6_f[i] - c7 * stp7_f[i] ;
02078
02079
02080
02081
02082
02083 }
02084
02085
02086
02087
02088
02089
02090
02091
02092
02093
02094
02095
02096
02097
02098
02099
02100
02101
02102
02103
02104
02105
02106
02107
02108
02109
02110
02111 }}} }
02112 }
02113
02114 #endif // SIMUL
02115
02116 int fd;
02117 char buf[200];
02118 FILE *fp;
02119 #ifndef SIMUL_U
02120 #if 0
02121 fp=Fopen("uc_l.h","w");
02122 for(j=0;j<2;j++){
02123 Fprintf(fp,"IFloat uc_l%d[] LOCATE(\"edramnormal\") = {\n",j);
02124 for(i=0;i< MATRIX_SIZE * ((local_chi+ local_chi_3)/2);i++){
02125 Fprintf(fp,"%0.4e, ",*(uc_l[j]+i));
02126 if( i%6 == 5){
02127 Fprintf(fp,"\n");
02128 }
02129 }
02130 Fprintf(fp,"\n};\n");
02131 }
02132 Fclose(fp);
02133 #endif
02134 #endif
02135
02136 #ifndef SIMUL_AGG
02137
02138 gauge_agg *temp = (gauge_agg *)
02139 smalloc(12*vol*sizeof(gauge_agg),"temp",fname,cname);
02140 int *num_ind = (int *)smalloc(6*vol*sizeof(int),"num_ind",fname,cname);
02141 int src;
02142 for(j=0;j<2;j++){
02143 for(i=0;i<vol;i++) num_ind[i]=0;
02144 for(i=0;i< ((local_chi+ local_chi_3)/2);i++){
02145 src = (unsigned long)chi_l[j][2*i];
02146 if (src%(VECT_LEN*sizeof(IFloat))!=0){
02147 ERR.General(cname,fname,"src = %d\n",src);
02148 exit(1);
02149 }
02150 src = src/(VECT_LEN*sizeof(IFloat));
02151 if(src > vol/2) {
02152 ERR.General(cname,fname,"src[%d](%d) > vol/2\n",i,src);
02153 exit(1);
02154 }
02155 temp[src*NUM_DIR*2+num_ind[src]].src = (unsigned long)chi_l[j][2*i];
02156 temp[src*NUM_DIR*2+num_ind[src]].dest= (unsigned long)chi_l[j][2*i+1];
02157 for(k=0;k<MATRIX_SIZE;k++){
02158 temp[src*NUM_DIR*2+num_ind[src]].mat[k] = uc_l[j][i*MATRIX_SIZE+k];
02159 }
02160 num_ind[src]++;
02161 }
02162 int index = 0;
02163 int div = 2;
02164 for( n = 0; n*div<NUM_DIR*2;n++)
02165 for(i=0;i<vol/2;i++){
02166 for(m=0;m<div;m++){
02167 if(num_ind[i] > n*div+m) {
02168 uc_l_agg[j][index].src = temp[i*NUM_DIR*2+n*div+m].src;
02169 uc_l_agg[j][index].dest = temp[i*NUM_DIR*2+n*div+m].dest;
02170 for(k=0;k<18;k++)
02171 uc_l_agg[j][index].mat[k] = temp[i*NUM_DIR*2+n*div+m].mat[k];
02172 index++;
02173 }
02174 }
02175 }
02176 }
02177
02178
02179 #if 0
02180 struct gauge_agg *agg_p;
02181 sprintf(buf,"uc_l_agg.h");
02182 fd = open(buf,O_CREAT|O_TRUNC|O_RDWR,00644);
02183 for(j=0;j<2;j++){
02184 sprintf(buf,"struct gauge_agg uc_l_agg%d[] LOCATE(\"edramtransient\") = {\n",j);
02185 write(fd,buf,strlen(buf));
02186 for(i=0;i< ((local_chi+ local_chi_3)/2);i++){
02187 agg_p = &(uc_l_agg[j][i]);
02188 sprintf(buf,"{%d,%d,{\n",agg_p->src,agg_p->dest);
02189 write(fd,buf,strlen(buf));
02190 for(k=0;k<18;k++){
02191 sprintf(buf,"%0.8e",agg_p->mat[k]);
02192 write(fd,buf,strlen(buf));
02193 if(k!=17){
02194 sprintf(buf,", ");
02195 write(fd,buf,strlen(buf));
02196 }
02197 if( k%6 == 5){
02198 sprintf(buf,"\n");
02199 write(fd,buf,strlen(buf));
02200 }
02201 }
02202 sprintf(buf,"}},\n");
02203 write(fd,buf,strlen(buf));
02204 }
02205 sprintf(buf,"\n};\n");
02206 write(fd,buf,strlen(buf));
02207 }
02208 close(fd);
02209 #endif
02210 #else
02211
02212 #if 0
02213 sprintf(buf,"uc_l_agg.h");
02214 fd = open(buf,O_CREAT|O_TRUNC|O_RDWR,00644);
02215 for(j=0;j<2;j++){
02216 sprintf(buf,"struct gauge_agg uc_l_agg%d[] LOCATE(\"edramtransient\") = {\n",j);
02217 write(fd,buf,strlen(buf));
02218 for(i=0;i< ((local_chi+ local_chi_3)/2);i++){
02219 sprintf(buf,"{%d,%d,{\n",*(chi_l[j]+2*i),*(chi_l[j]+2*i+1));
02220 write(fd,buf,strlen(buf));
02221 uc_l_agg[j][i].src = (unsigned long)chi_l[j][2*i];
02222 uc_l_agg[j][i].dest = (unsigned long)chi_l[j][2*i+1];
02223 for(k=0;k<18;k++){
02224 sprintf(buf,"%0.8e",*(uc_l[j]+i*18+k));
02225 write(fd,buf,strlen(buf));
02226 uc_l_agg[j][i].mat[k] = uc_l[j][i*18+k];
02227 if(k!=17){
02228 sprintf(buf,", ");
02229 write(fd,buf,strlen(buf));
02230 }
02231 if( k%6 == 5){
02232 sprintf(buf,"\n");
02233 write(fd,buf,strlen(buf));
02234 }
02235 }
02236 sprintf(buf,"}},\n");
02237 write(fd,buf,strlen(buf));
02238 }
02239 sprintf(buf,"\n};\n");
02240 write(fd,buf,strlen(buf));
02241 }
02242 close(fd);
02243 #endif
02244
02245 #endif //SIMUL_AGG
02246
02247 #ifndef SIMUL_U
02248 #if 0
02249 fp = Fopen("uc_nl.h","w");
02250 for(j=0;j<2;j++){
02251 Fprintf(fp,"IFloat uc_nl%d[] LOCATE(\"edramnormal\") = {\n",j);
02252 for(i=0;i< MATRIX_SIZE * ((non_local_chi+non_local_chi_3 )/2);i++){
02253 Fprintf(fp,"%0.4e, ",*(uc_nl[j]+i));
02254 if( i%6 == 5){
02255 Fprintf(fp,"\n");
02256 }
02257 }
02258 Fprintf(fp,"\n};\n");
02259 }
02260 Fclose(fp);
02261 #endif
02262 #endif
02263
02264 #ifndef SIMUL_AGG
02265
02266 #if 1
02267 int max = 12;
02268 for(j=0;j<2;j++){
02269 for(i=0;i<non_local_chi*3/2;i++) num_ind[i]=0;
02270 for(i=0;i< ((non_local_chi+ non_local_chi_3)/2);i++){
02271 src = (unsigned long)chi_nl[j][2*i];
02272 if (src%(VECT_LEN*sizeof(IFloat))!=0){
02273 ERR.General(cname,fname,"src = %d\n",src);
02274 exit(1);
02275 }
02276 src = src/(VECT_LEN*sizeof(IFloat));
02277 if(src > non_local_chi*3/2) {
02278 ERR.General(cname,fname,"src(%d) > non_local_chi*3\n",src);
02279 exit(1);
02280 }
02281 temp[src*max+num_ind[src]].src = (unsigned long)chi_nl[j][2*i];
02282 temp[src*max+num_ind[src]].dest= (unsigned long)chi_nl[j][2*i+1];
02283 for(k=0;k<18;k++)
02284 temp[src*max+num_ind[src]].mat[k] = uc_nl[j][i*18+k];
02285 num_ind[src]++;
02286 if(num_ind[src]>max){
02287 ERR.General(cname,fname,"num_ind[%d](%d) > %d \n",src, num_ind[src],max);
02288 exit(1);
02289 }
02290 }
02291
02292 int index = 0;
02293 int div = 12;
02294 for( n=0; n*div<max; n++)
02295 for(i=0;i<non_local_chi*3/2;i++){
02296 for(m=0;m<div;m++){
02297 if(num_ind[i] > n*div+m) {
02298 uc_nl_agg[j][index] = temp[i*max+n*div+m];
02299 index++;
02300 }
02301 }
02302 }
02303
02304 }
02305 sfree(temp,"temp",fname,cname);
02306 sfree(num_ind,"num_ind",fname,cname);
02307
02308 #if 0
02309 sprintf(buf,"uc_nl_agg.h");
02310 fd = open(buf,O_CREAT|O_TRUNC|O_RDWR,00644);
02311 for(j=0;j<2;j++){
02312 sprintf(buf,"struct gauge_agg uc_nl_agg%d[] LOCATE(\"edramtransient\") = {\n",j);
02313 write(fd,buf,strlen(buf));
02314 for(i=0;i< ((non_local_chi+ non_local_chi_3)/2);i++){
02315 agg_p = &(uc_nl_agg[j][i]);
02316 sprintf(buf,"{%d,%d,{\n",agg_p->src,agg_p->dest);
02317 write(fd,buf,strlen(buf));
02318 for(k=0;k<18;k++){
02319 sprintf(buf,"%0.8e",agg_p->mat[k]);
02320 write(fd,buf,strlen(buf));
02321 if(k!=17){
02322 sprintf(buf,", ");
02323 write(fd,buf,strlen(buf));
02324 }
02325 if( k%6 == 5){
02326 sprintf(buf,"\n");
02327 write(fd,buf,strlen(buf));
02328 }
02329 }
02330 sprintf(buf,"}},\n");
02331 write(fd,buf,strlen(buf));
02332 }
02333 sprintf(buf,"\n};\n");
02334 write(fd,buf,strlen(buf));
02335 }
02336 close(fd);
02337 #endif
02338 #else
02339 #if 0
02340 sprintf(buf,"uc_nl_agg.h");
02341 fd = open(buf,O_CREAT|O_TRUNC|O_RDWR,00644);
02342 for(j=0;j<2;j++){
02343 sprintf(buf,"struct gauge_agg uc_nl_agg%d[] LOCATE(\"edramtransient\") = {\n",j);
02344 write(fd,buf,strlen(buf));
02345 for(i=0;i< ((local_chi+ local_chi_3)/2);i++){
02346 sprintf(buf,"{%d,%d,{\n",*(chi_nl[j]+2*i),*(chi_nl[j]+2*i+1));
02347 uc_nl_agg[j][i].src = (unsigned long)chi_nl[j][2*i];
02348 uc_nl_agg[j][i].dest = (unsigned long)chi_nl[j][2*i+1];
02349 write(fd,buf,strlen(buf));
02350 for(k=0;k<18;k++){
02351 sprintf(buf,"%0.8e",*(uc_nl[j]+i*18+k));
02352 write(fd,buf,strlen(buf));
02353 uc_nl_agg[j][i].mat[k] = uc_nl[j][i*18+k];
02354 if(k!=17){
02355 sprintf(buf,", ");
02356 write(fd,buf,strlen(buf));
02357 }
02358 if( k%6 == 5){
02359 sprintf(buf,"\n");
02360 write(fd,buf,strlen(buf));
02361 }
02362 }
02363 sprintf(buf,"}},\n");
02364 write(fd,buf,strlen(buf));
02365 }
02366 sprintf(buf,"\n};\n");
02367 write(fd,buf,strlen(buf));
02368 }
02369 close(fd);
02370 #endif
02371 #endif
02372
02373 #endif // SIMUL_AGG
02374
02375 VRB.FuncEnd(cname,fname);
02376 }
02377
02378 extern "C"
02379 void asqtad_destroy_dirac_buf_g(void)
02380 {
02381 int i;
02382 char *cname = "";
02383 char *fname = "asqtad_destroy_dirac_buf_g()";
02384 VRB.Func(cname,fname);
02385 for ( i = 0; i < 2; i++){
02386 #ifndef SIMUL_U
02387 sfree(uc_l[i],"uc_l[i]",fname,cname);
02388 sfree(uc_nl[i],"uc_nl[i]",fname,cname);
02389 #endif
02390 sfree(uc_l_agg[i],"uc_l_agg[i]",fname,cname);
02391 sfree(uc_nl_agg[i],"uc_nl_agg[i]",fname,cname);
02392 }
02393 VRB.FuncEnd("",fname);
02394 }
02395
02396
02397
02398
02399
02400
02401
02402
02403
02404 static int CoordNN( int nn )
02405 {
02406 int i;
02407 int off_node = 0;
02408 int m;
02409
02410 for ( i = 0; i < 4; i++ )
02411 coord_nn[i] = coord[i];
02412
02413 m = nn % 4;
02414 for ( i = 0; i < 4; i++ ){
02415 coord_nn [ i ] = coord[i];
02416 }
02417
02418 if (nn<4 ) coord_nn[m]+=1;
02419 else coord_nn[m]-=1;
02420 while (coord_nn[m] < 0) {
02421 coord_nn[m] += size[m] ;
02422 }
02423 coord_nn[m] %= size[m] ;
02424 if (nn<4){ if (coord_nn[m] != (coord[m]+1)) off_node = 1;}
02425 else {if (coord_nn[m] != (coord[m]-1)) off_node = 1;}
02426 return off_node;
02427 }
02428
02429 static int CoordkNN( int nn, int k )
02430 {
02431 int i;
02432 int off_node = 0;
02433 int m;
02434
02435 for ( i = 0; i < 4; i++ )
02436 coord_knn[i] = coord[i];
02437
02438 m = nn % 4;
02439 for ( i = 0; i < 4; i++ ){
02440 coord_knn [ i ] = coord[i];
02441 }
02442
02443 if (nn<4 ) coord_knn[m]+=k;
02444 else coord_knn[m]-=k;
02445 while (coord_knn[m] < 0) {
02446 coord_knn[m] += size[m] ;
02447 }
02448 coord_knn[m] %= size[m] ;
02449
02450 if (nn<4 ){ if (coord_knn[m] != (coord[m]+k)) off_node = 1;}
02451 else {if (coord_knn[m] != (coord[m]-k)) off_node = 1;}
02452 return off_node;
02453 }
02454
02455
02456
02457
02458
02459
02460
02461
02462
02463
02464
02465
02466
02467
02468
02469
02470
02471
02472
02473
02474
02475
02476
02477
02478
02479
02480
02481
02482 static int LexGauge( int * c )
02483 {
02484 return c[1] + size[1] * ( c[2] + size[2] * ( c[3] + size[3] * c[0] ));
02485 }
02486
02487
02488
02489
02490
02491 static int LexVector( int * c )
02492 {
02493 return c[0] + size[0] * ( c[1] + size[1] * ( c[2] + size[2] * c[3] ));
02494 }
02495
02496
02497
02498
02499
02500
02501 static int LexSurface( int * cc, int surface )
02502 {
02503 int i;
02504 int s[4];
02505 int c[4];
02506
02507 for ( i = 0; i < 4; i++ ) {
02508 s[i] = size[i];
02509 c[i] = cc[i];
02510 }
02511
02512 s[ surface ] = 1;
02513 c[ surface ] = 0;
02514
02515 return c[0] + s[0] * ( c[1] + s[1] * ( c[2] + s[2] * c[3] ));
02516 }
02517
02518 extern "C" void dirac_comm_assert()
02519 {
02520 }
02521
02522
02523
02524
02525
02526
02527
02528
02529
02530
02531
02532
02533 extern "C"
02534 void asqtad_dirac(IFloat* b, IFloat* a, int a_odd, int add_flag)
02535 {
02536
02537 int i,j,k;
02538 long c = (long) a;
02539 int odd = 1 - a_odd;
02540
02541
02542
02543
02544 #if 0
02545 copy_buffer_cpp(countM[0][a_odd], (long)a, (long)Tbuffer[0][0], (long)ToffsetM[0][a_odd]);
02546 copy_buffer_cpp(countP[0][a_odd], (long)a, (long)Tbuffer[0][1], (long)ToffsetP[0][a_odd]);
02547 if(size[0]>2) {
02548 copy_buffer_cpp(countM[1][a_odd], (long)a, (long)Tbuffer[1][0], (long)ToffsetM[1][a_odd]);
02549 copy_buffer_cpp(countP[1][a_odd], (long)a, (long)Tbuffer[1][1], (long)ToffsetP[1][a_odd]);
02550 if(size[0]>4) {
02551 copy_buffer_cpp(countM[2][a_odd], (long)a, (long)Tbuffer[2][0], (long)ToffsetM[2][a_odd]);
02552 copy_buffer_cpp(countP[2][a_odd], (long)a, (long)Tbuffer[2][1], (long)ToffsetP[2][a_odd]);
02553 }
02554 }
02555 #endif
02556
02557
02558
02559 #if 0
02560 for(i=1;i<4;i++)
02561 if(size[i] >2 ){
02562 SCUarg[i]->Addr( a + Xoffset[2][i]);
02563 SCUarg[i+4]->Addr( a + Xoffset[2][i+4]);
02564 }
02565
02566 save_reg((long)intreg, (long)dreg);
02567
02568
02569 void *addr[2];
02570 if(split) {
02571 for(i=1;i<4;i++){
02572 addr[0] = (void *)(a+Xoffset[0][i]);
02573 addr[1] = (void *)(a+Xoffset[1][i]);
02574 SCUarg_1[i]->Addr( addr,1);
02575 SCUarg_2[i]->Addr( addr+1,1);
02576 addr[0] = (void *)(a+Xoffset[0][i+4]);
02577 addr[1] = (void *)(a+Xoffset[1][i+4]);
02578 SCUarg_1[i+4]->Addr( addr,1);
02579 SCUarg_2[i+4]->Addr( addr+1,1);
02580 }
02581 } else {
02582 for(i=1;i<4;i++){
02583 addr[0] = (void *)(a+Xoffset[0][i]);
02584 addr[1] = (void *)(a+Xoffset[1][i]);
02585 SCUarg_1[i]->Addr( addr,2);
02586 addr[0] = (void *)(a+Xoffset[0][i+4]);
02587 addr[1] = (void *)(a+Xoffset[1][i+4]);
02588 SCUarg_1[i+4]->Addr( addr,2);
02589 }
02590 }
02591
02592 SCUmulti_1->StartTrans();
02593
02594 save_reg((long)intreg, (long)dreg);
02595 #endif
02596
02597
02598
02599
02600 IFloat *fp0,*fp1, *uu;
02601 int *ch;
02602
02603
02604 dirac_cmv_jcw_agg_cpp( (local_chi + local_chi_3)/2, (long)0, (long)uc_l_agg[odd],
02605 (long)a, (long)tmpfrm);
02606
02607
02608
02609
02610
02611
02612
02613
02614
02615
02616
02617
02618 dirac_cmv_jcw_agg_cpp( non_local_chi_3/2, (long)0, (long)&(uc_nl_agg[odd][0]), (long)c, (long)tmpfrm);
02619
02620
02621
02622
02623
02624
02625
02626 dirac_cmv_jcw_agg_cpp( non_local_chi/2, (long)0, (long)&(uc_nl_agg[odd][non_local_chi_3/2]), (long)c, (long)tmpfrm);
02627
02628
02629
02630
02631
02632
02633
02634
02635 if ( add_flag == 0){
02636 dirac_sum2_cpp( vol/2, (long)0, (long)tmpfrm, (long)b);
02637 }
02638 else{
02639
02640 dirac_sum2_acc_cpp( vol/2, (long)0, (long)tmpfrm, (long)b);
02641
02642 }
02643
02644 }
02645
02646 CPS_END_NAMESPACE
02647
02648
02649
02650