Main Page | Modules | Namespace List | Class Hierarchy | Alphabetical List | Class List | Directories | File List | Namespace Members | Class Members | File Members | Related Pages

asqtad_dirac.C

Go to the documentation of this file.
00001 //-------------------------------------------------------------------
00002 //  $Id: asqtad_dirac.C,v 1.20 2008/02/08 18:35:07 chulwoo Exp $
00003 //
00004 //    12/21/02 HueyWen Lin, Chulwoo Jung
00005 //
00006 //   Asqtad Dirac operator for QCDOC. Communications and computations
00007 //   are overlapped.
00008 //   Uses many functions implemented by CJ for QCDOC
00009 //
00010 //
00011 //-------------------------------------------------------------------
00012 #include <config.h>
00013 #include <util/gjp.h>
00014 #include <comms/scu.h>
00015 #include <comms/glb.h>
00016 #include <util/lattice.h>
00017 #include <util/dirac_op.h>
00018 #include <util/vector.h>
00019 #include <stdio.h>
00020 #include <stdlib.h>
00021 //#include <unistd.h>
00022 #include <fcntl.h>
00023 //#include <time_cps.h>
00024 #include <math.h>
00025 CPS_START_NAMESPACE
00026 
00027 #undef num //temperary test for uc_l and uc_nl
00028 //~~~~~~~~~~~~~
00029 
00030 #define fat_link
00031 #define naik
00032 #define staple5
00033 #define staple7
00034 
00035 
00036 /*****************************************************************
00037  SIMUL switched on/off the hack CJ put in to help speed up the
00038 simulation. if SIMUL is undefined, program writes the temporary arraies
00039 for dirac operator. If SIMUL is defined, it will include (pre-calcuated)
00040 temporary arraies and skip the generation of arraies.
00041 ******************************************************************/
00042 /****************************************************************
00043 CPP is a switch for using C++ routine for dirac_cmv.
00044 *****************************************************************/
00045 
00046 typedef struct gauge_agg{
00047   int src;
00048   int dest;
00049   IFloat mat[18];
00050 } gauge_agg ;
00051 
00052 void dirac_sum2_acc_cpp(int s, long chi, long tmpfrm, long b);
00053 void dirac_cmv_cpp( int sites, long chi, long u,long a, long tmpfrm);
00054 void dirac_cmv_jcw_agg_cpp( int sites, long chi, long u,long a, long tmpfrm);
00055 void dirac_sum2_cpp(int s, long chi, long tmpfrm, long b);
00056 void copy_buffer_cpp(int n, long src, long dest, long ptable);
00057 
00058 extern "C" void copy_buffer(int n, long src, long dest, long ptable);
00059 
00060 enum{VECT_LEN=6, VECT_LEN2=6, MATRIX_SIZE=18, SITE_LEN=72, NUM_DIR=8};
00061 //---------------------------------------------------------------------
00062 //  Arrays for send direction, receive direction, lattice size per
00063 //  node, coordinate (for site where cluster of 8 matrices
00064 //  is being assembled) and the coordinates of a nearest neighbor site.
00065 //
00066 //  0 == T, 1 == X, 2 == Y, 3 == Z
00067 //--------------------------------------------------------------------
00068 
00069 
00070 static int size[4];
00071 static int split; 
00072 static int coord[4];
00073 static int coord_nn[4];
00074 static int coord_knn[4];
00075 static int vol;
00076 static int non_local_chi_3;
00077 static int non_local_chi;
00078 static int local_chi;
00079 static int local_chi_3;
00080 static int buffer_order[3][4]; // never used ??
00081 static int buffer_flush[3][8]; // never used ?? asign space in dirac_init but never be used after that.
00082 static int nflush;
00083 
00084 //---------------------------------------------------------------------
00085 //  uc_l[0] points to a cluster of matrices per even site for local 
00086 //  computations , arranged so that all parallel transport of spinors 
00087 //  is accomplished by the same matrix times vector function.
00088 //  The volume is devided by 2 areas: nn and 3rd nn part.
00089 //  uc_l[1] is the same for odd sites.
00090 //  uc_nl[0] is the same for even non-local sites.
00091 //  uc_nl[1] is the same for odd non-local sites.
00092 //---------------------------------------------------------------------
00093 static gauge_agg * uc_l_agg[2];
00094 static gauge_agg * uc_nl_agg[2];
00095 static IFloat * uc_l[2];
00096 static IFloat * uc_nl[2];
00097 
00098 //------------------------------------------------------------------
00099 // Allocate these arrays dynamicaly once the cache, noncached eDRAM
00100 // malloc are available (should be changed according to volume)
00101 //------------------------------------------------------------------
00102 
00103 static IFloat *tmpfrm;
00104 
00105 static IFloat *Tbuffer[3][2];
00106 
00107 //-------------------------------------------------------------------
00108 // end of stack based arrays which should be heap based
00109 //-------------------------------------------------------------------
00110 static int intreg[18] ;
00111 static IFloat dreg[18] ;
00112 static int * ToffsetP[3][2];
00113 static int * ToffsetM[3][2];
00114 static int countP[3][2];
00115 static int countM[3][2];
00116 //---------------------------------------------------------------------
00117 //  pointers to storage area for color vectors from tp, xp, yp, zp, tm,
00118 //  xm, ym, zm (p = plus, m = minus).  Indexed as 0-7
00119 //---------------------------------------------------------------------
00120 static IFloat * chi_off_node_total;
00121 static IFloat * chi_off_node[3][8];
00122 static IFloat * chi_off_node_p[3][8];
00123 //------------------------------------------------------------------
00124 //  pointer to array of pointers telling where color vectors are
00125 //  located for cluster arrangement of lattice (even and odd).
00126 //  chi_l[0] points to a list of local adresses for chi's needed 
00127 //  to get an even site result from application of D and
00128 //  to a temporary storage area where U_mu * chi's for each direction 
00129 //  are stored.
00130 //  the first half of storage size local_chi is for the nn,
00131 //   while the second half of storage size local_chi_3 is for the 3rd nn.
00132 //  chi_l[1] same for odd site.
00133 //  chi_nl[0] same for computations involving non-local spinors
00134 //------------------------------------------------------------------
00135 static IFloat ** chi_l[2];
00136 static IFloat ** chi_nl[2];
00137 
00138 //------------------------------------------------------------------
00139 //  Values for send and receive transfers. 0-7 correspond to
00140 //  tp, xp, yp, zp, tm, xm, ym, zm (p = plus, m = minus).
00141 //
00142 //  Rarg (for SCU receives) never changes, since it receives into
00143 //  the buffers for the specified direction.
00144 //  SCUarg[3] is set up for different Xoffset and Toffset.
00145 //------------------------------------------------------------------
00146 
00147 #if 0
00148 SCUDirArgIR * SCUarg[2*NUM_DIR];
00149 SCUDirArgIR SCUargIR[2*NUM_DIR];
00150 SCUDirArgMulti * SCUmulti;
00151 SCUDirArgMulti SCUmultiIR;
00152 
00153 SCUDirArgIR * SCUarg_1[2*NUM_DIR];
00154 SCUDirArgIR SCUarg_1IR[2*NUM_DIR];
00155 SCUDirArgMulti * SCUmulti_1;
00156 SCUDirArgMulti SCUmulti_1IR;
00157 
00158 SCUDMAInst *SCUDMAarg_p[NUM_DIR*4];
00159 SCUDMAInst SCUDMAarg[NUM_DIR*4];
00160 
00161 SCUDirArgIR * SCUarg_2[2*NUM_DIR];
00162 SCUDirArgIR SCUarg_2IR[2*NUM_DIR];
00163 SCUDirArgMulti * SCUmulti_2;
00164 SCUDirArgMulti SCUmulti_2IR;
00165 #endif
00166 
00167 //------------------------------------------------------------------
00168 //  The offset into the even or odd checkboard chi's and chi3's used for a send
00169 //  from a given face.
00170 //------------------------------------------------------------------
00171 
00172 static int Xoffset[3][NUM_DIR];
00173 
00174 //-------------------------------------------------------------------
00175 //  Given a lexical value for gauge fields, set the coordinates.
00176 //  Return 1 if odd, 0 if even
00177 //-------------------------------------------------------------------
00178 
00179 static int SetCoord( int sg );
00180 
00181 //---------------------------------------------------------------------
00182 //  Find nearest neighbor coordinate for coordinates given.  Nearest
00183 //  neighbor coordinates are placed in coord_nn, which are always
00184 //  on-node.  Function returns 0 if nearest neighbor is on-node,
00185 //  1 if off-node.
00186 //
00187 //  nn = 0 to 7.  tp, xp, yp, zp, tm, xm, ym, zm respectively.
00188 //---------------------------------------------------------------------
00189 
00190 static int CoordNN( int nn );
00191 
00192 static int CoordkNN( int nn, int k );
00193 
00194 //---------------------------------------------------------------------
00195 //  Return lexical value for links from coordinates c
00196 //---------------------------------------------------------------------
00197 
00198 static int LexGauge( int * c );
00199 
00200 //---------------------------------------------------------------------
00201 //  Return lexical value for vectors from coordinates c
00202 //---------------------------------------------------------------------
00203 
00204 static int LexVector( int * c );
00205 
00206 //-------------------------------------------------------------------
00207 //  Given a coordinate and a surface ( 0 = t, 1 = x, 2 = y, 3 = x )
00208 //  calculate the offset into the receive buffers (chi_off_node);
00209 //-------------------------------------------------------------------
00210 
00211 static int LexSurface( int * cc, int surface );
00212 
00213 static IFloat * gauge_field_addr;
00214 
00215 static  int blklen[NUM_DIR/2];
00216 static  int numblk[NUM_DIR/2];
00217 static  int stride[NUM_DIR/2];
00218 
00219 int k;
00220 //-------------------------------------------------------------------
00221 //  Called by the lattice constructor
00222 //  Fermion initializations: pointer tables 
00223 //-------------------------------------------------------------------
00224 extern "C"
00225 void asqtad_dirac_init(Fasqtad * lat )
00226 {
00227   gauge_field_addr = ( IFloat * ) lat->GaugeField();
00228   int r,c,i,j,m,n,nn;
00229   int local_count[2];
00230   int non_local_count[2];
00231   int local_count_3[2];
00232   int non_local_count_3[3][2];
00233   int off_node;
00234   int x[NUM_DIR/2];
00235   char buf[200];
00236   int fd;
00237   char *cname = "DiracOpAsqtad";
00238   char *fname = "dirac_init(const void *gauge)";
00239 
00240   //-------------------------------------------------------------------
00241   //  sg is a lexical index for (t,z,y,x) where x runs fastest. This is
00242   //  the gauge field order produced by convert for staggered fermions.
00243   //
00244   //    sg = x + L_x * ( y + L_y * ( z + L_z * t ) )
00245   //
00246   //  sc is a lexical index for (t,x,y,z) where t runs fastest.  The
00247   //  even and odd staggered color vectors are stored with indices
00248   //  running in this order, except that even sites come before odd
00249   //  sites.
00250   //  
00251   //    sc = t + L_t * ( x + L_x * ( y + L_y * z ) )
00252   //
00253   //  Similarly the color vectors are indexed by sc/2 for both even
00254   //  and odd blocks.  Even and odd blocks have a different base
00255   //  address.
00256   //-------------------------------------------------------------------
00257 
00258   int sg, sc;
00259 
00260   //-----------------------------------------------------------
00261   //  If t + x + y + z is odd, odd = 1.  Otherwise it is 0.
00262   //-----------------------------------------------------------
00263 
00264   int odd;
00265 
00266   //-----------------------------------------------------------
00267   //  The physics system storage order has vector indices as
00268   //  0-3, x,y,z,t.  Our vector indices run 0-3 as t,x,y,z.
00269   //  nn is used to hold physics system values for our index,
00270   //  given by n.
00271   //-----------------------------------------------------------
00272 
00273   size[0] = GJP.TnodeSites();
00274   size[1] = GJP.XnodeSites();
00275   size[2] = GJP.YnodeSites();
00276   size[3] = GJP.ZnodeSites();
00277   split =  (  (size[0]>2 &&size[1]>2 && size[2]>2 && size[3] >2 ) ? 0 : 1 ); 
00278 //  split =1;
00279 
00280   vol = size[0] * size[1] * size[2] * size[3];
00281 
00282   non_local_chi = 2*(size[0]*size[1]*size[2] + size[1]*size[2]*size[3]+
00283     size[2]*size[3]*size[0] + size[3]*size[0]*size[1]);
00284 
00285   non_local_chi_3 = 0;
00286   for(i=0;i<4;i++){
00287     if(size[i]<4)
00288       non_local_chi_3 += 4*vol/size[i];
00289     else
00290       non_local_chi_3 += 6*vol/size[i];
00291   }
00292 
00293 
00294   local_chi = NUM_DIR*vol - non_local_chi;
00295   local_chi_3 = NUM_DIR*vol - non_local_chi_3;
00296   nflush = vol/8;
00297   tmpfrm = (IFloat*)smalloc(NUM_DIR*2*vol/2 * VECT_LEN2 * sizeof(IFloat),
00298                             "tmpfrm",fname,cname);
00299 
00300 
00301   //-----------------------------------------------------------------
00302   //  Allocate 8 receive buffers for off-node vectors
00303   //-----------------------------------------------------------------
00304 
00305 
00306   chi_off_node_total = (IFloat*) 
00307     smalloc(3*non_local_chi*VECT_LEN*sizeof(IFloat)/2,
00308             "chi_off_node_total",fname,cname);
00309   for ( j= 0; j < 3; j++ ){
00310     chi_off_node[j][0] = &(chi_off_node_total[ j*non_local_chi* VECT_LEN/2 ] ); 
00311     chi_off_node_p[j][0] = (IFloat *)(sizeof (IFloat)*j*non_local_chi* VECT_LEN/2 );
00312     for ( i = 1; i < NUM_DIR; i++ ){
00313       chi_off_node[j][i] = chi_off_node[j][i-1]+vol/(2*size[i%4])*VECT_LEN;
00314       chi_off_node_p[j][i] = chi_off_node_p[j][i-1]+vol/(2*size[i%4])*VECT_LEN;
00315     }
00316   }
00317   //-----------------------------------------------------------------
00318   //  Space for storage of pointers to chi's.  2 pointers per site,
00319   //  but split into even and odd groups for the first part of the
00320   //  computation (parallel transport of spinors). 17 pointers per site
00321   //  to obtain the result of the application of the dirac operator
00322   //
00323   //  The size of chi_l and chi_nl has been doubled for additional U*Chi and UUU*chi
00324   //-----------------------------------------------------------------
00325 
00326 #ifndef SIMUL
00327   for ( i = 0; i < 2; i++ ){
00328     chi_l[i] = (IFloat**)smalloc((2*(local_chi+local_chi_3)/2)*sizeof(IFloat*),
00329                                  "chi_l[i]", fname, cname);
00330     chi_nl[i] = (IFloat**)
00331       smalloc((2*(non_local_chi+non_local_chi_3)/2)*sizeof(IFloat*),
00332               "chi_l[i]", fname, cname);
00333   }
00334 #endif
00335   
00336   for ( i = 0; i < 2; i++){
00337     local_count[i] = 0;
00338     non_local_count[i] = 0;
00339     local_count_3[i] = 0;
00340     for( n = 0; n < 3; n++ )  non_local_count_3[n][i] = 0;
00341 
00342   }
00343 
00344 
00345 #ifndef SIMUL
00346   //-----------------------------------------------------------------
00347   //  Loop over all directions
00348   //-----------------------------------------------------------------
00349   for ( n = 0; n < NUM_DIR; n++ ) {
00350 
00351     //-----------------------------------------------------------------
00352     //  Loop over all sites
00353     //-----------------------------------------------------------------
00354     for (x[3] = 0; x[3] < size[3]; x[3]++){
00355       for (x[2] = 0; x[2] < size[2]; x[2]++){
00356         for (x[1] = 0; x[1] < size[1]; x[1]++){
00357         for (x[0] = 0; x[0] < size[0]; x[0]++){  
00358 
00359             for (i = 0; i < 4 ; i++) coord[i] = x[i];
00360 
00361             odd = ( coord[0] + coord[1] + coord[2] + coord[3] ) % 2;
00362 
00363             sg = coord[1] + size[1] * ( coord[2] + size[2] * ( coord[3] + size[3] * coord[0] ));
00364             m = (2* NUM_DIR + 1) * (sg/2);
00365 
00366             if ( CoordNN( n ) ) {               //  off-node
00367               //----------------------------------------------------------
00368               // Assembly written for double precision only, multiplication
00369               // by sizeof(double) done to avoid a bitshift inside the
00370               // high performance code
00371               //----------------------------------------------------------
00372               //pointer to source field (offset in the receive buffer)
00373 
00374               *( chi_nl[ odd ]  +  2 * non_local_count[ odd ] )
00375                 = ( IFloat * ) ( VECT_LEN * ( LexVector( coord_nn ) / 2 ) * sizeof(IFloat));
00376 
00377               // pointer to temporary field where U*chi is stored
00378               *( chi_nl[ odd ] + 2*non_local_count[ odd ] +1 ) =
00379                                 ( IFloat * ) ( VECT_LEN2 * (LexVector( coord ) / 2 * 2*NUM_DIR+n )  * sizeof(IFloat));
00380               non_local_count[odd]++;
00381             }
00382         }
00383        }
00384       }
00385      }
00386     }
00387   //-----------------------------------------------------------------
00388   //  Loop over all directions
00389   //-----------------------------------------------------------------
00390   for ( n = 0; n < NUM_DIR; n++ ) {
00391 
00392     //-----------------------------------------------------------------
00393     //  Loop over all sites
00394     //-----------------------------------------------------------------
00395     for (x[3] = 0; x[3] < size[3]; x[3]++){
00396       for (x[2] = 0; x[2] < size[2]; x[2]++){
00397                         for (x[1] = 0; x[1] < size[1]; x[1]++){
00398                                 for (x[0] = 0; x[0] < size[0]; x[0]++){  
00399 
00400             for (i = 0; i < 4 ; i++) coord[i] = x[i];
00401 
00402             odd = ( coord[0] + coord[1] + coord[2] + coord[3] ) % 2;
00403 
00404             sg = coord[1] + size[1] * ( coord[2] + size[2] * ( coord[3] + size[3] * coord[0] ));
00405             m = (2* NUM_DIR + 1) * (sg/2);
00406 
00407             if ( !CoordNN( n ) ) {              //  off-node
00408 #ifndef SIMUL
00409               //pointer to source field
00410               *( chi_l[ odd ]  +  2 * local_count[ odd ] )
00411                 = ( IFloat * ) ( VECT_LEN * ( LexVector( coord_nn ) / 2 ) * sizeof(IFloat));
00412               // pointer to temporary field where U*chi is stored
00413               *( chi_l[ odd ]  +  2 * local_count[ odd ] + 1) =
00414                                         ( IFloat * ) ( VECT_LEN2 * (LexVector( coord ) / 2 *2*NUM_DIR+n) * sizeof(IFloat));
00415 #endif
00416               local_count[odd]++;
00417             }  //else-on_node case
00418         }
00419        }
00420       }
00421      }
00422     }
00423   //-----------------------------------------------------------------
00424   //  Loop over all directions
00425   //-----------------------------------------------------------------
00426   for ( n = 0; n < NUM_DIR; n++ ) {
00427 
00428     //-----------------------------------------------------------------
00429     //  Loop over all sites
00430     //-----------------------------------------------------------------
00431     for (x[3] = 0; x[3] < size[3]; x[3]++){
00432       for (x[2] = 0; x[2] < size[2]; x[2]++){
00433                         for (x[1] = 0; x[1] < size[1]; x[1]++){
00434                                 for (x[0] = 0; x[0] < size[0]; x[0]++){  
00435 
00436             for (i = 0; i < 4 ; i++) coord[i] = x[i];
00437 
00438             odd = ( coord[0] + coord[1] + coord[2] + coord[3] ) % 2;
00439 
00440             sg = coord[1] + size[1] * ( coord[2] + size[2] * ( coord[3] + size[3] * coord[0] ));
00441             m = (2* NUM_DIR + 1) * (sg/2);
00442 
00443 
00444 
00445    //******chi3*********
00446 
00447   if ( CoordkNN( n, 3 ) ) {             //  chi3_off-node
00448         
00449     for (j=0; j<3; j++){
00450        if((coord_knn[n%4]==j&&n<4)||(coord_knn[n%4]==(size[n%4]-1-j)&& n>3)) {
00451          *( chi_nl[ odd ] + 2 * non_local_count_3[j][odd]+(j+1)*non_local_chi )
00452         = ( IFloat * ) ( VECT_LEN * ( LexVector( coord_knn ) / 2 )
00453                                  * sizeof(IFloat));
00454         *( chi_nl[odd] + 2 * non_local_count_3[j][odd] + (j+1)*non_local_chi+1 ) = ( IFloat * ) ( VECT_LEN2 * (LexVector( coord ) / 2*2*NUM_DIR+n+8 )  * sizeof(IFloat));
00455         non_local_count_3[j][odd]++;
00456        }
00457     }
00458 
00459                                
00460           }  // end of chi3_off-node
00461         }
00462        }
00463       }
00464      }
00465     }
00466   //-----------------------------------------------------------------
00467   //  Loop over all directions
00468   //-----------------------------------------------------------------
00469   for ( n = 0; n < NUM_DIR; n++ ) {
00470 
00471     //-----------------------------------------------------------------
00472     //  Loop over all sites
00473     //-----------------------------------------------------------------
00474     for (x[3] = 0; x[3] < size[3]; x[3]++){
00475       for (x[2] = 0; x[2] < size[2]; x[2]++){
00476                         for (x[1] = 0; x[1] < size[1]; x[1]++){
00477                                 for (x[0] = 0; x[0] < size[0]; x[0]++){  
00478 
00479             for (i = 0; i < 4 ; i++) coord[i] = x[i];
00480 
00481             odd = ( coord[0] + coord[1] + coord[2] + coord[3] ) % 2;
00482 
00483             sg = coord[1] + size[1] * ( coord[2] + size[2] * ( coord[3] + size[3] * coord[0] ));
00484             m = (2* NUM_DIR + 1) * (sg/2);
00485 
00486   if ( !CoordkNN( n, 3 ) ) {            //  chi3_off-node
00487 #ifndef SIMUL
00488               //pointer to source field
00489               *( chi_l[ odd ]  +  2 * local_count_3[ odd ] + local_chi )
00490                 = ( IFloat * ) ( VECT_LEN * ( LexVector( coord_knn ) / 2 )
00491                                  * sizeof(IFloat));
00492               // pointer to temporary field where U*chi is stored
00493               *( chi_l[ odd ]  +  2 * local_count_3[ odd ] + local_chi + 1) =
00494                 ( IFloat * ) ( VECT_LEN2 * (LexVector( coord ) / 2 *2*NUM_DIR+n+8) * sizeof(IFloat));
00495 #endif
00496               local_count_3[odd]++;
00497             }  //else-on_node case
00498 
00499 
00500           } // for x[0] loop
00501         }
00502       }
00503     }// for x[3] loop
00504   }//for n loop
00505 #endif /* SIMUL */
00506   FILE *fp;
00507 
00508 #ifndef SIMUL
00509 #if 0
00510   fp=Fopen("chi_l.h","w");
00511   for(j=0;j<2;j++){
00512     Fprintf(fp,"IFloat * chi_l%d[] LOCATE(\"edramtransient\") = {\n",j);
00513     Fprintf(fp," (IFloat *) %d",*(chi_l[j]));
00514     for(i=1;i< 2*((local_chi+local_chi_3)/2);i++){
00515       Fprintf(fp,",\n (IFloat *) %d",*(chi_l[j]+i));
00516     }
00517     Fprintf(fp,"\n};\n");
00518   }
00519   Fclose(fp);
00520 
00521   fp=Fopen("chi_nl.h","w");
00522     Fprintf(fp,"testing\n");
00523   for(j=0;j<2;j++){
00524     Fprintf(fp,"IFloat * chi_nl%d[] LOCATE(\"edramtransient\") = {\n",j);
00525     Fprintf(fp," (IFloat *) %d",*(chi_nl[j]));
00526     for(i=1;i< 2*((non_local_chi+ non_local_chi_3)/2);i++){
00527       Fprintf(fp,",\n (IFloat *) %d",*(chi_nl[j]+i));
00528     }
00529     Fprintf(fp,"\n};\n");
00530   }
00531 
00532   Fclose(fp);
00533 #endif
00534 
00535 #endif /* SIMUL */
00536 
00537 
00538 
00539 
00540 
00541 
00542   //-------------------------------------------------------------------
00543   //  Set up SCU buffer parameters.  T direction is special, since
00544   //  the block-strided move will not work here.
00545   //-------------------------------------------------------------------
00546 
00547   blklen[0] = VECT_LEN * sizeof(IFloat) * size[1] * size[2] * size[3] / 2;
00548   blklen[1] = VECT_LEN * sizeof(IFloat) * size[0] / 2;
00549   blklen[2] = VECT_LEN * sizeof(IFloat) * size[0] * size[1] / 2;
00550   blklen[3] = VECT_LEN * sizeof(IFloat) * size[0] * size[1] * size[2] / 2;
00551 
00552   numblk[0] = 1;
00553   numblk[1] = size[2] * size[3];
00554   numblk[2] = size[3];
00555   numblk[3] = 1;
00556 
00557   stride[0] = 0;
00558   stride[1] = (VECT_LEN * size[0] * ( size[1] - 1 ) / 2)*sizeof(IFloat);
00559   stride[2] = (VECT_LEN * size[0] * size[1] * ( size[2] - 1 ) / 2)* sizeof(IFloat) ;
00560   stride[3] = 0;
00561 
00562   //-------------------------------------------------------------------
00563   //  Calculate offsets for T transfers done one word at a time.
00564   //  We have plus (P) transfers for both the even and odd
00565   //  checkerboards.  Same for minus (M) transfers.
00566   //-------------------------------------------------------------------
00567 
00568   for ( k = 0; k < 3; k++ ) {
00569   for ( i = 0; i < 2; i++ ) {
00570 #ifndef SIMUL_TBUF
00571     Tbuffer[k][i] = (IFloat*) 
00572       smalloc(size[1]*size[2]*size[3]*VECT_LEN*sizeof(IFloat)/2,
00573               "Tbuffer[k][i]", fname, cname);
00574     ToffsetP[k][i] = (int*) smalloc(size[1]*size[2]*size[3]*sizeof(int)/2,
00575                                     "ToffsetP[k][i]", fname, cname);
00576     ToffsetM[k][i] = (int*) smalloc(size[1]*size[2]*size[3]*sizeof(int)/2,
00577                                     "ToffsetM[k][i]", fname, cname);
00578 #endif
00579     countP[k][i] = 0;
00580     countM[k][i] = 0;
00581   }
00582  }
00583   for ( sg = 0; sg < vol; sg++ ) {
00584 
00585     odd = SetCoord( sg );
00586     sc = LexVector( coord );
00587 
00588     for ( int j = 0; j < 3; j++ ) {
00589       if ( coord[0] == j ) {
00590         *( ToffsetM[j][ odd ] + countM[j][ odd ] ) = VECT_LEN * ( sc / 2 );
00591         countM[j][ odd ]++;
00592       }
00593       if ( coord[0] == size[0] - 1 -j ) {
00594         *( ToffsetP[j][ odd ] + countP[j][ odd ] ) = VECT_LEN * ( sc / 2 );
00595         countP[j][ odd ]++;
00596       }
00597     }//end of j loop
00598   } //end of sg loop
00599   
00600   int vol3 = (size[1] * size[2] * size[3])/2;
00601 
00602   //-------------------------------------------------------------------
00603   //  Index i says data has been received from TP, XP, YP, ZP, TM, XM,
00604   //  YM, ZM
00605   //-------------------------------------------------------------------
00606 
00607   for ( i = 0; i < NUM_DIR; i++ ) {
00608     j = i % (NUM_DIR/2);
00609 #if 0
00610       SCUarg[i + NUM_DIR] = &(SCUargIR[i+NUM_DIR]);
00611       SCUarg[i + NUM_DIR] ->Init(chi_off_node[2][i], scudir[i], SCU_REC,
00612                     VECT_LEN * sizeof(IFloat) * vol / ( 2 * size[j] ), 1, 0, IR_7);
00613       SCUarg[i + NUM_DIR] ->Assert();
00614 
00615     SCUDMAarg_p[(i+NUM_DIR)*2]  = &(SCUDMAarg[(i+NUM_DIR)*2]);
00616     SCUDMAarg_p[(i+NUM_DIR)*2] ->Init(chi_off_node[0][i],
00617       VECT_LEN * sizeof(IFloat) * vol / ( 2 * size[j] ), 1, 0);
00618     SCUDMAarg_p[(i+NUM_DIR)*2+1]  = &(SCUDMAarg[(i+NUM_DIR)*2+1]);
00619     SCUDMAarg_p[(i+NUM_DIR)*2+1] ->Init(chi_off_node[1][i],
00620       VECT_LEN * sizeof(IFloat) * vol / ( 2 * size[j] ), 1, 0);
00621 
00622     if( split ){
00623       SCUarg_1[i + NUM_DIR] = &(SCUarg_1IR[i+NUM_DIR]);
00624       SCUarg_1[i + NUM_DIR] ->Init(scudir[i],SCU_REC, &SCUDMAarg_p[(i+NUM_DIR)*2],1, IR_5);
00625       SCUarg_1[i + NUM_DIR] ->Assert();
00626       SCUarg_2[i + NUM_DIR] = &(SCUarg_2IR[i+NUM_DIR]);
00627       SCUarg_2[i + NUM_DIR] ->Init(scudir[i],SCU_REC, &SCUDMAarg_p[(i+NUM_DIR)*2+1],1, IR_6);
00628       SCUarg_2[i + NUM_DIR] ->Assert();
00629     } else {
00630       SCUarg_1[i + NUM_DIR] = &(SCUarg_1IR[i+NUM_DIR]);
00631       SCUarg_1[i + NUM_DIR] ->Init(scudir[i],SCU_REC, &SCUDMAarg_p[(i+NUM_DIR)*2],2, IR_5);
00632       SCUarg_1[i + NUM_DIR] ->Assert();
00633     }
00634 
00635       // change the size of buffer_flush or add 2 more??
00636       // never used ??  // 12288 or 384?
00637       buffer_flush[0][i] = VECT_LEN * sizeof(IFloat) * vol/ (12288 * size[j]);
00638       buffer_flush[1][i] = VECT_LEN * sizeof(IFloat) * vol/ (12288 * size[j]);
00639       buffer_flush[2][i] = VECT_LEN * sizeof(IFloat) * vol/ (12288 * size[j]);
00640 
00641     //send arguments
00642     if ((i == 0) || ( i == 4)){
00643       SCUarg[i] = &(SCUargIR[i]);
00644       if (size[j] >4)
00645         SCUarg[i] ->Init (Tbuffer[2][(4 - i)/4], scudir[i], SCU_SEND,
00646                        blklen[j], numblk[j], stride[j], IR_7 );
00647       else if (size[j] >2) 
00648         SCUarg[i] ->Init (Tbuffer[1][i/4], scudir[i], SCU_SEND,
00649                        blklen[j], numblk[j], stride[j], IR_7 );
00650       else
00651         SCUarg[i] ->Init (chi_off_node[0][4-i], scudir[i], SCU_SEND,
00652                        blklen[j], numblk[j], stride[j], IR_7 );
00653       SCUarg[i] ->Assert();
00654 
00655       SCUDMAarg_p[i*2] = &(SCUDMAarg[i*2]);
00656       SCUDMAarg_p[i*2] ->Init(Tbuffer[0][(4 - i)/4], 
00657                        blklen[j], numblk[j], stride[j]);
00658       SCUDMAarg_p[i*2+1] = &(SCUDMAarg[i*2+1]);
00659 
00660       if(size[j]>2)
00661         SCUDMAarg_p[i*2+1] ->Init(Tbuffer[1][(4 - i)/4], 
00662                        blklen[j], numblk[j], stride[j]);
00663       else
00664         SCUDMAarg_p[i*2+1] ->Init(Tbuffer[0][i/4], 
00665                        blklen[j], numblk[j], stride[j]);
00666     if( split ){
00667       SCUarg_1[i] = &(SCUarg_1IR[i]);
00668       SCUarg_1[i] ->Init(scudir[i],SCU_SEND,&SCUDMAarg_p[i*2],1,IR_5);
00669       SCUarg_1[i] ->Assert();
00670       SCUarg_2[i] = &(SCUarg_2IR[i]);
00671       SCUarg_2[i] ->Init(scudir[i],SCU_SEND,&SCUDMAarg_p[i*2+1],1,IR_6);
00672       SCUarg_2[i] ->Assert();
00673     } else {
00674       SCUarg_1[i] = &(SCUarg_1IR[i]);
00675       SCUarg_1[i] ->Init(scudir[i],SCU_SEND,&SCUDMAarg_p[i*2],2,IR_5);
00676       SCUarg_1[i] ->Assert();
00677     }
00678 
00679     }
00680     else{
00681       SCUarg[i] = &(SCUargIR[i]);
00682       if(size[j] >2)
00683         SCUarg[i] ->Init(( void * ) 0, scudir[i], SCU_SEND,
00684                        blklen[j], numblk[j], stride[j], IR_7 );
00685       else
00686         SCUarg[i] ->Init(chi_off_node[0][(i+4)%NUM_DIR], scudir[i], SCU_SEND,
00687            VECT_LEN * sizeof(IFloat) * vol / ( 2 * size[j]),1,0, IR_7 );
00688       SCUarg[i] ->Assert();
00689 
00690       SCUDMAarg_p[i*2] = &(SCUDMAarg[i*2]);
00691       SCUDMAarg_p[i*2] ->Init((void *)0, 
00692                        blklen[j], numblk[j], stride[j]);
00693       SCUDMAarg_p[i*2+1] = &(SCUDMAarg[i*2+1]);
00694       SCUDMAarg_p[i*2+1] ->Init((void *)0, 
00695                        blklen[j], numblk[j], stride[j]);
00696       if( split ){
00697         SCUarg_1[i] = &(SCUarg_1IR[i]);
00698         SCUarg_1[i] -> Init(scudir[i],SCU_SEND,&SCUDMAarg_p[i*2],1,IR_5);
00699         SCUarg_1[i] ->Assert();
00700         SCUarg_2[i] = &(SCUarg_2IR[i]);
00701         SCUarg_2[i] -> Init(scudir[i],SCU_SEND,&SCUDMAarg_p[i*2+1],1,IR_6);
00702         SCUarg_2[i] ->Assert();
00703       } else {
00704         SCUarg_1[i] = &(SCUarg_1IR[i]);
00705         SCUarg_1[i] -> Init(scudir[i],SCU_SEND,&SCUDMAarg_p[i*2],2,IR_5);
00706         SCUarg_1[i] ->Assert();
00707       }
00708     }
00709 #endif
00710   }// end of NUM_DIR loop
00711 
00712 
00713 #if 0
00714   SCUmulti = &SCUmultiIR;
00715   SCUmulti->Init(SCUarg, 2*NUM_DIR);
00716   SCUmulti_1 = &SCUmulti_1IR;
00717   SCUmulti_1->Init(SCUarg_1, 2*NUM_DIR);
00718   if( split ){
00719     SCUmulti_2 = &SCUmulti_2IR;
00720     SCUmulti_2->Init(SCUarg_2, 2*NUM_DIR);
00721   }
00722 #endif
00723 
00724   //-------------------------------------------------------------------
00725   //  Need send offsets for various transfers.  The index for
00726   //  sends is TM, XM, YM, ZM, TP, XP, YP, ZP, since the
00727   //  transfers are indexed by the node data is received from.
00728   //-------------------------------------------------------------------
00729 for ( k = 0; k < 3; k++ ) {
00730   Xoffset[k][0] = 0;
00731   Xoffset[k][1] = VECT_LEN * size[0] * (size[1] - 1-k) / 2;
00732   Xoffset[k][2] = VECT_LEN * size[0] * size[1] * (size[2] - 1-k) / 2;
00733   Xoffset[k][3] = VECT_LEN * size[0] * size[1] * size[2] * (size[3]-1-k) / 2;
00734   Xoffset[k][4] = 0;
00735   Xoffset[k][5] = VECT_LEN * size[0] *k /2;
00736   Xoffset[k][6] = VECT_LEN * size[0] * size[1]*k /2;
00737   Xoffset[k][7] = VECT_LEN * size[0] * size[1] * size[2]*k /2;
00738 } // end of k loop
00739 }
00740 
00741 extern "C"
00742 void asqtad_destroy_dirac_buf()
00743 {
00744   char *cname = "";
00745   char *fname = "asqtad_destroy_dirac_buf()";
00746   int i,k;
00747 
00748   for ( i = 0; i < 2; i++ ) {
00749     for ( k = 0; k < 3; k++ ) {
00750 #ifndef SIMUL_TBUF
00751       sfree(Tbuffer[k][i],"Tbuffer[k][i]",fname,cname);
00752       sfree(ToffsetP[k][i],"ToffsetP[k][i]",fname,cname);
00753       sfree(ToffsetM[k][i],"ToffsetM[k][i]",fname,cname);
00754 #endif
00755     }
00756 #ifndef SIMUL
00757     sfree(chi_nl[i],"chi_nl[i]",fname,cname);
00758     sfree(chi_l[i],"chi_l[i]",fname,cname);
00759 #endif
00760   }
00761     
00762    sfree(chi_off_node_total,"chi_off_node_total",fname,cname);
00763   for ( i = 0; i < NUM_DIR; i++ ) {
00764 #if 0
00765     delete SCUarg[i];
00766     delete SCUarg[i+8];
00767     delete SCUarg_1[i];
00768     delete SCUarg_1[i+8];
00769     for(k=0;k<4;k++)
00770       delete SCUDMAarg_p[i*4+k];
00771 #endif
00772   }
00773   sfree(tmpfrm,"tmpfrm",fname,cname);
00774 }
00775 //-------------------------------------------------------------------
00776 //  Given a lexical value for gauge fields, set the coordinates.
00777 //  Return 1 if odd, 0 if even
00778 //-------------------------------------------------------------------
00779 
00780 static int SetCoord( int sg )
00781 {
00782   coord[1] =   sg % size[1];
00783   coord[2] = ((int)( sg / size[1] )) % size[2];
00784   coord[3] = ((int)( sg / ( size[1] * size[2] ) )) % size[3];
00785   coord[0] = ((int)( sg / ( size[1] * size[2] * size[3] ) )) % size[0];
00786 
00787   return ( coord[0] + coord[1] + coord[2] + coord[3] ) % 2;
00788 }
00789 
00790 //-------------------------------------------------------------------
00791 //  Prepare a copy of gauge fields and set up 
00792 //  related pointer for Dirac
00793 //-------------------------------------------------------------------
00794 
00795 void TransfP(int off_node, int nflush_g,IFloat * v,IFloat * mtmp
00796 
00797 , int n ){
00798         getPlusData(mtmp,v,18,(n+1)%4);
00799  }
00800 
00801 void TransfM(int off_node, int nflush_g,IFloat * v,IFloat * mtmp, int n ){
00802         getMinusData(mtmp,v,18,(n+1)%4);
00803  }
00804 void DaggerM (IFloat * w_t1,IFloat * v){
00805         int i, j, c, r;
00806         for ( r = 0; r < 3; r++ ) {
00807                         for ( c = 0; c < 3; c++ ) {
00808                   i = 6*r + 2*c;
00809                           j = 6*c + 2*r;
00810                           w_t1[i] = v[j];
00811                           w_t1[i+1] = -v[j+1];
00812            }
00813                }
00814  }
00815 
00816 void Parallel( IFloat * InMatrix,IFloat *OutMatrix  , int *coord, int dir_1, int dir_2, IFloat *v, IFloat * u,int multi_flag ,int tranfs_flag )
00817      {
00818  int nn, nflush_g=1, off_node;  
00819  IFloat stp3_1[18], mtmp[18] ;
00820  if ( dir_1 <4){      //dir_1 positive
00821     //U_mu(x+mu)
00822       nn = ( dir_1 - 1 + 4 ) % 4;
00823       v = u + SITE_LEN *(LexGauge( coord ))+ MATRIX_SIZE * nn;
00824       DaggerM(stp3_1, v); 
00825  } else{       //dir_1 negative
00826     //U_mu(x+mu)~
00827       nn = ( dir_1 - 1 + 4 ) % 4;
00828       v = u + SITE_LEN *(LexGauge( coord ))+ MATRIX_SIZE * nn;
00829       for (int i=0; i<18; i++){
00830         stp3_1[i] = v[i];
00831       }
00832  }//end of dir_1
00833  
00834 if ( multi_flag )
00835      mDotMEqual( OutMatrix, stp3_1, InMatrix ) ;
00836  else {
00837  for (int i=0; i<18; i++){
00838  OutMatrix[i] = stp3_1[i];
00839  }
00840  }
00841 
00842 
00843 if ( tranfs_flag){
00844   if(dir_2<4){ 
00845      
00846       off_node = CoordNN( dir_2 );
00847       coord[ dir_2]= coord_nn[ dir_2];// x+nu-> x
00848 
00849       TransfM( off_node,  nflush_g, OutMatrix, mtmp,dir_2%4  );//transfer OutMatrix to x     
00850   }else {
00851      dir_2= dir_2%4; 
00852       off_node = CoordNN( dir_2+4 );
00853       coord[ dir_2]= coord_nn[ dir_2];// x+nu-> x
00854 
00855       TransfP( off_node,  nflush_g, OutMatrix, mtmp,dir_2%4  );//transfer OutMatrix to x     
00856   
00857   }
00858 }
00859 
00860      } //Parallel
00861 
00862 
00863 void Staple3_PP( IFloat * InMatrix,IFloat *OutMatrix  , int *coord, int n,
00864                  int nu, IFloat *v, IFloat * u, int nflush_g,int sum_flag )
00865     { 
00866  int  nn, i, off_node;
00867 IFloat  stp3_0[18], stp3_2[18],  stp3_4[18], mtmp[18];
00868 
00869   CoordNN( n ); // 
00870   coord[n]= coord_nn[n];// x+mu
00871 
00872   //get U_nu(x+mu)~ and transfer it to  x+nu+mu
00873    Parallel( InMatrix,stp3_0, coord, nu+4, nu, v,  u, 1, 1 );
00874 
00875    // transfer U_nu(x+mu)~ to  x+nu
00876      off_node = CoordNN( n+4 );// x+mu +nu-> x+nu
00877      coord[n]= coord_nn[n];// x+nu
00878      TransfP( off_node,  nflush_g, stp3_0, mtmp,  n);//transfer stp3_0 to  x+nu
00879  
00880  //get U_mu(x+nu)& transfer U_mu(x+nu)*U_nu(x+mu)~ from x+nu to x site  
00881     Parallel( stp3_0,stp3_2, coord, n, nu+4, v,  u, 1, 1 );
00882 //get U_nu(x+mu) & mulyiply U_nu(x+mu)*U_mu(x+nu)*U_nu(x+mu)~
00883     Parallel( stp3_2,stp3_4, coord, nu, 0, v,  u, 1, 0 );
00884 
00885     for ( i = 0; i < 18; i++ ) {
00886                OutMatrix[i] = stp3_4[i];
00887            }
00888 
00889     } //Staple3_PP
00890 
00891 void Staple3_PN( IFloat * InMatrix,IFloat *OutMatrix  , int *coord, int n,
00892                  int nu, IFloat *v, IFloat * u, int nflush_g,int sum_flag ) 
00893 {
00894 int nn, i, off_node;
00895 IFloat  stp3_0[18], stp3_2[18],  stp3_4[18], mtmp[18];
00896 
00897     
00898      CoordNN( n );
00899      coord[n]= coord_nn[n];// x+mu
00900      CoordNN( nu+4 );
00901      coord[nu]= coord_nn[nu];// x+mu-nu
00902 
00903  //U_nu(x+mu) at x+mu-nu & transfer it to  x-nu
00904       Parallel( InMatrix,stp3_0, coord, nu, n+4, v,  u, 1, 1 );
00905 
00906 //U_mu(x-nu) *U_nu(x+mu)  at x-nu
00907        Parallel(stp3_0 ,stp3_2, coord, n, 0, v,  u, 1, 0 );
00908 
00909 //U_nu(x-nu)~ * U_mu(x-nu) *U_nu(x+mu) & transfer it to  x
00910         Parallel(stp3_2 ,stp3_4, coord, nu+4, nu, v,  u, 1, 1 );
00911 
00912     for ( i = 0; i < 18; i++ ) {
00913                OutMatrix[i] = stp3_4[i];
00914            }
00915 
00916  
00917      
00918 }//Staple3_PN
00919 
00920 
00921 void Staple3_NP( IFloat * InMatrix,IFloat *OutMatrix  , int *coord, int n,int nu, IFloat *v, IFloat * u, int nflush_g,int sum_flag )
00922     { 
00923  int  nn, i, off_node;
00924 IFloat  stp3_0[18], stp3_2[18],  stp3_4[18], mtmp[18];
00925   
00926 
00927   CoordNN( n+4 ); // 
00928   coord[n]= coord_nn[n];// x-mu
00929 
00930   //get U_nu(x-mu)~ and transfer it to  x+nu-mu
00931    Parallel( InMatrix,stp3_0, coord, nu+4, nu, v,  u, 1, 1 );
00932 
00933  //get U_mu(x-mu+nu)~& transfer U_mu(x-mu+nu~)*U_nu(x-mu)~ from x+nu-mu to x+nu site  
00934     Parallel( stp3_0,stp3_2, coord, n+4, n, v,  u, 1, 1 );
00935 
00936     //transfer U_mu(x-mu+nu~)*U_nu(x-mu)~ from x+nu to x site  
00937   CoordNN( nu+4 ); // 
00938   coord[nu]= coord_nn[nu];// x
00939  TransfP( off_node,  nflush_g, stp3_2 , mtmp, nu  );
00940   // TransfM( off_node,  nflush_g, stp3_2 , mtmp, nu  );
00941 //get U_nu(x) &  U_nu(x)*U_mu(x+nu-mu)*U_nu(x-mu)~
00942     Parallel( stp3_2,stp3_4, coord, nu, 0, v,  u, 1, 0 );
00943 
00944 
00945     for ( i = 0; i < 18; i++ ) {
00946    
00947                OutMatrix[i] = stp3_4[i];
00948            }
00949 
00950 
00951 
00952 }
00953 
00954 void Staple3_NN( IFloat * InMatrix,IFloat *OutMatrix  , int *coord, int n,int nu, IFloat *v, IFloat * u, int nflush_g, int sum_flag )
00955     { 
00956 
00957  int nn, i, off_node;
00958 IFloat  stp3_0[18], stp3_2[18],  stp3_4[18], mtmp[18];
00959  
00960      
00961      CoordNN( n+4 );
00962      coord[n]= coord_nn[n];// x-mu
00963      CoordNN( nu+4 );
00964      coord[nu]= coord_nn[nu];// x-mu-nu
00965 
00966  //*U_nu(x-mu-nu ) at x-mu-nu 
00967       Parallel( InMatrix,stp3_0, coord, nu, 0, v,  u, 1, 0 );
00968       // Parallel( InMatrix,stp3_0, coord, nu, 0, v,  u, 0, 0 );
00969 
00970 //U_mu(x-mu-nu)~ *U_nu(x-mu-nu )at x-mu-nu & transfer it to  x-nu
00971        Parallel(stp3_0 ,stp3_2, coord, n+4, n, v,  u, 1, 1 );
00972  
00973        // get U_nu(x-nu)~ & transfer U_nu(x-nu)~ * U_mu(x-mu-nu)~ *U_nu(x-mu-nu ) to  x
00974         Parallel(stp3_2 ,stp3_4, coord, nu+4, nu, v,  u, 1, 1 );
00975 
00976     for ( i = 0; i < 18; i++ ) {
00977 
00978                OutMatrix[i] = stp3_4[i];
00979            
00980            }
00981 
00982    
00983 }//Staple3_NN
00984 
00985 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
00986 //first P refers to n dir positive, the second P refers to  ro dir positive 
00987 //while the nu direction could be taken from 0 to 7 
00988 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
00989 void Staple5_PP( IFloat * InMatrix,IFloat *OutMatrix  , int *coord, int n,
00990                  int nu,int ro, IFloat *v, IFloat * u, int nflush_g )
00991  { 
00992     int  nn, i, off_node;
00993  IFloat stp5_0[18], stp5_1[18], stp5_2[18], 
00994    stp5_3[18] , stp5_4[18], stp5_5[18], mtmp[18];
00995      //U_ro(x+mu)~ and transfer it to  x+ro+mu
00996      CoordNN( n ); //
00997      coord[n]= coord_nn[n];// x+mu
00998       nn = ( ro - 1 + 4 ) % 4;
00999       v = u + SITE_LEN *(LexGauge( coord ))+ MATRIX_SIZE * nn;
01000     mDotMEqual( stp5_0, v, InMatrix  ) ;    
01001 
01002      off_node = CoordNN( ro );// x+mu -> x+mu+ro
01003      coord[ro]= coord_nn[ro];// x+mu+ro
01004      TransfM( off_node,  nflush_g, stp5_0, mtmp,  ro); //transfer stp5_0 to  x+mu+ro
01005 
01006      CoordNN( n+4 ); // 
01007      coord[n]= coord_nn[n];//  x+mu+ro -> x+ro
01008  
01009  
01011  if( nu<4)
01012     Staple3_PP(stp5_0,stp5_4, coord, n,nu, v, u, nflush_g,1 );
01013 else    
01014     Staple3_PN(stp5_0,stp5_4, coord, n,nu%4, v, u, nflush_g,1 );
01015 
01016  
01017 
01018    //transfer it to  x 
01019      off_node = CoordNN( ro +4 );//  x+ro ->x
01020      coord[ro]= coord_nn[ro];//
01021      TransfP( off_node,  nflush_g, stp5_4, mtmp, ro);
01022 
01024       nn = ( ro - 1 + 4 ) % 4;
01025       v = u + SITE_LEN *(LexGauge( coord ))+ MATRIX_SIZE * nn;
01026       DaggerM(stp5_1,v);
01027        mDotMEqual( OutMatrix, stp5_1,  stp5_4) ;
01028 
01029 
01030  }
01031 
01032 //n, ro 
01033 void Staple5_PN( IFloat * InMatrix,IFloat *OutMatrix  , int *coord, int n,
01034                  int nu,int ro, IFloat *v, IFloat * u, int nflush_g )
01035 {
01036 
01037     int  nn, i, off_node;
01038  IFloat stp5_0[18], stp5_1[18], stp5_2[18], 
01039    stp5_3[18] , stp5_4[18], stp5_5[18], mtmp[18];
01040 
01041   
01042 
01043     //U_ro(x+mu-ro) and transfer it to  x+mu-ro
01044      CoordNN( n ); //
01045      coord[n]= coord_nn[n];// x+mu
01046      CoordNN( ro+4 ); //
01047      coord[ro]= coord_nn[ro];// x+mu-ro
01048 
01049       nn = ( ro - 1 + 4 ) % 4;
01050       v = u + SITE_LEN *(LexGauge( coord ))+ MATRIX_SIZE * nn;
01051       DaggerM(stp5_1, v);
01052       mDotMEqual( stp5_0, stp5_1 , InMatrix  ) ; 
01053  
01054       /*     //transfer stp5_0 to  x+mu-ro+nu
01055      off_node = CoordNN( nu  );//  x+mu-ro ->x+mu-ro+nu
01056      coord[nu]= coord_nn[nu];//
01057      TransfP( off_node,  nflush_g, stp5_0, mtmp, nu); 
01058 
01059      CoordNN( nu+4 ); // 
01060      coord[nu]= coord_nn[nu];//  x+mu-ro+nu -> x+mu-ro*/
01061      CoordNN( n+4 ); // 
01062      coord[n]= coord_nn[n];//  x+mu-ro -> x-ro
01063 
01064 
01065 
01066  if( nu<4)
01067     Staple3_PP(stp5_0,stp5_4, coord, n,nu, v, u, nflush_g,1 );
01068 else    
01069     Staple3_PN(stp5_0,stp5_4, coord, n,nu%4, v, u, nflush_g,1 );
01070 
01071 
01072       
01073       nn = ( ro - 1 + 4 ) % 4;
01074       v = u + SITE_LEN *(LexGauge( coord ))+ MATRIX_SIZE * nn;     
01075       mDotMEqual( OutMatrix, v,  stp5_4) ;
01076 
01077    //transfer it to  x 
01078      off_node = CoordNN( ro  );//  x-ro ->x
01079      coord[ro]= coord_nn[ro];//
01080      TransfM( off_node,  nflush_g, OutMatrix, mtmp, ro);
01081 
01082 }
01083 
01084 void Staple5_NP( IFloat * InMatrix,IFloat *OutMatrix  , int *coord, int n,
01085                  int nu,int ro, IFloat *v, IFloat * u, int nflush_g )
01086  { 
01087     int  nn, i, off_node;
01088  IFloat stp5_0[18], stp5_1[18], stp5_2[18], 
01089    stp5_3[18] , stp5_4[18], stp5_5[18], mtmp[18];
01090  
01091 
01092      //U_ro(x-mu)~ and transfer it to  x+ro+mu
01093      CoordNN( n+4 ); //
01094      coord[n]= coord_nn[n];// x-mu
01095       nn = ( ro - 1 + 4 ) % 4;
01096       v = u + SITE_LEN *(LexGauge( coord ))+ MATRIX_SIZE * nn;
01097       mDotMEqual( stp5_0, v, InMatrix  ) ; 
01098 
01099      off_node = CoordNN( ro );// x-mu -> x+mu+ro
01100      coord[ro]= coord_nn[ro];// x-mu+ro
01101      TransfM( off_node,  nflush_g, stp5_0, mtmp,  ro); //transfer stp5_0 to  x-mu+ro
01102 
01103      CoordNN( n ); // 
01104      coord[n]= coord_nn[n];//  x-mu+ro -> x+ro
01105  
01106  
01108  if( nu<4)
01109     Staple3_NP(stp5_0,stp5_4, coord, n,nu, v, u, nflush_g,1 );
01110 else    
01111     Staple3_NN(stp5_0,stp5_4, coord, n,nu%4, v, u, nflush_g,1 );
01112 
01113  
01114 
01115    //transfer it to  x 
01116      off_node = CoordNN( ro +4 );//  x+ro ->x
01117      coord[ro]= coord_nn[ro];//
01118      TransfP( off_node,  nflush_g, stp5_4, mtmp, ro);
01119 
01121       nn = ( ro - 1 + 4 ) % 4;
01122       v = u + SITE_LEN *(LexGauge( coord ))+ MATRIX_SIZE * nn;
01123       DaggerM(stp5_1,v);
01124        mDotMEqual(OutMatrix, stp5_1,  stp5_4) ;
01125 
01126 
01127 
01128  }
01129 void Staple5_NN( IFloat * InMatrix,IFloat *OutMatrix  , int *coord, int n,
01130                  int nu,int ro, IFloat *v, IFloat * u, int nflush_g )
01131  { 
01132     int  nn, i, off_node;
01133  IFloat stp5_0[18], stp5_1[18], stp5_2[18], 
01134    stp5_3[18] , stp5_4[18], stp5_5[18], mtmp[18];
01135 
01136 
01137    
01138     //U_ro(x-mu-ro) and transfer it to  x+mu-ro
01139      CoordNN( n+4 ); //
01140      coord[n]= coord_nn[n];// x-mu
01141      CoordNN( ro+4 ); //
01142      coord[ro]= coord_nn[ro];// x-mu-ro
01143 
01144       nn = ( ro - 1 + 4 ) % 4;
01145       v = u + SITE_LEN *(LexGauge( coord ))+ MATRIX_SIZE * nn;
01146       DaggerM(stp5_1, v);
01147       mDotMEqual( stp5_0,stp5_1 , InMatrix  ) ; 
01148 
01149      CoordNN( n ); // 
01150      coord[n]= coord_nn[n];//  x-mu-ro -> x-ro
01151 
01152 
01153 
01154  if( nu<4)
01155     Staple3_NP(stp5_0,stp5_4, coord, n,nu, v, u, nflush_g,1 );
01156 else    
01157     Staple3_NN(stp5_0,stp5_4, coord, n,nu%4, v, u, nflush_g,1 );
01158 
01159 
01160       
01161       nn = ( ro - 1 + 4 ) % 4;
01162       v = u + SITE_LEN *(LexGauge( coord ))+ MATRIX_SIZE * nn;     
01163       mDotMEqual( OutMatrix,v,  stp5_4) ;
01164 
01165    //transfer it to  x 
01166      off_node = CoordNN( ro  );//  x-ro ->x
01167      coord[ro]= coord_nn[ro];//
01168      TransfM( off_node,  nflush_g, OutMatrix, mtmp, ro);
01169 
01170 
01171  }
01172 
01173 //#if 0
01174 #ifdef staple7
01175 
01176 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
01177 //first P refers to n dir positive, the second P refers to  de dir positive 
01178 //while the nu & ro direction could be taken from 0 to 7 
01179 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
01180 
01181 void Staple7_PP( IFloat * InMatrix,IFloat *OutMatrix  , int *coord, int n,
01182                  int nu,int ro,int de, IFloat *v, IFloat * u, int nflush_g )
01183  { 
01184     int  nn, i, off_node;
01185  IFloat stp5_0[18], stp5_1[18], stp5_2[18], 
01186    stp5_3[18] , stp5_4[18], stp5_5[18], mtmp[18];
01187      //U_ro(x+mu)~ and transfer it to  x+ro+mu
01188      CoordNN( n ); //
01189      coord[n]= coord_nn[n];// x+mu
01190       nn = ( de - 1 + 4 ) % 4;
01191       v = u + SITE_LEN *(LexGauge( coord ))+ MATRIX_SIZE * nn;
01192     mDotMEqual( stp5_0, v, InMatrix  ) ;    
01193 
01194      off_node = CoordNN( de );// x+mu -> x+mu+ro
01195      coord[de]= coord_nn[de];// x+mu+ro
01196      TransfM( off_node,  nflush_g, stp5_0, mtmp,  de); //transfer stp5_0 to  x+mu+ro
01197 
01198      CoordNN( n+4 ); // 
01199      coord[n]= coord_nn[n];//  x+mu+de -> x+de
01200  
01201  
01203  if( ro<4)
01204     Staple5_PP(stp5_0,stp5_4, coord, n,nu,ro, v, u, nflush_g );
01205 else    
01206     Staple5_PN(stp5_0,stp5_4, coord, n,nu,ro%4, v, u, nflush_g );
01207 
01208  
01209 
01210    //transfer it to  x 
01211      off_node = CoordNN( de +4 );//  x+de ->x
01212      coord[de]= coord_nn[de];//
01213      TransfP( off_node,  nflush_g, stp5_4, mtmp, de);
01214 
01215    
01216       nn = ( de - 1 + 4 ) % 4;
01217       v = u + SITE_LEN *(LexGauge( coord ))+ MATRIX_SIZE * nn;
01218       DaggerM(stp5_1,v);
01219        mDotMEqual( OutMatrix, stp5_1,  stp5_4) ;
01220 
01221 
01222  }
01223 
01224 
01225 void Staple7_PN( IFloat * InMatrix,IFloat *OutMatrix  , int *coord, int n,
01226                  int nu,int ro,int de, IFloat *v, IFloat * u, int nflush_g )
01227 {
01228 
01229     int  nn, i, off_node;
01230  IFloat stp5_0[18], stp5_1[18], stp5_2[18], 
01231    stp5_3[18] , stp5_4[18], stp5_5[18], mtmp[18];
01232 
01233   
01234 
01235     //U_de(x+mu-de) and transfer it to  x+mu-de
01236      CoordNN( n ); //
01237      coord[n]= coord_nn[n];// x+mu
01238      CoordNN( de+4 ); //
01239      coord[de]= coord_nn[de];// x+mu-de
01240 
01241       nn = ( de - 1 + 4 ) % 4;
01242       v = u + SITE_LEN *(LexGauge( coord ))+ MATRIX_SIZE * nn;
01243       DaggerM(stp5_1, v);
01244       mDotMEqual( stp5_0, stp5_1 , InMatrix  ) ; 
01245  
01246      CoordNN( n+4 ); // 
01247      coord[n]= coord_nn[n];//  x+mu-de -> x-de
01248 
01249 
01250 
01251  if( ro<4)
01252     Staple5_PP(stp5_0,stp5_4, coord, n,nu,ro, v, u, nflush_g);
01253  
01254 else    
01255     Staple5_PN(stp5_0,stp5_4, coord, n,nu,ro%4, v, u, nflush_g );
01256 
01257 
01258       
01259       nn = ( de - 1 + 4 ) % 4;
01260       v = u + SITE_LEN *(LexGauge( coord ))+ MATRIX_SIZE * nn;     
01261       mDotMEqual( OutMatrix, v,  stp5_4) ;
01262 
01263    //transfer it to  x 
01264      off_node = CoordNN( de  );//  x-de ->x
01265      coord[de]= coord_nn[de];//
01266      TransfM( off_node,  nflush_g, OutMatrix, mtmp, de);
01267 
01268 }
01269 
01270 
01271 void Staple7_NP( IFloat * InMatrix,IFloat *OutMatrix  , int *coord, int n,
01272                  int nu,int ro,int de, IFloat *v, IFloat * u, int nflush_g )
01273 {
01274 
01275  
01276     int  nn, i, off_node;
01277  IFloat stp5_0[18], stp5_1[18], stp5_2[18], 
01278    stp5_3[18] , stp5_4[18], stp5_5[18], mtmp[18];
01279 
01280 
01281  /*    //U_de(x-mu)~ and transfer it to  x+de+mu
01282      CoordNN( n+4 ); //
01283      coord[n]= coord_nn[n];// x-mu
01284       nn = ( de - 1 + 4 ) % 4;
01285       v = u + SITE_LEN *(LexGauge( coord ))+ MATRIX_SIZE * nn;
01286       mDotMEqual( stp5_0, v, InMatrix  ) ; 
01287 
01288      off_node = CoordNN( de );// x-mu -> x+mu+de
01289      coord[ro]= coord_nn[de];// x-mu+de
01290      TransfM( off_node,  nflush_g, stp5_0, mtmp,  de); //transfer stp5_0 to  x-mu+ro
01291 
01292      CoordNN( n ); // 
01293      coord[n]= coord_nn[n];//  x-mu+de -> x+de
01294  
01295  
01296 
01297  if( ro<4)
01298     Staple5_NP(stp5_0,stp5_4, coord, n,nu, ro, v, u, nflush_g );
01299 else    
01300     Staple5_NN(stp5_0,stp5_4, coord, n,nu, ro%4, v, u, nflush_g );
01301 
01302  
01303 
01304    //transfer it to  x 
01305      off_node = CoordNN( de +4 );//  x+ro ->x
01306      coord[de]= coord_nn[de];//
01307      TransfP( off_node,  nflush_g, stp5_4, mtmp, de);
01308 
01310       nn = ( de - 1 + 4 ) % 4;
01311       v = u + SITE_LEN *(LexGauge( coord ))+ MATRIX_SIZE * nn;
01312       DaggerM(stp5_1,v);
01313        mDotMEqual(OutMatrix, stp5_1,  stp5_4) ;
01314 
01315  */
01316  
01317 
01318      //U_de(x-mu)~ and transfer it to  x+de+mu
01319      CoordNN( n+4 ); //
01320      coord[n]= coord_nn[n];// x-mu
01321       nn = ( de - 1 + 4 ) % 4;
01322       v = u + SITE_LEN *(LexGauge( coord ))+ MATRIX_SIZE * nn;
01323       mDotMEqual( stp5_0, v, InMatrix  ) ; 
01324 
01325      off_node = CoordNN( de );// x-mu -> x+mu+de
01326      coord[de]= coord_nn[de];// x-mu+de
01327      TransfM( off_node,  nflush_g, stp5_0, mtmp,  de); //transfer stp5_0 to  x-mu+de
01328 
01329      CoordNN( n ); // 
01330      coord[n]= coord_nn[n];//  x-mu+de -> x+de
01331  
01332  
01333  
01334  if( ro<4)
01335     Staple5_NP(stp5_0,stp5_4, coord, n,nu,ro, v, u, nflush_g);
01336 else    
01337     Staple5_NN(stp5_0,stp5_4, coord, n,nu, ro%4, v, u, nflush_g );
01338 
01339 
01340 
01341    //transfer it to  x 
01342      off_node = CoordNN( de +4 );//  x+de ->x
01343      coord[de]= coord_nn[de];//
01344      TransfP( off_node,  nflush_g, stp5_4, mtmp, de);
01345 
01346    
01347       nn = ( de - 1 + 4 ) % 4;
01348       v = u + SITE_LEN *(LexGauge( coord ))+ MATRIX_SIZE * nn;
01349       DaggerM(stp5_1,v);
01350        mDotMEqual(OutMatrix, stp5_1,  stp5_4) ;
01351 
01352  
01353 
01354  }
01355 
01356 
01357 void Staple7_NN( IFloat * InMatrix,IFloat *OutMatrix  , int *coord, int n,
01358                  int nu,int ro,int de, IFloat *v, IFloat * u, int nflush_g )
01359   { 
01360     int  nn, i, off_node;
01361  IFloat stp5_0[18], stp5_1[18], stp5_2[18], 
01362    stp5_3[18] , stp5_4[18], stp5_5[18], mtmp[18];
01363 
01364 
01365    
01366     //U_de(x-mu-de) and transfer it to  x+mu-de
01367      CoordNN( n+4 ); //
01368      coord[n]= coord_nn[n];// x-mu
01369      CoordNN( de+4 ); //
01370      coord[de]= coord_nn[de];// x-mu-de
01371 
01372       nn = ( de - 1 + 4 ) % 4;
01373       v = u + SITE_LEN *(LexGauge( coord ))+ MATRIX_SIZE * nn;
01374       DaggerM(stp5_1, v);
01375       mDotMEqual( stp5_0,stp5_1 , InMatrix  ) ; 
01376 
01377      CoordNN( n ); // 
01378      coord[n]= coord_nn[n];//  x-mu-de -> x-de
01379 
01380 
01381 
01382  if( ro<4)
01383     Staple5_NP(stp5_0,stp5_4, coord, n,nu,ro, v, u, nflush_g );
01384 else    
01385     Staple5_NN(stp5_0,stp5_4, coord, n,nu,ro%4, v, u, nflush_g );
01386 
01387 
01388       
01389       nn = ( de - 1 + 4 ) % 4;
01390       v = u + SITE_LEN *(LexGauge( coord ))+ MATRIX_SIZE * nn;     
01391       mDotMEqual( OutMatrix,v,  stp5_4) ;
01392 
01393    //transfer it to  x 
01394      off_node = CoordNN( de  );//  x-de ->x
01395      coord[de]= coord_nn[de];//
01396      TransfM( off_node,  nflush_g, OutMatrix, mtmp, de);
01397 
01398 
01399   
01400 }
01401 
01402 #endif //staple7
01403 extern "C"
01404 void asqtad_dirac_init_g()
01405 {
01406 
01407   IFloat * u = gauge_field_addr;
01408   int c,i,j,m,n,r;
01409   int off_node;
01410   int local_count[2];
01411   int nflush_g = 2;
01412   int non_local_count[2];
01413   int local_count_3[2];
01414 int non_local_count_3[3][2];
01415   int x[NUM_DIR/2];
01416   char *cname = "";
01417   char *fname = "asqtad_dirac_init_g()";
01418 
01419   //--------------------------------------------------------------------
01420   // c1 -> one link; c2 -> 3-link; c3 -> 3-link staple; c5 -> 5-link staple;
01421   // c7 -> 7-link staple; c6 -> 5-link "straight" staple
01422   //--------------------------------------------------------------------
01423  
01424   IFloat c1 = GJP.KS_coeff();
01425   IFloat c2 = GJP.Naik_coeff();
01426   IFloat c3 = -GJP.staple3_coeff();
01427   IFloat c5 = GJP.staple5_coeff();
01428   IFloat c7 = -GJP.staple7_coeff();
01429   IFloat c6 = GJP.Lepage_coeff(); 
01430 
01431   //-------------------------------------------------------------------
01432   //  sg is a lexical index for (t,z,y,x) where x runs fastest. This is
01433   //  the gauge field order produced by convert for staggered fermions.
01434   //
01435   //    sg = x + L_x * ( y + L_y * ( z + L_z * t ) )
01436   //
01437   //  sc is a lexical index for (t,x,y,z) where t runs fastest.  The
01438   //  even and odd staggered color vectors are stored with indices
01439   //  running in this order, except that even sites come before odd
01440   //  sites.
01441   //
01442   //    sc = t + L_t * ( x + L_x * ( y + L_y * z ) )
01443   //
01444   //-------------------------------------------------------------------
01445 
01446   int sg;
01447 
01448   //-----------------------------------------------------------
01449   //  If t + x + y + z is odd, odd = 1.  Otherwise it is 0.
01450   //-----------------------------------------------------------
01451 
01452   int odd;
01453 
01454   //-----------------------------------------------------------
01455   //  The physics system storage order has vector indices as
01456   //  0-3, x,y,z,t.  Our vector indices run 0-3 as t,x,y,z.
01457   //  nn is used to hold physics system values for our index,
01458   //  given by n.
01459   //-----------------------------------------------------------
01460 
01461   int nn;
01462 
01463   //-----------------------------------------------------------
01464   //  Once all the index arithmetic is finished, v points to
01465   //  the initial gauge field matrix.  w points to where it should
01466   //  be stored.  same for w3 (for UUU matrix).
01467   //-----------------------------------------------------------
01468 
01469   IFloat * v;
01470   IFloat * w, wp1[18];
01471   IFloat * w3;
01472 
01473   IFloat  w_t1[18], w_t2[18], w_t3[18];
01474   IFloat mtmp[18] ;
01475 
01476   //-----------------------------------------------------------
01477   //  SCU transfer structure to get links from off node and a
01478   //  location where one link matrix can be stored.
01479   //-----------------------------------------------------------
01480   VRB.Func(cname,fname);
01481 #if 0
01482   size[0] = GJP.TnodeSites();
01483   size[1] = GJP.XnodeSites();
01484   size[2] = GJP.YnodeSites();
01485   size[3] = GJP.ZnodeSites();
01486 
01487   vol = size[0] * size[1] * size[2] * size[3];
01488 
01489   non_local_chi = 2*(size[0]*size[1]*size[2] + size[1]*size[2]*size[3]+
01490     size[2]*size[3]*size[0] + size[3]*size[0]*size[1]);
01491 
01492   non_local_chi_3 = 6*(size[0]*size[1]*size[2] + size[1]*size[2]*size[3]+
01493     size[2]*size[3]*size[0] + size[3]*size[0]*size[1]);
01494 
01495   local_chi = NUM_DIR*vol - non_local_chi;
01496   local_chi_3 = NUM_DIR*vol - non_local_chi_3;
01497 #endif
01498 
01499 #ifdef SIMUL_U
01500   uc_l[0] = uc_l0;
01501   uc_l[1] = uc_l1;
01502   uc_nl[0] = uc_nl0;
01503   uc_nl[1] = uc_nl1;
01504 #endif
01505 
01506   //-----------------------------------------------------------------
01507   //  Allocate space for two copies of the gauge fields on this node
01508   //-----------------------------------------------------------------
01509   for ( i = 0; i < 2; i++ ){
01510 
01511 #ifndef SIMUL_U
01512     uc_l[i]  = (IFloat*)
01513       smalloc(MATRIX_SIZE*((local_chi+local_chi_3)/2)*sizeof(IFloat),
01514               "uc_l[i]", fname, cname);
01515     for(int j=0;j<MATRIX_SIZE*(local_chi+local_chi_3)/2;j++)
01516         uc_l[i][j]=0.;
01517 #endif
01518     uc_l_agg[i]  = (gauge_agg*)
01519       smalloc(((local_chi+local_chi_3)/2)*sizeof(gauge_agg),
01520               "uc_l_agg[i]", fname, cname);
01521 
01522  
01523 #ifndef SIMUL_U
01524     uc_nl[i] = (IFloat*)
01525       smalloc(MATRIX_SIZE*((non_local_chi+non_local_chi_3)/2) * sizeof(IFloat),
01526               "uc_nl[i]", fname, cname);
01527     for(int j=0;j<MATRIX_SIZE*(non_local_chi+non_local_chi_3)/2;j++)
01528         uc_nl[i][j]=0.;
01529 #endif
01530     uc_nl_agg[i] = (gauge_agg*)
01531       smalloc(((non_local_chi+non_local_chi_3)/2)*sizeof(gauge_agg),
01532               "uc_nl_agg[i]", fname, cname);
01533 
01534   }
01535 
01536 #ifndef SIMUL_U
01537 
01538 
01539 
01540   for ( i = 0; i < 2; i++){
01541     local_count[i] = 0;
01542     non_local_count[i] = 0;
01543     local_count_3[i]=0;
01544    for (j=0; j<3; j++) non_local_count_3[j][i] = 0;
01545        
01546 
01547   }
01548 
01549   //-----------------------------------------------------------
01550   //  Copy links in POSTIVE directions.  These are all on-node
01551   //  Take Hermitian conjugate while doing this
01552   //-----------------------------------------------------------
01553   for ( n = 0; n < NUM_DIR/2; n++ ) {
01554 
01555   //-----------------------------------------------------------------
01556   //  Loop over all sites.  First rearrange gauge field for this
01557   //  site and then set up pointers to vector field
01558   //-----------------------------------------------------------------
01559 
01560     for (x[3] = 0; x[3] < size[3]; x[3]++){
01561       for (x[2] = 0; x[2] < size[2]; x[2]++){
01562                         for (x[1] = 0; x[1] < size[1]; x[1]++){
01563                                 for (x[0] = 0; x[0] < size[0]; x[0]++){
01564 
01565          for (i = 0; i < 4 ; i++) coord[i] = x[i];
01566             odd = ( coord[0] + coord[1] + coord[2] + coord[3] ) % 2;
01567             sg = coord[1] + size[1] * ( coord[2] + size[2] * ( coord[3] +  size[3] * coord[0] ));
01568 
01569     //---------------------------------------------------------------
01570      //U_mu(x) ~
01571      //---------------------------------------------------------------
01572        nn = ( n - 1 + 4 ) % 4;          //  nn is physics system order
01573        v = u + SITE_LEN *(LexGauge( coord ))+ MATRIX_SIZE * nn;
01574             //v = u + SITE_LEN * sg + MATRIX_SIZE * nn;
01575        //storage the uc fields in the same order of chi's
01576       if ( CoordNN( n ) ) {             // chi(x+mu) off-node
01577         w = uc_nl[odd] + MATRIX_SIZE * non_local_count[odd];
01578         for ( r = 0; r < 3; r++ ) {
01579           for ( c = 0; c < 3; c++ ) {
01580             i = 6*r + 2*c;
01581             j = 6*c + 2*r;
01582             w[i] = v[j];
01583                 w[i+1] = -v[j+1];
01584                         }
01585         }
01586               non_local_count[odd]++;
01587           }
01588         else {
01589             w = uc_l[odd] + MATRIX_SIZE * local_count[odd];
01590           for ( r = 0; r < 3; r++ ) {
01591                         for ( c = 0; c < 3; c++ ) {
01592                   i = 6*r + 2*c;
01593                       j = 6*c + 2*r;
01594                       w[i] = v[j];
01595                   w[i+1] = -v[j+1];
01596                 }
01597           }
01598        local_count[odd]++;
01599       } // end of else
01600 
01601  for (i = 0; i < 4 ; i++) coord[i] = x[i];
01602  CoordNN( n );
01603  coord[n]=coord_nn[n];  // x+mu
01604  CoordNN( n );
01605  coord[n]=coord_nn[n]; // x+2m
01606 
01607  //U_mu(x+mu) ~
01608  nn =  ( n - 1 + 4 ) % 4;          
01609  v = u + SITE_LEN * (LexGauge( coord )) + MATRIX_SIZE * nn;
01610  DaggerM( w_t2, v);  
01611 
01612   off_node= CoordNN( n+4 ); // x+2mu -> x+mu
01613   TransfP( off_node,  nflush_g, w_t2, mtmp,  n);
01614 
01615  //U_mu(x+mu) ~
01616   coord[n]=coord_nn[n];
01617   nn =  ( n - 1 + 4 ) % 4;          
01618  v = u + SITE_LEN * (LexGauge( coord )) + MATRIX_SIZE * nn;
01619  DaggerM( w_t1, v); 
01620 
01621   mDotMEqual( w_t3, w_t1, w_t2  ) ; //on x+mu node
01622 
01623   off_node= CoordNN( n+4 ); // x+mu -> x
01624   coord[n]=coord_nn[n];
01625   TransfP( off_node,  nflush_g, w_t3, mtmp,  n);
01626      //---------------------------------------------------------------
01627     //U_mu(x) ~U_mu(x+mu) ~U_mu(x+2mu) ~   (1 transformation  max)
01628      //---------------------------------------------------------------
01629 
01630 
01631    for (i = 0; i < 4 ; i++) coord[i] = x[i];
01632    if ( CoordkNN (n, 3)) {         // 3rd nn off-node
01633   
01634    for (j=0; j<3; j++){
01635 
01636     if ((coord_knn[n%4]==j)) {
01637         w3 = uc_nl[odd]+MATRIX_SIZE*(non_local_count_3[j][odd]+(j+1)* non_local_chi/2);
01638         non_local_count_3[j][odd]++;
01639        }
01640    }
01641 
01642      }
01643    else {                      // 3rd nn on-node
01644     w3 = uc_l[odd] + MATRIX_SIZE * (local_chi/2 + local_count_3[odd]);
01645     local_count_3[odd]++;
01646    }
01647 
01648 
01649   
01650 
01651    mDotMEqual(w3 ,  w ,  w_t3  ) ;  
01652 
01653   for ( i = 0; i < 18; i++ ) {
01654     w3[i] = c2 * w3[i]; 
01655 
01656   }
01657    // mDotMEqual(w3 ,  w_t3 ,  w  ) ;  //wrong one
01658 
01659 
01660 
01661 
01662  int nu;
01663  IFloat  stp3_0[18],  stp3_2[18],  stp3_4[18], stp3_5[18];
01664  IFloat stp5_0[18], stp5_1[18], stp5_2[18], stp5_3[18] , stp5_4[18], stp5_5[18];
01665  IFloat UnitMatrix[18], stp5_f[18], stp3_f[18] ; 
01666  IFloat stp7_0[18], stp7_f[18], stp7_2[18];
01667  IFloat stp6_f[18];
01668 
01669    for ( i = 0; i < 18; i++ ) {
01670                stp3_5[i] = 0.;
01671                stp5_f[i] = 0.;
01672                stp6_f[i] = 0.;
01673                stp3_f[i] = 0.;
01674                stp7_f[i] = 0.;
01675                UnitMatrix[i] = 0.;
01676            }
01677 
01678 
01679     UnitMatrix[0]=1.;
01680     UnitMatrix[8]=1.;
01681     UnitMatrix[16]=1.;
01682 
01683 for (nu=0; nu < 8 ; nu++){
01684  if  (nu%4 != n){
01685     for (i = 0; i < 4 ; i++) coord[i] = x[i]; 
01686 if (nu<4){
01687       Staple3_PP (UnitMatrix,stp3_4, coord, n,nu, v,u,nflush_g,1 );
01688       Staple5_PP (UnitMatrix,stp5_2, coord, n,nu, nu, v,u,nflush_g );
01689 }
01690 else{ 
01691      Staple3_PN (UnitMatrix,stp3_4, coord, n,nu%4, v,u,nflush_g,1 );
01692      Staple5_PN (UnitMatrix,stp5_2, coord, n,nu,nu%4, v,u,nflush_g );
01693 }
01694 
01695          for ( i = 0; i < 18; i++ ) {
01696                stp3_5[i] += stp3_4[i];
01697                stp6_f[i] += stp5_2[i];
01698            }
01699  }}
01700 
01701 
01702 
01703 
01704 
01705 for (int ro=0; ro < 4 ; ro++){  //positive ro direction
01706 if  (ro%4 != n)
01707    {  
01708  for (nu=0; nu < 8 ; nu++){ // 8 direction
01709 if  ( ( (nu%4)!= n )&&( (nu%4)!=(ro%4)) )
01710    {  
01711      for (i = 0; i < 4 ; i++) coord[i] = x[i]; 
01712 
01713   Staple5_PP (UnitMatrix ,stp5_2 , coord, n, nu, ro, v, u, nflush_g  );
01714 
01715        for ( i = 0; i < 18; i++ ) {
01716          stp5_f[i] += stp5_2[i];}
01717 }
01718 }}}     
01719 
01720  for (int ro=0; ro < 4 ; ro++){ //negative ro direction
01721 if  (ro != n)
01722    {  
01723  for (nu=0; nu < 8 ; nu++){ //8 direction
01724 if  ( ( (nu%4)!= n )&&( (nu%4)!=(ro%4)) )
01725    {  
01726      for (i = 0; i < 4 ; i++) coord[i] = x[i]; 
01727 
01728      Staple5_PN(UnitMatrix ,stp5_2 , coord, n, nu, ro, v, u, nflush_g  );
01729 
01730        for ( i = 0; i < 18; i++ ) {
01731          stp5_f[i] += stp5_2[i];}
01732    }
01733  }
01734    }
01735  }
01736 
01737 
01738 
01739 for (int de=0; de < 4 ;de++){  //positive de direction
01740   if  (de%4 != n ){
01741 
01742 for (int ro=0; ro < 8 ; ro++){  //all ro direction
01743 if  (ro%4 != n && ro%4 != de%4)
01744    {  
01745  for (nu=0; nu < 8 ; nu++){ // 8 direction
01746 if  (nu%4 != n && nu%4 != ro%4 && nu%4 != de%4)
01747    {  
01748      for (i = 0; i < 4 ; i++) coord[i] = x[i]; 
01749 
01750      Staple7_PP (UnitMatrix ,stp7_2 , coord, n, nu, ro,de, v, u, nflush_g  );
01751  
01752        for ( i = 0; i < 18; i++ ) {
01753          stp7_f[i] += stp7_2[i];}
01754 }
01755  }}} }}
01756 
01757 
01758 for (int de=0; de < 4 ;de++){  //negative de direction
01759   if  (de%4 != n){
01760 
01761 for (int ro=0; ro < 8 ; ro++){  //all ro direction
01762 if  (ro%4 != n && ro%4 != de%4)
01763    {  
01764  for (nu=0; nu < 8 ; nu++){ // 8 direction
01765 if  (nu%4 != n && nu%4 != ro%4 && nu%4 != de%4)
01766    {  
01767      for (i = 0; i < 4 ; i++) coord[i] = x[i]; 
01768 
01769      Staple7_PN (UnitMatrix ,stp7_2 , coord, n, nu, ro,de, v, u, nflush_g  );
01770  
01771        for ( i = 0; i < 18; i++ ) {
01772          stp7_f[i] += stp7_2[i];}
01773 }
01774  }}} }}
01775 
01776 
01777 
01778 
01779 
01780 
01781 
01782 
01783 
01784 
01785 
01786  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
01787  // add the fat link term to one link
01788  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
01789  
01790     for ( i = 0; i < 18; i++ ) {
01791 
01792       w[i]*= c1;
01793       w[i] = w[i] + c3 * stp3_5[i]+ c5 * stp5_f[i] + c6
01794 *stp6_f[i] + c7 * stp7_f[i] ;
01795       /*      w[i]+=stp3_5[i];
01796         w[i]+=stp5_f[i];
01797          w[i]+=stp7_f[i];
01798 */
01799            }
01800 
01801 
01802 
01803         }//end of for x[0] loop
01804        }
01805       }
01806     } //end of for x[3] loop
01807   }//end of the n-loop (sum_postive mu _
01808 
01809  
01810 
01811    //-----------------------------------------------------------------
01812     //  Copy links in NEGATIVE directions.  Some are off-node
01813     //  Add an overall minus sign
01814     //-----------------------------------------------------------------
01815     for ( n = 0; n < NUM_DIR/2; n++ ) {
01816  
01817   for (x[3] = 0; x[3] < size[3]; x[3]++){
01818         for (x[2] = 0; x[2] < size[2]; x[2]++){
01819           for (x[1] = 0; x[1] < size[1]; x[1]++){
01820             for (x[0] = 0; x[0] < size[0]; x[0]++){
01821 
01822               for (i = 0; i < 4 ; i++) coord[i] = x[i];
01823               odd = ( coord[0] + coord[1] + coord[2] + coord[3] ) % 2;
01824 
01825               nn = ( n - 1 + 4 ) % 4;           //  nn is physics system order
01826         //---------------------------------------------------------------
01827               //U_mu (x-mu)
01828               //---------------------------------------------------------------
01829         off_node = CoordNN( n + 4 );
01830 
01831               v = u + SITE_LEN * LexGauge( coord_nn ) + MATRIX_SIZE * nn;
01832 
01833         if ( off_node ) {               //  off-node
01834           w = uc_nl[odd] + MATRIX_SIZE * non_local_count[odd];
01835            non_local_count[odd]++;
01836                                 TransfM( off_node,  nflush_g, v, mtmp,  n);
01837                                 v = mtmp;
01838               }
01839         else{
01840                   w = uc_l[odd] + MATRIX_SIZE * local_count[odd];
01841                   local_count[odd]++;
01842               }  //end of "else"
01843 
01844         //U_mu (x-mu) -> x  
01845  for ( i = 0; i < 18; i++ ) {
01846                 w[i] = -v[i];
01847            }
01848 
01849  for (i = 0; i < 4 ; i++) coord[i] = x[i];
01850  CoordNN( n + 4);
01851  coord[n]=coord_nn[n];  // x-mu
01852  CoordNN( n + 4);
01853  coord[n]=coord_nn[n];  // x-2mu
01854  off_node=CoordNN( n + 4); //coord_nn= x-3mu
01855 
01856  //U_mu (x-3mu) to x2mu
01857      v = u + SITE_LEN * LexGauge( coord_nn ) + MATRIX_SIZE * nn;
01858      for ( i = 0; i < 18; i++ ) {
01859                 w_t2[i] = v[i];
01860            }
01861 
01862      TransfM( off_node,  nflush_g, w_t2, mtmp,  n);
01863  //U_mu (x-2mu)
01864      v = u + SITE_LEN * LexGauge( coord ) + MATRIX_SIZE * nn;
01865      for ( i = 0; i < 18; i++ ) {
01866                 w_t1[i] = v[i];
01867            }
01868   // /U_mu (x-2mu)*U_mu (x-3mu) on x-2mu to x
01869     mDotMEqual(w_t3, w_t1, w_t2  ) ; //on x-2mu
01870     off_node= CoordkNN( n,2 ); // x-2mu -> x //U_mu (x-mu) transfered to x too 
01871     TransfM( off_node,  nflush_g, w_t3, mtmp,  n);
01872 
01873    //---------------------------------------------------------------
01874    //U_mu(x-mu)*U_mu(x-2mu)*U_mu(x-3mu)
01875    //---------------------------------------------------------------
01876    for (i = 0; i < 4 ; i++) coord[i] = x[i];
01877    if ( CoordkNN (n + 4, 3)) {      // 3rd nn off-node
01878 
01879    for (j=0; j<3; j++){
01880 
01881     if ((coord_knn[n%4]==(size[n%4]-1-j))) {
01882         w3 = uc_nl[odd]+MATRIX_SIZE*(non_local_count_3[j][odd]+(j+1)* non_local_chi/2);
01883         non_local_count_3[j][odd]++;
01884        }
01885    }
01886    
01887    }
01888    else {                     // 3rd nn om-node
01889     w3 = uc_l[odd] + MATRIX_SIZE * (local_count_3[odd] + local_chi/2);
01890     local_count_3[odd]++;
01891    }
01892 
01893    // mDotMEqual( mtmp,  w_t1,  w_t2) ;
01894    mDotMEqual( w3 ,  w ,  w_t3 ) ;
01895 
01896   for ( i = 0; i < 18; i++ ) {
01897     w3[i] = c2 * w3[i]; 
01898 
01899   }
01900 
01901 
01902  int nu;
01903  IFloat  stp3_0[18],stp3_1[18],  stp3_2[18], stp3_3[18], stp3_4[18], stp3_5[18];
01904 IFloat stp5_0[18], stp5_1[18], stp5_2[18], stp5_3[18] , stp5_4[18], stp5_5[18];
01905 IFloat UnitMatrix[18], stp5_f[18], stp3_f[18], stp7_2[18],stp7_f[18],stp6_f[18] ; 
01906 
01907 
01908 
01909 
01910    for ( i = 0; i < 18; i++ ) {
01911                stp3_5[i] = 0;
01912                stp5_f[i] = 0;
01913                stp3_f[i] = 0;
01914                stp6_f[i] = 0;
01915                stp7_f[i] = 0;
01916                UnitMatrix[i] = 0; 
01917    
01918            }
01919 
01920     UnitMatrix[0]=1;
01921     UnitMatrix[8]=1;
01922     UnitMatrix[16]=1;
01923 
01924 
01925 for (nu=0; nu < 8 ; nu++){
01926  if  (nu%4 != n){
01927     for (i = 0; i < 4 ; i++) coord[i] = x[i]; 
01928 if (nu<4){
01929       Staple3_NP (UnitMatrix,stp3_4, coord, n,nu, v,u,nflush_g,1 );
01930       Staple5_NP (UnitMatrix,stp5_2, coord, n,nu, nu, v,u,nflush_g);
01931 }
01932 else {
01933      Staple3_NN (UnitMatrix,stp3_4, coord, n,nu%4, v,u,nflush_g,1 );
01934      Staple5_NN (UnitMatrix,stp5_2, coord, n,nu,nu%4, v,u,nflush_g);
01935 }
01936 
01937          for ( i = 0; i < 18; i++ ) {
01938                stp3_5[i] += stp3_4[i];
01939                stp6_f[i] += stp5_2[i];
01940            }
01941  }}
01942 
01943 
01944  for (int ro=0; ro < 4 ; ro++){  //positive ro direction
01945 if  (ro%4 != n)
01946    {  
01947  for (nu=0; nu < 8 ; nu++){ // 8 direction
01948 if  (nu%4 != n && nu%4 != ro%4)
01949    {  
01950      for (i = 0; i < 4 ; i++) coord[i] = x[i]; 
01951  Staple5_NP (UnitMatrix ,stp5_2 , coord, n, nu, ro, v, u, nflush_g  );
01952 
01953  //summation
01954        for ( i = 0; i < 18; i++ ) {
01955           stp5_f[i] += stp5_2[i]; }
01956   
01957 
01958    }
01959  }
01960    
01961    }}   
01962                 
01963 
01964 
01965  for (int ro=0; ro < 4 ; ro++){ //nrgative ro direction
01966 if  (ro != n)
01967    {  
01968  for (nu=0; nu < 8 ; nu++){ //8 direction
01969 if  (nu%4 != n && nu%4 != ro%4)
01970    {  
01971      for (i = 0; i < 4 ; i++) coord[i] = x[i]; 
01972 
01973  Staple5_NN(UnitMatrix ,stp5_2 , coord, n, nu, ro, v, u, nflush_g  );
01974 
01975        for ( i = 0; i < 18; i++ ) {
01976          stp5_f[i] += stp5_2[i];}
01977 
01978 
01979    }
01980  }
01981    }
01982  }
01983 
01984 
01985             
01986 
01987 for (int de=0; de < 4 ;de++){  //positive de direction
01988   if  (de%4 != n){
01989 
01990 for (int ro=0; ro < 8 ; ro++){  //all ro direction
01991 if  (ro%4 != n && ro%4 != de%4)
01992    {  
01993       for (nu=0; nu < 8 ; nu++){ // 8 direction
01994         //for (nu=0; nu < 4 ; nu++){ // positive direction
01995 if  (nu%4 != n && nu%4 != ro%4 && nu%4 != de%4)
01996    {  
01997      for (i = 0; i < 4 ; i++) coord[i] = x[i]; 
01998 
01999         Staple7_NP (UnitMatrix ,stp7_2 , coord, n, nu, ro,de, v, u, nflush_g  );
02000 
02001      /*   //U_de(x-mu)~ and transfer it to  x+de+mu
02002      CoordNN( n+4 ); //
02003      coord[n]= coord_nn[n];// x-mu
02004       nn = ( de - 1 + 4 ) % 4;
02005       v = u + SITE_LEN *(LexGauge( coord ))+ MATRIX_SIZE * nn;
02006       mDotMEqual( stp5_0, v, UnitMatrix  ) ; 
02007 
02008      off_node = CoordNN( de );// x-mu -> x+mu+de
02009      coord[de]= coord_nn[de];// x-mu+de
02010      TransfM( off_node,  nflush_g, stp5_0, mtmp,  de); //transfer stp5_0 to  x-mu+de
02011 
02012      CoordNN( n ); // 
02013      coord[n]= coord_nn[n];//  x-mu+de -> x+de
02014  
02015  
02016  
02017  if( ro<4)
02018     Staple5_NP(stp5_0,stp5_4, coord, n,nu,ro, v, u, nflush_g);
02019 else    
02020     Staple5_NN(stp5_0,stp5_4, coord, n,nu, ro%4, v, u, nflush_g );
02021 
02022 
02023 
02024    //transfer it to  x 
02025      off_node = CoordNN( de +4 );//  x+de ->x
02026      coord[de]= coord_nn[de];//
02027      TransfP( off_node,  nflush_g, stp5_4, mtmp, de);
02028 
02029    
02030       nn = ( de - 1 + 4 ) % 4;
02031       v = u + SITE_LEN *(LexGauge( coord ))+ MATRIX_SIZE * nn;
02032       DaggerM(stp5_1,v);
02033        mDotMEqual(stp7_2, stp5_1,  stp5_4) ;
02034 
02035      */
02036  
02037        for ( i = 0; i < 18; i++ ) {
02038          stp7_f[i] += stp7_2[i];}
02039 }
02040  }}} }}
02041 
02042 
02043  
02044 
02045 for (int de=0; de < 4 ;de++){  //negative de direction
02046   if  (de%4 != n){
02047 
02048 for (int ro=0; ro < 8 ; ro++){  //all ro direction
02049 if  (ro%4 != n && ro%4 != de%4)
02050    {
02051      //for (nu=0; nu < 4 ; nu++){ // positive direction  
02052  for (nu=0; nu < 8 ; nu++){ // 8 direction
02053 if  (nu%4 != n && nu%4 != ro%4 && nu%4 != de%4)
02054    {  
02055      for (i = 0; i < 4 ; i++) coord[i] = x[i]; 
02056 
02057      Staple7_NN (UnitMatrix ,stp7_2 , coord, n, nu, ro,de, v, u, nflush_g  );
02058  
02059        for ( i = 0; i < 18; i++ ) {
02060          stp7_f[i] += stp7_2[i];}
02061 }
02062  }}} }}
02063 
02064 
02065 
02066 
02067 
02068 
02069  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
02070  // add the fat link term to one link
02071  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
02072  
02073     for ( i = 0; i < 18; i++ ) {
02074 
02075       w[i]*= c1;
02076       //w[i] = w[i] + c3 * stp3_5[i]+ c5 * stp5_f[i] + c7 * stp7_f[i] ;
02077       w[i] = w[i] - c3 * stp3_5[i]- c5 * stp5_f[i] - c6*stp6_f[i] - c7 * stp7_f[i] ; //test
02078       /*
02079       w[i]+=stp3_5[i];
02080         w[i]+=stp5_f[i];
02081          w[i]+=stp7_f[i];
02082 */
02083            }
02084 
02085 
02086 
02087 
02088 
02089 
02090 
02091 
02092 
02093 
02094 
02095  
02096 
02097  
02098 
02099 
02100 
02101 
02102 
02103 
02104 
02105 
02106 
02107 
02108 
02109 
02110 
02111      }}} }//end of all the x-for loop
02112    }//end of the for-n-loop //all negative direction
02113 
02114 #endif // SIMUL 
02115 
02116   int fd;
02117   char buf[200];
02118   FILE *fp;
02119 #ifndef SIMUL_U
02120 #if 0
02121   fp=Fopen("uc_l.h","w");
02122   for(j=0;j<2;j++){
02123     Fprintf(fp,"IFloat uc_l%d[] LOCATE(\"edramnormal\") = {\n",j);
02124     for(i=0;i< MATRIX_SIZE * ((local_chi+ local_chi_3)/2);i++){
02125       Fprintf(fp,"%0.4e, ",*(uc_l[j]+i));
02126       if( i%6 == 5){
02127         Fprintf(fp,"\n");
02128       }
02129     }
02130     Fprintf(fp,"\n};\n"); 
02131   }
02132   Fclose(fp);
02133 #endif
02134 #endif
02135 
02136 #ifndef SIMUL_AGG
02137 
02138   gauge_agg *temp = (gauge_agg *)
02139     smalloc(12*vol*sizeof(gauge_agg),"temp",fname,cname);
02140   int *num_ind = (int *)smalloc(6*vol*sizeof(int),"num_ind",fname,cname);
02141   int src;
02142   for(j=0;j<2;j++){
02143     for(i=0;i<vol;i++) num_ind[i]=0;
02144     for(i=0;i< ((local_chi+ local_chi_3)/2);i++){
02145       src = (unsigned long)chi_l[j][2*i];
02146       if (src%(VECT_LEN*sizeof(IFloat))!=0){
02147         ERR.General(cname,fname,"src = %d\n",src);
02148         exit(1);
02149       }
02150       src = src/(VECT_LEN*sizeof(IFloat));
02151       if(src > vol/2) {
02152         ERR.General(cname,fname,"src[%d](%d) > vol/2\n",i,src);
02153         exit(1);
02154       }
02155       temp[src*NUM_DIR*2+num_ind[src]].src = (unsigned long)chi_l[j][2*i];
02156       temp[src*NUM_DIR*2+num_ind[src]].dest= (unsigned long)chi_l[j][2*i+1];
02157       for(k=0;k<MATRIX_SIZE;k++){
02158         temp[src*NUM_DIR*2+num_ind[src]].mat[k] = uc_l[j][i*MATRIX_SIZE+k];
02159       }
02160       num_ind[src]++;
02161     }
02162     int index = 0;
02163     int div = 2;
02164     for( n = 0; n*div<NUM_DIR*2;n++)
02165     for(i=0;i<vol/2;i++){
02166       for(m=0;m<div;m++){
02167         if(num_ind[i] > n*div+m) {
02168            uc_l_agg[j][index].src = temp[i*NUM_DIR*2+n*div+m].src;
02169            uc_l_agg[j][index].dest = temp[i*NUM_DIR*2+n*div+m].dest;
02170            for(k=0;k<18;k++)
02171               uc_l_agg[j][index].mat[k] = temp[i*NUM_DIR*2+n*div+m].mat[k];
02172            index++;
02173         }
02174       }
02175     }
02176   }
02177 
02178 
02179 #if 0
02180   struct gauge_agg *agg_p;
02181   sprintf(buf,"uc_l_agg.h");
02182   fd = open(buf,O_CREAT|O_TRUNC|O_RDWR,00644);
02183   for(j=0;j<2;j++){
02184     sprintf(buf,"struct gauge_agg uc_l_agg%d[] LOCATE(\"edramtransient\") = {\n",j);
02185     write(fd,buf,strlen(buf));
02186     for(i=0;i< ((local_chi+ local_chi_3)/2);i++){
02187       agg_p = &(uc_l_agg[j][i]);
02188       sprintf(buf,"{%d,%d,{\n",agg_p->src,agg_p->dest);
02189       write(fd,buf,strlen(buf));
02190       for(k=0;k<18;k++){
02191         sprintf(buf,"%0.8e",agg_p->mat[k]);
02192         write(fd,buf,strlen(buf));
02193         if(k!=17){
02194         sprintf(buf,", ");
02195         write(fd,buf,strlen(buf));
02196         }
02197         if( k%6 == 5){
02198           sprintf(buf,"\n");
02199           write(fd,buf,strlen(buf));
02200         }
02201       }
02202       sprintf(buf,"}},\n");
02203       write(fd,buf,strlen(buf));
02204     }
02205     sprintf(buf,"\n};\n"); 
02206     write(fd,buf,strlen(buf));
02207   }
02208   close(fd);
02209 #endif
02210 #else
02211 
02212 #if 0
02213   sprintf(buf,"uc_l_agg.h");
02214   fd = open(buf,O_CREAT|O_TRUNC|O_RDWR,00644);
02215   for(j=0;j<2;j++){
02216     sprintf(buf,"struct gauge_agg uc_l_agg%d[] LOCATE(\"edramtransient\") = {\n",j);
02217     write(fd,buf,strlen(buf));
02218     for(i=0;i< ((local_chi+ local_chi_3)/2);i++){
02219       sprintf(buf,"{%d,%d,{\n",*(chi_l[j]+2*i),*(chi_l[j]+2*i+1));
02220       write(fd,buf,strlen(buf));
02221       uc_l_agg[j][i].src = (unsigned long)chi_l[j][2*i];
02222       uc_l_agg[j][i].dest = (unsigned long)chi_l[j][2*i+1];
02223       for(k=0;k<18;k++){
02224         sprintf(buf,"%0.8e",*(uc_l[j]+i*18+k));
02225         write(fd,buf,strlen(buf));
02226         uc_l_agg[j][i].mat[k] = uc_l[j][i*18+k];
02227         if(k!=17){
02228         sprintf(buf,", ");
02229         write(fd,buf,strlen(buf));
02230         }
02231         if( k%6 == 5){
02232           sprintf(buf,"\n");
02233           write(fd,buf,strlen(buf));
02234         }
02235       }
02236       sprintf(buf,"}},\n");
02237       write(fd,buf,strlen(buf));
02238     }
02239     sprintf(buf,"\n};\n"); 
02240     write(fd,buf,strlen(buf));
02241   }
02242   close(fd);
02243 #endif
02244 
02245 #endif //SIMUL_AGG
02246 
02247 #ifndef SIMUL_U
02248 #if 0
02249   fp = Fopen("uc_nl.h","w");
02250   for(j=0;j<2;j++){
02251     Fprintf(fp,"IFloat uc_nl%d[] LOCATE(\"edramnormal\") = {\n",j); 
02252     for(i=0;i< MATRIX_SIZE * ((non_local_chi+non_local_chi_3 )/2);i++){
02253       Fprintf(fp,"%0.4e, ",*(uc_nl[j]+i));
02254       if( i%6 == 5){
02255         Fprintf(fp,"\n");
02256       }
02257     }
02258     Fprintf(fp,"\n};\n"); 
02259   }
02260   Fclose(fp);
02261 #endif
02262 #endif
02263 
02264 #ifndef SIMUL_AGG
02265 
02266 #if 1
02267   int max = 12;
02268   for(j=0;j<2;j++){
02269     for(i=0;i<non_local_chi*3/2;i++) num_ind[i]=0;
02270     for(i=0;i< ((non_local_chi+ non_local_chi_3)/2);i++){
02271       src = (unsigned long)chi_nl[j][2*i];
02272       if (src%(VECT_LEN*sizeof(IFloat))!=0){
02273         ERR.General(cname,fname,"src = %d\n",src);
02274         exit(1);
02275       }
02276       src = src/(VECT_LEN*sizeof(IFloat));
02277       if(src > non_local_chi*3/2) {
02278         ERR.General(cname,fname,"src(%d) > non_local_chi*3\n",src);
02279         exit(1);
02280       }
02281       temp[src*max+num_ind[src]].src = (unsigned long)chi_nl[j][2*i];
02282       temp[src*max+num_ind[src]].dest= (unsigned long)chi_nl[j][2*i+1];
02283       for(k=0;k<18;k++)
02284         temp[src*max+num_ind[src]].mat[k] = uc_nl[j][i*18+k];
02285       num_ind[src]++;
02286       if(num_ind[src]>max){
02287         ERR.General(cname,fname,"num_ind[%d](%d) > %d \n",src, num_ind[src],max);
02288         exit(1);
02289       }
02290     }
02291 
02292     int index = 0;
02293     int div = 12;
02294     for( n=0; n*div<max; n++)
02295     for(i=0;i<non_local_chi*3/2;i++){
02296       for(m=0;m<div;m++){
02297         if(num_ind[i] > n*div+m) {
02298            uc_nl_agg[j][index] = temp[i*max+n*div+m];
02299            index++;
02300         }
02301       }
02302     }
02303 
02304   }
02305   sfree(temp,"temp",fname,cname);
02306   sfree(num_ind,"num_ind",fname,cname);
02307 
02308 #if 0
02309   sprintf(buf,"uc_nl_agg.h");
02310   fd = open(buf,O_CREAT|O_TRUNC|O_RDWR,00644);
02311   for(j=0;j<2;j++){
02312     sprintf(buf,"struct gauge_agg uc_nl_agg%d[] LOCATE(\"edramtransient\") = {\n",j);
02313     write(fd,buf,strlen(buf));
02314     for(i=0;i< ((non_local_chi+ non_local_chi_3)/2);i++){
02315       agg_p = &(uc_nl_agg[j][i]);
02316       sprintf(buf,"{%d,%d,{\n",agg_p->src,agg_p->dest);
02317       write(fd,buf,strlen(buf));
02318       for(k=0;k<18;k++){
02319         sprintf(buf,"%0.8e",agg_p->mat[k]);
02320         write(fd,buf,strlen(buf));
02321         if(k!=17){
02322         sprintf(buf,", ");
02323         write(fd,buf,strlen(buf));
02324         }
02325         if( k%6 == 5){
02326           sprintf(buf,"\n");
02327           write(fd,buf,strlen(buf));
02328         }
02329       }
02330       sprintf(buf,"}},\n");
02331       write(fd,buf,strlen(buf));
02332     }
02333     sprintf(buf,"\n};\n"); 
02334     write(fd,buf,strlen(buf));
02335   }
02336   close(fd);
02337 #endif
02338 #else
02339 #if 0
02340   sprintf(buf,"uc_nl_agg.h");
02341   fd = open(buf,O_CREAT|O_TRUNC|O_RDWR,00644);
02342   for(j=0;j<2;j++){
02343     sprintf(buf,"struct gauge_agg uc_nl_agg%d[] LOCATE(\"edramtransient\") = {\n",j);
02344     write(fd,buf,strlen(buf));
02345     for(i=0;i< ((local_chi+ local_chi_3)/2);i++){
02346       sprintf(buf,"{%d,%d,{\n",*(chi_nl[j]+2*i),*(chi_nl[j]+2*i+1));
02347       uc_nl_agg[j][i].src = (unsigned long)chi_nl[j][2*i];
02348       uc_nl_agg[j][i].dest = (unsigned long)chi_nl[j][2*i+1];
02349       write(fd,buf,strlen(buf));
02350       for(k=0;k<18;k++){
02351         sprintf(buf,"%0.8e",*(uc_nl[j]+i*18+k));
02352         write(fd,buf,strlen(buf));
02353         uc_nl_agg[j][i].mat[k] = uc_nl[j][i*18+k];
02354         if(k!=17){
02355         sprintf(buf,", ");
02356         write(fd,buf,strlen(buf));
02357         }
02358         if( k%6 == 5){
02359           sprintf(buf,"\n");
02360           write(fd,buf,strlen(buf));
02361         }
02362       }
02363       sprintf(buf,"}},\n");
02364       write(fd,buf,strlen(buf));
02365     }
02366     sprintf(buf,"\n};\n"); 
02367     write(fd,buf,strlen(buf));
02368   }
02369   close(fd);
02370 #endif
02371 #endif
02372 
02373 #endif // SIMUL_AGG
02374 
02375   VRB.FuncEnd(cname,fname);
02376 }
02377 
02378 extern "C"
02379 void asqtad_destroy_dirac_buf_g(void)
02380 {
02381   int i;
02382   char *cname = "";
02383   char *fname = "asqtad_destroy_dirac_buf_g()";
02384   VRB.Func(cname,fname);
02385   for ( i = 0; i < 2; i++){
02386 #ifndef SIMUL_U
02387     sfree(uc_l[i],"uc_l[i]",fname,cname);
02388     sfree(uc_nl[i],"uc_nl[i]",fname,cname);
02389 #endif
02390     sfree(uc_l_agg[i],"uc_l_agg[i]",fname,cname);
02391     sfree(uc_nl_agg[i],"uc_nl_agg[i]",fname,cname);
02392   }
02393   VRB.FuncEnd("",fname);
02394 }
02395 //---------------------------------------------------------------------
02396 //  Find nearest neighbor coordinate for coordinates given.  Nearest
02397 //  neighbor coordinates are placed in coord_nn, which are always
02398 //  on-node.  Function returns 0 if nearest neighbor is on-node,
02399 //  1 if off-node.
02400 //
02401 //  nn = 0 to 7.  tp, xp, yp, zp, tm, xm, ym, zm respectively.
02402 //---------------------------------------------------------------------
02403 
02404 static int CoordNN( int nn )
02405 {
02406   int i;
02407   int off_node = 0;
02408   int m;
02409 
02410   for ( i = 0; i < 4; i++ )
02411     coord_nn[i] = coord[i];
02412 
02413   m = nn % 4;
02414   for ( i = 0; i < 4; i++ ){
02415         coord_nn [ i ] = coord[i];
02416    }//end of i-loop
02417 
02418      if (nn<4 ) coord_nn[m]+=1;
02419      else coord_nn[m]-=1;
02420      while (coord_nn[m] < 0) {
02421         coord_nn[m] += size[m] ;
02422          }
02423        coord_nn[m] %= size[m] ;
02424     if (nn<4){ if (coord_nn[m] != (coord[m]+1)) off_node = 1;}
02425        else {if (coord_nn[m] != (coord[m]-1)) off_node = 1;}
02426   return off_node;
02427 }
02428 
02429 static int CoordkNN( int nn, int k )
02430 {
02431   int i;
02432   int off_node = 0;
02433   int m;
02434 
02435   for ( i = 0; i < 4; i++ )
02436     coord_knn[i] = coord[i];
02437 
02438   m = nn % 4;
02439   for ( i = 0; i < 4; i++ ){
02440         coord_knn [ i ] = coord[i];
02441    }//end of i-loop
02442 
02443      if (nn<4 ) coord_knn[m]+=k;
02444      else coord_knn[m]-=k;
02445      while (coord_knn[m] < 0) {
02446         coord_knn[m] += size[m] ;
02447          }
02448        coord_knn[m] %= size[m] ;
02449  
02450   if (nn<4 ){ if (coord_knn[m] != (coord[m]+k)) off_node = 1;}
02451        else {if (coord_knn[m] != (coord[m]-k)) off_node = 1;}
02452   return off_node;
02453 }
02454 
02455 /*static int Coord3NN( int nn )
02456 {
02457   int i;
02458   int off_node = 0;
02459   int m;
02460 
02461   m = nn % 4;
02462   for ( i = 0; i < 4; i++ ){
02463         coord_3nn [ i ] = coord[i];
02464    }//end of i-loop
02465 
02466    if (nn<4 ) coord_3nn[m]+=3;
02467      else coord_3nn[m]-=3;
02468 
02469      while (coord_3nn[m] < 0) {
02470          coord_3nn[m] += size[m] ;
02471          }
02472        coord_3nn[m] %= size[m] ;
02473 
02474     if (coord_3nn[m] != (coord[m]+3) ||coord_3nn[m] != (coord[m]-3) ) off_node = 1;
02475  return off_node;
02476 }
02477 */
02478 //---------------------------------------------------------------------
02479 //  Return lexical value for links from coordinates c
02480 //---------------------------------------------------------------------
02481 
02482 static int LexGauge( int * c )
02483 {
02484   return c[1] + size[1] * ( c[2] + size[2] * ( c[3] + size[3] * c[0] ));
02485 }
02486 
02487 //---------------------------------------------------------------------
02488 //  Return lexical value for vectors from coordinates c
02489 //---------------------------------------------------------------------
02490 
02491 static int LexVector( int * c )
02492 {
02493   return c[0] + size[0] * ( c[1] + size[1] * ( c[2] + size[2] * c[3] ));
02494 }
02495 
02496 //-------------------------------------------------------------------
02497 //  Given a coordinate and a surface ( 0 = t, 1 = x, 2 = y, 3 = x )
02498 //  calculate the offset into the receive buffers (s?) .
02499 //-------------------------------------------------------------------
02500 
02501 static int LexSurface( int * cc, int surface )
02502 {
02503   int i;
02504   int s[4];
02505   int c[4];
02506 
02507   for ( i = 0; i < 4; i++ ) {
02508     s[i] = size[i];
02509     c[i] = cc[i];
02510   }
02511 
02512   s[ surface ] = 1;
02513   c[ surface ] = 0;
02514 
02515   return c[0] + s[0] * ( c[1] + s[1] * ( c[2] + s[2] * c[3] ));
02516 }
02517 
02518 extern "C" void dirac_comm_assert()
02519 {
02520 }
02521 
02522 //-------------------------------------------------------------------
02523 //  add_flag = 0 :       b = D a
02524 //  add_flag = 1 :       b += D a
02525 //
02526 //  a_odd    = 1 :       b even;  a odd
02527 //  a_odd    = 0 :       b odd ;  b even
02528 //
02529 //   D a = \sum_u( U^dag_u(x) a(x+u) - U(x-u) a(x-u) )
02530 //-------------------------------------------------------------------
02531 
02532 
02533 extern "C"
02534 void asqtad_dirac(IFloat* b, IFloat* a, int a_odd, int add_flag)
02535 {
02536 //  int i,j,k,c = (unsigned long) chi_off_node_total;
02537   int i,j,k;
02538   long c = (long) a;
02539   int odd = 1 - a_odd;
02540   //-----------------------------------------------------------------
02541   //  Transfer chi's on faces.  
02542   //-----------------------------------------------------------------
02543 
02544 #if 0
02545   copy_buffer_cpp(countM[0][a_odd], (long)a, (long)Tbuffer[0][0], (long)ToffsetM[0][a_odd]);
02546   copy_buffer_cpp(countP[0][a_odd], (long)a, (long)Tbuffer[0][1], (long)ToffsetP[0][a_odd]);
02547   if(size[0]>2) {
02548   copy_buffer_cpp(countM[1][a_odd], (long)a, (long)Tbuffer[1][0], (long)ToffsetM[1][a_odd]);
02549   copy_buffer_cpp(countP[1][a_odd], (long)a, (long)Tbuffer[1][1], (long)ToffsetP[1][a_odd]);
02550   if(size[0]>4) {
02551   copy_buffer_cpp(countM[2][a_odd], (long)a, (long)Tbuffer[2][0], (long)ToffsetM[2][a_odd]);
02552   copy_buffer_cpp(countP[2][a_odd], (long)a, (long)Tbuffer[2][1], (long)ToffsetP[2][a_odd]);
02553   }
02554   }
02555 #endif
02556 
02557   //make sure spinor field is in main memory before starting transfers
02558 
02559 #if 0
02560   for(i=1;i<4;i++)
02561   if(size[i] >2 ){
02562     SCUarg[i]->Addr( a + Xoffset[2][i]);
02563     SCUarg[i+4]->Addr( a + Xoffset[2][i+4]);
02564   }
02565 
02566   save_reg((long)intreg, (long)dreg);
02567  //*************
02568 
02569   void *addr[2];
02570   if(split) {
02571     for(i=1;i<4;i++){
02572       addr[0] = (void *)(a+Xoffset[0][i]);
02573       addr[1] = (void *)(a+Xoffset[1][i]);
02574       SCUarg_1[i]->Addr( addr,1);
02575       SCUarg_2[i]->Addr( addr+1,1);
02576       addr[0] = (void *)(a+Xoffset[0][i+4]);
02577       addr[1] = (void *)(a+Xoffset[1][i+4]);
02578       SCUarg_1[i+4]->Addr( addr,1);
02579       SCUarg_2[i+4]->Addr( addr+1,1);
02580     }
02581   } else {
02582     for(i=1;i<4;i++){
02583       addr[0] = (void *)(a+Xoffset[0][i]);
02584       addr[1] = (void *)(a+Xoffset[1][i]);
02585       SCUarg_1[i]->Addr( addr,2);
02586       addr[0] = (void *)(a+Xoffset[0][i+4]);
02587       addr[1] = (void *)(a+Xoffset[1][i+4]);
02588       SCUarg_1[i+4]->Addr( addr,2);
02589     }
02590   }
02591 
02592   SCUmulti_1->StartTrans();
02593 
02594   save_reg((long)intreg, (long)dreg);
02595 #endif
02596 
02597   //-----------------------------------------------------------------
02598   //do first local computations
02599   //-----------------------------------------------------------------
02600   IFloat *fp0,*fp1, *uu;
02601   int *ch;
02602 
02603 
02604   dirac_cmv_jcw_agg_cpp( (local_chi + local_chi_3)/2, (long)0, (long)uc_l_agg[odd],
02605                (long)a, (long)tmpfrm);
02606 
02607 
02608   //-----------------------------------------------------------------
02609   // check to see if transfers are done and start another transfer
02610   //-----------------------------------------------------------------
02611 
02612 
02613 
02614   //-----------------------------------------------------------------
02615   //do the computations involving "chi" non-local spinors
02616   //-----------------------------------------------------------------
02617 
02618   dirac_cmv_jcw_agg_cpp( non_local_chi_3/2, (long)0, (long)&(uc_nl_agg[odd][0]), (long)c, (long)tmpfrm);
02619 
02620  
02621 
02622   //-----------------------------------------------------------------
02623   //do the computations involving chi3 non-local spinors
02624   //----------------------------------------------------------------
02625 
02626   dirac_cmv_jcw_agg_cpp( non_local_chi/2, (long)0, (long)&(uc_nl_agg[odd][non_local_chi_3/2]), (long)c, (long)tmpfrm);
02627 
02628   //printf ("the computations involving chi3 non-local spinors done \n");
02629   // check to see if transfers are done
02630   //-----------------------------------------------------------------
02631   //do the sum of 16 temporary vectors at each lattice site
02632   //              ^^^ change must be made in  dirac_sum**
02633   //-----------------------------------------------------------------
02634 
02635   if ( add_flag == 0){
02636     dirac_sum2_cpp( vol/2, (long)0, (long)tmpfrm, (long)b);
02637   }
02638   else{
02639 
02640     dirac_sum2_acc_cpp( vol/2, (long)0, (long)tmpfrm, (long)b);
02641 
02642   }
02643 
02644 }
02645 
02646 CPS_END_NAMESPACE
02647 
02648 
02649 
02650 

Generated on Sat Oct 10 14:11:24 2009 for Columbia Physics System by  doxygen 1.3.9.1