/****************************************************************************
 *
 * DFT++:  density functional package developed by
 *         the research group of Prof. Tomas Arias, MIT.
 *
 * Principal author: Sohrab Ismail-Beigi
 *
 * Modifications for MPI version: Kenneth P Esler,
 *                                Sohrab Ismail-Beigi, and
 *                                Tairan Wang.
 *
 * Modifications for LSD version: Jason A Cline
 *
 * Modifications for lattice/Pulay forces: Gabor Csanyi and
 *                                         Sohrab Ismail-Beigi
 *
 * Copyright (C) 1996-1998 The Massachusetts Institute of Technology (MIT).
 *
 ****************************************************************************/

/*
 *            Tairan Wang                        November 18, 1997
 * 
 * column_bundle.C -- implement the column_bundle class 
 *
 * The majority of the changes moving from Threaded code to MPI code 
 * is in the implementation of this class.
 *
 * The total number of bands is distributed in all the processors,
 * while each single band is contained entirly in one processor.
 *
 */

/* $Id: column_bundle.c,v 1.1.1.1 1999/11/10 01:30:17 tairan Exp $ */

#include <stdio.h>
#include <math.h>

#include "header.h"
#include "parallel.h"   // include MPI related info.

#include <sys/time.h>

//
// The init() function is the main initialization routine
// that allocates space and does other setup for the column_bundle
// class.  All constructors acutally call this init() function.
//
// type must be either 0 (local) or 1 (distributed)
//
// On a serial or thread code, the two are equivalent.
// On an MPI code, local means the entire column_bundle is allocated
//         on the running node, whereas distributed means that the
//         column_bundle is distributed (column-wise) across the MPI nodes.
//
void
column_bundle::init(int nc,int len,Basis *b,int type)
{
  if ( type !=0 && type !=1 )
    die("type in init() is neither local nor distributed!!\n\n");

  // Decide whether the object is distributed or local and
  // set the distribute member flag accordingly.
  // By default, the object is local.
  distributed = 0;
  // If we are running MPI AND are asked to be distributed, only then 
  // can we be distributed.
#ifdef DFT_MPI
  if (type == 1) distributed = 1;
#endif  

  // Setup the total number of columns, their length, and Basis * from the
  // passed parameters and set all the other to zero.
  tot_ncols = nc;
  col_length = len;
  col = NULL;
  Vscloc = NULL;
  k.v[0] = k.v[1] = k.v[2] = 0.0;
  basis = b;

  // In case we were told to initialize a zero-size object with no columns:
  // In that case, we're done!  There is nothing to distribute
  // or allocate.  This is a typical null-constructor calling us.
  if (nc == 0)
    return;
  // If we're asked to allocate zero-length columns, bail out!!!
  // Crazy request...
  else if (len == 0)
    die("nc!=0(%d) and len==0 in column_bundle::init()!!\n\n",nc);

  // Distribute out the columns based on number of processors if this
  // is a distributed object;
  if (distributed)
    distribute_cols();
  // if not, then the process owns all the columns.
  else
    {
      my_ncols = tot_ncols;
      start_ncol = 0;
      end_ncol = tot_ncols-1;
    }

  // Now allocate the memory to hold the data
  col = (scalar_column *)mymalloc(sizeof(scalar_column) * my_ncols,
				  "col","column_bundle::init()");

  // The column_bundle data is one big chunck of data... the various
  // columns point to different parts in it.
  int i;
  scalar * p;

  p = (scalar *)mymalloc(sizeof(scalar) * col_length * my_ncols,
			 "col[0]",
			 "column_bundle::init()");
  for (i=0; i < my_ncols; i++, p += col_length )
    {
      col[i].n = col_length;
      col[i].c = p;
    }
}
 
//
// This function is responsible for freeing up all the memory
// that a column_bundle has allocated.  The destructor should
// call this.
//
void
column_bundle::freemem(void)
{
  // The whole memory is allocated through a single malloc call.
  if (my_ncols>0)
    myfree(col[0].c);
  myfree(col);
  col = NULL;
}

/////////////////////////////////////////////
// Generate my_ncols, start_ncol, end_ncol //
// from tot_ncols for a distributed        //
// column_bundle case                      //
// This must also work for the single node //
// case where it just distributed all      //
// columns to a single node (i.e. serial   //
// case).                                  //
// The distribution is as follows:         //
//    d = tot_ncols/N_Procs                //
//    r = tot_ncols%N_Procs                //
// the first r processes get d+1 columns   //
// and the rest get d columns              //
// and the order of distribution is just   //
// linear in the columns, i.e. columns     //
// 0..d go to process 0, columns d+1,2*d+1 //
// go to process 1, etc.                   //
/////////////////////////////////////////////
void
column_bundle::distribute_cols()
{
  int d,r;

  d = tot_ncols / N_Procs;
  r = tot_ncols % N_Procs;

  // If the process is one of the first r, then it gets d+1 columns
  if (System::Get_procID() < r)
    {
      my_ncols = d+1;
      start_ncol = (d+1)*System::Get_procID();
    }
  // Otherwise, it gets d columns
  else
    {
      my_ncols = d;
      start_ncol = (d+1)*r + d*(System::Get_procID()-r);
    }
  // the last column owned by the process (inclusive)
  end_ncol = start_ncol + my_ncols - 1;
}

/////////////////////////////////////////////
// Get Processor ID from column number     //
// Namely, given a column_number, find the //
// processor number of its owner based on  //
// above distribution                      //
/////////////////////////////////////////////
int
column_bundle::ncol_to_ID(int col_index) const
{
  int d,r,id;

  d = tot_ncols / N_Procs;
  r = tot_ncols % N_Procs;

  // If it's one of the columns belonging to the first r processes
  // which have d+1 columns each...
  if (col_index < r*(d+1))
    id = col_index / (d+1);
  // otherwise, it belongs to the set of processes with d columns
  else
    id = (col_index - r) / d;

  // In the unlikely but bizzare case...
  if (id >= N_Procs)
    die("In column_bundle::ncol_to_ID, id > N_Procs");

  return id;
}

///////////////////////////////////////////////////////////////////////////
// This function copies all the non-data and non-size members of one
// column_bundle into another.  This should be modified as new members are
// added or subtracted from column_bundle.  The copy constructor and the
// assignment operators should call this.  Does Y1 -> Y2.
///////////////////////////////////////////////////////////////////////////
void
copy_innards_column_bundle(const column_bundle *Y1,column_bundle *Y2)
{
  Y2->distributed = Y1->distributed;
  Y2->Vscloc = Y1->Vscloc;
  Y2->k.v[0] = Y1->k.v[0];
  Y2->k.v[1] = Y1->k.v[1];
  Y2->k.v[2] = Y1->k.v[2];
  Y2->basis = Y1->basis;
}

////////////////////////////////////////////////
// Various constructors:
// these all assumed type == 1 in init() above,
// i.e. a distributed column_bundle.
////////////////////////////////////////////////

// Null constructor for distributed object
column_bundle::column_bundle()
{
  init(0,0,NULL,1);
}

// Constructor given sizes of the column_bundle for distrib. case
column_bundle::column_bundle(int nc,int len)
{
  init(nc,len,NULL,1);
}

// Constructor given sizes of the column_bundle and Basis pointer for
// distrib. case
column_bundle::column_bundle(int nc,int len,Basis *b)
{
  init(nc,len,b,1);
}

//////////////////////////////////////////////
// The following constructors take a string //
// argument (that must be "local")          //
// indicating that the column_bundle to     //
// be constructed should be local and not   //
// distributed.                             //
//////////////////////////////////////////////

// Null constructor for local case
column_bundle::column_bundle(const char* str)
{
  if (strcmp(str,"local")!=0)
    die("str!=local in column_bundle constructor(char *)\n\n");
  else
    init(0,0,NULL,0);
}

// Constructor given sizes of the local column_bundle
column_bundle::column_bundle(int nc,int len,const char* str)
{
  if (strcmp(str,"local")!=0)
    die("str!=local in column_bundle constructor(int,int,char *)\n\n");
  else
    init(nc,len,NULL,0);
}

// Constructor given sizes of the column_bundle and Basis pointer for
// local column_bundle
column_bundle::column_bundle(int nc,int len,Basis *b,const char* str)
{
  if (strcmp(str,"local")!=0)
    die("str!=local in column_bundle constructor(int,int,char *)\n\n");
  else
    init(nc,len,b,0);
}


////////////////////////////////////////////////
// Copy constructor for general case          //
// distributed or local                       //
////////////////////////////////////////////////
column_bundle::column_bundle(const column_bundle &Y)
{
  // Do the initialization and memory allocation 
  // of the copy object with appropriate size and flags...
  init(Y.tot_ncols,Y.col_length,Y.basis,Y.distributed);

  // Now copy the rest of the stuff inside of column_bundle
  // (including the distribute flag)
  copy_innards_column_bundle(&Y,this);

  // Now copy over contents of Y to copy object
  int i,j;

  for (i=0; i < my_ncols; i++)
    for (j=0; j < col_length; j++)
      col[i].c[j] = Y.col[i].c[j];
}

/////////////////////////////////////////////
// Destructor: just frees the memory used  //
/////////////////////////////////////////////
column_bundle::~column_bundle()
{
  freemem();
}

//////////////////////////////////////
//                                  //
// Operator functions               //
//                                  //
//////////////////////////////////////

/* Assignment:  nonstandard in that it returns void.  To make it standard,
 * replace void -> column_bundle and uncomment the return *this; */
void
column_bundle::operator=(const column_bundle &Y)
{
  int i,j;

  /* The sizes must agree */
  if (tot_ncols != Y.tot_ncols)
    die("In column_bundle::operator=, tot_ncols != Y.tot_ncols\n");
  if (my_ncols != Y.my_ncols)
    die("In column_bundle::operator=, my_ncols != Y.my_ncols\n");
  if (col_length != Y.col_length)
    die("In column_bundle::operator=, col_length != Y.col_length\n");

  for (i=0; i < my_ncols; i++)
    for (j=0; j < col_length; j++)
      col[i].c[j] = Y.col[i].c[j];
  copy_innards_column_bundle(&Y,this);
  /* return *this; */
}

// Assignment of scalar:  all entries set to s
inline void
column_bundle::operator=(scalar s)
{
  register int i, j;

  for (i=0; i < my_ncols; i++)
    for (j=0; j < col_length; j++)
      col[i].c[j] = s;
}

/* Add two column_bundles */
column_bundle
operator+(const column_bundle &Y1,const column_bundle &Y2)
{
  if (Y1.my_ncols != Y2.my_ncols || Y1.col_length != Y2.col_length)
    die("Size mismatch in operator+ on column_bundles\n");

  column_bundle Ysum(Y1);
  register int i,j;

  for (i=0; i < Y1.my_ncols; i++)
    for (j=0; j < Y1.col_length; j++)
      Ysum.col[i].c[j] += Y2.col[i].c[j];
  return Ysum;
}


/* Accumulate sum of column_bundles */
void
column_bundle::operator+=(const column_bundle &Y)
{
  if (tot_ncols != Y.tot_ncols)
    die("size mismatch of tot_ncols in column_bundle::operator+=\n\n");
  if (my_ncols != Y.my_ncols)
    die("size mismatch of my_ncols in column_bundle::operator+=\n\n");
  if (col_length != Y.col_length)
    die("size mismatch of col_length in column_bundle::operator+=\n\n");

  register int i,j;

  for (i=0; i < my_ncols; i++)
    for (j=0; j < col_length; j++)
      col[i].c[j] += Y.col[i].c[j];
}


/* Subtract two column_bundles */
column_bundle
operator-(const column_bundle &Y1,const column_bundle &Y2)
{
  if (Y1.tot_ncols != Y2.tot_ncols || Y1.col_length != Y2.col_length)
    die("Size mismatch in operator- on column_bundles\n");

  column_bundle Ydiff(Y1);
  register int i,j;

  for (i=0; i < Y1.my_ncols; i++)
    for (j=0; j < Y1.col_length; j++)
      Ydiff.col[i].c[j] -= Y2.col[i].c[j];
  return Ydiff;
}

/* Accumulate difference of column_bundles */
void
column_bundle::operator-=(const column_bundle &Y)
{
  if (tot_ncols != Y.tot_ncols || col_length != Y.col_length)
    die("Size mismatch in operator-= on column_bundles\n");

  register int i,j;

  for (i=0; i < my_ncols; i++)
    for (j=0; j < col_length; j++)
      col[i].c[j] -= Y.col[i].c[j];
}

/* Scale a column_bundle Y by scalar s */
column_bundle
operator*(scalar s,const column_bundle &Y)
{
  int i,j;
  column_bundle sY(Y);

  for (i=0; i < Y.my_ncols; i++)
    for (j=0; j < Y.col_length; j++)
      sY.col[i].c[j] *= s;
  return sY;
}

/* Scale a column_bundle Y by scalar s */
column_bundle
operator*(const column_bundle &Y,scalar s)
{
  int i,j;
  column_bundle sY(Y);

  for (i=0; i < Y.my_ncols; i++)
    for (j=0; j < Y.col_length; j++)
      sY.col[i].c[j] *= s;
  return sY;
}

/* Scale a column_bundle Y scalar s in place */
void
column_bundle::operator*=(scalar s)
{
  register int i,j;

  for (i=0; i < my_ncols; i++)
    for (j=0; j < col_length; j++)
      col[i].c[j] *= s;
}


/////////////////////////////////////////////
//                                         //
//  MEMBER FUNCTIONS:                      //
//                                         //
/////////////////////////////////////////////
// zeroes out all the columns
void
column_bundle::zero_out(void)
{
  int i,j;

  for(i=0;i < my_ncols; i++)
    for(j=0; j < col_length; j++)
      col[i].c[j] = (scalar)0.0;
}

/* Negates all the entries in the column_bundle */
void
column_bundle::negate(void)
{
  register int i,j;

  for(i=0;i < my_ncols; i++)
    for(j=0; j < col_length; j++)
      {
#if defined SCALAR_IS_COMPLEX
	col[i].c[j].x = -col[i].c[j].x;
	col[i].c[j].y = -col[i].c[j].y;
#elif defined SCALAR_IS_REAL
	col[i].c[j] = -col[i].c[j];
#else
#error scalar is neither real nor complex!
#endif
      }
}


/* Fill the column_bundle with random gaussian distributed numbers
 * with zero mean and std. dev. = 1/(1+((0.5*G^2)/0.75)^6), where 0.5*G^2
 * is the kinetic energy of the basis function in question.
 * The reason for this strange formula is that we want high-energy
 * plane-waves to have lower initial weight than low-energy one.  The
 * cutoff of 0.75 is because following the Jones-scheme business for
 * the 2-atom Si cell, we want to keep the (0 0 0), (1 1 1), and (2 0 0)
 * vectors (units are 2*pi/a).  The energy of the (2 0 0) in Hartrees
 * and for a=5.43 Angstorms is 0.7499 (miracle!?!?), and the next
 * highest planewave is (2 2 0) with energy 1.4998.  So I just chose
 * some function which turns over at 0.75 relatively sharply. */
void
column_bundle::randomize(void)
{
  register int i,j;
  register real std,KE,t;
  vector3 kplusG;

  if (basis->nbasis != col_length)
    die("\ncolumn_bundle::randomize() nbasis != col_length!!!\n\n");

  for(j=0; j < basis->nbasis; j++)
    {
      kplusG.v[0] = (real)basis->Gx[j] + k.v[0];
      kplusG.v[1] = (real)basis->Gy[j] + k.v[1];
      kplusG.v[2] = (real)basis->Gz[j] + k.v[2];
      KE = 0.5*(kplusG*(basis->GGT*kplusG));
      t = KE/0.75;
      std = 1.0/(1.0+t*t*t*t*t*t);
      for(i=0;i < my_ncols; i++) 
	{
#if defined SCALAR_IS_COMPLEX
	  col[i].c[j].x = gauss(std);
	  col[i].c[j].y = gauss(std);
#elif defined SCALAR_IS_REAL
	  col[i].c[j] = gauss(std);
#else
#error scalar is neither real nor complex!
#endif
	}
    }
}

/* Fill column_bundle_array with random numbers */
void
randomize_column_bundle_array(int nbundles,column_bundle *Y)
{
  int i;

  for (i=0; i < nbundles; i++)
    Y[i].randomize();
}

// Write column_bundle in binary to file fname
//////////////////////////////////////////////////
// Processor identified by MPI_IO has regular   //
// IO functionalities. See MPI_Get_Attr for     //
// furthur details. For now, assume processor 0 //
// is this processor.                           //
//////////////////////////////////////////////////
void
column_bundle::write(char *fname)
{
  FILE *fp;

  fp = dft_fopen(fname, "w");
  write_stream(fp);
  dft_fclose(fp);
}


/* Write column_bundle in binary to the already open IO stream...don't
 * close it! */
///////////////////////////////
// See comments for write    //
// above.                    //
///////////////////////////////
void
column_bundle::write_stream(FILE *fp)
{
  int i;

#ifdef DFT_MPI

  if (System::Get_procID() == System::Get_IOprocID() )
    {
      // Temporary holding places for MPI.
      int ncols, nproc;
      column_bundle data(my_ncols+1, col_length, basis, "local");
      MPI_Status status;
      // CRITICAL: data.col[i].c[j] must be continuous, with j being
      // the inner index.
      // And my_ncols should NOT differ by more than one across processes.
      
      for ( nproc = 0; nproc < N_Procs; nproc++ ) {
	
	if ( nproc == System::Get_procID() ) {

	  for (i=0; i < my_ncols; i++)
	    fwrite(col[i].c,sizeof(scalar),col_length,fp);
	  
	} else {
	  
	  // Receive the size of my_ncols into ncols.
	  MPI_Recv(&ncols, 1, MPI_INT, nproc, 0, MPI_COMM_WORLD, &status);

	  // Receive the actual data.
	  if (ncols > 0)
	    MPI_Recv(data.col[0].c, ncols*col_length*SCALAR_SIZE, MPI_DOUBLE, nproc, 1,
		     MPI_COMM_WORLD, &status);
	  
	  for (i=0; i < ncols; i++)
	    fwrite(data.col[i].c,sizeof(scalar),col_length,fp);
	  
	}
      }
      
    } else {

    // Send data to processor io_node for writing.
    MPI_Send(&my_ncols, 1, MPI_INT, System::Get_IOprocID(),
	     0, MPI_COMM_WORLD);

    if (my_ncols > 0)
      MPI_Send(col[0].c, my_ncols*col_length*SCALAR_SIZE, MPI_DOUBLE, 
	       System::Get_IOprocID(), 1, MPI_COMM_WORLD);
  }

  MPI_Barrier(MPI_COMM_WORLD);  // just to be safe, synchronize.


#else // DFT_MPI

  for (i=0; i < my_ncols; i++)
    fwrite(col[i].c,sizeof(scalar),col_length,fp);



#endif // DFT_MPI
}


/* Read column_bundle in binary from file fname */
//////////////////////////////////////////////////
// Processor identified by MPI_IO has regular   //
// IO functionalities. See MPI_Get_Attr for     //
// furthur details. For now, assume processor 0 //
// is this processor.                           //
//////////////////////////////////////////////////
void
column_bundle::read(char *fname)
{
  FILE *fp;

  fp = dft_fopen(fname, "r");
  read_stream(fp);
  dft_fclose(fp);

}


/* Read column_bundle in binary form from an already open IO stream...
 * don't close it! */
///////////////////////////////
// See comments for read     //
// above.                    //
///////////////////////////////
void
column_bundle::read_stream(FILE *fp)
{
  int i;

#ifdef DFT_MPI

  if (System::Get_procID() == System::Get_IOprocID() ) {

    // Temporary holding places for MPI.
    int ncols, nproc;
    column_bundle data(my_ncols+1, col_length, basis, "local");
    MPI_Status status;
    // CRITICAL: data.col[i].c[j] must be continuous, with j being
    // the inner index.
    // And my_ncols must NOT differ by more than 1 across processes.
    for ( nproc = 0; nproc < N_Procs; nproc++ ) {
      
      if ( nproc == System::Get_procID() ) {
	
	for (i=0; i < my_ncols; i++)
	  fread(col[i].c,sizeof(scalar),col_length,fp);
	
      } else {
	MPI_Recv(&ncols, 1, MPI_INT, nproc, 0, MPI_COMM_WORLD, &status);
	
	for (i=0; i < ncols; i++)
	  fread(data.col[i].c,sizeof(scalar),col_length,fp);
	
	if (ncols > 0) 
	  MPI_Send(data.col[0].c, ncols*col_length*SCALAR_SIZE,
		   MPI_DOUBLE, nproc, 1, MPI_COMM_WORLD);
      }
    }
      
  } else {
    
    MPI_Status status;
    
    MPI_Send(&my_ncols, 1, MPI_INT, System::Get_IOprocID(),
	     0, MPI_COMM_WORLD);
    
    // Receive data from processor io_node.
      if (my_ncols > 0) 
	MPI_Recv(col[0].c, my_ncols*col_length*SCALAR_SIZE, MPI_DOUBLE, 
		 System::Get_IOprocID(), 1, MPI_COMM_WORLD, &status);
  }
  
  MPI_Barrier(MPI_COMM_WORLD);  // just to be safe, synchronize.

#else // DFT_MPI
  
  for (i=0; i < my_ncols; i++)
    fread(col[i].c,sizeof(scalar),col_length,fp);



#endif // DFT_MPI
}




///////////////////////////////////
//                               //
// ----   Other Functions   ---- //
//                               //
///////////////////////////////////

/* Read/write an array of column_bundles from/to a file */
//////////////////////////////////////////////////
// Processor identified by MPI_IO has regular   //
// IO functionalities. See MPI_Get_Attr for     //
// furthur details. For now, assume processor 0 //
// is this processor.                           //
//////////////////////////////////////////////////
void
read_column_bundle_array(char *fname,int nbundles,column_bundle *Y)
{
  FILE *fp = NULL;
  int i;

  fp = dft_fopen(fname,"r");
  for (i=0; i < nbundles; i++)
    Y[i].read_stream(fp);
  dft_fclose(fp);
}


void
write_column_bundle_array(char *fname,int nbundles,column_bundle *Y)
{
  FILE *fp = NULL;
  int i;

  fp = dft_fopen(fname,"w");
  for (i=0; i < nbundles; i++)
    Y[i].write_stream(fp);
  dft_fclose(fp);
}


/* Allocate an array of column_bundles:  assumes they are distributed!! */
column_bundle *
alloc_column_bundle_array(int nbundles,int tot_ncols,Basis* basis)
{
  int i;
  column_bundle *Y;

  Y = (column_bundle *)mymalloc(sizeof(column_bundle)*nbundles,
				"alloc_column_bundle_array","Y");
  for (i=0; i < nbundles; i++)
    Y[i].init(tot_ncols,basis[i].nbasis,&(basis[i]),1);

  return Y;
}

/* Free the memory used by a column_bundle array */
void
free_column_bundle_array(int nbundles,column_bundle *Y)
{
  int i;

  for (i=0; i < nbundles; i++)
    Y[i].freemem();
  myfree(Y);
}



/* Sum of absolute squares of all elements in Y */
real
abs2(const column_bundle &Y)
{
  register real Y2;
  register int i,j;
  register scalar z;

  Y2 = 0.0;
  for (i=0; i < Y.my_ncols; i++)
    for (j=0; j < Y.col_length; j++)
      {
	z = Y.col[i].c[j];
#if defined SCALAR_IS_COMPLEX
	Y2 += z.x*z.x + z.y*z.y;
#elif defined SCALAR_IS_REAL
	Y2 += z*z;
#else
#error scalar is neither real nor complex!
#endif
      }

  // If the object is distributed, do a global
  // summation across the processors.
  if (Y.distributed)
    {
#ifdef DFT_PROFILING
      timerOn(13);  // Turn on other MPI_Allreduce timer
#endif // DFT_PROFILING
#ifdef DFT_MPI
      real temp = 0.0;
      MPI_Allreduce ( &Y2, &temp, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD );
      Y2 = temp;
#endif // DFT_MPI
#ifdef DFT_PROFILING
      timerOff(13);  // Turn off other MPI_Allreduce timer
#endif // DFT_PROFILING
    }

  return Y2;
}

/* Sum of absolute squares of all elements in array of column_bundles Y:
 * i.e. it is just the sum of abs2() on all nbundle elements of Y */
real
abs2(int nbundles,column_bundle *Y)
{
  int i;
  register real Y2;

  Y2 = 0.0;
  for (i=0; i < nbundles; i++)
    Y2 += abs2(Y[i]);
  return Y2;
}

/* Take "dot-product" of two column_bundles:  sum the diagonals of Y1^Y2 */
scalar
dot(const column_bundle &Y1,const column_bundle &Y2)
{
  register int i,j;
  register scalar d;

  if (Y1.tot_ncols != Y2.tot_ncols)
    die("Y1.tot_ncols != Y2.tot_ncols in dot_column_bundles\n");
  if (Y1.my_ncols != Y2.my_ncols) 
    {
      dft_log(DFT_SILENCE,
	      "Different distribution of the same size arrays!!\n");
      die("Y.my_ncols != Y2.my_ncols in dot_column_bundles\n");
    }
  if (Y1.col_length != Y2.col_length)
    die("Y1.col_length != Y2.col_length in dot_column_bundles\n");
  if (Y1.distributed != Y2.distributed)
    die("Y1, Y2 are not the same distributed type\n");

  d = 0.0;
  for (i=0; i < Y1.my_ncols; i++)
    for (j=0; j < Y1.col_length; j++)
      {
#if defined SCALAR_IS_COMPLEX
	/* do d += conjugate(Y1.col[i].c[j])*Y2.col[i].c[j]; */
	d.x += Y1.col[i].c[j].x*Y2.col[i].c[j].x + 
	       Y1.col[i].c[j].y*Y2.col[i].c[j].y ;

	d.y += Y1.col[i].c[j].x*Y2.col[i].c[j].y -
	       Y1.col[i].c[j].y*Y2.col[i].c[j].x ;
#elif defined SCALAR_IS_REAL
	d += Y1.col[i].c[j]*Y2.col[i].c[j];
#else
#error scalar is neither real nor complex!
#endif
      }

  // If the object is distributed, then do a global summation
  if ( Y1.distributed )
    {
#ifdef DFT_PROFILING
      timerOn(13);  // Turn on other MPI_Allreduce timer
#endif // DFT_PROFILING

#ifdef DFT_MPI
      scalar temp = 0.0;

#if defined SCALAR_IS_COMPLEX
      MPI_Allreduce ( &d.x, &temp.x, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD );
      MPI_Allreduce ( &d.y, &temp.y, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD );
#elif defined SCALAR_IS_REAL
      MPI_Allreduce ( &d, &temp, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD );
#else
#error scalar is neither real nor complex!
#endif // SCALAR_IS_COMPLEX

      d = temp;
#endif // DFT_MPI

#ifdef DFT_PROFILING
      timerOff(13);  // Turn off other MPI_Allreduce timer
#endif // DFT_PROFILING
    }
  
  return d;
}


/* Take "dot-product" of nbundles pairs of column_bundles: i.e.
 * do dot_column_bundles does, but just loop over the pairs. */
scalar
dot(int nbundles,column_bundle *Y1,column_bundle *Y2)
{
  int i;
  register scalar d;

  d = 0.0;
  for (i=0; i < nbundles; i++)
    d += dot(Y1[i],Y2[i]);

  return d;
}

/* Dees Yout += s * Yin */
void
scale_accumulate(scalar s,const column_bundle &Yin, column_bundle &Yout)
{
  register int i,j,cl;
  cl = Yin.col_length;
  for (j=0; j < Yin.my_ncols; j++)
    for (i=0; i < cl; i++)
      Yout.col[j].c[i] += s * Yin.col[j].c[i];
}

/* Dees Yout[k] += s * Yin[k] */
void
scale_accumulate(int nbundles, 
		 scalar s, column_bundle* Yin, column_bundle* Yout)
{
  for (int k=0; k < nbundles; k++)
    scale_accumulate(s,Yin[k],Yout[k]);
}

/*
 *
 *
 *
 *
 *
 *
 *
 *
 *
 *
 * The operators   matrix operator^(column_bundle &,column_bundle &)
 * and             column_bundle operator*(column_bundle &,matrix &)
 *
 *
 *
 *
 *
 *
 *
 *
 *
 *
 *
 */

matrix operator^(const column_bundle &Y1,const column_bundle &Y2)
{

#ifdef DFT_PROFILING
  timerOn(3); // turn on  Y1dag Y2  timer
#endif // DFT_PROFILING


  if (Y1.col_length != Y2.col_length)
    die("In operator ^ on column_bundles, Y1.col_length != Y2.col_length\n");

  int n1, n2, N;
  matrix Y1dY2(Y1.tot_ncols,Y2.tot_ncols);

  // Y1 is Nxn1, Y2 is Nxn2, and Y1dY2 is n1xn2.
  n1 = Y1.my_ncols;
  n2 = Y2.my_ncols;
  N  = Y1.col_length;

  // zero out Y1dY2
  Y1dY2.zero_out();

  // both Y1 and Y2 are distribyted
  if ( (Y1.distributed == 1) && (Y2.distributed == 1) )
    {
      // use the code in dist_multiply.c by Ken Esler.
      do_Y1dag_Y2_distributed(Y1, Y2, Y1dY2);

#ifdef DFT_PROFILING
      timerOff(3); // turn off  Y1dag Y2  timer

      counterIncr(0); // incr distributed Y1dag local Y2 counter
      counterIncr(1,(n1+n2)*N/1.0e6);
#endif // DFT_PROFILING

      // done
      return Y1dY2;
    }

  // Y1 and Y2 are both local.
  else if ( (Y1.distributed == 0) && (Y2.distributed == 0) )
    {
      // do the multiply
      Y1dagY2_block_matrix_mult(Y1,Y2,Y1dY2,n1,n2,N,0,0,0,0);

#ifdef DFT_PROFILING
      timerOff(3); // turn off  Y1dag Y2  timer
      counterIncr(2); // incr local Y1dag Y2 counter
      counterIncr(3,(n1+n2)*N/1.0e6); // incr. flop counter
#endif // DFT_PROFILING

      // both are local, so we've got the final answer!  We're done.
      return Y1dY2;
    }

  // Y1 is local, Y2 is distributed
  else if ( Y1.distributed == 0 )
    {
      // At the end, this needs a global reduction (see below).
      // do the multiply with proper offset
      Y1dagY2_block_matrix_mult(Y1,Y2,Y1dY2,n1,n2,N,0,Y2.start_ncol,0,0);

#ifdef DFT_PROFILING
      counterIncr(4); // incr local Y1dag distributed Y2 counter
      counterIncr(5,(n1+n2)*N/1.0e6);
#endif // DFT_PROFILING
    }

  // Y1 is distributed, Y2 is local
  else if ( Y2.distributed == 0 )
    {
      // At the end, this needs a global reduction (see below).
      // do the multiply with proper offset
      Y1dagY2_block_matrix_mult(Y1,Y2,Y1dY2,n1,n2,N,Y1.start_ncol,0,0,0);

#ifdef DFT_PROFILING
      counterIncr(6); // incr distributed Y1dag local Y2 counter
      counterIncr(7,(n1+n2)*N/1.0e6);
#endif // DFT_PROFILING
    }

#ifdef DFT_MPI
  // If we are running MPI and have distributed column_bundles,
  // then we must do a global reduction of all the processors's
  // Y1dY2 matrices into result to get the final answer.
  matrix result(Y1.tot_ncols, Y2.tot_ncols);

  // zero out where the final answer is going
  result.zero_out();

  // do the global sum over all processors
#ifdef DFT_PROFILING
  timerOn(13);  // Turn on other MPI_Allreduce timer
#endif
  MPI_Allreduce ( &(Y1dY2.c[0]), &(result.c[0]), 
                 Y1.tot_ncols * Y2.tot_ncols * SCALAR_SIZE, 
                 MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD );
#ifdef DFT_PROFILING
  timerOff(13);  // Turn off other MPI_Allreduce timer
#endif // DFT_PROFILING

#else  // DFT_MPI
  // We aren't running MPI, then we don't do any reduction, and
  //  the final answer is just Y1dY2 and we'll just make result 
  // point to it.
  matrix &result = Y1dY2;
#endif // DFT_MPI

  return result;
}



/*
 * Multiply column_bundle by a matrix on the right:
 * YM = Y*M
 */
column_bundle operator*(const column_bundle &Y, const matrix &M)
{
  if (Y.tot_ncols != M.nr)
    die("In column_bundle:operator*, Y.tot_ncols != M.nr\n");

  column_bundle YM(M.nc,Y.col_length,Y.basis);

  copy_innards_column_bundle(&Y,&YM);

  /* Call the version below!! */
  do_column_bundle_matrix_mult(Y,M,YM,0);

  return YM;
}

/*
 * Does YM = Y*M or YM += Y*M.
 */
void
do_column_bundle_matrix_mult(const column_bundle &Y,
			     const matrix &M,
			     column_bundle &YM,
			     int accum)
{

#ifdef DFT_PROFILING
  if (accum)
    timerOn(5);  // turn on  YM += Y * M  timer
  else
    timerOn(4); // turn on  YM = Y * M  timer
#endif // DFT_PROFILING

  if (Y.tot_ncols != M.nr)
    die("In do_column_bundle_matrix_mult(), Y.tot_ncols != M.nr\n");
  if (Y.col_length != YM.col_length)
    die("In do_column_bundle_matrix_mult(), Y.col_length != YM.col_length\n");
  if (M.nc != YM.tot_ncols)
    die("In do_column_bundle_matrix_mult(), M.nc != YM.tot_ncols\n");

  copy_innards_column_bundle(&Y,&YM);

  int N     = Y.col_length;
  int ncols = YM.my_ncols;
  int nrows = Y.my_ncols;

  /* Zero out final result if not in accumulation mode */
  if (!accum)
    YM.zero_out();

  // both YM and Y are distributed
  if ( (Y.distributed == 1) && (YM.distributed == 1) )
    {
      // Use code in dist_multiply.c
      do_Y_M_mult_distributed(Y, M, YM, accum);

#ifdef DFT_PROFILING
      if (accum)
	{
	  counterIncr(10); // Incr YM += Y*M counter
	  counterIncr(11, (ncols+nrows)*N/1.0e6 ); 
	}
      else
	{
	  counterIncr(8); // Incr YM = Y*M counter
	  counterIncr(9, (ncols+nrows)*N/1.0e6 );  // incr. flop counter
	}
#endif // DFT_PROFILING
    }

  // Y and YM are both local.
  else if ( (Y.distributed == 0) && (YM.distributed == 0) )
    {   
      Y_M_block_matrix_mult(Y,M,YM,N,nrows,ncols,0,0,0,accum);
#ifdef DFT_PROFILING
      counterIncr(12); // Incr other YM = Y*M counter
#endif // DFT_PROFILING
    }

  // Y is local, YM is distributed.
  else if ( Y.distributed == 0 )
    {
      Y_M_block_matrix_mult(Y,M,YM,N,nrows,ncols,0,YM.start_ncol,0,accum);
#ifdef DFT_PROFILING
      counterIncr(12); // Incr other YM = Y*M counter
#endif // DFT_PROFILING
    }

  // Y is distributed, YM is local.
  else if ( YM.distributed == 0 )
    {
#ifdef DFT_MPI
      // If in MPI mode, then we have to create a new column_bundle
      // to hold the local result before we sum up accros processors.
      column_bundle Ytemp(YM.tot_ncols, YM.col_length, YM.basis, "local");
      copy_innards_column_bundle(&YM, &Ytemp);
      // If in accum mode, then the zeroth processor starts with YM
      // in its local result and accumulates into it.  All other proc.
      // and for all other cases start with zero in their local result.
      if (accum && System::Get_procID()==0)
	Ytemp = YM;
      else
	Ytemp.zero_out();
#else
      // Non MPI mode... just make Ytemp point to YM and zero it out
      // if not in accum mode.
      column_bundle &Ytemp = YM;
      if (!accum)
	Ytemp.zero_out();
#endif

      // do the multiply into Ytemp
      Y_M_block_matrix_mult(Y,M,Ytemp,N,nrows,ncols,Y.start_ncol,0,0,accum);

#ifdef DFT_PROFILING
      timerOn(13);  // Turn on other MPI_Allreduce timer
#endif // DFT_PROFILING

#ifdef DFT_MPI
      // do a global reduction on Ytemp, store in YM;
      MPI_Allreduce ( &(Ytemp.col[0].c[0]), &(YM.col[0].c[0]), 
                     N * ncols * SCALAR_SIZE, 
                     MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD );
#endif // DFT_MPI

#ifdef DFT_PROFILING
      timerOff(13);  // Turn off other MPI_Allreduce timer
      counterIncr(12); // Incr other YM = Y*M counter
#endif // DFT_PROFILING

    }

#ifdef DFT_PROFILING
  if (accum)
    timerOff(5); // turn off  YM += Y * M  timer
  else
    timerOff(4); // turn off  YM = Y * M  timer
#endif // DFT_PROFILING
}
