/****************************************************************************
 *
 * DFT++:  density functional package developed by
 *         the research group of Prof. Tomas Arias, MIT.
 *
 * Principal author: Sohrab Ismail-Beigi
 *
 * Modifications for MPI version: Kenneth P Esler,
 *                                Sohrab Ismail-Beigi, and
 *                                Tairan Wang.
 *
 * Modifications for LSD version: Jason A Cline
 *
 * Modifications for lattice/Pulay forces: Gabor Csanyi and
 *                                         Sohrab Ismail-Beigi
 *
 * Copyright (C) 1996-1998 The Massachusetts Institute of Technology (MIT).
 *
 ****************************************************************************/

/*
 * dist_multiply.C -- distributed version of large matrix multiplications
 */

/* $Id: dist_multiply.c,v 1.1.1.1 1999/11/10 01:30:17 tairan Exp $ */

#include <stdio.h>
#include <math.h>

#include "header.h"
#include "parallel.h"   // include MPI related info.

//////////////////////////////////////////////////
// Routine to construct a table of the starting //
// and ending columns owned by each node.       //
//////////////////////////////////////////////////
class node_info
{
 public:
  int start_ncol, end_ncol;
};

//
// Find out which nodes/processes own which columns of the column_bundle
// Allocates and returns an array of node_info classes, with the size
// of the array being the number of nodes/processes.
//
node_info *
find_node_ownership(const column_bundle &Y)
{
  int i;
  node_info *nodes;


  nodes = (node_info *) mymalloc ( sizeof(node_info) * Y.Get_N_Procs(),
				   "nodes", "find_node_ownership");

  for (i=0; i<Y.Get_N_Procs(); i++)
    {
      nodes[i].start_ncol=Y.tot_ncols+1;
      nodes[i].end_ncol = -1;
    }
  for (i=0; i<Y.tot_ncols; i++)
    {
      if (nodes[Y.ncol_to_ID(i)].start_ncol > i)
	nodes[Y.ncol_to_ID(i)].start_ncol=i;
      if (nodes[Y.ncol_to_ID(i)].end_ncol < i)
	nodes[Y.ncol_to_ID(i)].end_ncol=i;
    }
  return (nodes);
}

// Takes a (possibly) overflowed or underflowed
// id number and returns it to the range
// 0 to n_nodes - 1.  ie. imposes circular
// boundary conditions for node numbers
int
circ_node_id(int node_id, int n_nodes)
{
  while (node_id < 0)
    node_id += n_nodes;
  return (node_id % n_nodes);
}

///////////////////////////////////////////////////
// This routine uses mpi calls to transpose the  //
// distributed matrix Y into the distributed     //
// matrix Ytran.  Currently, node n first sends  //
// the info it owes to node n+1 and receieves    //
// the info it needs from n-1.  It the sends to  //
// n+2 and receives from n-2, and so forth,      //
// with circular boundary conditions.            //
///////////////////////////////////////////////////


/* Does an MPI transpose of column bundle Y into Ytran.  If accum is
 * non-zero, the transposed Y is accumulated into to Ytran,
 * instead of over-writing it.  */
void
do_column_bundle_transpose(const column_bundle &Y,
			   column_bundle &Ytran,
			   int accum)
{
#ifdef DFT_PROFILING
  timerOn(6);   // turn on Transpose Timer
#endif // DFT_PROFILING

#ifdef DFT_MPI

  // MPI case

  /* First, find out which processor owns which columns of Y and Ytran */
  node_info *Y_nodes     = find_node_ownership(Y);
  node_info *Ytran_nodes = find_node_ownership(Ytran);

  /* Now allocate memory for the send and receive buffers.  These need to
     be set to the largest size buffer we will need.  First, determine what
     this maximum size is... */
  int max_send_size=0, max_recv_size=0;
  int nrows, ncols;
  for (int node = 0; node < Y.Get_N_Procs(); node++)
    {
      nrows = Ytran_nodes[node].end_ncol - Ytran_nodes[node].start_ncol + 1;
      ncols = Y.my_ncols;
      if ((nrows * ncols) > max_send_size)
	max_send_size = nrows * ncols;

      nrows = Y_nodes[node].end_ncol - Y_nodes[node].start_ncol + 1;
      ncols = Ytran.my_ncols;
      if ((nrows * ncols) > max_recv_size)
	max_recv_size = nrows * ncols;
    }
  /* ...and now allocate memory */
  scalar *send_buff, *recv_buff;
  send_buff = (scalar *)mymalloc(sizeof(scalar)*max_send_size, 
				 "send_buff", "do_column_bundle_transpose");
  recv_buff = (scalar *)mymalloc(sizeof(scalar)*max_recv_size,
				 "recv_buff", "do_column_bundle_transpose");
  
  /* Loop through node by node.  Send first to node n+1 and receive */
  /* from n-1.  Then send to n+2 and receive from n-2, etc. until   */
  /* Y dagger bundle is complete.                                   */
  int row, col;

  for (int i=0; i<Y.Get_N_Procs(); i++)
    {
      int send_node_id = circ_node_id(Y.Get_procID()+i, Y.Get_N_Procs());
      int recv_node_id = circ_node_id(Y.Get_procID()-i, Y.Get_N_Procs());

      /* Fill the send buffer with the transposed data chuck */
      int send_nrows = Y.my_ncols;
      int send_ncols = Ytran_nodes[send_node_id].end_ncol -
	               Ytran_nodes[send_node_id].start_ncol + 1;
#define sb(row,col) *(send_buff+(row)*send_ncols+(col))
      for (row=0; row < send_nrows; row++)
	for (col=0; col < send_ncols; col++)
	  sb(row,col)=Y.col[row].c[col+Ytran_nodes[send_node_id].start_ncol];
#undef sb
      /* Now determine how much we're going to receive */
      int recv_nrows = Y_nodes[recv_node_id].end_ncol -
	               Y_nodes[recv_node_id].start_ncol + 1;
      int recv_ncols = Ytran.my_ncols;

      int send_size = send_nrows*send_ncols*2;
      int recv_size = recv_nrows*recv_ncols*2;

      MPI_Status stat;
      int error_mpi = MPI_SUCCESS;

      if ((send_size > 0)&&(recv_size > 0))
	{  // normal situation.
	  error_mpi = MPI_Sendrecv(send_buff, send_size,
				   MPI_DOUBLE, send_node_id, Y.Get_procID(),
				   recv_buff, recv_size,
				   MPI_DOUBLE, recv_node_id, recv_node_id,
				   MPI_COMM_WORLD, &stat);
	}
      else if (send_size > 0)
	{ // recv_size is non-positive, do send only
	  error_mpi = MPI_Send(send_buff, send_size,
			       MPI_DOUBLE, send_node_id, Y.Get_procID(), 
			       MPI_COMM_WORLD);
	}
      else if (recv_size > 0)
	{ // send_size is non-positive, do recv only
	  error_mpi = MPI_Recv(recv_buff, recv_size,
			       MPI_DOUBLE, recv_node_id, recv_node_id,
			       MPI_COMM_WORLD, &stat);
	}
      // if only some processes call this, 
      // maynot die gracefully since  die  calls  MPI_Finalize.
      // consider using  MPI_Abort  instead
      if (error_mpi != MPI_SUCCESS)
	die ("MPI Error in MPI_Sendrecv in dist_transpose().\n");

#define rb(row,col) *(recv_buff+(row)*recv_ncols+(col))
      if (accum)
	for (row=0; row < recv_nrows; row++)
	  for (col=0; col < recv_ncols; col++)
	    Ytran.col[col].c[row+Y_nodes[recv_node_id].start_ncol] += 
	      rb(row,col);
      else
	for (row=0; row < recv_nrows; row++)
	  for (col=0; col < recv_ncols; col++)
	    Ytran.col[col].c[row+Y_nodes[recv_node_id].start_ncol]=rb(row,col);
#undef rb
    }

  // Free up the memory.
  if (send_buff!=NULL) myfree(send_buff);
  if (recv_buff!=NULL) myfree(recv_buff);
  myfree(Y_nodes);
  myfree(Ytran_nodes);


#else // DFT_MPI
  // serial case:  this is easy!
  int row, col;
  if (accum)
    for (row=0; row < Y.my_ncols; row++)
      for (col=0; col < Y.col_length; col++)
	Ytran.col[col].c[row] += Y.col[row].c[col]; 
  else
    for (row=0; row < Y.my_ncols; row++)
      for (col=0; col < Y.col_length; col++)
	Ytran.col[col].c[row] = Y.col[row].c[col];




#endif // DFT_MPI

#ifdef DFT_PROFILING
  timerOff(6);   // turn off Transpose Timer
#endif // DFT_PROFILING

}

//
// Does YM = Y*M or YM += Y*M in distributed case.
// The calculation actually works with transposes:
//
//         (YM)tran =/+= (MtranYtran)trans.
//
// Ytran holds the transpose of Y **and** of YM, so the
// multiply is actaully done in place.
//
void
do_Y_M_mult_distributed(const column_bundle &Y,
			const matrix &M,
			column_bundle &YM,
			int accum)
{
  // Copy innards
  copy_innards_column_bundle(&Y, &YM);

  // We construct Ytran.  Note that Ytran will hold Ytranspose and
  // YM transpose, eventually.  Thus, we set the size of Ytran to be
  // the large of the two matrices.
  int larger;
  if (M.nc > Y.tot_ncols) larger = M.nc;
  else                    larger = Y.tot_ncols;
  column_bundle Ytran(Y.col_length, larger, Y.basis);

  // We now set the # of rows of Ytran to be the # of cols of Y,
  // regardless of the actual size of the memory allocated for Y above.
  // This allows us to do the transpose of Y, calculate (YM)tran in place
  // in Ytran, and then finally transpose the resulting Ytrans into YM.
  // In the last step, we will adjust the # of rows of Ytran to be
  // the # of colums of M (i.e. # of cols of YM).
  Ytran.col_length = Y.tot_ncols;
  do_column_bundle_transpose (Y, Ytran, 0);

  // Do the multiply, but not in accumulate mode as the
  // the call to transpose Ytran again (further below) will
  // take care of accumulation if needed.  The multiply is done
  // in place, so the input and output are both Ytran.  The routine
  // should be written to work in place.
  Y_M_block_matrix_mult(Ytran,M,Ytran,
			Ytran.my_ncols,Ytran.col_length,M.nc,
			0,0,1,0);

  // Now adjust Ytran to be (YM)tran and do transpose into YM with
  // possible accumulation as needed.
  Ytran.col_length = M.nc;
  do_column_bundle_transpose (Ytran, YM, accum);
}

//
// Does Y1^Y2 multiplication in distributed case.
//
void
do_Y1dag_Y2_distributed(const column_bundle &Y1,
			const column_bundle &Y2,
			matrix &M)
{
  if (Y1.col_length != Y2.col_length)
    die ("Y1.col_length != Y2.col_length in do_Y1dag_Y2_block.\n");

  // First of all, create a temporary column_bundle to hold Y1dagger
  // and Y2 dagger.  We put both of these matrices in the same column
  // bundle to ensure that the distribution of the columns among the
  // processors is the same.  We could not do the multiplication locally
  // otherwise. 
  column_bundle Ytrans(Y1.col_length, Y1.tot_ncols + Y2.tot_ncols,Y1.basis);
  
  // Set the size of Ytrans to the size of Y1dag.  This allows us to
  // transpose Y1 into Ytrans.
  Ytrans.col_length = Y1.tot_ncols;
  do_column_bundle_transpose (Y1, Ytrans, 0);
 
  // Now is the tiny-bit tricky part.  We have to trick the transpose
  // routine into puting Y2transpose into the lower part of Ytrans.
  // We just have to set the number of rows to Y2.tot_ncols and increment
  // the column pointer values to their previous value + Y1.tot_ncols.
  Ytrans.col_length = Y2.tot_ncols;
  int i;
  for (i = 0; i < Ytrans.my_ncols; i++)
    Ytrans.col[i].c += Y1.tot_ncols;

  // Do the transpose
  do_column_bundle_transpose (Y2, Ytrans, 0);

  // Now fix everything up with Ytrans
  for (i = 0; i < Ytrans.my_ncols; i++)
    Ytrans.col[i].c -= Y1.tot_ncols;
  Ytrans.col_length = Y1.tot_ncols + Y2.tot_ncols;

  // We construct our matrix to hold the result and fill the matrix
  // with our local parts of the dot products between the rows of Ytrans.
  // Then we'll do a global sum reduction to get the final result.

#ifdef DFT_MPI
  // MPI case
  matrix local_result(Y1.tot_ncols, Y2.tot_ncols);
#else
  // serial case
  matrix &local_result = M;
#endif 

  // Do the actual hard work of multiplication by
  // passing appropriate sizes, flags, and offsets to the
  // multiplier routine.
  Y1dagY2_block_matrix_mult(Ytrans,Ytrans,local_result,
			    Y1.tot_ncols,Y2.tot_ncols,Ytrans.my_ncols,
			    0,0,1,Y1.tot_ncols);
			    
#ifdef DFT_MPI

  // i.e. MPI version

#ifdef DFT_PROFILING
  timerOn(12);  // Turn on Y1dagY2_MPI_Allreduce timer.
#endif // DFT_PROFILING

  // Now do MPI global sum reduction
  int mpi_error = MPI_Allreduce (local_result.c, M.c,
  				 Y1.tot_ncols * Y2.tot_ncols * 2,
  	  		         MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
  if (mpi_error != MPI_SUCCESS)
    die ("MPI error in MPI_Allreduce in do_Y1dag_Y2_distributed.\n");

#ifdef DFT_PROFILING
  timerOff(12);  // Turn off Y1dagY2_MPI_Allreduce timer.
#endif // DFT_PROFILING

#endif //  DFT_MPI

}
