/****************************************************************************
 *
 * DFT++:  density functional package developed by
 *         the research group of Prof. Tomas Arias, MIT.
 *
 * Principal author: Sohrab Ismail-Beigi
 *
 * Modifications for MPI version: Kenneth P Esler,
 *                                Sohrab Ismail-Beigi, and
 *                                Tairan Wang.
 *
 * Modifications for LSD version: Jason A Cline
 *
 * Modifications for lattice/Pulay forces: Gabor Csanyi and
 *                                         Sohrab Ismail-Beigi
 *
 * Copyright (C) 1996-1998 The Massachusetts Institute of Technology (MIT).
 *
 ****************************************************************************/

/*
 * Sohrab Ismail-Beigi,  Dec. 1996, modified Mar 1997.
 *
 * Calculates the diagonals of various products of column_bundles 
 * and diagonal matrices in an efficient manner.
 *
 */

/* $Id: diaginnerouter.c,v 1.1.1.1 1999/11/10 01:30:17 tairan Exp $ */

#include <math.h>
#include <stdio.h>

#include "header.h"
#include "parallel.h"

/*
 * Returns diag(F*X^Y) where X^ is the hermetian adjoint of X.
 *
 * Useful for the calculation of the Kinetic energy.
 */
vector
diaginner(const diag_matrix &F, column_bundle &X,const column_bundle &Y)
{
#ifdef DFT_PROFILING
  timerOn(7);  // Turn on the diaginner timer
#endif

  if (X.col_length != Y.col_length || X.tot_ncols != Y.tot_ncols ||
      X.tot_ncols != F.n)
    die("Size mismatch in diaginner: F.n=%d, X is %d by %d, Y is %d by %d\n",
	F.n,X.col_length,X.tot_ncols,Y.col_length,Y.tot_ncols);

  vector diagFXdagY(F.n,F.basis);
  diagFXdagY.zero_out();

#ifdef DFT_MPI
  // result it is a temporary result in MPI case
  vector result(F.n,F.basis);
  result.zero_out();
#else
  // or it points to diagFXdagY (serial/thread cases)
  vector &result = diagFXdagY;
#endif

  // do the work...
  register scalar z;
  register int i,j;
  for (i=0; i < X.my_ncols; i++)
    {
      z = 0.0;
      for (j=0; j < X.col_length; j++)

#if defined SCALAR_IS_COMPLEX
      	z += conjugate(X.col[i].c[j])*Y.col[i].c[j];
#elif defined SCALAR_IS_REAL
        z += X.col[i].c[j]*Y.col[i].c[j];
#else
#error scalar is neither complex nor real
#endif
	result.c[X.start_ncol+i] = F.c[X.start_ncol+i] * z;
    }

#ifdef DFT_MPI
  // If this is an MPI case, we have to sum over all results from
  // all processors before begin done...
#ifdef DFT_PROFILING
  timerOn(13);  // Turn on other MPI_Allreduce timer
#endif
  MPI_Allreduce( &result.c[0], &diagFXdagY.c[0], F.n*SCALAR_SIZE, 
		 MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD );
#ifdef DFT_PROFILING
  timerOff(13);  // Turn off other MPI_Allreduce timer
#endif
#endif

#ifdef DFT_PROFILING
  timerOff(7);  // Turn off the diaginner timer
#endif

  return diagFXdagY;
}

//
// This local function does the work of diagouterI (i.e. calculating
// electron density):  it calculates the cumulative density of 
// n_cols_todo in the column_bundle X starting at column start_col.
// Notice that these column indices are referenced to the columns
// "owned" by X, i.e. they must be between 0 and X->my_ncols.
//
static void
do_diagouterI_some_columns(const diag_matrix *F,
			   const column_bundle *X,
			   int start_col,
			   int n_cols_todo,
			   vector *rho_local)
{
  register int NxNyNz = X->basis->NxNyNz;

  /* Stores a single column of X */
  column_bundle Xcol(1,X->col_length,"local");
  /* Stores the result of I on a single column of X */
  column_bundle IXcol(1,NxNyNz,"local");
  
  /* loop over the columns of X and accumulate the resulting densities
   * into rho_local */
  int i;
  for (i=start_col; i < start_col+n_cols_todo; i++)
    {
      /* Local vars */
      register int j;

      /* Copy all the descriptive junk inside of X */
      copy_innards_column_bundle(X,&Xcol);
      copy_innards_column_bundle(X,&IXcol);

      /* Copy in i'th column of X into Xcol */
      for (j=0; j < X->col_length; j++)
	Xcol.col[0].c[j] = X->col[i].c[j];

      /* Apply I (i.e. FFT3D) */
      apply_I(Xcol,IXcol);

      /* Square the magnitude of the IXcol entries and scale them by
       *the filling f_i and accumulate into rho_local. */
      register scalar f_i;

      f_i = F->c[ X->start_ncol + i ];
      for (j=0; j < NxNyNz; j++)
#if defined SCALAR_IS_COMPLEX
	{
	  register scalar z = IXcol.col[0].c[j];
	  register real z2;
	  
	  z2 = z.x*z.x + z.y*z.y;
	  rho_local->c[j].x += f_i.x*z2;
	  rho_local->c[j].y += f_i.y*z2;
	  /* The above block does the line below: */	  
	  /* c[j] = f_i*conjugate(c[j])*c[j]; */
	}
#elif defined SCALAR_IS_REAL
        rho_local->c[j] = f_i*z*z;
#else
#error scalar is neither complex nor real
#endif
    }
}

/*
 * Returns diag((I*X)*F*(I*X)^) where X^ is the hermetian adjoint of X.
 *
 * The routine does the calculation by applying I to 'howmany' columns of
 * X.
 *
 * It's prime use is in the calculation of the charge-density.
 *
 * The two routines below do this calculation in parallel by using
 * threads.
 *
 */
vector
diagouterI(const diag_matrix &F,const column_bundle &X)
{
  if (F.n != X.tot_ncols)
    die("In diagouterI, F.n != X.tot_ncols\n");

  /* Stores the final result */
  vector diagIXFIXdag(F.basis->NxNyNz,F.basis);
  diagIXFIXdag.zero_out();


#ifdef DFT_MPI
  /* Stores intermediate result for columns owned by this MPI node */
  vector rho_local(F.basis->NxNyNz,F.basis);
  rho_local.zero_out();
#else
  /* If we're serial/thread,
   * we don't need any other places to store the result...
   * just fake rho_local to be the same as the final result */
  vector &rho_local = diagIXFIXdag;
#endif


  // Now calculate the electron density for the bands
  // owned by this process.  For thread case, we should
  // distribute the work among the threads...
  // The final accumulated result ends up in rho_local.
  do_diagouterI_some_columns(&F,&X,0,X.my_ncols,&rho_local);


#ifdef DFT_MPI
/* If running MPI, the we have to do a global sum of all rho_locals
 * on different nodes into the final results diagIXFIXdag */
#ifdef DFT_PROFILING
  timerOn(13);  // Turn on other MPI_Allreduce timer
#endif
  MPI_Allreduce( &(rho_local.c[0]), &(diagIXFIXdag.c[0]),
		 X.basis->NxNyNz*SCALAR_SIZE, DFT_MPI_REAL,
		 MPI_SUM, MPI_COMM_WORLD );
#ifdef DFT_PROFILING
  timerOff(13);  // Turn off other MPI_Allreduce timer
#endif
#endif // DFT_MPI


  return diagIXFIXdag;
}
