/****************************************************************************
 *
 * DFT++:  density functional package developed by
 *         the research group of Prof. Tomas Arias, MIT.
 *
 * Principal author: Sohrab Ismail-Beigi
 *
 * Modifications for MPI version: Kenneth P Esler,
 *                                Sohrab Ismail-Beigi, and
 *                                Tairan Wang.
 *
 * Modifications for LSD version: Jason A Cline
 *
 * Modifications for lattice/Pulay forces: Gabor Csanyi and
 *                                         Sohrab Ismail-Beigi
 *
 * Copyright (C) 1996-1998 The Massachusetts Institute of Technology (MIT).
 *
 ****************************************************************************/

/*
 *
 * PHLO.C -- fundamental operators
 *
 */

/* $Id: PHLO.c,v 1.1.1.1 1999/11/10 01:30:17 tairan Exp $ */

#include <stdio.h>
#include <math.h>

#include "header.h"

/* Multiply by overlap matrix :  for planewaves, the idenity operator */
vector
O(const vector &v)
{
  return v;
}

/*
 * Multiply by overlap matrix but subtract out uniform part:
 *
 * Obar_ij = O_ij - conjugate(s_i)*s_j/vol
 *
 * where vol is the unit cell volume and
 *
 * s_i = integral_{unit cell} { d^3r b_i(r) }
 *
 * where b_i(r) is the i'th basis function.
 *
 * For planewaves, b_j(r) = exp(i*G_j*r)/sqrt(vol), so
 * s_j = Kroncker(G_j,0)*sqrt(vol) and hence
 * Obar_ij = (Kronecker(i,j)-Kronecker(i,0)*Kronecker(j,0)),
 * the identity operator except for a zero in the G=0 place.  In our case,
 * for vectors in G-space, the 0'th entry in the FFT box is the G=0 element.
 *
 */
vector
Obar(const vector &v)
{
  vector Obarv(v);

  Obarv.c[0] = 0.0;

  return Obarv;
}

/* Multiply by overlap maxtrix :  for planewaves, the idenity operator */
column_bundle
O(const column_bundle &Y)
{
  column_bundle OY(Y);

  apply_O(Y,OY);

  return OY;
}

/* A version of O(Y) above that uses much less temporary memory */
void
apply_O(const column_bundle &Y,column_bundle &OY)
{
#ifdef DFT_PROFILING
  timerOn(29); // turn on O timer
#endif // DFT_PROFILING

  OY = Y;

#ifdef DFT_PROFILING
  timerOff(29); // turn off O timer
#endif // DFT_PROFILING
}

/* This version applies O() to Y in place...the result is placed back
 * in Y.  For plane-waves, this is quite trivial:  do nothing! */
void
apply_O_inplace(column_bundle &Y)
{
  return;
}

/* Returns (I-P)Y...the complement projector of P=O*C*Cdag where
 * Cdag is the Hermetian adjoint of C and C = Y*U(-1/2).
 * Thus the routine returns Pbar*Yin = (I-O*C*Cdag)*Yin. */
column_bundle
Pbar(const column_bundle &C,const column_bundle &Y)
{
#ifdef DFT_PROFILING
  timerOn(30); // turn on Pbar timer
#endif // DFT_PROFILING

  column_bundle result(Y);

  result -= O(C*(C^Y));

#ifdef DFT_PROFILING
  timerOff(30); // turn off Pbar timer
#endif // DFT_PROFILING
  return result;
}

/* Less memory intensive version of Pbar() above */
void
apply_Pbar(const column_bundle &C,
	   const column_bundle &Y,
	   column_bundle &PbarY)
{
#ifdef DFT_PROFILING
  timerOn(30); // turn on Pbar timer
#endif // DFT_PROFILING

  do_column_bundle_matrix_mult(C,C^Y,PbarY,0);
  apply_O_inplace(PbarY);
  PbarY.negate();
  PbarY += Y;

#ifdef DFT_PROFILING
  timerOff(30); // turn off Pbar timer
#endif // DFT_PROFILING
}

/*
 * Multiply a vector by the Laplacian.  For planewaves, L is digonal and
 * is just -|G|^2.
 */
vector
L(const vector &v)
{
#ifdef DFT_PROFILING
  timerOn(31); // turn on L timer
#endif // DFT_PROFILING

  int Nx,Ny,Nz,Nx2,Ny2,Nz2,NyNz;
  register int i,j,k,index;
  real G2,GGTxx,GGTxy,GGTxz,GGTyy,GGTyz,GGTzz;
  Basis *basis;

  if (v.basis == 0)
    die("L(vector) called with v.basis==0\n");
  if (v.n != v.basis->NxNyNz)
    die("L(vector) called with vector.n != FFT box size\n");
  if ((v.basis->Nx%2)!=0 || (v.basis->Ny%2)!=0 || (v.basis->Nz%2)!=0)
    die("L(vector) called with basis->Nx,Ny,or Nz not a multiple of 2\n");

  vector Lv(v);

  basis=v.basis;
  Nx = basis->Nx; Nx2 = Nx/2;
  Ny = basis->Ny; Ny2 = Ny/2;
  Nz = basis->Nz; Nz2 = Nz/2;
  NyNz = Ny*Nz;
  GGTxx = basis->GGT.m[0][0];
  GGTyy = basis->GGT.m[1][1];
  GGTzz = basis->GGT.m[2][2];
  GGTxy = basis->GGT.m[0][1];
  GGTxz = basis->GGT.m[0][2];
  GGTyz = basis->GGT.m[1][2];
  for (i=-Nx2; i < Nx2; i++)
    for (j=-Ny2; j < Ny2; j++)
      for (k=-Nz2; k < Nz2; k++)
	{
	  index = 0;
	  if (k < 0) index += k+Nz;        else index += k;
	  if (j < 0) index += Nz*(j+Ny);   else index += Nz*j;
	  if (i < 0) index += NyNz*(i+Nx); else index += NyNz*i;
	  G2 = i*i*GGTxx +
	       j*j*GGTyy +
	       k*k*GGTzz +
	       2*(i*j*GGTxy +
		  i*k*GGTxz +
		  j*k*GGTyz);
	  Lv.c[index] = -G2*v.c[index];
	}

#ifdef DFT_PROFILING
  timerOff(31); // turn off L timer
#endif // DFT_PROFILING
  return Lv;
}

/*
 * The inverse of the Laplacian on vectors:  the is obviously modulo
 * G=0 where there are infinities.  For planewaves, we have -1/|G|^2.
 */
vector
invL(const vector &v)
{
#ifdef DFT_PROFILING
  timerOn(32); // turn on invL timer
#endif // DFT_PROFILING

  int Nx,Ny,Nz,Nx2,Ny2,Nz2,NyNz;
  register int i,j,k,index;
  real G2,GGTxx,GGTxy,GGTxz,GGTyy,GGTyz,GGTzz;
  Basis *basis;

  if (v.basis == 0)
    die("invL(vector) called with v.basis==0\n");
  if (v.n != v.basis->NxNyNz)
    die("invL(vector) called with vector.n != FFT box size\n");
  if ((v.basis->Nx%2)!=0 || (v.basis->Ny%2)!=0 || (v.basis->Nz%2)!=0)
    die("invL(vector) called with basis->Nx,Ny,or Nz not a multiple of 2\n");

  vector invLv(v);

  basis=v.basis;
  Nx = basis->Nx; Nx2 = Nx/2;
  Ny = basis->Ny; Ny2 = Ny/2;
  Nz = basis->Nz; Nz2 = Nz/2;
  NyNz = Ny*Nz;
  GGTxx = basis->GGT.m[0][0];
  GGTyy = basis->GGT.m[1][1];
  GGTzz = basis->GGT.m[2][2];
  GGTxy = basis->GGT.m[0][1];
  GGTxz = basis->GGT.m[0][2];
  GGTyz = basis->GGT.m[1][2];
  for (i=-Nx2; i < Nx2; i++)
    for (j=-Ny2; j < Ny2; j++)
      for (k=-Nz2; k < Nz2; k++)
	{
	  /* G = 0 case */
	  if (i==0 && j==0 && k==0)
	    invLv.c[0] = 0.0;
	  else
	    {
	      index = 0;
	      if (k < 0) index += k+Nz;        else index += k;
	      if (j < 0) index += Nz*(j+Ny);   else index += Nz*j;
	      if (i < 0) index += NyNz*(i+Nx); else index += NyNz*i;
	      G2 = i*i*GGTxx +
		   j*j*GGTyy +
		   k*k*GGTzz +
		   2*(i*j*GGTxy +
		      i*k*GGTxz +
		      j*k*GGTyz);
	      invLv.c[index] = -v.c[index]/G2;
	    }
	}

#ifdef DFT_PROFILING
  timerOff(32); // turn off invL timer
#endif // DFT_PROFILING
  return invLv;
}

/*
 * Multiply column_bundle by Laplacian.  For planewaves, this means
 * multiply by -|k+G|^2.
 *
 * For the work, it calls the routine below it.
 *
 */
column_bundle
L(const column_bundle &Y)
{
  if (Y.basis == 0)
    die("L(column_bundle) called with Y.basis == 0\n");
  if (Y.col_length != Y.basis->nbasis)
    die("L(column_bundle) called with Y.col_length != nbasis\n");

  /* Set up LY and copy all the required stuff in Y */
  column_bundle LY(Y.tot_ncols,Y.basis->nbasis);
  copy_innards_column_bundle(&Y,&LY);

  /* Do the work */
  apply_L(Y,LY);

  return LY;
}

/*
 * Actually does the multiplication of Y by L (see above).
 */
void
apply_L(const column_bundle &Y,column_bundle &LY)
{
#ifdef DFT_PROFILING
  timerOn(31); // turn on L timer
#endif // DFT_PROFILING

  register int i,j,nbasis;
  real kplusGx,kplusGy,kplusGz,kx,ky,kz;
  real GGTxx,GGTyy,GGTzz,GGTxy,GGTxz,GGTyz;
  register real kplusG2;
  Basis *basis;

  if (Y.basis == 0)
    die("apply_L() called with Y.basis == 0\n");
  if (Y.col_length != Y.basis->nbasis)
    die("apply_L() called with Y.col_length != nbasis\n");
  if ( (Y.col_length!=LY.col_length) || (Y.tot_ncols!=LY.tot_ncols) )
    die("apply_L() called with different sizes for Y and LY\n");

  /* Let's go! */
  nbasis = Y.col_length;
  basis = Y.basis;
  kx = Y.k.v[0];
  ky = Y.k.v[1];
  kz = Y.k.v[2];
  GGTxx = basis->GGT.m[0][0];
  GGTyy = basis->GGT.m[1][1];
  GGTzz = basis->GGT.m[2][2];
  GGTxy = basis->GGT.m[0][1];
  GGTxz = basis->GGT.m[0][2];
  GGTyz = basis->GGT.m[1][2];
  for (j=0; j < nbasis; j++)
    {
      kplusGx = kx + basis->Gx[j];
      kplusGy = ky + basis->Gy[j];
      kplusGz = kz + basis->Gz[j];
      kplusG2 = kplusGx*kplusGx*GGTxx +
	        kplusGy*kplusGy*GGTyy +
	        kplusGz*kplusGz*GGTzz +
	        2*(kplusGx*kplusGy*GGTxy +
	           kplusGx*kplusGz*GGTxz +
	           kplusGy*kplusGz*GGTyz    );
      
      for (i=0; i < Y.my_ncols; i++)
        LY.col[i].c[j] = -kplusG2*Y.col[i].c[j];

    }

#ifdef DFT_PROFILING
  timerOff(31); // turn off L timer
#endif // DFT_PROFILING
}

/*
 * Multiply column_bundle by preconditioner.  In our case, we precond.
 * with the inverse kinetic part of Hsp.
 *
 * For each column of Y, we multiply the G'th component by
 * f(0.5*|k+G|^2/KErollover) (the kinetic energy) where
 * f(x) satisfies:
 *
 * f(x) = 1 + O(x^N)          for x << 1  (does nothing to low KE parts)
 *      = (1/x)*(1 + O(x^-N)) for x >> 1  (scale by 2*KErollover/|k+G|^2 for
 *                                               high KE components)
 *      = N/(N+1)             for x == 1
 *
 * f(x) = (1+x+x^2+x^3+...+x^(N-1))/(1+x+x^2+...+x^N)=(1-x^N)/(1-x^(N+1))
 * has this property, and it is used below.  Currently, N = 9 below.
 *
 * The function of KErollover is to determine the point where x=1, i.e.
 * the roll-over point for the function f(x).
 */
void
precond_inv_kinetic(column_bundle &Y,real KErollover)
{
#ifdef DFT_PROFILING
  timerOn(33); // turn on precond timer
#endif // DFT_PROFILING

  register int i,j;
  Basis *basis;
  real kplusGx,kplusGy,kplusGz,kplusG2,kx,ky,kz;
  real GGTxx,GGTyy,GGTzz,GGTxy,GGTxz,GGTyz;
  real f,x;
  int nbasis,my_ncols;

  if (Y.basis == 0)
    die("K(column_bundle) called with Y.basis == 0\n");
  basis = Y.basis;

  /* Let's go! */
  nbasis = Y.col_length;
  my_ncols = Y.my_ncols;
  basis = Y.basis;
  kx = Y.k.v[0];
  ky = Y.k.v[1];
  kz = Y.k.v[2];
  GGTxx = basis->GGT.m[0][0];
  GGTyy = basis->GGT.m[1][1];
  GGTzz = basis->GGT.m[2][2];
  GGTxy = basis->GGT.m[0][1];
  GGTxz = basis->GGT.m[0][2];
  GGTyz = basis->GGT.m[1][2];
  for (j=0; j < nbasis; j++)
    {
      kplusGx = kx + basis->Gx[j];
      kplusGy = ky + basis->Gy[j];
      kplusGz = kz + basis->Gz[j];
      kplusG2 = kplusGx*kplusGx*GGTxx +
                kplusGy*kplusGy*GGTyy +
	        kplusGz*kplusGz*GGTzz +
		2*(kplusGx*kplusGy*GGTxy +
		   kplusGx*kplusGz*GGTxz +
		   kplusGy*kplusGz*GGTyz    );
      x = 0.5*kplusG2/KErollover;
      f = 1.0+x*(1.0+x*(1.0+x*(1.0+x*(1.0+x*(1.0+x*(1.0+x*(1.0+x)))))));
      f = f/(1.0+x*f);

      // Hide ncols.
      for (i=0; i < my_ncols; i++)
        Y.col[i].c[j] *= f;
    }

#ifdef DFT_PROFILING
  timerOff(33); // turn off precond timer
#endif // DFT_PROFILING

}


//
// Switch into sequential code.
//
/*
 * Below we have the routines that multiply a column_bundle by the
 * single-particle Hamiltonian.  The master routine is Hsp() which
 * does three things
 * (1) applies the kinteic operator -0.5*L   (done in serial)
 * (2) applies the local self-consistent potentail Vscloc 
 *              (done in parrallel with threads)
 * (3) applies the non-local potential (uses * and ^ operators on 
 *              column_bundles, which operators 
 *              should be parallel or optimized; and also fills in
 *              Vnl (the non-local pot. matrix elements) in parallel).
 *
 * See Hsp() below for more details.
 *
 */

/*
 * Multiply column_bundle by single-particle Hamiltonian:
 *
 * Hsp = -0.5*L + Idag*Diag(Vscloc)*I
 *              + sum(ions,l,m,...) { Vnl*M*Vnl^ }
 *
 * i.e. Kinetic operator, local self-consistent potential, and non-local
 * potential respectively.
 *
 * It actaully calls the routine below it...
 */
column_bundle
Hsp(const column_bundle &Y,Ioninfo *ioninfo)
{
  if (Y.basis == 0)
    die("Hsp(column_bundle) called with Y.basis == 0\n");
  if (Y.col_length != Y.basis->nbasis)
    die("Hsp(column_bundle) called with Y.col_length != nbasis\n");
  if (Y.Vscloc->n != Y.basis->NxNyNz)
    die("Hsp(Y) called with Y.basis->Vscloc->n != FFT box size\n");

  /* Holds final result */
  column_bundle HspY(Y.tot_ncols,Y.basis->nbasis);

  /* Do the work!! */
  apply_Hsp(Y,ioninfo,HspY);

  return HspY;
}

/*
 * Actually does the work of apply Hsp onto Y.
 */
void
apply_Hsp(const column_bundle &Y,
	  Ioninfo *ioninfo,
	  column_bundle &HspY)
{
#ifdef DFT_PROFILING
  timerOn(26);   // Turn on apply_Hsp timer
#endif // DFT_PROFILING

  register int i;
  int sp,lm;
  Basis *basis;

  if (Y.basis == 0)
    die("apply_Hsp() called with Y.basis == 0\n");
  basis = Y.basis;
  if (Y.col_length != basis->nbasis)
    die("apply_Hsp() called with Y.col_length != nbasis\n");
  if (Y.Vscloc->n != basis->NxNyNz)
    die("apply_Hsp() called with Y.basis->Vscloc->n != FFT box size\n");
  if ((Y.col_length != HspY.col_length) || (Y.tot_ncols != HspY.tot_ncols))
    die("apply_Hsp() called with sizes of Y and Hsp not being equla\n");

  /* Kinetic part: -0.5*L*Y */
  /*does   HspY = -0.5*L(Y); */
  apply_L(Y,HspY);
  HspY *= (scalar)(-0.5);


  /* Local part of self-consistent potential: Idag*Diag(Vsc)*I*Y */
  //
  // HspY(:,i) += Idag*Diag(Vscloc)*I*Y(:,i)   (i'th column of both sides)
  //
  for (i = 0; i < Y.my_ncols; i++) 
    {
      column_bundle Ycol(1,Y.basis->nbasis,"local"); /* Holds one column of Y */
      column_bundle IYcol(1,Y.basis->NxNyNz,"local"); /* Holds I of Ycol */
      copy_innards_column_bundle(&Y,&Ycol);
      copy_innards_column_bundle(&Y,&IYcol);

      /* Copy i'th column of Y into Ycol */
      register scalar *cin = &(Y.col[i].c[0]);
      register scalar *cout = &(Ycol.col[0].c[0]);
      register int j;
      register int nbasis = Y.basis->nbasis;
      register int NxNyNz = Y.basis->NxNyNz;

      for (j=0; j < nbasis; j++)
	cout[j] = cin[j];

      /* Apply I to Ycol */
      apply_I(Ycol,IYcol);

      /* Multiply by Vscloc */
      register scalar *vscloc = &(Y.Vscloc->c[0]);
      register scalar *iycol = &(IYcol.col[0].c[0]);

      for (j=0; j < NxNyNz; j++)
#if defined SCALAR_IS_COMPLEX
	{
	  register scalar IYj = iycol[j];
	  register scalar Vscj = vscloc[j];
	  
	  iycol[j].x = IYj.x*Vscj.x - IYj.y*Vscj.y;
	  iycol[j].y = IYj.x*Vscj.y + IYj.y*Vscj.x;
	  /* The above block does:   */
	  /* 	  IYcol.col[0].c[j] *= Y.Vscloc->c[j]; */
	}
#elif defined SCALAR_IS_REAL
        {
          iycol[j] *= vscloc[j];
	}
#else
#error scalar is neither complex nor real
#endif
	/* Apply Idag in place:  i.e. it destroys the contents of IYcol
	 * in the process...but we don't care! */
	apply_Idag_inplace(IYcol,Ycol);

	/* Accumulate into the i'th column of HspY */
	cin = &(Ycol.col[0].c[0]);
	cout = &(HspY.col[i].c[0]);
	
	for (j=0; j < nbasis; j++)
	  cout[j] += cin[j];
    }


  /* Nonlocal part of potential: sum(ions,l,m,...) { Vnl*Mnl*Vnl^Y } */
  for (sp=0; sp < ioninfo->nspecies; sp++)
    for (lm=0; lm < ioninfo->species[sp].nlm; lm++)
      {
	if (ioninfo->species[sp].ngamma[lm] > 1)
	  {
	    dft_log(DFT_SILENCE,
		    "\nMultiple-projectors:  running slow Hsp()!\n");

	    /* this is the slow way where we go one atom at a time...
	     * the smarter way would be to somehow make a new class
	     * which is a block-diagonal matrix class (a string of matrix
	     * classes on the diagonal of a bigger one), where each
	     * diagonal is just Mnl below, and to define an
	     * block_diag_matrix*matrix (returning matrix) operator.
	     * Then we can do what we do with the Kleinman-Bylander
	     * below with minimal changes. */

	    column_bundle Vnl(ioninfo->species[sp].ngamma[lm],
			      basis->nbasis,"local");
	    matrix &Mnl = ioninfo->species[sp].M[lm]; /* reference */

	    copy_innards_column_bundle(&Y,&Vnl);

	    for (i=0; i < ioninfo->species[sp].natoms; i++)
	      {
		Vnl_pseudo(sp,i,lm,Y.k,Y.basis,ioninfo,Vnl);
		HspY += Vnl*(Mnl*(Vnl^Y));
	      }
	  }
	/* Kleinman-Bylander:  bunch up all local potentials for
	 * the atoms of this species and state into a big column_bundle
	 * and work on them instead (should be faster due to ^ and *
	 * operators being block-multiplies, etc.) */
	else
	  {
	    // Vnl is created as distributed column_bundle.
	    // the dimension that's distributed is ioninfo->species[sp].natoms
	    column_bundle Vnl(ioninfo->species[sp].natoms,basis->nbasis);
	    column_bundle Vnloneatom(1,basis->nbasis,"local");
	    copy_innards_column_bundle(&Y,&Vnl);

 	    for (i= 0; i < Vnl.my_ncols; i++)
 	      {
 		register int j;
 		Vnl_pseudo(sp,i+Vnl.start_ncol,
			   lm,Y.k,basis,ioninfo,Vnloneatom);
 		for (j=0; j < basis->nbasis; j++)
 		  Vnl.col[i].c[j] = Vnloneatom.col[0].c[j];
 	      }


	    /* Now use Vnl! */
/* we want to do:   HspY += Vnl*(Mnl*(Vnl^Y)); */


	    scalar Mnl = ioninfo->species[sp].M[lm](0,0);
	    do_column_bundle_matrix_mult(Vnl,Mnl*(Vnl^Y),HspY,1);
	  }
      }

#ifdef DFT_PROFILING
  timerOff(26);   // Turn off apply_Hsp timer
#endif // DFT_PROFILING
}
