/****************************************************************************
 *
 * DFT++:  density functional package developed by
 *         the research group of Prof. Tomas Arias, MIT.
 *
 * Principal author: Sohrab Ismail-Beigi
 *
 * Modifications for MPI version: Kenneth P Esler,
 *                                Sohrab Ismail-Beigi, and
 *                                Tairan Wang.
 *
 * Modifications for LSD version: Jason A Cline
 *
 * Modifications for lattice/Pulay forces: Gabor Csanyi and
 *                                         Sohrab Ismail-Beigi
 *
 * Copyright (C) 1996-1998 The Massachusetts Institute of Technology (MIT).
 *
 ****************************************************************************/

/************************ Gabor Csanyi **************************
 * This routine calculates forces on the lattice vectors         
 *                                                               
 * the directions are given by the 6 independent components      
 * of the strain tensor                                          
 *                                                               
 * so Fab =(Energy((I+eps Tab) R )-Energy((I-eps Tab)) R))/2eps  
 * lim eps -> 0                                                  
 *                                                               
 * where Txx = 1 0 0    Txy = 0 1 0   etc.                       
 *             0 0 0          1 0 0                              
 *             0 0 0          0 0 0                              
 *                                                               
 ****************************************************************
 */

/* $Id: calclatforces.c,v 1.1.1.1 1999/11/10 01:30:17 tairan Exp $ */

#include <stdio.h>
#include "header.h"


/* this is a piece of old code, I dont remember where this magic number came from,
 * we need to look into this.
 * #define dEdN (-0.00065028473)
 * 
 */

vector dVps(const vector &Vps, const matrix3 &GGbarT);
vector dinvL(const vector &v, const matrix3 &GGbarT);
column_bundle dL(const column_bundle &Y, const matrix3 &GGbarT);
real dEwald(Ioninfo *ions,matrix3 R, matrix3 Rbar, matrix3 Gbar);
real erfcprime(real x);
column_bundle Vnlprime_pseudo(int sp,int ion,int lm, vector3 kvec, Basis *basis, Ioninfo *ions, matrix3 GbarT);
void stress_lat(Basis *basis, const matrix3 &T);



void
calc_latforces(
	       Elecinfo *elecinfo,
	       Elecvars *elecvars,
	       Ioninfo *ions,
	       Energies *ener,
	       Lattice_forces *force)
{
  matrix3 Gbar, Rbar, Rinv, R, G;
  // matrix3 I3(1., 0., 0., 0., 1., 0., 0., 0., 1.);
  matrix3 O3(0., 0., 0., 0., 0., 0., 0., 0., 0.);
  real Omega, dinvOmega, dOmega;
  int k, sp, lm, i, a, b;
  real FKE, FH, Fps, Fxc, Fcore, Fewald, Fnl, Fpulay;
  matrix3 T;

  dft_log("\n--------------------------------------------------\n");
  dft_log("  Calculating lattice forces...\n\n");

  /* copy data from arguments into local vars */
  real EH = ener->EH;
  real Eps = ener->Eloc;
  real Ecore = ener->Ecore;
  real Enl = ener->Enl;
  real *w; w = elecinfo->w;
  int nkpts = elecinfo->nkpts;
  vector n_tot = (elecvars->n);
  diag_matrix *F; F = elecinfo->F;
  column_bundle *C; C = elecvars->C;
  Basis *basis; basis = C->basis;


  G = basis->G;
  R = basis->latvec;
  Rinv = inv3(R);

  Omega = basis->unit_cell_volume;
  vector Vpsprime(basis->NxNyNz,basis); 
  Vlocprime_pseudo(basis,ions,Vpsprime.c);


  /******/
  /* Set the Transformation Matrix to calculate the various matrix elements in turn */
  /******/
  for(a = 0; a < 3; a++)
    for(b = a; b < 3; b++){
      dft_log( "\n------ %d,%d component of lattice stress...\n", a, b);
      T = O3;
      T.m[a][b] = T.m[b][a] = 1.;

      /* R has the lattice vectors in its columns */
      Rbar = T*R;
      Gbar = -2*M_PI*(Rinv*Rbar*Rinv);


      /* d(1/Omega) = -1/Omega * Trace( Rinv Rbar) */
      dinvOmega = -trace3(Rinv*Rbar)/Omega;
      dOmega = trace3(Rinv*Rbar) * Omega;

      dft_log( "dOmega/dR\t= %15.7le\n", dOmega);
      /******/
      /* Now calculate the forces */
      /******/

      Fpulay = -sqrt(2.0)/(3.0*pow(M_PI,2.0))*pow((elecinfo->Ecut), 1.5)*dEdN*dOmega;
      dft_log( "Pulay force\t= %15.7le\n", Fpulay);
      FKE = 0.;
      for (k=0; k < nkpts; k++)
	FKE += 0.5*REAL(sum_vector(diaginner(F[k],C[k],dL(C[k], G*(~Gbar)) )))*w[k];
      dft_log( "KE force\t= %15.7le\n", FKE);
      FH = -(EH*Omega*dinvOmega + 0.5*REAL(n_tot^Jdag(Obar( (scalar) (-4*M_PI)* dinvL(Obar(J(n_tot)), G*(~Gbar))))));
      dft_log( "EH force\t= %15.7le\n", FH);
      Fxc = -REAL(n_tot^(pointwise_mult(Jdag(O(J(excprime(n_tot)))), n_tot)))*dinvOmega*Omega;
      // Include option for GGA:
      if (elecvars->ex_opt == DFT_EXCORR_GGA)
	Fxc -= REAL(n_tot^(Jdag(O(J(exGCprime(n_tot)-exGC(n_tot)))))) * dinvOmega*Omega;
      dft_log( "XC force\t= %15.7le\n", Fxc);
      Fcore = -Ecore*Omega*dinvOmega;
      dft_log( "Core force\t= %15.7le\n", Fcore);
      Fps = -(Eps*Omega*dinvOmega+REAL(n_tot^Jdag(dVps(Vpsprime, G*(~Gbar))))); 
      dft_log( "ps force\t= %15.7le\n", Fps); 
      Fewald = -dEwald(ions, R, Rbar, Gbar);
      dft_log( "Ewald force\t= %15.7le\n", Fewald);
      
      /* Now do nonlocal */
      
      Fnl = -Enl*Omega*dinvOmega; /* the volume dependent part */
      for (sp=0; sp < ions->nspecies; sp++)
	for (lm=0; lm < ions->species[sp].nlm; lm++){
	  column_bundle Vnlprime(ions->species[sp].ngamma[lm],basis->nbasis,"local");
	  column_bundle Vnl(ions->species[sp].ngamma[lm],basis->nbasis,"local");
	  
	  matrix VdagC(ions->species[sp].ngamma[lm],elecinfo->nbands);
	  matrix VprimedagC(ions->species[sp].ngamma[lm],elecinfo->nbands);
	  matrix &Mnl = ions->species[sp].M[lm]; /* reference */
	  copy_innards_column_bundle(&(C[0]),&Vnl);
	  copy_innards_column_bundle(&(C[0]),&Vnlprime);
	  
	  for (i=0; i < ions->species[sp].natoms; i++)
	    for (k=0; k < elecinfo->nkpts; k++){
	      Vnl_pseudo(sp,i,lm,elecinfo->kvec[k],basis,ions, Vnl);
	      Vnlprime = Vnlprime_pseudo(sp,i,lm,elecinfo->kvec[k],basis,ions, ~Gbar);
	      VdagC = Vnl^C[k];
	      VprimedagC = Vnlprime^C[k];
	      Fnl -= REAL( w[k]*trace(Mnl*(VprimedagC*F[k]*herm_adjoint(VdagC)+VdagC*F[k]*herm_adjoint(VprimedagC))  ) );
	    }
	}
      dft_log( "Nonlocal force\t= %15.7le\n", Fnl);
      
      /*******/
      /* Store the result in the force matrix */
      /*******/
      force->Fpulay.m[a][b] = force->Fpulay.m[b][a] = Fpulay; 
      force->FKE.m[a][b] = force->FKE.m[b][a] = FKE;
      force->FH.m[a][b] = force->FH.m[b][a] = FH;
      force->Fps.m[a][b] = force->Fps.m[b][a] = Fps;
      force->Fxc.m[a][b] = force->Fxc.m[b][a] = Fxc;
      force->Fcore.m[a][b] = force->Fcore.m[b][a] = Fcore;
      force->Fewald.m[a][b] = force->Fewald.m[b][a] = Fewald;
      force->Fnl.m[a][b] = force->Fnl.m[b][a] = Fnl;
    }
    

  force->Ftot =
    force->Fpulay+
    force->FKE+
    force->Fps+
    force->Fxc+
    force->Fcore+
    force->FH+
    force->Fewald+
    force->Fnl;

}

/* gradient of the locel pseudo-potential */
/* plane wave basis */
vector
dVps(const vector &Vpsprime, const matrix3 &GGbarT)
{
  int Nx,Ny,Nz,Nx2,Ny2,Nz2,NyNz;
  register int i,j,k,index;
  real dg;
  vector3 x;

  vector dVps(Vpsprime);
  matrix3 GGT=Vpsprime.basis->GGT;

  Nx = Vpsprime.basis->Nx; Nx2 = Nx/2;
  Ny = Vpsprime.basis->Ny; Ny2 = Ny/2;
  Nz = Vpsprime.basis->Nz; Nz2 = Nz/2;
  NyNz = Ny*Nz;
  for (i=-Nx2; i < Nx2; i++)
    for (j=-Ny2; j < Ny2; j++)
      for (k=-Nz2; k < Nz2; k++){
	/* G = 0 case */
	if (i==0 && j==0 && k==0)
	  dVps.c[0] = 0.0;
	/* G != 0 */
	else{
	  index = 0;
	  if (k < 0) index += k+Nz;        else index += k;
	  if (j < 0) index += Nz*(j+Ny);   else index += Nz*j;
	  if (i < 0) index += NyNz*(i+Nx); else index += NyNz*i;
	  x.v[0] = i; x.v[1] = j; x.v[2] = k;
	  dg = (x*(GGbarT*x))/sqrt(x*(GGT*x));
	  dVps.c[index] = Vpsprime.c[index]*dg;
	}
      }
  
  return dVps;

}

/* gradient of the inverse Laplacian */
/* plane wave basis */
vector
dinvL(const vector &v, const matrix3 &GGbarT)
{
  int Nx,Ny,Nz,Nx2,Ny2,Nz2,NyNz;
  register int i,j,k,index;
  real G2, dG2;
  vector3 x;

  if (v.basis == 0)
    die("dinvL(vector) called with v.basis==0\n");
  if (v.n != v.basis->NxNyNz)
    die("dinvL(vector) called with vector.n != FFT box size\n");
  if ((v.basis->Nx%2)!=0 || (v.basis->Ny%2)!=0 || (v.basis->Nz%2)!=0)
    die("dinvL(vector) called with basis->Nx,Ny,or Nz not a multiple of 2\n");

  vector dinvLv(v);
  matrix3 GGT = v.basis->GGT;

  Nx = v.basis->Nx; Nx2 = Nx/2;
  Ny = v.basis->Ny; Ny2 = Ny/2;
  Nz = v.basis->Nz; Nz2 = Nz/2;
  NyNz = Ny*Nz;
  for (i=-Nx2; i < Nx2; i++)
    for (j=-Ny2; j < Ny2; j++)
      for (k=-Nz2; k < Nz2; k++){
	/* G = 0 case */
	if (i==0 && j==0 && k==0)
	  dinvLv.c[0] = 0.0;
	/* G != 0 */
	else{
	  index = 0;
	  if (k < 0) index += k+Nz;        else index += k;
	  if (j < 0) index += Nz*(j+Ny);   else index += Nz*j;
	  if (i < 0) index += NyNz*(i+Nx); else index += NyNz*i;
	  x.v[0] = i; x.v[1] = j; x.v[2] = k;
	  G2 = x*(GGT*x);
	  dG2 = 2.0*x*(GGbarT*x);
	  dinvLv.c[index] = v.c[index]*dG2/(G2*G2);
	}
      }
  
  return dinvLv;
}


/* gradient of the Laplacian in the direction specified by Gbar */
/* plane wave basis */
column_bundle
dL(const column_bundle &Y, const matrix3 &GGbarT)
{
  register int i,j,nbasis;
  real kx,ky,kz;
  real dGk2;
  vector3 x;

  if (Y.basis == 0)
    die("dL(column_bundle, matrix3) called with Y.basis == 0\n");
  if (Y.col_length != Y.basis->nbasis)
    die("dL(column_bundle, matrix3) called with Y.col_length != nbasis\n");


  /* Set up dLY and copy all the required stuff in Y */
  column_bundle dLY(Y.tot_ncols,Y.basis->nbasis);

  copy_innards_column_bundle(&Y,&dLY);

  nbasis = Y.col_length;
  kx = Y.k.v[0];
  ky = Y.k.v[1];
  kz = Y.k.v[2];
  for (j=0; j < nbasis; j++){
    x.v[0] = kx + Y.basis->Gx[j]; 
    x.v[1] = ky + Y.basis->Gy[j]; 
    x.v[2] = kz + Y.basis->Gz[j];
    dGk2 = 2.0*x*(GGbarT*x);

    for (i=0; i < Y.my_ncols; i++)
      dLY.col[i].c[j] = -dGk2*Y.col[i].c[j];
    // Hide ncols
    //dLY.mult_add( Y, -dGk2, j );
  }
  return dLY;
}

/* gradient of the Ewald term */
real 
dEwald(Ioninfo *ions,matrix3 R, matrix3 Rbar, matrix3 Gbar)
{
  /* Constants */
  real const pi = M_PI,
    twopi = 2.0*pi,
    fourpi = 4.0*pi;

  int const Nlat_start = 5,
    Nlat_end   = 5;


  real *Z;
  vector3 *atpos, c;
  int natoms;
  matrix3 RTR,G,GGT, GGbarT, RTRbar, Rinv;
  vector3 x;
  real vol,G2,r, dr,dtemp,sigma,eta, dinvVol, tmp2, temp;
  real angle,SG[2];
  real dEreal,dErecip,dEtot;
  int sp,i,j,k,l,tau,taup,cell[3],Nlat;

  /* Take out all the ions from the Ioninfo structure and put them into
   * a long list of Z[] and atpos[] values */
  natoms = 0;
  for (sp=0; sp < ions->nspecies; sp++)
    natoms += ions->species[sp].natoms;
  Z = (real *)mymalloc(sizeof(real)*natoms,"Z","Ewald()");
  atpos = (vector3 *)mymalloc(sizeof(vector3)*natoms,"atpos","Ewald()");
  k = 0;
  for (sp=0; sp < ions->nspecies; sp++)
    for (j=0; j < ions->species[sp].natoms; j++)
      {
	Z[k] = ions->species[sp].Z;
	atpos[k] = ions->species[sp].atpos[j];
	k++;
      }

  /* Unit cell volume */
  vol = det3(R);

  /* Calcuate matrix of dot-products of lattice vectors */
  RTR = (~R)*R;
 
  /* Calculate G=2*pi*inv(R): recip. lattice vectors in rows of G */
  G = twopi*inv3(R);

  /* Calculate dot products of recip. lattice vectors */
  GGT = G*(~G);

  GGbarT =  G*(~Gbar);
  RTRbar =  (~R)*Rbar;
  Rinv = inv3(R);
  dinvVol = -trace3(Rinv*Rbar)/vol;

  dft_log("\n-- Ewald() --\n");
  dft_log("latvec = \n");
  R.print(dft_global_log, "%lg ");
  dft_log("natoms = %d\n",natoms);

  dft_log(DFT_ANAL_LOG"Z = [ ");
  for (i=0; i < natoms; i++)
    dft_log(DFT_ANAL_LOG, "%lg ",Z[i]);
  dft_log(DFT_ANAL_LOG"]\natpos=\n");
  for (i=0; i < natoms; i++)
    atpos[i].print(DFT_ANAL_LOG"%lg ",logfile,1);


  /* set scale of width of gaussian to roughly the the size of a unit cell */
  /*   sigma = 0.4*pow(vol/(real)natoms,1.0/3.0); */
  sigma = 2.0;
  eta = 1.0/(sqrt(2.0)*sigma);
  dft_log("Using sigma = %lg   eta = %lg for gaussian\n\n",
	    sigma,eta);

  /* Real-space part of gradient */
  for (Nlat = Nlat_start; Nlat <= Nlat_end; Nlat++)
    {
      dEreal = (real)0.0;
      for (tau=0; tau < natoms; tau++)
	for (taup=0; taup < natoms; taup++)
	  {
	    dft_log(-2,"\nReal space potential for tau=%d taup=%d\n",
		      tau,taup);
	    /* The constant part of the gradient from "renomalization" */
	    dtemp = -0.5*Z[tau]*Z[taup]*pi/(eta*eta)*dinvVol;
	    dEreal += dtemp;
	    dft_log(-1,"Constant part = %.10le\n",dtemp);
	    /* loop over cells */
	    tmp2 = dEreal;
	    for (cell[0]=-Nlat; cell[0]<=Nlat; cell[0]++)
	      for (cell[1]=-Nlat; cell[1]<=Nlat; cell[1]++)
		for (cell[2]=-Nlat; cell[2]<=Nlat; cell[2]++)
		  /* Only exclude the cell==0 and tau==tau' term */
		  if (tau!=taup || cell[0]!=0 || cell[1]!=0 || cell[2]!=0)
		    {
		      /* Find the distance |cell+tau'-tau| between atom at tau
		       * and the other atom being considered.
		       * x is this cell+tau'-tau vector in lattice coords;
		       * r is its actual length in real distance units*/
		      for (l=0; l < 3; l++)
			x.v[l] = cell[l] + atpos[taup].v[l] - atpos[tau].v[l];
		      r = sqrt(x*(RTR*x));
		      dr = (x*(RTRbar*x))/r;
		      dtemp = 0.5*Z[tau]*Z[taup]*(erfcprime(eta*r)*eta-erfc(eta*r)/r)*dr/r;
		      dEreal += dtemp;
		      dft_log(-2,"tau=%d\ttaup=%d\tcell=[%d %d %d]\tdr =%15.10f\te = %.16le\n",
				tau, taup, cell[0],cell[1],cell[2],dr,dtemp);
		    }
	  } /* end of tau' loop */
      dft_log("Nlat = %2d  Real-space gradient = %25.15le\n",
		Nlat,dEreal);
    } /* of Nlat loop */

  /*
   * the code below has been modified to calculate the gradient  -CsG
   *
   * Reciprocal space contribution:
   * Erecip = 0.5*sum_{G!=0}
   *            {4*pi*exp(-|G|^2/(4*eta^2))/(vol*|G|^2)*|S(G)|^2}
   * where S(G) = sum_{tau} { Z[tau]*exp(-i*G*r_tau) }
   *
   * r_tau = R*tau (R is matrix, tau is 3-vector) and
   *     G = cell*G (second G is matrix, cell is row-vector of integers)
   * so G*r_tau = 2*pi*cell*tau.
   */
  for (Nlat = Nlat_start; Nlat <= Nlat_end; Nlat++)
    {
      dErecip = (real)0.0;
      for (cell[0]=-Nlat; cell[0]<=Nlat; cell[0]++)
	for (cell[1]=-Nlat; cell[1]<=Nlat; cell[1]++)
	  for (cell[2]=-Nlat; cell[2]<=Nlat; cell[2]++)
	    /* Skip G=0 */
	    if (cell[0]!=0 || cell[1]!=0 || cell[2]!=0)
	      {
		/* Calculate structure factor */
		SG[0] = SG[1] = 0.0;
		c.v[0] = cell[0]; c.v[1] = cell[1]; c.v[2] = cell[2];
		for (tau=0; tau < natoms; tau++)
		  {
		    angle = -twopi*(c*atpos[tau]);
		    SG[0] += Z[tau]*cos(angle);
		    SG[1] += Z[tau]*sin(angle);
		  }
		/* Calculate |G|^2 */
		G2 = c*(GGT*c);
		/* The gradient for G */
		tmp2 = exp(-G2/(4.0*eta*eta))/G2;
		temp = 0.5*fourpi*tmp2/vol*(SG[0]*SG[0]+SG[1]*SG[1]);	       
		dtemp = temp*(vol*dinvVol-1/G2*2.0*(c*(GGbarT*c))-1/(4.0*eta*eta)*2.0*(c*(GGbarT*c)));
		dErecip += dtemp;
		dft_log(-2,"G=[%d %d %d] G2 =%lg  e = %le\n",
			  cell[0],cell[1],cell[2],G2,dtemp);
	      }
      dft_log("Nlat = %2d  Reciprocal space gradient = %25.15le\n",
		Nlat,dErecip);
    } /* of Nlat loop */
  dEtot = dEreal + dErecip;
  dft_log("\nEwald gradient = %25.15le\n\n",dEtot);

  myfree(Z);
  myfree(atpos);
  return dEtot;
}


real
erfcprime(real x)
{
  return(-2.0/sqrt(M_PI)*exp(-x*x));
}


/*
 * Code by Sohrab Ismail-Beigi,  Mar. 1997
 * 
 * Non-local pseudopotentials, with multiple-projector extension of
 * basic Kleinman-Bylander form.
 *
 * Adapted to calculate forces on the lattice vectors by Gabor Csanyi
 * April 1997
 *
 */

/*
 * the original commets for the energy calculation follow 
 * 
 * note : the volume dependece is taken care of elsewhere
 */

/*
 * Calculates the matrix elements of the non-local pseudopotential
 * (multiple-projector form) for the ion'th ion of species sp
 * for atomic state lm at k-vector kvec and returs the result
 * as a column bundle of ions->species[sp].ngamma[lm] columns of
 * length basis->nbasis.  ngamma is the number of multiple projectors
 * for the species sp and state lm.
 *
 * Letting q = k + G, the matrix we are computing is Vnl with entries:
 * 
 * Vnl(q,gamma) = <q|V_gamma*phi_gamma>
 *
 * where |q> is a plane-wave state with <r|q> = exp(i*q*r)/sqrt(Vol)
 * and V_gamma is the non-local potential for the gamma'th state
 * and phi_gamma is the atomic wave-function for the gamma'th state.
 *
 * Now, V_gamma*phi_gamma(r) = V_l(|r|)*phi_l(|r|)*Ylm(rhat)
 *                                           (hat == unit vec.)
 *
 * and using the relation:
 *
 *   exp(iq.r) = sum_{l,m} { 4pi(i^l)j_l(|q|*|r|)Ylm*(qhat)Ylm(rhat) } 
 *
 * We arrivate at,
 *
 *   Vnl(q,gamma) = Y_lm(qhat)*(4*pi)*f_l(|q|)/sqrt(Vol)
 *
 * where
 *
 *   f_l(q) = integral{0,inf} { dr*r^2*V_l(r)*phi_l(r)*(i^l*j_l(r))* }
 *
 * We get f_l(q) from the internal tables stored in ions via interpolation.
 *
 */
column_bundle
Vnlprime_pseudo(int sp,int ion,int lm,
		vector3 kvec,
		Basis *basis,
		Ioninfo *ions,
		matrix3 GbarT)
{
  complex S,Ylm, Ylmprime;
  real dot,posx,posy,posz,kx,ky,kz,kplusGx,kplusGy,kplusGz;
  real qx,qy,qz,q,invq,qhatx,qhaty,qhatz, dq=0.0, dqhatx, dqhaty, dqhatz;
  real flq=0.0, flqprime=0.0, r,invdq=0.0;
  real GGTxx,GGTyy,GGTzz,GGTxy,GGTxz,GGTyz;
  real Vol,invsqrtVol;
  register real ym1,y0,y1,y2,a,b,c,x;
  int ngamma,gamma,n,j=0,nmax=0,l,m;
  matrix3 G;
  static int sampling_too_close = 0;
  vector3 myx, dqvec;

#define N_COMPLAIN_TOO_CLOSE 103

  /* pi et al. */
  const real pi = M_PI;
  const real twopi = (real)2.0*pi;
  const real fourpi = (real)4.0*pi;
  const real sixth = (real)1.0/(real)6.0;
  /* Ylm normalization constants */
  const real N00  =  sqrt(1.0/(4.0*pi));
  const real N10  =  sqrt(3.0/(4.0*pi));
  const real N11  = -sqrt(3.0/(8.0*pi));
  const real N1m1 =  sqrt(3.0/(8.0*pi));
  const real N20  =  sqrt(5.0/(16.0*pi));
  const real N21  = -sqrt(15.0/(8.0*pi));
  const real N2m1 =  sqrt(15.0/(8.0*pi));
  const real N22  =  sqrt(15.0/(32.0*pi));
  const real N2m2 =  sqrt(15.0/(32.0*pi));

  Speciesinfo *sp_info = &(ions->species[sp]);

  /* Vnlprime will contain the final result */
  ngamma = sp_info->ngamma[lm];
  column_bundle Vnlprime(ngamma,basis->nbasis,basis,"local");
  Vnlprime.zero_out();

  /* Get ionic position, k-vector, cell volume, G, and GGT matrix entries */
  posx = sp_info->atpos[ion].v[0];
  posy = sp_info->atpos[ion].v[1];
  posz = sp_info->atpos[ion].v[2];
  kx = kvec.v[0];
  ky = kvec.v[1];
  kz = kvec.v[2];
  G = basis->G;
  GGTxx = basis->GGT.m[0][0];
  GGTyy = basis->GGT.m[1][1];
  GGTzz = basis->GGT.m[2][2];
  GGTxy = basis->GGT.m[0][1];
  GGTxz = basis->GGT.m[0][2];
  GGTyz = basis->GGT.m[1][2];
  Vol = basis->unit_cell_volume;
  invsqrtVol = (real)1.0/sqrt(Vol);

  /* Get l,m values for the potential */
  l = sp_info->l[lm];
  m = sp_info->m[lm];
  if (l < 0 || l > 2)
    die("l != {0,1,2} nonlocal pseudopotentials not supported yet!\n");
  if (m < -l || m > l)
    die("Nonlocal pseudopotential with |m| > l !?!?\n");


  /* Calculate the non-local pseudopotential matrix entries */
  for (gamma = 0; gamma < ngamma; gamma++){

    invdq = (real)1.0/sp_info->dq_nl[lm][gamma];
    nmax = sp_info->ngrid_nl[lm][gamma]-3;

    for (n = 0; n < basis->nbasis; n++) {
      /* q = k+G:  here q is in reciprocal lattice coordinates */
      qx = kplusGx = kx + basis->Gx[n];
      qy = kplusGy = ky + basis->Gy[n];
      qz = kplusGz = kz + basis->Gz[n];
      /* S = exp(-i*dot(q,pos)) */
      dot = -twopi*(posx*qx + posy*qy + posz*qz);
      S.x = cos(dot);
      S.y = sin(dot);

      /* Interpolate flq using internal tables */
      q = sqrt( qx*qx*GGTxx + qy*qy*GGTyy + qz*qz*GGTzz +
		2.0*(qx*qy*GGTxy + qx*qz*GGTxz + qy*qz*GGTyz) );

      r = q*invdq;
      j = (int)r;
      if (j > nmax) {
	dft_log("G=[%d %d %d] q=[%g %g %g] k=[%g %g %g]\n",
		basis->Gx[n],basis->Gy[n],basis->Gz[n],
		qx,qy,qz,kx,ky,kz);
	die("Nonlocal pseudopotential grid is too small!\n");
      }
      /* If j < 1, then it had best be the q=0 point! Otherwise,
       * complain periodically and do a linear interpolation. */
      if (j < 1){
	if (q < (real)1.0e-12){
	  flq = sp_info->flq[lm][gamma][0];
	  flqprime = 0.0;
	}
	else{
	  if (sampling_too_close%N_COMPLAIN_TOO_CLOSE==0){
	    dft_log("\n");
	    dft_log("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@");
	    dft_log("\n");
	    dft_log("@@@@@@@@@@@@@ WARNING @@@@@@@@@@@@@@");
	    dft_log("\nSampling nonlocal pseudopot. too\n");
	    dft_log("close to q=0 for species=%d, lm=%d,\n",
		    sp,lm);
	    dft_log("G=[%d %d %d] k=[%g %g %g].\n",
		    basis->Gx[n],basis->Gy[n],basis->Gz[n],kx,ky,kz);
	    dft_log("Doing linear interpolation.\n");
	    dft_log("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@");
	    dft_log("\n\n");
	  }
	  sampling_too_close++;
	  x = r;
	  y0 = sp_info->flq[lm][gamma][0];
	  y1 = sp_info->flq[lm][gamma][1];
	  flq = y0 + x*(y1-y0);
	  flqprime = (y1-y0)*invdq;
	}
      }
    
      else{
	/* Cubic interpolation with j-1, j, j+1, and j+2 points */
	ym1 = sp_info->flq[lm][gamma][j-1];
	y0  = sp_info->flq[lm][gamma][j  ];
	y1  = sp_info->flq[lm][gamma][j+1];
	y2  = sp_info->flq[lm][gamma][j+2];
	a = sixth*(y2-ym1+3.0*(y0-y1));
	b = 0.5*(y1+ym1-y0-y0);
	c = y1-sixth*(y2+3.0*y0+ym1+ym1);
	x = r-j;
	flq = ((a*x+b)*x+c)*x+y0;
	
	/* now do prime */
	flqprime = ((3.0*a*x+2.0*b)*x+c)*invdq;
	
	/* The line below is quadratic interpolation: */
	/* flq = y0 + 0.5*x*(y1-ym1+x*(y1+ym1-y0-y0)); */
      }

      /* If l>0 and q=0, then flq=0 anyways, so we can skip calculating
       * Ylm (of course, we can't calculate Ylm anyways since for
       * q=0, qhat is undefined! */
      if (j==0){
	if(l == 0){
	  Ylm.x = N00;
	  Ylm.y = 0.0;
	  Ylmprime.x = 0.0;
	  Ylmprime.y = 0.0;
	}else {
	  /* the only nontrivial case for the derivative is l==1, but */
	  /* it turns out that (Ylm*Flq)' is 0 for q==0 */
	  Ylm.x = Ylm.y = 0.0;
	  Ylmprime.x = Ylmprime.y = 0.0;
	}
	goto skipcalcYlm;
      }
      /* Otherwise, we have to calculate Ylm na dYlm' : 
       * qhat is a unit vector in direction of q:  here q is in "real"
       * bohr^-1 units and q = (k+G)*G where (k+G) is a row-vector. */	  
      qx = kplusGx*G.m[0][0] + kplusGy*G.m[1][0] + kplusGz*G.m[2][0];
      qy = kplusGx*G.m[0][1] + kplusGy*G.m[1][1] + kplusGz*G.m[2][1];
      qz = kplusGx*G.m[0][2] + kplusGy*G.m[1][2] + kplusGz*G.m[2][2];
      invq = (real)1.0/q;
      qhatx = qx*invq;
      qhaty = qy*invq;
      qhatz = qz*invq;

      myx.v[0] = kplusGx; myx.v[1] = kplusGy; myx.v[2] = kplusGz;
      dq = (myx*((G*GbarT)*myx))/q;
      dqvec = GbarT*myx;
      dqhatx = dqvec.v[0]*invq - qx/(q*q)*dq;
      dqhaty = dqvec.v[1]*invq - qy/(q*q)*dq;
      dqhatz = dqvec.v[2]*invq - qz/(q*q)*dq;
      
      /* Calculate Ylm and Ylm' */
      
      if (l==0){
	Ylm.x = N00;
	Ylm.y = 0.0;
	Ylmprime.x = 0.0;
	Ylmprime.y = 0.0;
      }
      else if (l==1){
	if (m==0){
	  Ylm.x = N10*qhatz;
	  Ylm.y = 0.0;
	  Ylmprime.x = N10*dqhatz;
	  Ylmprime.y = 0.0;
	}
	else if (m==1){
	  Ylm.x = N11*qhatx;
	  Ylm.y = N11*qhaty;
	  Ylmprime.x = N11*dqhatx;
	  Ylmprime.y = N11*dqhaty;
	}
	else if (m==-1){
	  Ylm.x =  N1m1*qhatx;
	  Ylm.y = -N1m1*qhaty;
	  Ylmprime.x =  N1m1*dqhatx;
	  Ylmprime.y = -N1m1*dqhaty;
	}
      }
      else if (l==2){
	if (m==0){
	  Ylm.x = N20*(3.0*qhatz*qhatz-1.0);
	  Ylm.y = 0.0;
	  Ylmprime.x = N20*3.0*2.0*qhatz*dqhatz;
	  Ylmprime.y = 0.0;
	}
	else if (m==1){
	  Ylm.x = N21*(qhatz*qhatx);
	  Ylm.y = N21*(qhatz*qhaty);
	  Ylmprime.x = N21*(dqhatz*qhatx + qhatz*dqhatx);
	  Ylmprime.y = N21*(dqhatz*qhaty + qhatz*dqhaty);
	}
	else if (m==-1){
	  Ylm.x = N2m1*( qhatz*qhatx);
	  Ylm.y = N2m1*(-qhatz*qhaty);
	  Ylmprime.x = N2m1*( dqhatz*qhatx + qhatz*dqhatx);
	  Ylmprime.y = N2m1*(-dqhatz*qhaty - qhatz*dqhaty);
	}
	else if (m==2){
	  Ylm.x = N22*(qhatx*qhatx-qhaty*qhaty);
	  Ylm.y = N22*(2.0*qhatx*qhaty);
	  Ylmprime.x = N22*(2.0*dqhatx*qhatx-2.0*dqhaty*qhaty);
	  Ylmprime.y = N22*(2.0*dqhatx*qhaty + 2.0*qhatx*dqhaty);
	}
	else if (m==-2){
	  Ylm.x = N2m2*(qhatx*qhatx-qhaty*qhaty);
	  Ylm.y = N2m2*(-2.0*qhatx*qhaty);
	  Ylmprime.x = N2m2*(2.0*dqhatx*qhatx-2.0*dqhaty*qhaty);
	  Ylmprime.y = N2m2*(-2.0*dqhatx*qhaty - 2.0*qhatx*dqhaty);
	}
      }
      else
	die("nonlocal pseudopot: l ?!?!!\n");
    skipcalcYlm: 
      /* Calculate Vnlprime */
      scalar temp1, temp2;
      temp1 = S*Ylmprime; temp1 *= flq;
      temp2 = S*Ylm;      temp2 *= flqprime*dq;
      Vnlprime.col[gamma].c[n] = temp1 + temp2;
      Vnlprime.col[gamma].c[n] *= fourpi*invsqrtVol;
    }
  }

  /* We know at least these two things...set them to be safe. */
  Vnlprime.basis = basis;
  Vnlprime.k = kvec;
  
  return Vnlprime;
}

/*
     Textbook formulae from Liboff:

  Y0,0  = 1/sqrt(4pi)

  Y1,0      = sqrt(3/(4*pi))*cos(theta)
  Y1,(+-)1  = (-+)sqrt(3/(8pi))*sin(theta)*exp((+-)i*phi)

  Y2,0      = sqrt(5/(16*pi))*(3*cos(theta)^2 -1)
  Y2,(+-)1  = (-+)sqrt(15/(8pi))(sin(theta)*cos(theta))*exp((+-)i*phi)
  Y2,(+-)2  = sqrt(15/(32*pi))(sin(theta)^2)*exp((+-)2i*phi)
*/


/* this is a hack to test the analytic stresses */

void
latforces_finite_difs(Elecinfo *elecinfo, Elecvars *elecvars, Ioninfo *ions, Energies *ener, Lattice_forces *force)
{

  real epsilon; 
  Basis *basis; basis = elecvars->C->basis;
  Basis tmpbasis = (*basis);
  Energies ener_p;


  dft_log("\nepsilon\t  Etot\t\tK\t\tH\txc\tcore\t\tps\tEwald\t\tnl \n");
  for(epsilon = 0.1; epsilon > 0.0000009; epsilon /= 10.){
    {
      matrix3 T(1.+epsilon, 0, 0., 0,  1., 0., 0., 0., 1.);
      *basis = tmpbasis; 
      stress_lat(basis, T); 
      calc_n(elecinfo, elecvars, ions);
      solve_poisson(elecvars);
      Vloc_pseudo(basis,ions,elecvars->Vlocps.c);
      calc_all_energies(basis, ions, elecinfo, elecvars, &ener_p);
    }
    dft_log("%f ", epsilon);
    dft_log("%.4e ", -1-(ener_p.Etot - ener->Etot)/(epsilon*force->Ftot.m[0][0]));
    dft_log("%.4e ", -1-(ener_p.KE - ener->KE)/(epsilon*force->FKE.m[0][0]));
    dft_log("%.4e ", -1-(ener_p.EH - ener->EH)/(epsilon*force->FH.m[0][0]));
    dft_log("%.4e ", -1-(ener_p.Exc - ener->Exc)/(epsilon*force->Fxc.m[0][0]));
    dft_log("%.4e ", -1-(ener_p.Ecore - ener->Ecore)/(epsilon*force->Fcore.m[0][0]));
    dft_log("%.4e ", -1-(ener_p.Eloc - ener->Eloc)/(epsilon*force->Fps.m[0][0]));
    dft_log("%.4e ", -1-(ener_p.Eewald - ener->Eewald)/(epsilon*force->Fewald.m[0][0]));
    dft_log("%.4e\n", -1-(ener_p.Enl - ener->Enl)/(epsilon*force->Fnl.m[0][0]));
  }
  dft_log( "\n");

  /*   *basis = tmpbasis; */
  /*   calc_Eewald(ions, basis, ener, MUMBLE, logfile); */


  /*   matrix3 T(1.0001, 0, 0, 0, 1, 0, 0, 0, 1.); */
  /*   *basis = tmpbasis;  */
  /*   stress_lat(basis, T);  */
  /*   calc_n(elecinfo, elecvars); */
  /*   solve_poisson(elecvars); */
  /*   Vloc_pseudo(basis,ions,elecvars->Vlocps.c, 0,logfile); */
  /*   calc_all_energies(basis, ions, elecinfo, elecvars, &ener_p, SILENT, logfile); */
  /*   fprintf(logfile, "total energy for T11=1.0001  Etot=%.10f\n", ener_p.Etot); */

  /*   calc_Eewald(ions, basis, &ener_p, MUMBLE, logfile); */


}



void
stress_lat(Basis *basis, const matrix3 &T)
{

  basis->latvec = T*(basis->latvec);

  basis->unit_cell_volume = det3(basis->latvec);
  basis->G = (2.0*M_PI)*inv3(basis->latvec);
  basis->GGT = (basis->G)*(~basis->G);

}
