[Q-e-developers] [Q-e-commits] r12231 - trunk/espresso/GWW/bse

Thu Mar 24 17:31:56 CET 2016

Ciao Paolo,

the code you commit has few issues on different compilers. Here one for example:

 pgf90 -fast -r8 -Mcache_align -mp -D__PGI -D__OPENMP -D__FFTW   -I../../include -I../../iotk/src -I../../Modules -I../../PHonon/Gamma/ -I../../PHonon/PH/ -I../../PW/src -I../../FFTXlib -I../../LAXlib -I../pw4gww -I../gww -I. -c plot_excwfn.F90 -o plot_excwfn.o
PGF90-S-0141-Derived Type object required on left of % (plot_excwfn.F90: 70)
PGF90-S-0075-Subscript, substring, or argument illegal in this context for v_rt (plot_excwfn.F90: 70)
PGF90-S-0038-Symbol, v_rt, has not been explicitly declared (plot_excwfn.F90)
  0 inform,   0 warnings,   3 severes, 0 fatal for plot_excwfn
make[2]: *** [plot_excwfn.o] Error 2

I will correct minor issues myself if those are immediate to fix. 

--
Mr. Filippo SPIGA, M.Sc.
Quantum ESPRESSO Foundation
http://www.quantum-espresso.org ~ skype: filippo.spiga

*****
Disclaimer: "Please note this message and any attachments are CONFIDENTIAL and may be privileged or otherwise protected from disclosure. The contents are not to be disclosed to anyone other than the addressee. Unauthorized recipients are requested to preserve this confidentiality and to advise the sender immediately of any error in transmission."

> On Mar 21, 2016, at 11:09 AM, puma at qeforge.qe-forge.org wrote:
> 
> Author: puma
> Date: 2016-03-21 12:09:18 +0100 (Mon, 21 Mar 2016)
> New Revision: 12231
> 
> Added:
>   trunk/espresso/GWW/bse/absorption.f90
>   trunk/espresso/GWW/bse/bse_basic_structure.f90
>   trunk/espresso/GWW/bse/bse_main.f90
>   trunk/espresso/GWW/bse/bse_wannier.f90
>   trunk/espresso/GWW/bse/cgsolve.f90
>   trunk/espresso/GWW/bse/check_basis.f90
>   trunk/espresso/GWW/bse/conj_grad_stuff.f90
>   trunk/espresso/GWW/bse/conjgrad.f90
>   trunk/espresso/GWW/bse/contract_w_exc.f90
>   trunk/espresso/GWW/bse/diago_exc.f90
>   trunk/espresso/GWW/bse/direct_v_exc.f90
>   trunk/espresso/GWW/bse/direct_w_exc.f90
>   trunk/espresso/GWW/bse/dvpsi_bse.f90
>   trunk/espresso/GWW/bse/exc_h_a.f90
>   trunk/espresso/GWW/bse/exchange_exc.f90
>   trunk/espresso/GWW/bse/exciton.f90
>   trunk/espresso/GWW/bse/find_eig.f90
>   trunk/espresso/GWW/bse/h_h.f90
>   trunk/espresso/GWW/bse/lanczos.f90
>   trunk/espresso/GWW/bse/openfil_bse.f90
>   trunk/espresso/GWW/bse/plot_excwfn.f90
>   trunk/espresso/GWW/bse/print_bse.f90
>   trunk/espresso/GWW/bse/print_spectrum.f90
>   trunk/espresso/GWW/bse/qpcorrections.f90
>   trunk/espresso/GWW/bse/qpe_exc.f90
>   trunk/espresso/GWW/bse/read_export.f90
>   trunk/espresso/GWW/bse/rotate_wannier.f90
>   trunk/espresso/GWW/bse/sdescent.f90
>   trunk/espresso/GWW/bse/spectrum.f90
>   trunk/espresso/GWW/bse/start_bse.f90
>   trunk/espresso/GWW/bse/tmp.f90
>   trunk/espresso/GWW/bse/transitions.f90
>   trunk/espresso/GWW/bse/tspace.f90
>   trunk/espresso/GWW/bse/write_wannier_matrix.f90
> Log:
> Fortran files of new GWL BSE code.
> 
> 
> 
> Added: trunk/espresso/GWW/bse/absorption.f90
> ===================================================================
> --- trunk/espresso/GWW/bse/absorption.f90	                        (rev 0)
> +++ trunk/espresso/GWW/bse/absorption.f90	2016-03-21 11:09:18 UTC (rev 12231)
> @@ -0,0 +1,280 @@
> +subroutine absorption(vstate_r,psibar,fc,ieig,ampl,ipol)
> +!this subroutine handles the computation of the absorption spectrum
> +
> +USE bse_wannier, ONLY : l_finite,r_pola,num_nbndv
> +USE cell_base,   ONLY : at,alat
> +USE bse_basic_structures
> +USE fft_custom_gwl
> +USE wvfct,       ONLY : npw
> +
> +
> +implicit none
> +TYPE(v_state_r),INTENT(in) :: vstate_r
> +TYPE(fft_cus),  INTENT(in) :: fc
> +REAL(DP),       INTENT(out):: ampl
> +INTEGER,        INTENT(in) :: ieig,ipol
> +COMPLEX(DP),    INTENT(in) :: psibar(npw,num_nbndv(1))
> +
> +REAL(DP) ::  imod_rpola
> +REAL(DP) :: upol(3,3)
> +
> +data upol /1.0d0,0.0d0,0.0d0, 0.0d0,1.0d0,0.0d0, 0.0d0,0.0d0,1.0d0/
> +
> +call start_clock('absorption')
> +if(l_finite) then
> +   r_pola(1:3)=upol(1:3,ipol)
> +   call amplitude_finite(vstate_r,fc,ieig,ampl)
> +else
> +   call amplitude(psibar(1,1),fc,ieig,ampl)
> +endif
> +
> +call stop_clock('absorption')
> +return
> +end subroutine
> +
> +subroutine amplitude_finite(vstate_r,fc,ieig,ampl)
> +!this subroutine computes the amplitude of each exciton for finite systems using 
> +!the matrix element of the position operator
> +!WARNING: for now, it should be used only for calculation where the molecule is centered around the
> +!origin of the supercell 
> +
> +USE exciton
> +USE bse_wannier, ONLY : num_nbndv,l_finite,r_pola
> +USE cell_base,   ONLY : at,alat
> +USE fft_custom_gwl
> +USE bse_basic_structures
> +USE mp_world,   ONLY : mpime, nproc
> +USE mp_pools,    ONLY: nproc_pool
> +USE mp,          ONLY : mp_barrier
> +USE io_global,   ONLY : stdout
> +USE wvfct,       ONLY : npw
> +USE mp_world,             ONLY : world_comm
> +
> +
> +implicit none
> +TYPE(v_state_r), INTENT(in) :: vstate_r
> +TYPE(fft_cus),   INTENT(in) :: fc
> +REAL(DP),        INTENT(out):: ampl
> +INTEGER,         INTENT(in) :: ieig
> +
> +TYPE(exc_r) :: rpsiv_r
> +TYPE(exc)   :: rpsiv
> +
> +REAL(DP) :: r(3),rr(3),rdote
> +INTEGER  :: ikstart,iklocal,ij,ii,ik,ifft,iktotal
> +INTEGER  :: iv
> +LOGICAL  :: debug
> +INTEGER  :: iz,iy,ix,iqq
> +REAL(DP) :: prod,prod2
> +
> +call start_clock('amplitude_finite')
> +debug=.true.
> +
> +! each processor finds the starting index of its collection of FFT planes  
> +ikstart=1
> +do ii=1,mpime
> +   ikstart=ikstart+fc%dfftt%npp(ii)
> +enddo
> +
> +iktotal=0
> +do ii=1,nproc_pool
> +   iktotal=iktotal+fc%dfftt%npp(ii)
> +enddo
> +
> +!if(debug) then
> +!   write(stdout,*) 'mpime, iktotal=',mpime,iktotal
> +!   write(stdout,*) 'mpime,fc%nrx3t',mpime,fc%nrx3t
> +!   write(stdout,*) 'ikstart=',ikstart
> +!endif
> +
> +!call flush_unit( stdout )
> +!call mp_barrier
> +
> +
> +! create the rpsiv_r excitonic vector (in real space)
> +
> +call initialize_exc_r(rpsiv_r)
> +rpsiv_r%nrxxt=fc%nrxxt 
> +rpsiv_r%numb_v=num_nbndv(1)
> +rpsiv_r%label=12
> +allocate(rpsiv_r%ar(rpsiv_r%nrxxt,rpsiv_r%numb_v))
> +
> +rpsiv_r%ar(1:rpsiv_r%nrxxt,1:rpsiv_r%numb_v)=0.d0
> +
> +do iz=1,fc%dfftt%npp(mpime+1) 
> +   do iy=1,fc%nr2t
> +      do ix=1,fc%nr1t
> +          iqq=(iz-1)*(fc%nrx1t*fc%nrx2t)+(iy-1)*fc%nrx1t+ix 
> +          r(:)= (dble(ix-1)/dble(fc%nr1t)-int(2.d0*dble(ix-1)/dble(fc%nr1t)))*at(:,1)*alat+&
> +            &(dble(iy-1)/dble(fc%nr2t)-int(2.d0*dble(iy-1)/dble(fc%nr2t)))*at(:,2)*alat+&
> +            &(dble(iz-1+ikstart-1)/dble(fc%nr3t)-int(2.d0*dble(iz-1+ikstart-1)/dble(fc%nr3t)))*at(:,3)*alat
> +
> +!          r(:)=dble(ix-1)/dble(fc%nr1t)*at(:,1)*alat+&
> +!               dble(iy-1)/dble(fc%nr2t)*at(:,2)*alat+&
> +!               dble(iz-1+ikstart-1)/dble(fc%nr3t)*at(:,3)*alat
> +
> +!          if(debug) then
> +!             rr(1)=dble(ix-1)/dble(fc%nr1t)-int(2.d0*dble((ix)-1)/dble(fc%nr1t))
> +!             rr(2)=dble(iy-1)/dble(fc%nr2t)-int(2.d0*dble((iy)-1)/dble(fc%nr2t))
> +!             rr(3)= dble(iz-1+ikstart-1)/dble(fc%nr3t)-int(2.d0*dble(iz-1+ikstart-1)/dble(fc%nr3t))
> +!             write(stdout,*) 'rr',rr(1),rr(2),rr(3) 
> +!             write(stdout,*) 'rc',r(1),r(2),r(3)
> +!             CALL flush_unit( stdout )   
> +!          endif
> +
> +          rdote=r(1)*r_pola(1)+r(2)*r_pola(2)+r(3)*r_pola(3)   
> +
> +          rpsiv_r%ar(iqq,1:num_nbndv(1))=rdote*vstate_r%wfnrt(iqq,1:num_nbndv(1),1)
> +  
> +      enddo
> +   enddo
> +enddo
> +
> +!do ifft=0,fc%nrx1t*fc%nrx2t*fc%dfftt%npp(mpime+1)-1
> +!
> +!   iklocal=ifft/(fc%nrx1t*fc%nrx2t)+1
> +!   ik=ikstart+iklocal-1
> +!   ij=(ifft-(fc%nrx1t*fc%nrx2t)*(iklocal-1))/fc%nrx1t+1
> +!   ii=ifft-(fc%nrx1t*fc%nrx2t)*(iklocal-1)-fc%nrx1t*(ij-1)
> +!!   
> +!   r(:)= (dble(ii-1)/dble(fc%nrx1t)-int(2.d0*dble((ii)-1)/dble(fc%nrx1t)))*at(:,1)*alat+&
> +!        &(dble(ij-1)/dble(fc%nrx2t)-int(2.d0*dble((ij)-1)/dble(fc%nrx2t)))*at(:,2)*alat+&
> +!        &(dble(ik-1)/dble(iktotal)-int(2.d0*dble((ik)-1)/dble(iktotal)))*at(:,3)*alat
> +!!
> +!   if(debug) then
> +!      rr(1)=(dble((ii)-1)/dble(fc%nrx1t)-int(2.d0*dble((ii)-1)/dble(fc%nrx1t)))
> +!      rr(2)=(dble((ij)-1)/dble(fc%nrx2t)-int(2.d0*dble((ij)-1)/dble(fc%nrx2t)))
> +!      rr(3)=(dble((ik)-1)/dble(iktotal)-int(2.d0*dble((ik)-1)/dble(iktotal)))
> +!      write(stdout,*) 'rr',rr(1),rr(2),rr(3) 
> +!      write(stdout,*) 'rc',r(1),r(2),r(3)
> +!      CALL flush_unit( stdout )   
> +!   endif
> +!
> +!   rdote=r(1)*r_pola(1)+r(2)*r_pola(2)+r(3)*r_pola(3)   
> +!
> +!   rpsiv_r%ar(ifft+1,1:num_nbndv(1))=rdote*vstate_r%wfnrt(ifft+1,1:num_nbndv(1),1)
> +!
> +!enddo
> +
> +
> +
> +!fft rpsiv_r to reciprocal space
> +call initialize_exc(rpsiv)
> +rpsiv%label=100
> +rpsiv%npw=npw
> +rpsiv%numb_v=num_nbndv(1)
> +allocate(rpsiv%a(rpsiv%npw,rpsiv%numb_v)) 
> +
> +!if (debug) then
> +!   call mp_barrier
> +!   write(stdout,*) 'rpsiv allocated'
> +!   CALL flush_unit( stdout )   
> +!endif
> +
> +call fftback_a_exc(rpsiv_r,fc,rpsiv)
> +
> +!if (debug) then
> +!   call mp_barrier
> +!   write(stdout,*) 'fft_performed'
> +!   CALL flush_unit( stdout )   
> +!endif
> +
> +
> +!compute the exciton amplitude
> +if(debug) then
> + call sproduct_exc(rpsiv,rpsiv,prod)
> + call sproduct_exc(bse_spectrum(ieig),bse_spectrum(ieig),prod2)
> + write(*,*) 'ieig, prod1', ieig, prod
> + write(*,*) 'ieig, prod2', ieig, prod2
> +endif
> +
> +call sproduct_exc(bse_spectrum(ieig),rpsiv,ampl)
> +ampl=ampl*ampl
> +
> +
> +!if (debug) then
> +!   call mp_barrier
> +!   write(stdout,*) 'amplitude computed'
> +!   CALL flush_unit( stdout )   
> +!endif
> +
> +FLUSH( stdout )   
> +call free_memory_exc_a_r(rpsiv_r)
> +call free_memory_exc_a(rpsiv)
> +
> +call stop_clock('amplitude_finite')
> +return
> +end subroutine
> +
> +subroutine amplitude(psibar,fc,ieig,ampl)
> +!this subroutine computes the amplitude of each exciton 
> +
> +USE exciton
> +USE bse_wannier, ONLY : num_nbndv,l_finite,r_pola
> +USE cell_base,   ONLY : at,alat
> +USE fft_custom_gwl
> +USE bse_basic_structures
> +USE mp_world,   ONLY : mpime, nproc
> +USE mp_pools,    ONLY:nproc_pool
> +USE mp,          ONLY : mp_barrier
> +USE mp_world,             ONLY : world_comm
> +USE io_global,   ONLY : stdout
> +USE wvfct,       ONLY : npw
> +
> +
> +implicit none
> +TYPE(fft_cus),   INTENT(in) :: fc
> +REAL(DP),        INTENT(out):: ampl
> +INTEGER,         INTENT(in) :: ieig
> +COMPLEX(DP),    INTENT(in) :: psibar(npw,num_nbndv(1))
> +
> +
> +REAL(DP) :: r(3),rr(3),rdote,prod,prod2
> +INTEGER  :: ikstart,iklocal,ij,ii,ik,ifft,iktotal
> +INTEGER  :: iv
> +LOGICAL  :: debug
> +INTEGER  :: iz,iy,ix,iqq
> +TYPE(exc)   :: rpsiv
> +
> +call start_clock('amplitude')
> +debug=.false.
> +
> +call initialize_exc(rpsiv)
> +rpsiv%label=100
> +rpsiv%npw=npw
> +rpsiv%numb_v=num_nbndv(1)
> +allocate(rpsiv%a(rpsiv%npw,rpsiv%numb_v)) 
> +
> +do iv=1,num_nbndv(1)
> +   rpsiv%a(1:rpsiv%npw,iv)=psibar(1:npw,iv) 
> +enddo
> +
> +!check if there is something in the psibar vector
> +! and in the bse_spectrum_vector
> +if(debug) then
> + call sproduct_exc(rpsiv,rpsiv,prod)
> + call sproduct_exc(bse_spectrum(ieig),bse_spectrum(ieig),prod2)
> + write(*,*) 'ieig, prod1', ieig, prod
> + write(*,*) 'ieig, prod2', ieig, prod2
> +endif
> +
> +
> +!compute the exciton amplitude
> +
> +call sproduct_exc(bse_spectrum(ieig),rpsiv,ampl)
> +ampl=ampl*ampl
> +
> +
> +!if (debug) then
> +!   call mp_barrier
> +!   write(stdout,*) 'amplitude computed'
> +!endif
> +
> +call free_memory_exc_a(rpsiv)
> +
> +FLUSH( stdout )   
> +call mp_barrier(world_comm)
> +
> +call stop_clock('amplitude')
> +return
> +end subroutine
> 
> Added: trunk/espresso/GWW/bse/bse_basic_structure.f90
> ===================================================================
> --- trunk/espresso/GWW/bse/bse_basic_structure.f90	                        (rev 0)
> +++ trunk/espresso/GWW/bse/bse_basic_structure.f90	2016-03-21 11:09:18 UTC (rev 12231)
> @@ -0,0 +1,920 @@
> +MODULE bse_basic_structures
> +!this module describes the basis structures 
> +!which are obtained from the DFT and GW code
> +  USE kinds, ONLY : DP
> +
> +  REAL(kind=DP), ALLOCATABLE :: vg_q(:) ! contains the elements V(G) of the Coulomb potential obtained upon integration over q
> +
> +  COMPLEX(kind=DP), ALLOCATABLE :: u_trans(:,:,:)!unitarian transformation from bloch wfcs to wannier'
> +
> +  TYPE wannier_o
> +! this structures contains the overlap between the wannier square modules
> +     integer :: numb_v ! number of valence bands for the two spin channels 
> +     real(kind=dp),dimension(:,:), pointer ::o(:,:) ! overlap matrix (numb_v*numb_v)   
> +  END TYPE
> +
> +  TYPE ii_mat
> +     integer :: numb_v! number of valence bands for the two spin channels 
> +     integer :: np_max ! maximum number of overlapping wannier orbitals for a given v
> +     integer, dimension (:,:), pointer :: iimat(:,:) ! (np_max,numb_v) the rows of this matrix contain for each iv, 
> +                                                     !the set of jv for which   o_mat(iv,jv)>=s_bse
> + 
> +  END TYPE
> +
> +  TYPE vww_prod
> +!this type contains the v*wv*wv' products needed for the exchange part of the
> +!direct interaction term of the excitonic Hamiltonian
> +     integer :: numb_v! number of valence bands for the two spin channels 
> +     integer :: npw ! number of plane wave per processor
> +     integer :: np_max ! maximum number of overlapping wannier orbitals for a given v
> +     complex(kind=dp), dimension (:,:,:), pointer :: vww(:,:,:) ! v*wv*wv' product in G space (npw,np_max,numb_v)
> +
> +  END TYPE
> +  
> +  TYPE bse_z
> +! this type contains the z terms to build up the Wc term of the excitonic Hamiltonian
> +!z_beta_v_v'=(v*phi_beta)*wv*Wv'
> +     integer :: numb_v! number of valence bands for the two spin channels 
> +     integer :: np_max ! maximum number of overlapping wannier orbitals for a given v
> +     integer :: numw_prod ! dimension of the polarizability basis
> +     real(kind=dp), dimension (:,:,:), pointer :: z(:,:,:) ! v*phi_beta*wv*wv' product (numw_prod,np_max,numb_v)
> +  END TYPE
> +
> +  TYPE v_state
> +! this type contains the valence states wavefunctions and single particle energies
> +!
> +  integer :: nspin  ! number of spin channels
> +  integer :: numb_v(2) ! number valence state
> +  integer :: npw ! number of g-vectors per processor
> +  real(kind=dp), dimension (:,:),pointer  :: esp(:,:) ! single particle energies (numb_v,nspin) 
> +  complex(kind=dp), dimension(:,:,:), pointer :: wfn(:,:,:) ! wave function in G space (npw,numb_v,nspin)  
> +  integer ::gstart
> +
> +  END TYPE
> +
> +  TYPE v_state_r
> +! this type contains the valence states wfns in real space on the dual grid 
> +  integer :: nspin  ! number of spin channels
> +  integer :: numb_v(2) ! number of valence states
> +  integer :: nrxxt ! number of r points per processor
> +  real(kind=dp), dimension(:,:,:), pointer :: wfnrt(:,:,:) ! wave function in r-spce (dual grid) (nrxxt,numb_v,nspin)
> +
> +  END TYPE
> +
> +  TYPE c_state
> +! this type contains the valence states wavefunctions and single particle energies
> +!
> +  integer :: nspin  ! number of spin channels
> +  integer :: numb_c ! number valence state
> +  integer :: npw ! number of g-vectors per processor
> +  real(kind=dp), dimension (:,:),pointer  :: esp(:) ! single particle energies (numb_c) 
> +  complex(kind=dp), dimension(:,:), pointer :: wfn(:,:) ! wave function in G space (npw,numb_c)  
> +  integer ::gstart
> +
> +  END TYPE
> +
> +  TYPE c_state_r
> +! this type contains the valence states wfns in real space on the dual grid 
> +  integer :: nspin  ! number of spin channels
> +  integer :: numb_c ! number of valence states
> +  integer :: nrxxt ! number of r points per processor
> +  real(kind=dp), dimension(:,:), pointer :: wfnrt(:,:) ! wave function in r-spce (dual grid) (nrxxt,numb_c)
> +
> +  END TYPE
> +
> +  CONTAINS
> +
> +      subroutine initialize_v_state_r(v_wfnr)
> +      implicit none
> +      type(v_state_r) :: v_wfnr
> +      nullify(v_wfnr%wfnrt)
> +      return
> +      end subroutine
> +
> +      subroutine initialize_v_state(v_wfn)
> +      implicit none
> +      type(v_state) :: v_wfn
> +      nullify(v_wfn%wfn)
> +      nullify(v_wfn%esp)
> +      return
> +      end subroutine
> +
> +      subroutine initialize_c_state_r(c_wfnr)
> +      implicit none
> +      type(c_state_r) :: c_wfnr
> +      nullify(c_wfnr%wfnrt)
> +      return
> +      end subroutine
> +
> +      subroutine initialize_c_state(c_wfn)
> +      implicit none
> +      type(c_state) :: c_wfn
> +      nullify(c_wfn%wfn)
> +      nullify(c_wfn%esp)
> +      return
> +      end subroutine
> +  
> +      subroutine initialize_wannier_o(o)
> +      implicit none
> +      type(wannier_o) :: o
> +      nullify(o%o)
> +      return
> +      end subroutine
> +  
> +      subroutine initialize_imat(iimat)
> +      implicit none
> +      type(ii_mat) :: iimat
> +      nullify(iimat%iimat)
> +      return
> +      end subroutine
> +
> +      subroutine initialize_vww_prod(vww)
> +      implicit none
> +      type(vww_prod) :: vww
> +      nullify(vww%vww)
> +      return
> +      end subroutine
> +
> +      subroutine initialize_bse_z(z)
> +      implicit none
> +      type(bse_z) :: z
> +      nullify(z%z)
> +      return
> +      end subroutine
> +
> +      subroutine free_v_state_r(v_wfnr)
> +      implicit none
> +      type(v_state_r) :: v_wfnr
> +      if(associated(v_wfnr%wfnrt)) deallocate (v_wfnr%wfnrt)
> +      nullify(v_wfnr%wfnrt)
> +      return
> +      end subroutine
> +
> +      subroutine free_v_state(v_wfn)
> +      implicit none
> +      type(v_state) :: v_wfn
> +      if(associated(v_wfn%wfn)) deallocate (v_wfn%wfn)
> +      nullify(v_wfn%wfn)
> +      if(associated(v_wfn%esp)) deallocate (v_wfn%esp)
> +      nullify(v_wfn%esp)
> +      return
> +      end subroutine
> +
> +      subroutine free_c_state_r(c_wfnr)
> +      implicit none
> +      type(c_state_r) :: c_wfnr
> +      if(associated(c_wfnr%wfnrt)) deallocate (c_wfnr%wfnrt)
> +      nullify(c_wfnr%wfnrt)
> +      return
> +      end subroutine
> +
> +      subroutine free_c_state(c_wfn)
> +      implicit none
> +      type(c_state) :: c_wfn
> +      if(associated(c_wfn%wfn)) deallocate (c_wfn%wfn)
> +      nullify(c_wfn%wfn)
> +      if(associated(c_wfn%esp)) deallocate (c_wfn%esp)
> +      nullify(c_wfn%esp)
> +      return
> +      end subroutine
> +  
> +      subroutine free_wannier_o(o)
> +      implicit none
> +      type(wannier_o) :: o
> +      if(associated(o%o)) deallocate (o%o)
> +      nullify(o%o)
> +      return
> +      end subroutine
> +  
> +      subroutine free_imat(iimat)
> +      implicit none
> +      type(ii_mat) :: iimat
> +      if(associated(iimat%iimat)) deallocate (iimat%iimat)
> +      nullify(iimat%iimat)
> +      return
> +      end subroutine
> +
> +      subroutine free_vww_prod(vww)
> +      implicit none
> +      type(vww_prod) :: vww
> +      if(associated(vww%vww)) deallocate (vww%vww)
> +      nullify(vww%vww)
> +      return
> +      end subroutine
> +
> +      subroutine free_bse_z(z)
> +      implicit none
> +      type(bse_z) :: z
> +      if(associated(z%z)) deallocate (z%z)
> +      nullify(z%z)
> +      return
> +      end subroutine
> +
> +      subroutine make_v_state(numb_v,v)
> +      use io_global, ONLY : stdout, ionode 
> +      USE gvect,                 ONLY : gstart
> +      USE lsda_mod,              ONLY : nspin
> +      use wavefunctions_module,  ONLY : evc
> +      use io_files,  ONLY : prefix, iunwfc, tmp_dir
> +      USE io_files, ONLY: nwordwfc
> +      USE wvfct,    ONLY : nbnd, npwx,npw,et
> +      use mp_world, ONLY : mpime
> +      USE mp,          ONLY :mp_barrier
> +      USE mp_world,             ONLY : world_comm
> +
> +      implicit none
> +
> +      type(v_state) :: v
> +      integer :: numb_v(2)
> +  
> +      integer :: is,ivmax,iv
> +      logical :: debug
> +
> +      debug=.false.
> +     
> +      call start_clock('make_v_state')
> +     
> +      if(debug) then
> +         write(*,*) 'make_v_state: in, mpime=',mpime
> +         ! debug MARGHE
> +         write(*,*) 'nbnd=', nbnd
> +         write(*,*) 'numb_v(1)=', numb_v(1)
> +      endif
> +
> +
> +      v%nspin=nspin
> +      v%numb_v(:)=numb_v(:)
> +      v%npw=npw
> +      v%gstart=gstart
> +
> +
> +      allocate( evc( npwx, nbnd ) )
> +  
> +      if (nspin==1) then
> +         ivmax= v%numb_v(1)
> +      else 
> +         ivmax=max(v%numb_v(1),v%numb_v(2))
> +      endif
> +      
> +
> +
> +      allocate( v%wfn(v%npw,ivmax,v%nspin))
> +      allocate( v%esp(ivmax,v%nspin))
> +
> +
> +      do is=1,nspin
> +         call davcio(evc,2*nwordwfc,iunwfc,is,-1)
> +         do iv=1,v%numb_v(is)
> +            v%wfn(1:v%npw,1:v%numb_v(is),is)=evc(1:v%npw,1:v%numb_v(is))
> +         enddo  
> +            v%esp(1:v%numb_v(is),is)=et(1:v%numb_v(is),is)
> +      enddo
> +
> +      deallocate(evc)
> +
> +      if(debug) then
> +         write(*,*) 'make_v_state: out, mpime=',mpime
> +      endif
> +
> +      call mp_barrier( world_comm )
> +      call stop_clock('make_v_state')
> +
> +      return
> +      end subroutine
> +
> +      subroutine make_c_state(numb_v,c)
> +      use io_global, ONLY : stdout, ionode 
> +      USE gvect,                 ONLY : gstart
> +      USE lsda_mod,              ONLY : nspin
> +      use wavefunctions_module,  ONLY : evc
> +      use io_files,  ONLY : prefix, iunwfc, tmp_dir
> +      USE io_files, ONLY: nwordwfc
> +      USE wvfct,    ONLY : nbnd, npwx,npw,et
> +      use mp_world, ONLY : mpime
> +      USE mp,          ONLY :mp_barrier
> +      USE mp_world,             ONLY : world_comm
> +
> +      implicit none
> +
> +      type(c_state) :: c
> +      integer :: numb_v(2)
> +  
> +      integer :: is,ic
> +      logical :: debug
> +
> +      debug=.false.
> +     
> +      call start_clock('make_c_state')
> +     
> +      if(debug) then
> +         write(*,*) 'make_c_state: in, mpime=',mpime
> +         ! debug MARGHE
> +         write(*,*) 'nbnd=', nbnd
> +         write(*,*) 'numb_v(1)=', numb_v(1)
> +      endif
> +
> +
> +      c%nspin=nspin
> +      c%numb_c=nbnd-numb_v(1)
> +      c%npw=npw
> +      c%gstart=gstart
> +
> +
> +      allocate( evc( npwx, nbnd ) )
> +  
> +!      if (nspin==1) then
> +!         ivmax= v%numb_v(1)
> +!      else 
> +!         ivmax=max(v%numb_v(1),v%numb_v(2))
> +!      endif
> +      
> +
> +
> +      allocate( c%wfn(c%npw,c%numb_c))
> +      allocate( c%esp(c%numb_c))
> +
> +
> +      do is=1,nspin
> +         call davcio(evc,2*nwordwfc,iunwfc,is,-1)
> +         do ic=1,c%numb_c
> +            c%wfn(1:c%npw,1:c%numb_c)=evc(1:c%npw,numb_v(is)+1:nbnd)
> +         enddo  
> +            c%esp(1:c%numb_c)=et(numb_v(is)+1:nbnd,is)
> +      enddo
> +
> +      deallocate(evc)
> +
> +      if(debug) then
> +         write(*,*) 'make_c_state: out, mpime=',mpime
> +      endif
> +
> +      call mp_barrier( world_comm )
> +      call stop_clock('make_c_state')
> +
> +      return
> +      end subroutine
> +
> +      subroutine c_times_cstate(v,cstate_in,cstate_out)
> +      ! this subroutine multiplies each line ic of the c_state vector by the ic real component of the v vector
> +      use kinds, only:DP
> +      !use bse_wannier, only: qpe_imin,qpe_imax
> +
> +      implicit none
> +
> +      type(c_state),intent(in) :: cstate_in
> +      type(c_state),intent(out) :: cstate_out
> +
> +      integer :: ib
> +      real(kind=DP) :: v(cstate_in%numb_c)
> +
> +      do ib=1,cstate_in%numb_c
> +         cstate_out%wfn(1:cstate_out%npw,ib)=cmplx(v(ib),0.d0)* cstate_out%wfn(1:cstate_in%npw,ib)
> +      enddo
> +     
> +      return
> +      end subroutine
> +
> + 
> +      subroutine v_wfng_to_wfnr(vwfng,fc,vwfnr)
> +     !this subroutine FFT the valence wfns to real space in the dual grid
> +
> +      USE kinds, ONLY : DP
> +      USE fft_custom_gwl
> +      USE bse_wannier, ONLY : dual_bse 
> +      USE wvfct,    ONLY : igk, g2kin, npwx, npw, nbnd, nbndx
> +      USE io_global, ONLY : stdout, ionode, ionode_id
> +      USE mp_world, ONLY : mpime, nproc,world_comm
> +      USE mp_wave, ONLY : mergewf,splitwf
> +      USE mp,             ONLY : mp_sum
> +      USE gvect
> +      USE wavefunctions_module, ONLY :  psic
> +
> +      implicit none
> +
> +      type(v_state) vwfng
> +      type(v_state_r) vwfnr
> +      type(fft_cus) :: fc
> +
> +      COMPLEX(kind=DP), allocatable :: vwfng_t(:,:,:)
> +      COMPLEX(kind=DP), ALLOCATABLE :: evc_g(:)
> +
> +      integer :: ii,is
> +      integer ::ivmax  
> +
> +      call start_clock('v_wfng_to_wfnr')
> +  
> + 
> +      if (vwfng%nspin==1) then
> +         ivmax= vwfng%numb_v(1)
> +      else 
> +         ivmax=max(vwfng%numb_v(1),vwfng%numb_v(2))
> +      endif
> +
> +
> +      allocate(vwfng_t(fc%npwt,ivmax,vwfng%nspin))
> +
> +      vwfnr%nspin=vwfng%nspin
> +      vwfnr%nrxxt=fc%nrxxt       
> +      vwfnr%numb_v=vwfng%numb_v
> +
> +      allocate(vwfnr%wfnrt(vwfnr%nrxxt,ivmax,vwfnr%nspin))
> +      
> +      allocate(evc_g(fc%ngmt_g ))
> +
> +      if(fc%dual_t==4.d0) then
> +      do is=1,vwfng%nspin
> +         vwfng_t(1:fc%npwt,1:vwfng%numb_v(is),is)= vwfng%wfn(1:fc%npwt,1:vwfng%numb_v(is),is)
> +      enddo
> +      else
> +         do is=1,vwfng%nspin
> +           call reorderwfp_col(vwfng%numb_v(is),vwfng%npw,fc%npwt,vwfng%wfn(1,1,is),vwfng_t(1,1,is),vwfng%npw,&
> +                 & fc%npwt,ig_l2g,fc%ig_l2gt,fc%ngmt_g,mpime, nproc,world_comm )
> +
> +           !do ii=1,vwfng%numb_v(is)
> +           !   call mergewf(vwfng%wfn(:,ii,is),evc_g,vwfng%npw,ig_l2g,mpime,nproc,ionode_id,intra_pool_comm)
> +           !   call splitwf(vwfng_t(:,ii,is),evc_g,fc%npwt,fc%ig_l2gt,mpime,nproc,ionode_id,intra_pool_comm)
> +           !enddo
> +         enddo
> +      endif
> +
> +      do is=1,vwfng%nspin 
> +         do ii=1,vwfng%numb_v(is),2
> +            psic(1:fc%nrxxt)=(0.d0,0.d0)
> +            if (ii==vwfng%numb_v(is)) then
> +               psic(fc%nlt(1:fc%npwt))  = vwfng_t(1:fc%npwt,ii,is)
> +               psic(fc%nltm(1:fc%npwt)) = CONJG( vwfng_t(1:fc%npwt,ii,is) )
> +            else
> +               psic(fc%nlt(1:fc%npwt))=vwfng_t(1:fc%npwt,ii,is)+(0.d0,1.d0)*vwfng_t(1:fc%npwt,ii+1,is)
> +               psic(fc%nltm(1:fc%npwt)) =CONJG(vwfng_t(1:fc%npwt,ii,is))+(0.d0,1.d0)*CONJG(vwfng_t(1:fc%npwt,ii+1,is))
> +            endif
> +            CALL cft3t( fc, psic, fc%nr1t, fc%nr2t, fc%nr3t, fc%nrx1t, fc%nrx2t, fc%nrx3t, 2 )
> +            vwfnr%wfnrt(1:fc%nrxxt,ii,is)= DBLE(psic(1:fc%nrxxt))
> +            if(ii/=vwfng%numb_v(is)) vwfnr%wfnrt(1:fc%nrxxt,ii+1,is)= DIMAG(psic(1:fc%nrxxt))
> +         enddo
> +      enddo
> +
> +      deallocate(evc_g)
> +
> +      call stop_clock('v_wfng_to_wfnr')
> +
> +      return
> +      end subroutine
> +
> +      subroutine c_wfng_to_wfnr(cwfng,fc,cwfnr)
> +     !this subroutine FFT the conduction wfns to real space in the dual grid
> +
> +      USE kinds, ONLY : DP
> +      USE fft_custom_gwl
> +      USE bse_wannier, ONLY : dual_bse, num_nbndv 
> +      USE wvfct,    ONLY : igk, g2kin, npwx, npw, nbnd, nbndx
> +      USE io_global, ONLY : stdout, ionode, ionode_id
> +      USE mp_world, ONLY : mpime, nproc,world_comm
> +      USE mp_wave, ONLY : mergewf,splitwf
> +      USE mp,             ONLY : mp_sum
> +      USE gvect
> +      USE wavefunctions_module, ONLY :  psic
> +
> +      implicit none
> +
> +      type(c_state) cwfng
> +      type(c_state_r) cwfnr
> +      type(fft_cus) :: fc
> +
> +      COMPLEX(kind=DP), allocatable :: cwfng_t(:,:)
> +      COMPLEX(kind=DP), ALLOCATABLE :: evc_g(:)
> +
> +      integer :: ii,is
> +      integer ::icmax  
> +
> +      call start_clock('c_wfng_to_wfnr')
> +  
> + 
> +!      if (vwfng%nspin==1) then
> +!         ivmax= vwfng%numb_v(1)
> +!      else 
> +!         ivmax=max(vwfng%numb_v(1),vwfng%numb_v(2))
> +!      endif
> +!       icmax=nbnd-num_nbndv(1)
> +
> +
> +      allocate(cwfng_t(fc%npwt,cwfng%numb_c))
> +
> +      cwfnr%nrxxt=fc%nrxxt       
> +      cwfnr%numb_c=cwfng%numb_c
> +
> +      allocate(cwfnr%wfnrt(cwfnr%nrxxt,cwfnr%numb_c))
> +      
> +      allocate(evc_g(fc%ngmt_g ))
> +
> +      if(fc%dual_t==4.d0) then
> +         cwfng_t(1:fc%npwt,1:cwfng%numb_c)= cwfng%wfn(1:fc%npwt,1:cwfng%numb_c)
> +      else
> +         call reorderwfp_col(cwfng%numb_c,cwfng%npw,fc%npwt,cwfng%wfn(1,1),cwfng_t(1,1),cwfng%npw,&
> +                 & fc%npwt,ig_l2g,fc%ig_l2gt,fc%ngmt_g,mpime, nproc,world_comm )
> +
> +      endif
> +
> +      do ii=1,cwfng%numb_c,2
> +           psic(1:fc%nrxxt)=(0.d0,0.d0)
> +           if (ii==cwfng%numb_c) then
> +              psic(fc%nlt(1:fc%npwt))  = cwfng_t(1:fc%npwt,ii)
> +              psic(fc%nltm(1:fc%npwt)) = CONJG( cwfng_t(1:fc%npwt,ii) )
> +           else
> +              psic(fc%nlt(1:fc%npwt))=cwfng_t(1:fc%npwt,ii)+(0.d0,1.d0)*cwfng_t(1:fc%npwt,ii+1)
> +              psic(fc%nltm(1:fc%npwt)) =CONJG(cwfng_t(1:fc%npwt,ii))+(0.d0,1.d0)*CONJG(cwfng_t(1:fc%npwt,ii+1))
> +           endif
> +           CALL cft3t( fc, psic, fc%nr1t, fc%nr2t, fc%nr3t, fc%nrx1t, fc%nrx2t, fc%nrx3t, 2 )
> +           cwfnr%wfnrt(1:fc%nrxxt,ii)= DBLE(psic(1:fc%nrxxt))
> +           if(ii/=cwfng%numb_c) cwfnr%wfnrt(1:fc%nrxxt,ii+1)= DIMAG(psic(1:fc%nrxxt))
> +      enddo
> +
> +      deallocate(evc_g)
> +
> +      call stop_clock('c_wfng_to_wfnr')
> +
> +      return
> +      end subroutine
> +
> +      subroutine write_wfnr(wfnr)
> +      ! this subroutines writes on disk the type v_state_r for every processor
> +!      USE io_files,             ONLY : find_free_unit, prefix
> +      USE io_files,             ONLY :  tmp_dir,prefix
> +      USE mp_world,  ONLY : mpime
> +      implicit none
> +
> +      INTEGER, EXTERNAL :: find_free_unit
> +      type(v_state_r) wfnr
> +      INTEGER :: iw, iunw,is
> +      CHARACTER(5) :: nproc
> +
> +      iunw=find_free_unit()
> +      
> +      write(nproc,'(5i1)') &
> +           & mpime/10000,mod(mpime,10000)/1000,mod(mpime,1000)/100,mod(mpime,100)/10,mod(mpime,10)
> +
> +
> +      open( unit=iunw, file=trim(tmp_dir)//trim(prefix)//'.wfnr_t.'// nproc , status='unknown',form='unformatted')
> +
> +      write(iunw) wfnr%numb_v  
> +      write(iunw) wfnr%nspin
> +      write(iunw) wfnr%nrxxt
> +      
> +      do is=1,wfnr%nspin
> +         do iw=1,wfnr%numb_v(is)
> +            write(iunw)  wfnr%wfnrt(1:wfnr%nrxxt,iw,is)
> +         enddo
> +      enddo 
> +      close(iunw)
> +      end subroutine
> +
> +      subroutine read_wfnr(wfnr)
> +      ! this subroutines reads from disk the type v_state_r for every processor
> +!      USE io_files,             ONLY : find_free_unit, prefix
> +      USE io_files,             ONLY : tmp_dir, prefix
> +      USE mp_world,  ONLY : mpime
> +      implicit none
> +
> +      INTEGER, EXTERNAL :: find_free_unit
> +      type(v_state_r) wfnr
> +      INTEGER :: iw, iunw,is
> +      CHARACTER(5) :: nproc
> +
> +      iunw=find_free_unit()
> +      
> +      write(nproc,'(5i1)') &
> +           & mpime/10000,mod(mpime,10000)/1000,mod(mpime,1000)/100,mod(mpime,100)/10,mod(mpime,10)
> +
> +
> +      open( unit=iunw, file=trim(tmp_dir)//trim(prefix)//'.wfnr_t.'// nproc , status='old',form='unformatted')
> +
> +      read(iunw) wfnr%numb_v  
> +      read(iunw) wfnr%nspin
> +      read(iunw) wfnr%nrxxt
> +
> +      do is=1,wfnr%nspin
> +         do iw=1,wfnr%numb_v(is)
> +            read(iunw)  wfnr%wfnrt(1:wfnr%nrxxt,iw,is)
> +         enddo
> +      enddo
> + 
> +      close(iunw)
> +      end subroutine
> +	
> +      subroutine write_cwfnr(wfnr)
> +      ! this subroutines writes on disk the type v_state_r for every processor
> +!      USE io_files,             ONLY : find_free_unit, prefix
> +      USE io_files,             ONLY :  tmp_dir,prefix
> +      USE mp_world,  ONLY : mpime
> +      implicit none
> +
> +      INTEGER, EXTERNAL :: find_free_unit
> +      type(c_state_r) wfnr
> +      INTEGER :: iw, iunw,is
> +      CHARACTER(5) :: nproc
> +
> +      iunw=find_free_unit()
> +      
> +      write(nproc,'(5i1)') &
> +           & mpime/10000,mod(mpime,10000)/1000,mod(mpime,1000)/100,mod(mpime,100)/10,mod(mpime,10)
> +
> +
> +      open( unit=iunw, file=trim(tmp_dir)//trim(prefix)//'.cwfnr_t.'// nproc , status='unknown',form='unformatted')
> +
> +      write(iunw) wfnr%numb_c  
> +      write(iunw) wfnr%nrxxt
> +      
> +      do iw=1,wfnr%numb_c
> +         write(iunw)  wfnr%wfnrt(1:wfnr%nrxxt,iw)
> +      enddo
> +
> +      close(iunw)
> +      end subroutine
> +
> +      subroutine read_cwfnr(wfnr)
> +      ! this subroutines reads from disk the type v_state_r for every processor
> +!      USE io_files,             ONLY : find_free_unit, prefix
> +      USE io_files,             ONLY : tmp_dir,prefix
> +      USE mp_world,  ONLY : mpime
> +      implicit none
> +
> +      INTEGER, EXTERNAL :: find_free_unit
> +      type(c_state_r) wfnr
> +      INTEGER :: iw, iunw,is
> +      CHARACTER(5) :: nproc
> +
> +      iunw=find_free_unit()
> +      
> +      write(nproc,'(5i1)') &
> +           & mpime/10000,mod(mpime,10000)/1000,mod(mpime,1000)/100,mod(mpime,100)/10,mod(mpime,10)
> +
> +
> +      open( unit=iunw, file=trim(tmp_dir)//trim(prefix)//'.cwfnr_t.'// nproc , status='old',form='unformatted')
> +
> +      read(iunw) wfnr%numb_c  
> +      read(iunw) wfnr%nrxxt
> +
> +      do iw=1,wfnr%numb_c
> +         read(iunw)  wfnr%wfnrt(1:wfnr%nrxxt,iw)
> +      enddo
> +
> +      close(iunw)
> +      end subroutine
> + 
> +
> +      subroutine read_omat(ispin,o)
> +      ! this subroutines reads the overlap matrix written by pw4gww
> +!      USE io_files,             ONLY : find_free_unit, prefix
> +      USE io_files,             ONLY : prefix,tmp_dir
> +      USE io_global,            ONLY : ionode, ionode_id
> +      USE mp,                   ONLY : mp_bcast
> +      USE kinds,                ONLY : DP
> +      USE mp_world,             ONLY : world_comm
> +      
> +      implicit none
> +
> +      INTEGER, EXTERNAL :: find_free_unit
> +
> +      type(wannier_o) :: o 
> +      integer ispin
> +
> +      integer ii,iunu
> +      real(kind=DP) :: s_bse
> +
> +      if(ionode) then
> +         iunu = find_free_unit()
> +         if (ispin==1) open(unit=iunu,file=trim(tmp_dir)//trim(prefix)//'.wbse1',status='old',form='unformatted')
> +         if (ispin==2) open(unit=iunu,file=trim(tmp_dir)//trim(prefix)//'.wbse2',status='old',form='unformatted')
> +
> +         read(iunu) o%numb_v
> +         read(iunu) s_bse
> +
> +         allocate(o%o(o%numb_v,o%numb_v))
> +
> +         do ii=1,o%numb_v
> +            read(iunu) o%o(1:o%numb_v,ii)
> +         enddo
> +         close(iunu)
> +      endif
> +      
> +      CALL mp_bcast(o%numb_v, ionode_id , world_comm)
> +      if(.not.ionode) then     
> +        allocate(o%o(o%numb_v,o%numb_v))
> +      endif
> +      CALL mp_bcast(o%o, ionode_id, world_comm )
> +
> +      return
> +      end subroutine
> +
> +      subroutine read_iimat(iimat,ispin) 
> +      ! this subroutines reads the ii matrix written by pw4gww
> +!      USE io_files,             ONLY : find_free_unit, prefix
> +      USE io_files,             ONLY :  prefix, tmp_dir
> +      USE io_global,            ONLY : ionode, ionode_id
> +      USE mp,                   ONLY : mp_bcast
> +      USE mp_world,             ONLY : world_comm
> +      USE kinds,                ONLY : DP
> +
> +      implicit none
> +      INTEGER, EXTERNAL :: find_free_unit
> +      type(ii_mat) :: iimat
> +      integer ispin
> +
> +      real(kind=DP) :: s_bse
> +      integer       :: iv,iuni
> +      logical       :: debug
> +
> +      debug=.false.
> +
> +      if(ionode) then
> +        iuni = find_free_unit()
> +        if (ispin==1) open(unit=iuni,file=trim(tmp_dir)//trim(prefix)//'.iwwbse1',status='old',form='unformatted')
> +        if (ispin==2) open(unit=iuni,file=trim(tmp_dir)//trim(prefix)//'.iwwbse2',status='old',form='unformatted')
> +        read(iuni) iimat%numb_v
> +        read(iuni) s_bse
> +        read(iuni) iimat%np_max
> +        if(debug) then
> +          write(*,*) 'From read_iimat numb_v',iimat%numb_v
> +          write(*,*) 'From read_iimat s_bse', s_bse
> +          write(*,*) 'From read_iimat np_max',iimat%np_max
> +        endif
> +      endif
> +
> +      CALL mp_bcast(iimat%numb_v, ionode_id, world_comm )
> +      CALL mp_bcast(iimat%np_max, ionode_id, world_comm )
> +
> +      allocate(iimat%iimat(iimat%np_max,iimat%numb_v))
> +
> +      if(ionode) then
> +        if(debug) then
> +           write(*,*) 'iimat matrix'  
> +        endif
> +        do iv=1, iimat%numb_v
> +           read(iuni) iimat%iimat(1:iimat%np_max,iv)
> +           if(debug) then
> +              write(*,*) 'iv=',iv, iimat%iimat(1:iimat%np_max,iv)
> +           endif
> +        enddo
> +        close(iuni)
> +      endif
> +
> +      CALL mp_bcast(iimat%iimat, ionode_id, world_comm )
> +
> +      return 
> +      end subroutine
> +
> +      subroutine read_vww_prod(ispin,numb_v,npw,np_max,iimat,vww)
> +      !each processor reads the vww(G) written by pw4gww
> +      !be careful to check that the iimat that is passed to the subroutine is the related to the correct spin channel
> +
> +!      USE io_files,             ONLY : find_free_unit, prefix,diropn
> +      USE io_files,             ONLY : prefix,diropn
> +      USE io_global, ONLY : stdout, ionode 
> +
> +      implicit none
> +      INTEGER, EXTERNAL :: find_free_unit
> +      type(vww_prod) :: vww
> +      type(ii_mat)   :: iimat
> +      integer        :: numb_v,npw,np_max,ispin
> +
> +      integer iv, ip, iungprod, ii,iundebug,i
> +      logical exst,debug
> +
> +      debug=.false.
> +     
> +      if(debug) then
> +         iundebug = find_free_unit()
> +         open(iundebug,file='vww_bse.dat')
> +      endif      
> + 
> +      vww%numb_v=numb_v
> +      vww%npw=npw
> +      vww%np_max=np_max
> +
> +      allocate(vww%vww(npw,np_max,numb_v))
> +
> +      vww%vww(1:npw,1:np_max,1:numb_v)=dcmplx(0.d0,0.d0)
> +      
> +      iungprod = find_free_unit()
> +      if (ispin==1)  CALL diropn( iungprod, 'vww_bse1.',npw*2, exst)
> +      if (ispin==2)  CALL diropn( iungprod, 'vww_bse2.',npw*2, exst)
> +
> +!      if(debug) then 
> +!         if(ionode) write(stdout,*) 'Read_vww_prod #1'
> +!      endif
> +
> +      ii=0
> +      do iv=1,numb_v
> +         do ip=1, np_max 
> +            if(iimat%iimat(ip,iv)>0) then
> +!               if(debug) then 
> +!                  if(ionode) write(stdout,*) 'Read_vww_prod #', ii
> +!               endif
> +               ii=ii+1
> +               call davcio(vww%vww(:,ip,iv),npw*2,iungprod,ii,-1)
> +               if(debug) then
> +                  if(ionode) then
> +                     do i=1,npw
> +                        write(iundebug,*) vww%vww(i,ip,iv)   
> +                     enddo
> +                  endif
> +               endif
> +            endif
> +         enddo
> +      enddo
> +
> +      close(iungprod)   
> +      if (debug) close(iundebug)   
> +      return
> +      end subroutine
> +
> +      subroutine read_z(ispin,iimat,z)
> +      ! the ionode reads the z matrix and broadcast its value to the rest of the
> +      ! processors.
> +      !be careful to check that the iimat that is passed to the subroutine is the related to the correct spin channel
> +
> +
> +!      USE io_files,             ONLY : find_free_unit, prefix
> +      USE io_files,             ONLY :  prefix, tmp_dir
> +      USE io_global,            ONLY : ionode, ionode_id
> +      USE mp,                   ONLY : mp_bcast, mp_barrier
> +      USE mp_world, ONLY : world_comm
> +      USE kinds,                ONLY : DP
> +      USE io_global, ONLY : stdout,ionode
> +
> +
> +      implicit none
> +      INTEGER, EXTERNAL :: find_free_unit
> +      type(bse_z)    ::z 
> +      type(ii_mat)   :: iimat
> +!      integer        :: numw_prod
> +      integer        ::ispin
> +
> +      real(kind=DP) :: s_bse
> +      integer       :: iv,iunz,ii
> +
> +      logical debug
> +
> +      debug=.false.
> +
> +      if(ionode) then 
> +         iunz = find_free_unit()
> +         if(debug) then
> +           if(ionode) write(stdout,*) 'read_z ',trim(tmp_dir)//trim(prefix)//'.zbse1'
> +         endif
> +
> +
> +         if (ispin==1) open(unit=iunz,file=trim(tmp_dir)//trim(prefix)//'.zbse1',status='old',form='unformatted')
> +         if (ispin==2) open(unit=iunz,file=trim(tmp_dir)//trim(prefix)//'.zbse2',status='old',form='unformatted')
> +         read(iunz) z%numb_v
> +         read(iunz) s_bse 
> +         read(iunz) z%np_max
> +         read(iunz) z%numw_prod
> +
> +         if(debug) then
> +           if(ionode) write(stdout,*) 'z%numb_v=', z%numb_v
> +           if(ionode) write(stdout,*) 's_bse=',s_bse
> +           if(ionode) write(stdout,*) 'z%np_max=',z%np_max
> +           if(ionode) write(stdout,*) 'z%numw_prod=', z%numw_prod
> +         endif
> +
> +      endif 
> +
> +      CALL mp_bcast(z%numb_v, ionode_id, world_comm )
> +      CALL mp_bcast(z%np_max, ionode_id, world_comm )
> +      CALL mp_bcast(z%numw_prod, ionode_id, world_comm )
> +      call mp_barrier(world_comm)
> +      
> +      allocate(z%z(z%numw_prod,z%np_max,z%numb_v))
> +     
> +      if(ionode) then
> +         do iv=1, z%numb_v
> +            do ii=1,z%np_max
> +               if(debug) then
> +                 if(ionode) write(stdout,*)'read_z, ii=',ii 
> +               endif
> +               if (iimat%iimat(ii,iv)>0) read(iunz) z%z(:,ii,iv)
> +            enddo
> +         enddo
> +      endif
> +
> +      if(debug) then
> +        if(ionode) write(stdout,*) 'read_z #1'
> +      endif
> +!
> +
> +      CALL mp_bcast(z%z, ionode_id, world_comm )
> +      call mp_barrier(world_comm)
> +
> +      if(debug) then
> +        if(ionode) write(stdout,*) 'read_z #2'
> +      endif
> +
> +
> +      if(ionode) close(iunz)
> +      FLUSH( stdout )      
> + 
> +      return
> +
> +      end subroutine
> +
> +END MODULE
> 
> Added: trunk/espresso/GWW/bse/bse_main.f90
> ===================================================================
> --- trunk/espresso/GWW/bse/bse_main.f90	                        (rev 0)
> +++ trunk/espresso/GWW/bse/bse_main.f90	2016-03-21 11:09:18 UTC (rev 12231)
> @@ -0,0 +1,380 @@
> +program bse_punch
> +
> +use io_global, ONLY : stdout, ionode, ionode_id
> +use io_files,  ONLY : psfile, pseudo_dir,diropn,outdir
> +!use io_files,  ONLY : prefix,outdir,tmp_dir,iunwfc,find_free_unit
> +use io_files,  ONLY : prefix,outdir,tmp_dir,iunwfc
> +use mp_world, ONLY : mpime
> +use mp_pools, ONLY : kunit
> +USE wvfct,     ONLY : nbnd, et, npwx
> +USE gvecw,              ONLY : ecutwfc
> +use pwcom
> +USE wavefunctions_module, ONLY : evc
> +use mp, ONLY: mp_bcast
> +USE mp_world, ONLY : world_comm
> +USE fft_base,             ONLY : dfftp
> +use scf, only : vrs, vltot, v, kedtau
> +USE fft_custom_gwl
> +use bse_basic_structures
> +use exciton
> +USE constants,   ONLY: RYTOEV
> +USE mp,          ONLY: mp_barrier
> +USE qpe_exc,         ONLY: qpc
> +use bse_wannier, ONLY: num_nbndv,&
> +           l_truncated_coulomb,&
> +           truncation_radius, &
> +           numw_prod,&
> +           dual_bse,&
> +           l_verbose, &
> +           lambda,eps,&
> +           l_cgrad,maxit,cg_nreset,lm_delta,n_eig,eps_eig, scissor,&
> +           l_plotexc,plotn_min,plotn_max,r_hole,l_plotaverage,&
> +           l_tspace,nbndt,l_finite,r_pola,&
> +           spectra_e_min,spectra_e_max,spectra_nstep,spectra_broad,&
> +           l_restart,n_eig_start, nit_lcz,l_lanczos, l_restart_lcz, nlcz_restart,&
> +           l_tdhf,l_fullbse,l_lf,l_rpa, l_contraction, l_gtrick, qpe_imin, qpe_imax,&
> +           l_scissor
> +implicit none
> +INTEGER, EXTERNAL :: find_free_unit
> +
> +integer :: i, kunittmp, ios, is
> +CHARACTER(LEN=256), EXTERNAL :: trimcheck
> +character(len=200) :: pp_file
> +logical ::  uspp_spsi, ascii, single_file, raw
> +
> +
> +type(v_state) :: vstate
> +type(v_state_r) :: vstate_r
> +type(c_state)   :: cstate
> +type(c_state)   :: wcstate
> +type(exc) :: a_exc
> +type(exc) :: b_exc
> +!type(exc) :: a_excdiago,a_exchange 
> +!type(exc):: a_dirv
> +!type(exc):: a_dirw
> +!type(exc):: a_rot
> +type(fft_cus) :: fc
> +
> +
> +logical exst
> +integer iuv
> +
> +logical :: debug
> +
> +real(kind=DP) :: sdeig
> +
> +
> +
> +NAMELIST /inputbse/ prefix,num_nbndv,dual_bse,outdir,l_truncated_coulomb,&
> +                    truncation_radius, numw_prod, l_verbose,lambda,eps,&
> +                    l_cgrad,maxit,cg_nreset,lm_delta,n_eig,eps_eig,&
> +                    scissor,l_plotexc,plotn_min,plotn_max,r_hole,&
> +                    l_plotaverage,l_tspace,nbndt,l_finite,r_pola,&
> +                    spectra_e_min,spectra_e_max,spectra_nstep,spectra_broad,&
> +                    l_restart,n_eig_start, nit_lcz,l_lanczos, l_restart_lcz, nlcz_restart,&
> +                    l_fullbse,l_tdhf,l_lf,l_rpa,l_contraction,l_gtrick, qpe_imin, qpe_imax,&
> +                    l_scissor
> +
> +debug=.false.
> +            
> +call start_bse( )
> +call start_clock('bse_main')
> +
> +
> +!
> +!   set default values for variables in namelist
> +!
> +prefix='export'
> + CALL get_environment_variable( 'ESPRESSO_TMPDIR', outdir )	
> +IF ( TRIM( outdir ) == ' ' ) outdir = './'
> +  pp_file= ' '
> +  uspp_spsi = .FALSE.
> +  ascii = .FALSE.
> +  single_file = .FALSE.
> +  raw = .FALSE.
> +
> +
> +
> +num_nbndv(1:2) = 1
> +l_truncated_coulomb = .false.
> +truncation_radius = 10.d0
> +numw_prod=1
> +dual_bse=1.d0
> +l_verbose=.true.
> +lambda=0.001d0
> +eps=0.0001d0
> +maxit=100
> +cg_nreset=50
> +l_cgrad=.false.
> +lm_delta=0.3
> +n_eig=10
> +eps_eig=0.0000000001
> +scissor=0.d0
> +l_plotexc=.false.
> +plotn_min=0 
> +plotn_max=0 
> +r_hole(1:3)=0.d0
> +l_plotaverage=.false.
> +l_tspace=.false.
> +nbndt=1
> +l_finite=.false.
> +r_pola(1:3)=1.d0
> +spectra_e_min=0.d0
> +spectra_e_max=10.d0
> +spectra_nstep=100
> +spectra_broad=0.01d0
> +l_restart=0
> +n_eig_start=0
> +nit_lcz=100
> +l_lanczos=.false.
> +l_restart_lcz=.false.
> +nlcz_restart=1
> +l_fullbse=.true.
> +l_tdhf=.false.
> +l_lf=.false.
> +l_rpa=.false.
> +l_contraction=.false.
> +l_gtrick=.false.
> +qpe_imin=1
> +qpe_imax=1
> +l_scissor=.true.
> +!
> +!    Reading input file
> +!
> +IF ( ionode ) THEN
> +      !
> +      CALL input_from_file ( )
> +      !
> +      READ(5,inputbse,IOSTAT=ios)
> +      !
> +!      call read_namelists( 'PW4GWW' )
> +      !
> +      IF (ios /= 0) CALL errore ('pw4gww', 'reading inputbse namelist', ABS(ios) )
> +      scissor=scissor/RYTOEV
> +ENDIF
> +
> +
> +!-------------------------------------------------------------------------
> +! ... Broadcasting variables
> +!------------------------------------------------------------------------
> +
> +   
> +
> +  tmp_dir = trimcheck( outdir )
> +  CALL mp_bcast( outdir, ionode_id, world_comm )
> +  CALL mp_bcast( tmp_dir, ionode_id, world_comm )
> +  CALL mp_bcast( prefix, ionode_id , world_comm)
> +  CALL mp_bcast( num_nbndv,     ionode_id , world_comm)
> +  CALL mp_bcast(l_truncated_coulomb, ionode_id, world_comm)
> +  CALL mp_bcast(truncation_radius, ionode_id, world_comm)
> +  call mp_bcast(numw_prod, ionode_id, world_comm)
> +  CALL mp_bcast(dual_bse, ionode_id, world_comm)
> +  CALL mp_bcast( pp_file, ionode_id , world_comm)
> +  CALL mp_bcast( uspp_spsi, ionode_id , world_comm)
> +  CALL mp_bcast( ascii, ionode_id , world_comm)
> +  CALL mp_bcast( single_file, ionode_id , world_comm)
> +  CALL mp_bcast( raw, ionode_id , world_comm)
> +  CALL mp_bcast( pseudo_dir, ionode_id , world_comm)
> +  CALL mp_bcast( psfile, ionode_id , world_comm)
> +  CALL mp_bcast( lambda, ionode_id , world_comm)
> +  CALL mp_bcast( eps, ionode_id , world_comm)
> +  CALL mp_bcast( maxit, ionode_id , world_comm)
> +  CALL mp_bcast( cg_nreset, ionode_id , world_comm)
> +  CALL mp_bcast( l_cgrad, ionode_id , world_comm)
> +  CALL mp_bcast( lm_delta, ionode_id , world_comm)
> +  CALL mp_bcast( n_eig, ionode_id , world_comm)
> +  CALL mp_bcast( eps_eig, ionode_id , world_comm)
> +  CALL mp_bcast( scissor, ionode_id , world_comm)
> +  CALL mp_bcast( l_plotexc, ionode_id , world_comm)
> +  CALL mp_bcast( plotn_min, ionode_id , world_comm) 
> +  CALL mp_bcast( plotn_max, ionode_id , world_comm) 
> +  CALL mp_bcast( r_hole, ionode_id, world_comm ) 
> +  CALL mp_bcast( l_plotaverage, ionode_id, world_comm ) 
> +  CALL mp_bcast( l_tspace, ionode_id, world_comm ) 
> +  CALL mp_bcast( nbndt, ionode_id, world_comm ) 
> +  CALL mp_bcast( l_finite, ionode_id, world_comm ) 
> +  CALL mp_bcast( r_pola, ionode_id, world_comm ) 
> +  CALL mp_bcast( spectra_e_min, ionode_id, world_comm ) 
> +  CALL mp_bcast( spectra_e_max, ionode_id, world_comm ) 
> +  CALL mp_bcast( spectra_nstep, ionode_id, world_comm ) 
> +  CALL mp_bcast( spectra_broad, ionode_id, world_comm ) 
> +  CALL mp_bcast( l_restart, ionode_id, world_comm ) 
> +  CALL mp_bcast( n_eig_start, ionode_id, world_comm ) 
> +  CALL mp_bcast( nit_lcz, ionode_id, world_comm ) 
> +  CALL mp_bcast( l_lanczos, ionode_id, world_comm ) 
> +  CALL mp_bcast( l_restart_lcz, ionode_id, world_comm ) 
> +  CALL mp_bcast( nlcz_restart, ionode_id, world_comm ) 
> +  CALL mp_bcast( l_fullbse, ionode_id, world_comm ) 
> +  CALL mp_bcast( l_tdhf, ionode_id, world_comm ) 
> +  CALL mp_bcast( l_lf, ionode_id, world_comm ) 
> +  CALL mp_bcast( l_rpa, ionode_id, world_comm ) 
> +  CALL mp_bcast( l_contraction, ionode_id, world_comm)
> +  CALL mp_bcast( l_gtrick, ionode_id, world_comm)
> +  CALL mp_bcast( l_scissor, ionode_id, world_comm)
> +  CALL mp_bcast( qpe_imin, ionode_id, world_comm)
> +  CALL mp_bcast( qpe_imax, ionode_id, world_comm)
> +
> +  call read_file 
> +! after read_file everything is known
> +
> +#if defined __PARA
> +  kunittmp = kunit
> +#else
> +  kunittmp = 1
> +#endif
> +
> +  call openfil_bse
> +
> +  call read_export(pp_file,kunittmp,uspp_spsi, ascii, single_file, raw)
> +  call summary()  
> +  call print_bseinfo()
> +
> +  CALL hinit0()
> +  CALL set_vrs(vrs, vltot, v%of_r, kedtau, v%kin_r, dfftp%nnr, nspin, doublegrid )
> +
> +  if(l_verbose) write(stdout,*) 'To check, we print the KS eigenvalues:'
> +  FLUSH( stdout )
> +  !
> +  CALL print_ks_energies()
> +
> +! inizialize dual grid once for all
> +  fc%dual_t=dual_bse
> +  fc%ecutt=ecutwfc
> +  call initialize_fft_custom(fc)
> +
> +! read ks wavefunction, allocate and fill up the v_state object and c_state object
> +  if (allocated(evc)) deallocate (evc)
> +  call initialize_v_state(vstate)
> +  call make_v_state(num_nbndv,vstate)
> +  
> +  call initialize_c_state(cstate)
> +  call make_c_state(num_nbndv,cstate)
> +
> +  call initialize_c_state(wcstate)
> +  call make_c_state(num_nbndv,wcstate)
> +
> +! FFT the valence states vector into r-space (using dual_bse)
> +  call initialize_v_state_r(vstate_r)
> +  call v_wfng_to_wfnr(vstate,fc,vstate_r)
> + 
> +  if(.not.l_gtrick) then
> +     call v_wfng_to_wfnr(vstate,fc,vstate_r)
> +  endif
> +
> +! if debug mode check polarizability basis orthonormality
> +  if(debug) then
> +     call check_basis(numw_prod,npw) 
> +  endif
> +
> +! allocate once for all vg_q
> +  allocate(vg_q(npwx))
> +  if(.not.l_truncated_coulomb) then
> +     iuv = find_free_unit()
> +     CALL diropn( iuv, 'vgq', npwx, exst )
> +     CALL davcio(vg_q,npwx,iuv,1,-1)
> +     close(iuv)
> +  endif
> +     
> +
> +  if(debug) write(*,*) 'vgq allocated'
> +
> +! QP corrections read and used to prepare wcstate
> +  if(.not.l_scissor) then
> +     allocate(qpc(qpe_imax))
> +     call qpcorrections(wcstate)
> +  endif
> +
> +  if(l_tspace) then
> +!    solve the BSE in transition space
> +     call tspace_diago(vstate,vstate_r,fc)
> +  else
> +!    solve the BSE with cg, or steepest descent procedure, compute the optical spectrum,
> +!    and the excitonic wfns 
> +     if(l_lanczos) then
> +        if(debug) write(*,*) 'Solve using Lanczos'
> +        if(l_gtrick) call v_wfng_to_wfnr(vstate,fc,vstate_r)
> +        call lanczos(vstate,vstate_r,cstate,wcstate,fc)
> +     else
> +        if(l_gtrick) call v_wfng_to_wfnr(vstate,fc,vstate_r)!still to be implemented
> +        call find_eig(vstate,vstate_r,cstate,wcstate,fc)
> +     endif
> +     call mp_barrier(world_comm)
> +  endif 
> +
> +! 
> +
> +
> +! free memory
> +  call free_v_state_r(vstate_r)
> +  call free_v_state(vstate)
> +  call free_c_state(cstate)
> +  call free_c_state(wcstate)
> +  if(.not.l_scissor) deallocate(qpc)
> +
> +  write(stdout,*) 'BSE COMPLETED'
> +  call stop_clock('bse_main')
> +  call print_clock('bse_main')
> +  call print_clock('fft')
> +  call print_clock('ffts')
> +  call print_clock('fftw')
> +  call print_clock('cft3t')
> +  call print_clock('davcio')
> +  call print_clock('make_v_state')
> +  call print_clock('make_c_state')
> +  call print_clock('v_wfng_to_wfnr')
> +  call print_clock('c_wfng_to_wfnr')
> +  call print_clock('c_times_exc')
> +  call print_clock('pc_operator_exc')
> +  call print_clock('sproduct_exc')
> +  call print_clock('normalize_exc')
> +  call print_clock('pout_operator_exc')
> +  call print_clock('fft_a_exc')
> +  call print_clock('fftback_a_exc')
> +  call print_clock('urot_a')
> +  CALL print_clock('cgsolve')
> +  call print_clock('conjgrad')
> +  call print_clock('linmin')
> +  call print_clock('diago_exc')
> +  call print_clock('direct_v_exc')
> +  call print_clock('direct_w_exc')
> +  call print_clock('direct_w_dgemv')
> +  call print_clock('dgemv1')
> +  call print_clock('dgemv2')
> +  call print_clock('dgemv3')
> +  call print_clock('dgemv4')
> +  call print_clock('direct_w_cft3t')
> +  call print_clock('wdirect_fftback')
> +  call print_clock('exchange_exc')
> +  call print_clock('direct_w_contract')
> +  call print_clock('direct_v_contract')
> +  call print_clock('dvpsi_e')
> +  call print_clock('exc_h_a')
> +  call print_clock('find_eig')
> +  call print_clock('h_h')
> +  call print_clock('lanczos')
> +  call print_clock('lanczos_iterations')
> +  call print_clock('lanczos_cf')
> +  call print_clock('plot_excwfn')
> +  call print_clock('print_spectrum')
> +  call print_clock('read_export')
> +  call print_clock('rotate_wannier_gamma_bse')
> +  call print_clock('sdescent')
> +  call print_clock('build_spectrum')
> +  call print_clock('absorption')
> +  call print_clock('amplitude_finite')
> +  call print_clock('amplitude')
> +  call print_clock('tspace_diago')
> +  call print_clock('build_exch')
> +  call print_clock('read_wannier_matrix')
> +
> +  CALL FLUSH( stdout )
> +
> +  call stop_pp
> +
> +  stop
> +end program bse_punch
> +
> +
> +
> +
> +
> 
> Added: trunk/espresso/GWW/bse/bse_wannier.f90
> ===================================================================
> --- trunk/espresso/GWW/bse/bse_wannier.f90	                        (rev 0)
> +++ trunk/espresso/GWW/bse/bse_wannier.f90	2016-03-21 11:09:18 UTC (rev 12231)
> @@ -0,0 +1,63 @@
> +!this module contains bse code input variables  
> +MODULE bse_wannier
> +
> +  USE kinds, ONLY: DP
> +
> +  INTEGER :: numw_prod!number of products w_i(r)*w_j(r) then of orthonormalized products
> +  INTEGER :: num_nbndv(2) !number of valence bands
> +  INTEGER :: nset!number of states to be read  written from/to file simultaneously
> +  INTEGER :: maxit!maximun number of iterarion for conj gradient minimiaztion
> +  INTEGER :: n_eig!number of excitoninc eigenvalues to be found
> +  INTEGER :: cg_nreset!conj gradient variable number of steps after which 
> +                      !the residual is effectively computed with the application
> +                      ! of the Hamiltonian  
> +  INTEGER :: plotn_min!start plotting from plot_nmin eigenstate 
> +  INTEGER :: plotn_max!end plotting at plot_nmax eigenstate
> +  INTEGER :: nbndt! total number of bands
> +  INTEGER :: spectra_nstep! number of frequencies for which the spectrum will be
> +                          !computed 
> +  INTEGER :: l_restart! if 1 restart from the computation of the n_eig_start
> +                      ! vector; if 2 restart from the calculation of the
> +                      ! absorption spectrum
> +  INTEGER :: n_eig_start! eigenvector index where the calculation restarts from 
> +  INTEGER :: nlcz_restart! number of lanczos iteration where the calculation
> +                            !restarts from
> +  INTEGER :: nit_lcz! number of total lanczos iteration to be performed 
> +  INTEGER :: qpe_imin ! index of the lowest band for which GW energies are known
> +  INTEGER :: qpe_imax ! index of the highest band for which GW energies are known 
> +  LOGICAL :: l_truncated_coulomb!if true the Coulomb potential is truncated
> +  LOGICAL :: l_verbose!if true higher verbosity output
> +  LOGICAL :: l_cgrad! if true conjgradient diago
> +  LOGICAL :: l_plotexc !if true plot the excitonic wavefunctions
> +  LOGICAL :: l_plotaverage !if true plot the average excitonic wavefunction (for degenerate states)
> +  LOGICAL :: l_tspace !if true solve the BSE in transition space
> +  LOGICAL :: l_finite !if true computes the spectrum using the expectation
> +                       !value of the position operator, makes sense only for finite
> +                       !systems
> +  LOGICAL :: l_lanczos !if true computes absorption through a lanczos procedure
> +  LOGICAL :: l_restart_lcz! if true restart the lanczos calculation from
> +                          ! previous run
> +  LOGICAL :: l_fullbse ! if true perform full BSE calculation
> +  LOGICAL :: l_tdhf ! if true perform do not include Wc,i.e. perform
> +                    ! a td-hf like calculation
> +  LOGICAL :: l_lf   ! do not include the W=v+Wc term local fields only
> +  LOGICAL :: l_rpa  ! RPA non-local field calculation
> +  LOGICAL :: l_scissor! if true use scissor operator, else use computed QP energies
> +  REAL(kind=DP) :: truncation_radius!truncation radius for Coulomb potential
> +  REAL(kind=DP) :: dual_bse!dual factor for bse calculations
> +  REAL(kind=DP) :: lambda!small positive real for steepest descendent
> +  REAL(kind=DP) :: eps!small positive real for steepest descendent convergence
> +  REAL(kind=DP) :: eps_eig!small positive real for steepest descendent convergence check on eigenstate
> +  REAL(kind=DP) :: lm_delta! magnitude of the step along the search direction in
> +                           ! the line minimization subroutine  
> +  REAL(kind=DP) :: scissor!scissor operator opening the KS gap in eV
> +  REAL(kind=DP) :: r_hole(3)!coordinate of the hole (in alat units) for the excitonic wavefunction plot
> +  REAL(kind=DP) :: r_pola(3)!polarization direction
> +  REAL(kind=DP) :: spectra_e_min!minimum energy for the absorption spectrum
> +  REAL(kind=DP) :: spectra_e_max!max energy for the absorption spectrum
> +  REAL(kind=DP) :: spectra_broad!range of broadening for the spectra (in eV) 
> +  LOGICAL :: l_contraction!if true uses contraction for speeding-up
> +  LOGICAL :: l_gtrick!save arrays in G space for limiting memory usage
> +
> +END MODULE bse_wannier
> +
> 
> Added: trunk/espresso/GWW/bse/cgsolve.f90
> ===================================================================
> --- trunk/espresso/GWW/bse/cgsolve.f90	                        (rev 0)
> +++ trunk/espresso/GWW/bse/cgsolve.f90	2016-03-21 11:09:18 UTC (rev 12231)
> @@ -0,0 +1,189 @@
> +!
> +! Copyright (C) 2003 PWSCF group
> +! This file is distributed under the terms of the
> +! GNU General Public License. See the file `License'
> +! in the root directory of the present distribution,
> +! or http://www.gnu.org/copyleft/gpl.txt .
> +!
> +!
> +!-----------------------------------------------------------------------
> +SUBROUTINE cgsolve (npw,evc,npwx,nbnd,overlap,      &
> +     &              nbndx,orthonormal,precondition,diagonal, &
> +     &              startwith0,e,b,u,h,Ah,pu,niter,eps,iter,x)
> +  !-----------------------------------------------------------------------
> +  !
> +  !  conjugate-gradient solution of a system of constrained linear equations
> +  !  "operator" is the linear operator - diagonal preconditioning allowed
> +  !  x = solution, u = gradient, h = conjugate gradient, Ah = operator*h
> +  !
> +  USE io_global, ONLY : stdout
> +  USE kinds,     ONLY : DP
> +  USE becmod,    ONLY : calbec
> +  USE io_global,   ONLY : ionode,stdout
> +  USE gvect,                 ONLY : gstart
> +  USE mp, ONLY : mp_sum, mp_barrier
> +  USE mp_world,             ONLY : world_comm
> +
> +  IMPLICIT NONE
> +  INTEGER npw, npwx, nbnd, nbndx, niter, iter
> +  real(DP) :: diagonal(npw), e(nbnd), overlap(nbndx,nbnd)
> +  COMPLEX(DP) :: x(npwx,nbnd), b(npwx,nbnd), u(npwx,nbnd),          &
> +       h(npwx,nbnd),Ah(npwx,nbnd),evc(npwx,nbnd), pu(npwx,nbnd)
> +  LOGICAL :: orthonormal, precondition,startwith0
> +  !
> +  INTEGER :: ibnd, jbnd, i, info
> +  real(DP) :: lagrange(nbnd,nbnd)
> +  real(DP) :: lambda, u_u, uu0, u_A_h, alfa, eps, uu(nbnd), ddot
> +  logical ::  debug
> +  real(kind=DP), allocatable :: omat(:,:)
> +  integer j
> +  EXTERNAL ddot, H_h
> +  !
> +  CALL start_clock('cgsolve')
> +  debug=.false.
> +  !
> +  ! starting gradient |u> = (A|x>-|b>)-lambda|psi> (lambda=<Ax-b|psi_i>)
> +  !
> +  IF (.not.startwith0) THEN
> +     CALL H_h(e,x,u)
> +  ELSE
> +     u (:,:) = (0.d0, 0.d0)
> +     ! note that we assume x=0 on input
> +  ENDIF
> +  !
> +
> +  allocate(omat(nbnd,nbnd))
> +  omat(1:nbnd,1:nbnd)=0.d0
> +  if(debug) then
> +     call dgemm('T','N',nbnd,nbnd,2*npw,2.d0,b,2*npwx,b,2*npwx,0.d0,omat,nbnd)
> +     if(gstart==2) then
> +        do i=1,nbnd
> +           do j=1,nbnd
> +              omat(i,j)=omat(i,j)-dble(b(1,i)*conjg(b(1,j)))
> +           enddo
> +        enddo
> +     endif
> +   
> +     call mp_sum(omat,world_comm)  
> +     do i=1,nbnd
> +        do j=1,nbnd
> +           if(ionode) write(*,*) 'nbnd1,nbnd2,omat4(1,2)', i,j,omat(i,j)
> +        enddo
> +     enddo
> +  endif
> +
> +
> + 
> +  CALL daxpy(2*npwx*nbnd,-1.d0,b,1,u,1)
> +
> +  IF (precondition) THEN
> +     CALL zvscal(npw,npwx,nbnd,diagonal,u,pu)
> +     CALL calbec ( npw, evc, pu, lagrange )
> +  ELSE
> +     CALL calbec ( npw, evc,  u, lagrange )
> +   ENDIF
> +  IF (.not. orthonormal) &
> +       CALL DPOTRS('U',nbnd,nbnd,overlap,nbndx,lagrange,nbnd,info)
> +  IF (info/=0) CALL errore('cgsolve','error in potrs',info)
> +  !
> +  CALL dgemm ('N', 'N', 2*npw, nbnd, nbnd, -1.d0, evc, &
> +       2*npwx, lagrange, nbnd, 1.d0, u, 2*npwx)
> +  !
> +  ! starting conjugate gradient |h> = |u>
> +  IF (precondition) THEN
> +     CALL zvscal(npw,npwx,nbnd,diagonal,u,h)
> +  ELSE
> +     CALL zcopy(npwx,nbnd,u,1,h,1)
> +  ENDIF
> +  ! uu = <u|h>
> +  CALL pw_dot('Y',npw,nbnd,u,npwx,h,npwx,uu)
> +  u_u = 0.0d0
> +  DO ibnd=1,nbnd
> +     u_u = u_u + uu(ibnd)
> +  ENDDO
> +  !
> +  !      print '("  iter # ",i3,"  u_u = ",e10.4)', 0, u_u
> +  !
> +  !   main iteration loop
> +  !
> +  DO iter = 1, niter
> +  
> +     !
> +     ! calculate A|h>
> +     !
> +     CALL H_h(e,h(1,1),Ah(1,1))
> +     !
> +     ! u_A_h = <u|A|h> (NB: must be equal to <h|A|h>)
> +     IF (precondition) THEN
> +        CALL zvscal(npw,npwx,nbnd,diagonal,u,pu)
> +        ! uu = <u|PA|h>
> +        CALL pw_dot('Y',npw,nbnd,pu,npwx,Ah,npwx,uu)
> +     ELSE
> +        ! uu = <u|A|h>
> +        CALL pw_dot('Y',npw,nbnd, u,npwx,Ah,npwx,uu)
> +     ENDIF
> +     u_A_h = 0.0d0
> +     DO ibnd=1,nbnd
> +        u_A_h = u_A_h + uu(ibnd)
> +     ENDDO
> +     !
> +     lambda = - u_u / u_A_h
> +     ! update the gradient and the trial solution
> +     uu0 = u_u
> +     u_u = 0.0d0
> +     CALL daxpy(2*npwx*nbnd,lambda, h,1,x,1)
> +     CALL daxpy(2*npwx*nbnd,lambda,Ah,1,u,1)
> +     ! lagrange multipliers ensure orthogonality of the solution
> +     IF (precondition) THEN
> +        CALL zvscal(npw,npwx,nbnd,diagonal,u,pu)
> +        CALL calbec ( npw, evc, pu, lagrange )
> +     ELSE
> +        CALL calbec ( npw, evc,  u, lagrange )
> +     ENDIF
> +     IF (.not. orthonormal) &
> +          CALL DPOTRS('U',nbnd,nbnd,overlap,nbndx,lagrange,nbnd,info)
> +     IF (info/=0) CALL errore('cgsolve','error in potrs',info)
> +     CALL dgemm ('N', 'N', 2*npw, nbnd, nbnd,-1.d0, evc, &
> +          2*npwx, lagrange, nbndx, 1.d0, u, 2*npwx)
> +     IF (precondition) THEN
> +        CALL zvscal(npw,npwx,nbnd,diagonal,u,pu)
> +        ! uu = <u|A|u>
> +        CALL pw_dot('Y',npw,nbnd, u,npwx,pu,npwx,uu)
> +     ELSE
> +        ! uu = <u|u>
> +        CALL pw_dot('Y',npw,nbnd, u,npwx, u,npwx,uu)
> +     ENDIF
> +     u_u = 0.0d0
> +     DO ibnd=1,nbnd
> +        u_u = u_u + uu(ibnd)
> +     ENDDO
> +     !         print '("  iter # ",i3,"  u_u = ",e10.4)', iter, u_u
> +     !
> +     IF( u_u <= eps) GOTO 10
> +     IF (iter==niter) THEN
> +        WRITE( stdout,'("   *** Conjugate Gradient minimization",   &
> +             &    " not converged after ",i3," iterations"/ &
> +             &    " residual norm |Ax-b|^2 : ",e10.4)') iter,u_u
> +        GOTO 10
> +     ENDIF
> +     !   update the conjugate gradient
> +     alfa =  u_u / uu0
> +     DO ibnd = 1,nbnd
> +        IF (precondition) THEN
> +           DO i=1,npw
> +              h(i,ibnd) = alfa*h(i,ibnd) + u(i,ibnd)*diagonal(i)
> +           ENDDO
> +        ELSE
> +           DO i=1,npw
> +              h(i,ibnd) = alfa*h(i,ibnd) + u(i,ibnd)
> +           ENDDO
> +        ENDIF
> +     ENDDO
> +  ENDDO
> +  !
> +10 CONTINUE
> +  CALL stop_clock('cgsolve')
> +  !
> +  deallocate(omat)
> +  RETURN
> +END SUBROUTINE cgsolve
> 
> Added: trunk/espresso/GWW/bse/check_basis.f90
> ===================================================================
> --- trunk/espresso/GWW/bse/check_basis.f90	                        (rev 0)
> +++ trunk/espresso/GWW/bse/check_basis.f90	2016-03-21 11:09:18 UTC (rev 12231)
> @@ -0,0 +1,82 @@
> +subroutine check_basis(numwprod,npw)
> +! checking if the polarizability basis is orthonormal
> +
> +USE fft_custom_gwl
> +!USE io_files,             ONLY : find_free_unit, prefix, diropn
> +USE io_files,             ONLY : prefix, diropn
> +USE wavefunctions_module, ONLY :  psic
> +USE mp,          ONLY :mp_barrier
> +use io_global, ONLY : stdout, ionode 
> +USE kinds, ONLY : DP
> +USE mp,             ONLY : mp_sum
> +use mp_world, ONLY : mpime
> +USE mp_world,             ONLY : world_comm
> +USE gvect,          ONLY : gstart,ngm_g
> +
> +
> +implicit none
> +INTEGER, EXTERNAL :: find_free_unit
> +REAL(kind=DP), EXTERNAL :: ddot
> +
> +integer numwprod
> +integer npw
> +
> +COMPLEX(kind=DP), ALLOCATABLE :: p_basis(:,:)
> +real(kind=DP) :: prod
> +INTEGER ::iungprod
> +INTEGER ::iunnorm
> +LOGICAL       :: exst
> +integer :: ii,jj
> +
> +
> +
> +iungprod = find_free_unit()
> +allocate(p_basis(npw,numwprod))
> +CALL diropn( iungprod, 'wiwjwfc_red', npw*2, exst )
> +
> +do ii=1,numwprod
> +   call davcio(p_basis(:,ii),npw*2,iungprod,ii,-1)
> +enddo
> +
> +call mp_barrier(world_comm)
> +close(iungprod)
> +
> +
> +! check normalization
> +if(ionode) then
> +   iunnorm = find_free_unit()
> +   open(iunnorm, file='pol_basis_norm.dat',status='unknown',form='formatted')
> +   write(iunnorm,*) '# Pol_vector_i, Norm'
> +endif
> +
> +do ii=1,numwprod   
> +   prod=2.d0*ddot(2*npw,p_basis(:,ii),1,p_basis(:,ii),1)
> +   if (gstart==2) prod=prod-p_basis(1,ii)*p_basis(1,ii)
> +   call mp_sum(prod,world_comm)
> +!   prod=prod/ngm_g
> +   if(ionode) write(iunnorm,*) ii,prod
> +enddo   
> +
> +if(ionode) close(iunnorm)
> +
> +!check orthogonality
> +
> +if(ionode) then
> +   iunnorm = find_free_unit()
> +   open(iunnorm, file='pol_basis_ortho.dat',status='unknown',form='formatted')
> +   write(iunnorm,*) '# Pol_vector_i, #Polarization vector j, Product'
> +endif
> +
> +do ii=1,numwprod
> +   do jj=ii+1,numwprod   
> +      prod=2.d0*ddot(2*npw,p_basis(:,ii),1,p_basis(:,jj),1)
> +      if (gstart==2) prod=prod-p_basis(1,ii)*p_basis(1,jj)
> +      call mp_sum(prod,world_comm)
> +!      prod=prod/ngm_g
> +      if(ionode) write(iunnorm,*) ii,jj,prod
> +   enddo
> +enddo   
> +
> +if(ionode) close(iunnorm)
> +return
> +end subroutine
> 
> Added: trunk/espresso/GWW/bse/conj_grad_stuff.f90
> ===================================================================
> --- trunk/espresso/GWW/bse/conj_grad_stuff.f90	                        (rev 0)
> +++ trunk/espresso/GWW/bse/conj_grad_stuff.f90	2016-03-21 11:09:18 UTC (rev 12231)
> @@ -0,0 +1,305 @@
> +subroutine linmin(i_state,a_exc,fpa,dfa,h,modh,x,vstatecg,vstate_rcg,cstate,wcstate,fc,delta_eig,restart,iter,cgstatus,step_rel)
> +! This subroutine approximates the minimum of the functional
> +! f=<a_exc|Hexc|a_exc>-lambda*(<a_exc|a_exc>-1)
> +! along a given direction |h> with the vertex of a 
> +! parabola passing through the points (a_exc,fpa), and (b_exc=a_exc+delta*grad f,fpb) and having
> +! slope -dfa at the a_exc point.
> +! it updates the vector |a_exc> setting it at the 
> +! found minimum position, and it updates the value of the functional fpa, and returns its 
> +! gradient, |x>, at the new position  
> +
> +USE exciton 
> +USE fft_custom_gwl 
> +USE io_global, ONLY : stdout,ionode
> +USE wvfct,    ONLY : npw
> +use bse_wannier, ONLY:num_nbndv,lm_delta,eps,maxit
> +USE bse_basic_structures  
> +!USE mp,          ONLY :mp_barrier
> +!USE constants,        ONLY: RYTOEV
> +
> +implicit none
> +type(exc), intent(inout) :: a_exc
> +type(exc), intent(out) :: x
> +type(exc), intent(in) :: h
> +real(kind=DP), intent(inout) :: dfa,fpa
> +real(kind=DP), intent(inout) :: modh
> +real(kind=DP), intent(inout):: step_rel
> +logical, intent(inout) ::cgstatus
> +
> +type(v_state) :: vstatecg
> +type(v_state_r) :: vstate_rcg
> +type(c_state) :: cstate
> +type(c_state) :: wcstate
> +type(fft_cus) :: fc
> +
> +real(kind=DP) :: fp_out, fpb, fpv,delta_new,b,c,a,delta_eig,hsquare,fpc
> +
> +type(exc) :: b_exc
> +type(exc) :: c_exc
> +type(exc) :: v_exc
> +type(exc) :: xb
> +type(exc) :: xv
> +
> +integer :: is,j,i,i_state,iter
> +real(kind=DP):: step,fptmp
> +real(kind=DP) ::e_expected,pb,pc
> +logical ::restart,fit_3p
> +integer :: fit_ok
> +
> +real(kind=DP) :: deri,sca
> +
> +call start_clock('linmin')
> +fit_3p=.true.
> +
> +
> +!initialize internal vectors
> +!second point for the qudratic fit
> +call initialize_exc(b_exc)
> +b_exc%label=1
> +b_exc%npw=npw
> +b_exc%numb_v=num_nbndv(1)
> +allocate(b_exc%a(b_exc%npw,b_exc%numb_v))
> +
> +if(fit_3p) then
> +!this variable is needed if we want to find the 
> +!parabola equations given three points
> +   call initialize_exc(c_exc)
> +   c_exc%label=1
> +   c_exc%npw=npw
> +   c_exc%numb_v=num_nbndv(1)
> +   allocate(c_exc%a(c_exc%npw,c_exc%numb_v))
> +endif
> +
> +!gradient at the second point for the qudratic fit
> +call initialize_exc(xb)
> +xb%label=1
> +xb%npw=npw
> +xb%numb_v=num_nbndv(1)
> +allocate(xb%a(xb%npw,xb%numb_v))
> +
> +!position of the vertex of the qudratic fit
> +call initialize_exc(v_exc)
> +v_exc%label=1
> +v_exc%npw=npw
> +v_exc%numb_v=num_nbndv(1)
> +allocate(v_exc%a(v_exc%npw,v_exc%numb_v))
> +
> +!gradient at the position of the vertex of the qudratic fit
> +call initialize_exc(xv)
> +xv%label=1
> +xv%npw=npw
> +xv%numb_v=num_nbndv(1)
> +allocate(xv%a(xv%npw,xv%numb_v))
> +
> +!for debug purpose plot the energies along the search line
> +!if(.not.cgstatus) then
> +!   do j=0,50
> +!    step=step_rel/25
> +!      b_exc%a(1:b_exc%npw,1:b_exc%numb_v)=a_exc%a(1:a_exc%npw,1:a_exc%numb_v)+&
> +!                             step*dble(j)*h%a(1:h%npw,1:h%numb_v) 
> +!      do is = 1,vstatecg%nspin
> +!         call pc_operator_exc(b_exc,vstatecg,is)
> +!      enddo
> +!      call pout_operator_exc(b_exc,i_state)
> +!
> +!      call normalize_exc(b_exc)
> +!
> +!!compute the function (and the gradient) at the new b_exc position
> +!      call exc_h_a(b_exc,xb,vstatecg,vstate_rcg,fc) 
> +!      call sproduct_exc(b_exc,xb,fpb)
> +!
> +!      write(stdout,*) 'en_prof', iter, dble(j), fpb
> +!   enddo
> +!endif
> +
> +
> +
> +!take one step along the search direction
> +b_exc%a(1:b_exc%npw,1:b_exc%numb_v)=a_exc%a(1:a_exc%npw,1:a_exc%numb_v)+step_rel*h%a(1:h%npw,1:h%numb_v) 
> +
> +
> +!project into the conduction manifold,remove any component along previous
> +!eigenstates, and normalize b_exc
> +do is = 1,vstatecg%nspin
> +   call pc_operator_exc(b_exc,vstatecg,is)
> +enddo
> +
> +call pout_operator_exc(b_exc,i_state)!DEBUG
> +
> +call normalize_exc(b_exc)
> +
> +!compute the function at the new b_exc position
> +call exc_h_a(b_exc,xb,vstatecg,vstate_rcg,cstate,wcstate,fc) 
> +call sproduct_exc(b_exc,xb,fpb)
> +
> +
> +call exc_h_a(a_exc,xb,vstatecg,vstate_rcg,cstate,wcstate,fc)
> +call sproduct_exc(a_exc,xb,sca)
> +xb%a(1:xb%npw,1:xb%numb_v)=xb%a(1:xb%npw,1:xb%numb_v)&
> +                          -sca*a_exc%a(1:b_exc%npw,1:b_exc%numb_v)
> +call sproduct_exc(xb,h,deri)
> +deri=deri*2.d0
> +
> +!compute the quadratic fit
> +if(.not.fit_3p) then 
> +   b=deri!-dfa DEBUG
> +   c=fpa
> +   pb=step_rel
> +   a= (fpb-b*pb-c)/((pb)**2.d0)
> +
> +endif
> +
> +if(fit_3p) then
> +! find the third point C and compute there the function fpc
> +   c_exc%a(1:c_exc%npw,1:c_exc%numb_v)=a_exc%a(1:a_exc%npw,1:a_exc%numb_v)+0.5d0*step_rel*h%a(1:h%npw,1:h%numb_v) 
> +   do is = 1,vstatecg%nspin
> +      call pc_operator_exc(c_exc,vstatecg,is)
> +   enddo
> +
> +   call pout_operator_exc(c_exc,i_state)
> +
> +   call normalize_exc(c_exc)
> +
> +!  xv is used here a tmp variable
> +   call exc_h_a(c_exc,xv,vstatecg,vstate_rcg,cstate,wcstate,fc) 
> +   call sproduct_exc(c_exc,xv,fpc)
> +
> +   pb=step_rel
> +   pc=0.5d0*step_rel
> +   
> +   c=fpa 
> +   a=(fpb*pc-fpc*pb-fpa*(pc-pb))/(pc*pb*(pb-pc))
> +   b=(-fpb*pc*pc+fpc*pb*pb+fpa*(pc*pc-pb*pb))/(pc*pb*(pb-pc))
> +
> +endif
> +
> +
> +delta_new=-b/(2.d0*a)
> +
> +!compute the coordinate of the parabola vertex
> +v_exc%a(:,:)=a_exc%a(:,:)+delta_new*h%a(:,:) 
> +
> +!write(stdout,*) 'delta_new',iter, delta_new
> +
> +!compute the expected value for the minimum from the parabolic fit
> +e_expected=-b**2/(4*a)+c
> +
> +!project into the conduction manifold, remove any component along previous
> +!eigenstates and normalize v_exc
> +do is = 1,vstatecg%nspin
> +   call pc_operator_exc(v_exc,vstatecg,is)
> +enddo
> +
> +call pout_operator_exc(v_exc,i_state)
> +call normalize_exc(v_exc)
> +
> +!compute the function (and the gradient) at the new v_exc position
> +call exc_h_a(v_exc,xv,vstatecg,vstate_rcg,cstate,wcstate,fc) 
> +call sproduct_exc(v_exc,xv,fpv)
> +xv%a(1:xv%npw,1:xv%numb_v)=xv%a(1:xv%npw,1:xv%numb_v)&
> +                          -fpv*v_exc%a(1:v_exc%npw,1:v_exc%numb_v)
> +
> +!write(stdout,*) 'min', iter, e_expected
> +!write(stdout,*) 'fpa', iter, fpa
> +!write(stdout,*) 'fpb', iter, fpb
> +!write(stdout,*) 'fpv', iter, fpv
> +!write(stdout,*) 'FIT PARAMETER A', iter, a
> +!write(stdout,*) 'FIT PARAMETER B', iter, b
> +!write(stdout,*) 'FIT PARAMETER C', iter, c
> +
> +!find where our functional is minimum among the last three points
> +
> +if ((fpv<=fpa).and.(fpv<=fpb)) then
> +!     the quadratic fit found a good position update the information
> +   fit_ok=1
> +   cgstatus=.true.
> +   a_exc%a(:,:)=v_exc%a(:,:)
> +!   write(stdout,*) 'Quadratic fit: V new point, fpv=',fpv, 'fpa=',fpa  
> +   fpa=fpv      
> +   x%a(1:x%npw,1:x%numb_v)=xv%a(1:xv%npw,1:xv%numb_v)
> +else
> +!take a steepest descent step and restart the cg
> +   write(stdout,*) 'WARNING:I restart the cg'
> +   cgstatus=.false.
> +   fit_ok=0
> +   j=1
> +   fptmp=fpa+0.1d0
> +   do while (fptmp>=fpa)
> +      step=(0.3d0)*(0.1d0**j)
> +      b_exc%a(:,:)=a_exc%a(:,:)+step_rel*step*h%a(:,:) 
> +
> +!  project into the conduction manifold remove any component along previous
> +!  eigenstates, and normalize b_exc
> +      do is = 1,vstatecg%nspin
> +         call pc_operator_exc(b_exc,vstatecg,is)
> +      enddo
> +      call pout_operator_exc(b_exc,i_state)
> +      call normalize_exc(b_exc)
> +
> +!  compute the function (and the gradient) at the new b_exc position
> +      call exc_h_a(b_exc,xb,vstatecg,vstate_rcg,cstate,wcstate,fc) 
> +      call sproduct_exc(b_exc,xb,fptmp)
> +      xb%a(1:xb%npw,1:xb%numb_v)=xb%a(1:xb%npw,1:xb%numb_v)&
> +                          -fptmp*b_exc%a(1:b_exc%npw,1:b_exc%numb_v)
> +  
> +      if(fptmp>=fpa) then! look in the opposite direction
> +         b_exc%a(:,:)=a_exc%a(:,:)-step_rel*step*h%a(:,:)
> +!        project into the conduction manifold remove any component along previous
> +!        eigenstates, and normalize b_exc
> +         do is = 1,vstatecg%nspin
> +            call pc_operator_exc(b_exc,vstatecg,is)
> +         enddo
> +         call pout_operator_exc(b_exc,i_state)
> +         call normalize_exc(b_exc)
> +
> +!  compute the function (and the gradient) at the new b_exc position
> +         call exc_h_a(b_exc,xb,vstatecg,vstate_rcg,cstate,wcstate,fc) 
> +         call sproduct_exc(b_exc,xb,fptmp)
> +         xb%a(1:xb%npw,1:xb%numb_v)=xb%a(1:xb%npw,1:xb%numb_v)&
> +                          -fptmp*b_exc%a(1:b_exc%npw,1:b_exc%numb_v)
> +    
> +      endif
> +      j=j+1
> +   enddo 
> +   
> +   a_exc%a(:,:)=b_exc%a(:,:)
> +   fpa=fptmp      
> +   x%a(1:x%npw,1:x%numb_v)=xb%a(1:xb%npw,1:xb%numb_v)
> +endif   
> +
> +
> +!project into the conduction manifold
> +do is = 1,vstatecg%nspin
> +   call pc_operator_exc(x,vstatecg,is)
> +enddo
> +call pout_operator_exc(x,i_state)
> +!compute dfa, i.e. the magnitude of the gradient at the position a_exc
> +call sproduct_exc(x,x,dfa)
> +dfa=sqrt(dfa)
> +
> +
> +!compute delta_eig (using as a tmp variable b)
> +call exc_h_a(a_exc,b_exc,vstatecg,vstate_rcg,cstate,wcstate,fc) 
> +
> +call sproduct_exc(b_exc,b_exc,hsquare)
> +
> +delta_eig=hsquare-fpa**2
> +
> +! set the magnitude of the next step 
> +if(cgstatus) then 
> +   step_rel=2.d0*delta_new 
> +else
> +   step_rel=(0.3d0)*(0.1d0**(j-1))
> +endif
> +
> +!write(stdout,*) 'fit_ok', iter, fit_ok
> +!free memory
> +call free_memory_exc_a(b_exc)
> +if(fit_3p) call free_memory_exc_a(c_exc)
> +call free_memory_exc_a(v_exc)
> +call free_memory_exc_a(xb)
> +call free_memory_exc_a(xv)
> +
> +call stop_clock('linmin')
> +return
> +end subroutine 
> 
> Added: trunk/espresso/GWW/bse/conjgrad.f90
> ===================================================================
> --- trunk/espresso/GWW/bse/conjgrad.f90	                        (rev 0)
> +++ trunk/espresso/GWW/bse/conjgrad.f90	2016-03-21 11:09:18 UTC (rev 12231)
> @@ -0,0 +1,198 @@
> +subroutine conjgrad(i_state,vstatecg,vstate_rcg,cstate,wcstate,fc,en)
> +use exciton 
> +use bse_basic_structures 
> +USE fft_custom_gwl 
> +USE io_global, ONLY : stdout,ionode
> +USE wvfct,    ONLY : npw
> +use bse_wannier, ONLY:num_nbndv,eps,maxit,eps_eig,lm_delta
> +USE mp,          ONLY :mp_barrier
> +USE mp_world,             ONLY : world_comm
> +USE constants,        ONLY: RYTOEV
> +
> +implicit none
> +
> +type(exc) :: a_exc
> +type(exc) :: x
> +type(exc) :: g
> +type(exc) :: h
> +
> +type(v_state) :: vstatecg
> +type(v_state_r) :: vstate_rcg
> +type(c_state) :: cstate
> +type(c_state) :: wcstate
> +type(fft_cus) :: fc
> +
> +!real(kind=DP) :: ha,hb,hc
> +
> +real(kind=dp), intent(out) :: en
> +
> +real(kind=DP) :: fp,fp_out,dfp,cg_lambda,dfg,step_rel
> +real(kind=DP) :: eigout,gg,dgg,gam,delta,hsquare,delta_eig,modh
> +integer :: i, is, i_state
> +
> +logical :: restart
> +logical :: cgstatus
> +
> +call start_clock('conjgrad')
> +
> +restart=.false.
> +cgstatus=.false.
> +
> +if(ionode) write(stdout,*) 'Conjugate gradient started.'
> +if(ionode) write(stdout,*) 'Looking for eigenvalue number:',i_state
> +if(ionode) write(stdout,*) 'eps=',eps
> +if(ionode) write(stdout,*) 'eps_eig=',eps_eig
> +
> +!create initial random excitonic wavefunction vector a_exc 
> +call initialize_exc(a_exc)
> +a_exc%label=1
> +a_exc%npw=npw
> +a_exc%numb_v=num_nbndv(1)
> +allocate(a_exc%a(a_exc%npw,a_exc%numb_v))
> +
> +call random_exc(a_exc)
> +
> +!project into the conduction manifold
> +do is = 1,vstatecg%nspin
> +   call pc_operator_exc(a_exc,vstatecg,is)
> +enddo
> +
> +!project out all the previous found state
> +call pout_operator_exc(a_exc,i_state)
> + 
> +!and normalize it
> +call normalize_exc(a_exc)
> +
> +!initialize the other (internal) vectors
> +!within the cc iteration x is the gradient vector 
> +call initialize_exc(x)
> +x%label=1
> +x%npw=npw
> +x%numb_v=num_nbndv(1)
> +allocate(x%a(x%npw,x%numb_v))
> +
> +!within the cg iteration g is the vector that stores (minus) the previous-step's gradient  
> +call initialize_exc(g)
> +g%label=1
> +g%npw=npw
> +g%numb_v=num_nbndv(1)
> +allocate(g%a(g%npw,g%numb_v))
> +
> +!within the cg gradient iteration h is storing the search direction 
> +!check if really needed
> +call initialize_exc(h)
> +h%label=1
> +h%npw=npw
> +h%numb_v=num_nbndv(1)
> +allocate(h%a(h%npw,h%numb_v))
> +
> +
> +!Compute gradient at the initial guess position  
> +call exc_h_a(a_exc,x,vstatecg,vstate_rcg,cstate,wcstate,fc) 
> +
> +!Compute function value fp at the initial guess position a_exc
> +call sproduct_exc(a_exc,x,fp)
> +!call sproduct_exc(x,x,hsquare)
> +
> +x%a(1:x%npw,1:x%numb_v)=x%a(1:x%npw,1:x%numb_v)&
> +                          -fp*a_exc%a(1:a_exc%npw,1:a_exc%numb_v)
> +
> +!Project the gradient into the conduction states manifold
> +!Remove any component along the eigenstates already found
> +do is = 1,vstatecg%nspin
> +   call pc_operator_exc(x,vstatecg,is)
> +enddo
> +call pout_operator_exc(x,i_state)
> +
> +!Compute dfp, i.e. the magnitude of the gradient at the initial guess position a_exc
> +call sproduct_exc(x,x,dfp)
> +dfp=sqrt(dfp)
> +dfg=dfp
> +modh=dfp
> +
> +!Set initial g and h values 
> +g%a(1:g%npw,1:g%numb_v)=-x%a(1:x%npw,1:x%numb_v)
> +h%a(1:h%npw,1:h%numb_v)=g%a(1:g%npw,1:g%numb_v)
> +
> +!Start CG iterations
> +step_rel=lm_delta
> +delta=100
> +delta_eig=dfp-fp**2
> +i=1
> +do while ((i<=maxit).and.((delta>=eps).or.(delta_eig>eps_eig)))
> +   
> +   if(i>1)delta=abs(eigout-fp*RYTOEV)
> +!  note that delta_eig is calculated for each step within linmin
> +   
> +   eigout=fp*RYTOEV
> +   if(ionode) write(stdout,*) 'CG: eig#',i_state,'it=', i, 'Eig (eV)=',eigout 
> +!   if(ionode) write(stdout,*) 'CG: eig#',i_state,'it=', i, 'delta_eig=',delta_eig
> +
> +!   if(ionode) write(stdout,*) 'CG: it=', i, 'Delta(eV)=',delta
> +
> +   call linmin(i_state,a_exc,fp,dfp,h,modh,x,vstatecg,vstate_rcg,cstate,wcstate,fc,delta_eig,restart,i,cgstatus,step_rel)
> +
> +   if(cgstatus) then   
> +
> +      gg=dfg**2.d0 
> +      if (gg==0.d0) exit
> +
> +      call sproduct_exc(g,x,dgg)
> +      dgg=dfp**2+dgg
> +
> +      gam=max(dgg/gg,0.d0)
> +
> +      g%a(1:g%npw,1:g%numb_v)=-x%a(1:x%npw,1:x%numb_v)
> +      dfg=dfp
> +
> +      h%a(1:h%npw,1:h%numb_v)= g%a(1:g%npw,1:g%numb_v)+gam*h%a(1:h%npw,1:h%numb_v)
> +!      x%a(1:x%npw,1:x%numb_v)=h%a(1:h%npw,1:h%numb_v)
> +     
> +      call sproduct_exc(h,h,modh)
> +      modh=sqrt(modh)
> +
> +   else 
> +   ! every 20 iterations restart the CG or if parabolic fit didn't work
> +   
> + 
> +      h%a(1:h%npw,1:h%numb_v)=-x%a(1:x%npw,1:x%numb_v)
> +      g%a(1:g%npw,1:g%numb_v)=-x%a(1:x%npw,1:x%numb_v)
> +      dfg=dfp
> +      modh=dfp
> +
> +
> +   endif
> +   
> +   i=i+1
> +
> +   call mp_barrier(world_comm)
> +enddo
> +
> +bse_spectrum(i_state)%a(1:bse_spectrum(i_state)%npw,1:bse_spectrum(i_state)%numb_v)=&
> +                         a_exc%a(1:a_exc%npw,1:a_exc%numb_v)
> +
> +
> +bse_spectrum(i_state)%e=fp*RYTOEV
> +
> +en=fp
> +
> +
> +if(i==maxit)  then
> +   if(ionode) write(stdout,*) 'WARNING Conjugate gradient: Max iteration reached'
> +   if(ionode) write(stdout,*) 'Please increase the max iteration number or decrease accuracy' 
> +endif
> +
> +
> +
> +!free memory
> +call free_memory_exc_a(a_exc)
> +call free_memory_exc_a(h)
> +call free_memory_exc_a(g)
> +call free_memory_exc_a(x)
> +
> +
> +call stop_clock('conjgrad')
> +
> +if(ionode) write(stdout,*) 'Conjugate gradient ended.'
> +return
> +end subroutine
> 
> Added: trunk/espresso/GWW/bse/contract_w_exc.f90
> ===================================================================
> --- trunk/espresso/GWW/bse/contract_w_exc.f90	                        (rev 0)
> +++ trunk/espresso/GWW/bse/contract_w_exc.f90	2016-03-21 11:09:18 UTC (rev 12231)
> @@ -0,0 +1,716 @@
> +MODULE contract_w
> +
> +USE kinds, ONLY : DP,sgl
> +use bse_basic_structures, ONLY : ii_mat
> +
> +SAVE
> +
> +type(ii_mat) :: iimat_contract
> +REAL(kind=sgl), POINTER  :: vphipizeta_save(:,:)
> +REAL(kind=sgl), POINTER  :: vww_save(:,:)
> +COMPLEX(kind=sgl), POINTER  :: vphipizeta_save_g(:,:)
> +COMPLEX(kind=sgl), POINTER  :: vww_save_g(:,:)
> +INTEGER, POINTER ::  vpmax_ii(:),vpmax_ii_start(:),vpmax_ii_end(:)
> +INTEGER :: vpmax_tot
> +
> +CONTAINS
> +
> +subroutine free_memory_contrac_w
> +  use bse_wannier, ONLY : l_gtrick
> +  implicit none
> +  if(.not.l_gtrick) then
> +     deallocate(vphipizeta_save)
> +  else
> +     deallocate(vphipizeta_save_g)
> +  endif
> +  deallocate(vpmax_ii)
> +  deallocate(vpmax_ii_start,vpmax_ii_end)
> +  if(.not.l_gtrick) then
> +     deallocate(vww_save)
> +  else
> +     deallocate(vww_save_g)
> +  endif
> +end subroutine free_memory_contrac_w
> +
> +subroutine contract_w_build(fc)
> +! this subroutine computes the w part of the direct term of the exc Hamiltonian
> +
> +USE wvfct,                 ONLY :  npw 
> +USE fft_custom_gwl
> +use bse_basic_structures
> +use exciton
> +USE gvect
> +use bse_wannier, ONLY: l_truncated_coulomb, &
> +           truncation_radius, l_gtrick
> +USE constants,        ONLY : e2, fpi
> +USE cell_base,        ONLY : tpiba,omega,tpiba2
> +!USE io_files,             ONLY : find_free_unit, prefix, diropn
> +USE io_files,             ONLY :  prefix, diropn
> +USE wavefunctions_module, ONLY :  psic
> +USE io_global, ONLY : stdout, ionode, ionode_id
> +USE mp_world, ONLY : mpime, nproc
> +USE mp_pools, ONLY: intra_pool_comm
> +USE mp_wave, ONLY : mergewf,splitwf
> +USE polarization
> +USE lsda_mod, ONLY :nspin
> +USE io_global, ONLY : stdout,ionode
> +USE mp,          ONLY :mp_barrier
> +USE mp_world,             ONLY : world_comm
> +
> +
> +
> +
> +
> +implicit none
> +INTEGER, EXTERNAL :: find_free_unit
> +logical :: debug=.false.
> +
> +type(bse_z) :: z
> +type(polaw) :: pw
> +
> +type(fft_cus) :: fc
> +type(ii_mat) :: iimat
> +
> +
> +
> +REAL(kind=DP), ALLOCATABLE :: fac(:)
> +COMPLEX(kind=DP), ALLOCATABLE :: p_basis(:,:)
> +COMPLEX(kind=DP), ALLOCATABLE :: p_basis_t(:,:)
> +REAL(kind=DP), ALLOCATABLE :: p_basis_r(:,:)
> +REAL(kind=DP), ALLOCATABLE :: zvphi(:)
> +REAL(kind=DP), ALLOCATABLE :: zvv(:)
> +COMPLEX(kind=DP), ALLOCATABLE :: evc_g(:)
> +
> +INTEGER ::iungprod
> +INTEGER :: ig,ii,iv,ispin
> +REAL(kind=DP) :: qq
> +LOGICAL       :: exst
> +
> +
> +INTEGER :: vpmax,k
> +REAL(kind=DP), allocatable :: zp(:,:)
> +REAL(kind=DP), allocatable :: pizeta(:,:)
> +REAL(kind=DP), allocatable :: vphipizeta(:,:)
> +
> +INTEGER ::  kilobytes
> +
> +
> +call start_clock('direct_w_exc')
> +CALL memstat( kilobytes )
> +write(stdout,*) 'memory0', kilobytes
> +FLUSH(stdout)
> +
> +
> +
> +
> +! read  iimat
> +call initialize_imat(iimat)
> +
> +do ispin=1,nspin
> +   call read_iimat(iimat,ispin) 
> +enddo
> +
> +! read z terms
> +call initialize_bse_z(z)
> +call read_z(1,iimat,z)
> +
> +FLUSH( stdout ) 
> +
> +! get Coulomb potential
> +allocate(fac(npw))
> +if(l_truncated_coulomb) then
> +   do ig=1,npw
> +      qq = g(1,ig)**2.d0 + g(2,ig)**2.d0 + g(3,ig)**2.d0
> +      if (qq > 1.d-8) then
> +         fac(ig)=(e2*fpi/(tpiba2*qq))*(1.d0-dcos(dsqrt(qq)*truncation_radius*tpiba))
> +      else
> +         fac(ig)=e2*fpi*(truncation_radius**2.d0/2.d0)
> +     endif
> +   enddo
> +    fac(:)=fac(:)/omega
> +else
> +
> +   fac(:)=0.d0
> +   fac(1:npw)=vg_q(1:npw)
> +endif
> +
> +
> +! read polarization basis and multiply per V
> +
> +iungprod = find_free_unit()
> +allocate(p_basis(npw,z%numw_prod))
> +CALL diropn( iungprod, 'wiwjwfc_red', npw*2, exst )
> +
> +do ii=1,z%numw_prod
> +   call davcio(p_basis(:,ii),npw*2,iungprod,ii,-1)
> +   p_basis(1:npw,ii)=p_basis(1:npw,ii)*dcmplx(fac(1:npw))
> +enddo
> +
> +call mp_barrier(world_comm)
> +
> +close(iungprod)
> +CALL memstat( kilobytes )
> +write(stdout,*) 'memory1', kilobytes
> +FLUSH(stdout)
> +! FFT to real space (dual grid)
> +allocate(p_basis_t(fc%npwt,z%numw_prod)) 
> +allocate(p_basis_r(fc%nrxxt,z%numw_prod))
> +allocate(evc_g(fc%ngmt_g ))
> +
> +
> +if(fc%dual_t==4.d0) then
> +   p_basis_t(1:fc%npwt,1:z%numw_prod)=p_basis(1:npw,1:z%numw_prod)
> +else
> +    call reorderwfp_col(z%numw_prod,npw,fc%npwt,p_basis(1,1),p_basis_t(1,1),npw,fc%npwt, &
> +           & ig_l2g,fc%ig_l2gt,fc%ngmt_g,mpime, nproc,intra_pool_comm )
> +
> +!   do ii=1,z%numw_prod
> +!      call mergewf(p_basis(:,ii),evc_g,a_in%npw,ig_l2g,mpime,nproc,ionode_id,intra_pool_comm)
> +!      call splitwf(p_basis_t(:,ii),evc_g,fc%npwt,fc%ig_l2gt,mpime,nproc,ionode_id,intra_pool_comm)
> +!   enddo
> +endif
> +
> +deallocate(evc_g)
> +deallocate(p_basis)
> +
> +call start_clock('direct_w_cft3t')
> +do ii=1,z%numw_prod,2
> +   psic(1:fc%nrxxt)=(0.d0,0.d0)
> +   if (ii==z%numw_prod) then
> +      psic(fc%nlt(1:fc%npwt))  = p_basis_t(1:fc%npwt,ii)
> +      psic(fc%nltm(1:fc%npwt)) = CONJG( p_basis_t(1:fc%npwt,ii) )
> +   else
> +      psic(fc%nlt(1:fc%npwt))=p_basis_t(1:fc%npwt,ii)+(0.d0,1.d0)*p_basis_t(1:fc%npwt,ii+1)
> +      psic(fc%nltm(1:fc%npwt))=CONJG(p_basis_t(1:fc%npwt,ii))+(0.d0,1.d0)*CONJG(p_basis_t(1:fc%npwt,ii+1))
> +   endif
> +      CALL cft3t( fc, psic, fc%nr1t, fc%nr2t, fc%nr3t, fc%nrx1t, fc%nrx2t, fc%nrx3t, 2 )
> +       p_basis_r(1:fc%nrxxt,ii)= DBLE(psic(1:fc%nrxxt))
> +      if(ii/=z%numw_prod) p_basis_r(1:fc%nrxxt,ii+1)= DIMAG(psic(1:fc%nrxxt))
> +enddo
> +call stop_clock('direct_w_cft3t')
> +
> +deallocate(p_basis_t) 
> +
> +!read P
> +call initialize_polaw(pw)
> +call read_polaw_global(0, pw)
> +
> +
> +call mp_barrier(world_comm)
> +
> +CALL memstat( kilobytes )
> +write(stdout,*) 'memory2', kilobytes
> +FLUSH(stdout)
> +
> +
> +! allocate tmp matrix
> +
> +
> +!compute line by line the output excitonic vector
> +
> +!!!!!!!!!!!!!!!!!dgemm subroutine!!!!!!!!!!!!!!!!!!!!!
> +call start_clock('direct_w_dgemv')
> +write(stdout,*) 'memory2',z%numw_prod,iimat%np_max
> +allocate(zp(z%numw_prod,iimat%np_max))
> +!!allocate(pizeta(z%numw_prod,iimat%np_max)) 
> +!!allocate(vphipizeta(fc%nrxxt,iimat%np_max)) 
> +!calculate vpmax_tot
> +vpmax_tot=0
> +allocate(vpmax_ii( iimat%numb_v))
> +allocate(vpmax_ii_start( iimat%numb_v))
> +allocate(vpmax_ii_end( iimat%numb_v))
> +do iv=1, iimat%numb_v
> +
> +   vpmax=0
> +   vpmax_ii_start(iv)=vpmax_tot+1
> +   do ii=1, iimat%np_max
> +      if (iimat%iimat(ii,iv)==0) cycle
> +      vpmax=vpmax+1
> +   enddo
> +   vpmax_ii(iv)=vpmax
> +   vpmax_tot=vpmax_tot+vpmax
> +   vpmax_ii_end(iv)=vpmax_tot
> +enddo
> +write(stdout,*)  'VPHIZETA_SAVE :', fc%nrxxt,vpmax_tot
> +FLUSH(stdout)
> +if(.not.l_gtrick) then
> +   allocate(vphipizeta_save(fc%nrxxt,vpmax_tot))
> +else
> +   allocate(vphipizeta_save_g(fc%npwt,vpmax_tot))
> +endif
> +write(stdout,*)  'VPHIZETA_SAVE :', fc%nrxxt,vpmax_tot
> +!
> +CALL memstat( kilobytes )
> +write(stdout,*) 'memory3', kilobytes
> +FLUSH(stdout)
> +do iv=1, iimat%numb_v
> +   zp(1:z%numw_prod,1:iimat%np_max)=0.d0
> +   vpmax=0
> + 
> +   write(stdout,*) 'DEBUG1'
> +   FLUSH(stdout)
> +   call start_clock('dgemv1')
> +   do ii=1, iimat%np_max
> +      if (iimat%iimat(ii,iv)==0) cycle
> +      vpmax=vpmax+1
> +      do k=1, z%numw_prod
> +         zp(k,vpmax)=z%z(k,ii,iv)!ATTENZIONE era zp(ii)
> +      enddo
> +   enddo
> +   write(stdout,*) 'DEBUG2'
> +  FLUSH(stdout)
> +  if(vpmax>0) then
> +     allocate(pizeta(z%numw_prod,vpmax)) 
> +     allocate(vphipizeta(fc%nrxxt,vpmax)) 
> +  else
> +     allocate(pizeta(z%numw_prod,1))
> +     allocate(vphipizeta(fc%nrxxt,1))
> +  endif
> +   call stop_clock('dgemv1')
> +
> +
> +   call start_clock('dgemv2.1')
> +   write(stdout,*) 'DEBUG2', z%numw_prod,pw%numpw
> +   FLUSH(stdout)
> +
> +   call dgemm('N','N', z%numw_prod,vpmax, z%numw_prod,1.d0,pw%pw,z%numw_prod,zp,&
> +              z%numw_prod,0.d0,pizeta,z%numw_prod)
> +   call stop_clock('dgemv2')
> +
> +   write(stdout,*) 'DEBUG3'
> +   FLUSH(stdout)
> +   call start_clock('dgemv3')
> +   call dgemm('N','N', fc%nrxxt, vpmax, z%numw_prod,1.d0,p_basis_r(1,1),fc%nrxxt,&
> +              pizeta(1,1), z%numw_prod, 0.d0, vphipizeta(1,1),fc%nrxxt)
> +   call stop_clock('dgemv3')
> +   if(.not.l_gtrick) then
> +      vphipizeta_save(1:fc%nrxxt,vpmax_ii_start(iv):vpmax_ii_end(iv))= real(vphipizeta(1:fc%nrxxt,1:vpmax))
> +   else
> +      do ii=1,vpmax,2
> +         if(ii==vpmax) then
> +            psic(1:fc%nrxxt)=dcmplx(vphipizeta(1:fc%nrxxt,ii),0.d0)
> +         else
> +            psic(1:fc%nrxxt)=dcmplx(vphipizeta(1:fc%nrxxt,ii),vphipizeta(1:fc%nrxxt,ii+1))
> +         endif
> +         CALL cft3t(fc, psic, fc%nr1t, fc%nr2t, fc%nr3t, fc%nrx1t, fc%nrx2t, fc%nrx3t, -2 )
> +         if(ii==vpmax) then
> +            vphipizeta_save_g(1:fc%npwt,vpmax_ii_start(iv)+ii-1)=cmplx(psic(fc%nlt(1:fc%npwt)))
> +         else
> +            vphipizeta_save_g(1:fc%npwt,vpmax_ii_start(iv)+ii-1)=&
> +                 &cmplx(0.5d0*(psic(fc%nlt(1:fc%npwt))+conjg( psic(fc%nltm(1:fc%npwt)))))
> +            vphipizeta_save_g(1:fc%npwt,vpmax_ii_start(iv)+ii-1+1)=&
> +                 &cmplx((0.d0,-0.5d0)*(psic(fc%nlt(1:fc%npwt)) - conjg(psic(fc%nltm(1:fc%npwt)))))
> +         endif
> +      enddo
> +   endif
> +     write(stdout,*) 'DEBUG4'
> +     FLUSH(stdout)
> +   !
> +!
> +   deallocate(pizeta)
> +   deallocate(vphipizeta)
> +call mp_barrier(world_comm)
> +enddo
> +call stop_clock('direct_w_dgemv')
> +
> +deallocate(zp)
> +deallocate(p_basis_r)
> +
> +call free_bse_z(z)
> +call free_memory_polaw(pw)
> +call free_imat(iimat)
> +
> +
> +FLUSH( stdout )
> +CALL memstat( kilobytes )
> +write(stdout,*) 'memory4', kilobytes
> +FLUSH(stdout)
> +call stop_clock('direct_w_exc')
> +
> +return
> +end subroutine contract_w_build
> +
> +
> +
> +subroutine contract_w_apply(a_in,fc,a_out)
> +! this subroutine computes the w part of the direct term of the exc Hamiltonian
> + 
> +USE fft_custom_gwl
> +use bse_basic_structures
> +use exciton
> +USE gvect
> +use bse_wannier, ONLY: l_truncated_coulomb, &
> +           truncation_radius,l_gtrick
> +USE constants,        ONLY : e2, fpi
> +USE cell_base,        ONLY : tpiba,omega,tpiba2
> +!USE io_files,             ONLY : find_free_unit, prefix, diropn
> +USE io_files,             ONLY :  prefix, diropn
> +USE wavefunctions_module, ONLY :  psic
> +USE io_global, ONLY : stdout, ionode, ionode_id
> +USE mp_world, ONLY : mpime, nproc
> +USE mp_pools, ONLY: intra_pool_comm
> +USE mp_wave, ONLY : mergewf,splitwf
> +USE polarization
> +USE lsda_mod, ONLY :nspin
> +USE io_global, ONLY : stdout,ionode
> +USE mp,          ONLY :mp_barrier
> +USE mp_world,             ONLY : world_comm
> +
> +
> +
> +
> +
> +implicit none
> +INTEGER, EXTERNAL :: find_free_unit
> +
> +type(polaw) :: pw
> +type(exc):: a_in
> +type(exc):: a_out
> +type(exc_r):: a_in_rt
> +type(exc_r):: a_tmp_rt
> +type(fft_cus) :: fc
> +
> +
> +
> +
> +
> +INTEGER ::iungprod
> +INTEGER :: ig,ii,iv,ispin
> +REAL(kind=DP) :: qq
> +LOGICAL       :: exst
> +
> +
> +INTEGER ::k
> +
> +
> +logical debug
> +
> +call start_clock('direct_w_contract')
> +
> +debug=.false.
> +! read  iimat
> +
> +!FFT the input excitonic vector to real space (dual grid)
> +call initialize_exc_r(a_in_rt)
> +call fft_a_exc(a_in,fc,a_in_rt)
> +
> +call mp_barrier(world_comm)
> +
> +! allocate tmp matrix
> +call initialize_exc_r(a_tmp_rt)
> +a_tmp_rt%nrxxt=fc%nrxxt 
> +a_tmp_rt%numb_v=a_in%numb_v
> +a_tmp_rt%label=12
> +allocate(a_tmp_rt%ar(a_tmp_rt%nrxxt,a_tmp_rt%numb_v))
> +
> +!compute line by line the output excitonic vector
> +
> +!!!!!!!!!!!!!!!!!dgemm subroutine!!!!!!!!!!!!!!!!!!!!!
> +call start_clock('contract_w_dgemv')
> +
> +
> +!
> +a_tmp_rt%ar(1:a_tmp_rt%nrxxt,1:a_tmp_rt%numb_v) =0.d0
> +!
> +do iv=1, a_in%numb_v
> +
> +   if(.not.l_gtrick) then
> +      do ii=1,vpmax_ii(iv)
> +         a_tmp_rt%ar(1:fc%nrxxt,iv)= &
> +              &a_tmp_rt%ar(1:fc%nrxxt,iv)+a_in_rt%ar(1:fc%nrxxt,iimat_contract%iimat(ii,iv))*&
> +              &dble(vphipizeta_save(1:fc%nrxxt,vpmax_ii_start(iv)-1+ii))
> +      enddo
> +   else
> +      do ii=1,vpmax_ii(iv),2
> +         psic(1:fc%nrxxt)=(0.d0,0.d0)
> +         if (ii==vpmax_ii(iv)) then
> +            psic(fc%nlt(1:fc%npwt))  = dcmplx(vphipizeta_save_g(1:fc%npwt,vpmax_ii_start(iv)-1+ii))
> +            psic(fc%nltm(1:fc%npwt)) = dcmplx(CONJG( vphipizeta_save_g(1:fc%npwt,vpmax_ii_start(iv)-1+ii) ))
> +         else
> +            psic(fc%nlt(1:fc%npwt))= dcmplx(vphipizeta_save_g(1:fc%npwt,vpmax_ii_start(iv)-1+ii))+&
> +                 &(0.0,1.0)* dcmplx(vphipizeta_save_g(1:fc%npwt,vpmax_ii_start(iv)-1+ii+1))
> +            psic(fc%nltm(1:fc%npwt))=DCMPLX(CONJG( vphipizeta_save_g(1:fc%npwt,vpmax_ii_start(iv)-1+ii))+&
> +                 &(0.0,1.0)*CONJG( vphipizeta_save_g(1:fc%npwt,vpmax_ii_start(iv)-1+ii+1)))
> +         endif
> +         CALL cft3t( fc, psic, fc%nr1t, fc%nr2t, fc%nr3t, fc%nrx1t, fc%nrx2t, fc%nrx3t, 2 )
> +         a_tmp_rt%ar(1:fc%nrxxt,iv)= &
> +              &a_tmp_rt%ar(1:fc%nrxxt,iv)+a_in_rt%ar(1:fc%nrxxt,iimat_contract%iimat(ii,iv))*&
> +              &dble(psic(1:fc%nrxxt))
> +         if (ii/=vpmax_ii(iv)) then
> +             a_tmp_rt%ar(1:fc%nrxxt,iv)= &
> +                  &a_tmp_rt%ar(1:fc%nrxxt,iv)+a_in_rt%ar(1:fc%nrxxt,iimat_contract%iimat(ii+1,iv))*&
> +                  &dimag(psic(1:fc%nrxxt))
> +
> +         endif
> +      enddo
> +   endif
> +
> +
> +
> +enddo
> +call stop_clock('contract_w_dgemv')
> +
> +
> +
> +
> +call free_memory_exc_a_r(a_in_rt)
> +
> +
> +
> +call start_clock('wdirect_fftback')
> +!FFT back to provide the output excitonic wave vector in G-space
> +call fftback_a_exc(a_tmp_rt,fc,a_out)
> +call stop_clock('wdirect_fftback')
> +
> +call free_memory_exc_a_r(a_tmp_rt)
> +
> +FLUSH( stdout )
> +call stop_clock('direct_w_contract')
> +
> +return
> +end subroutine contract_w_apply
> +
> +
> +subroutine contract_v_build(fc)
> +!TO BE CALLED AFTER CONTRACT_W_BUILD  
> +!IMPORTANT: REQUIRES IIMAT_CONTRACT
> +! computes the v part of the direct term of the exc Hamiltonian 
> +USE fft_custom_gwl
> +use bse_basic_structures
> +use exciton
> +USE wavefunctions_module, ONLY :  psic
> +USE gvect,                 ONLY : ig_l2g
> +USE io_global, ONLY : stdout, ionode, ionode_id
> +USE mp_world, ONLY : mpime, nproc
> +USE mp_pools, ONLY: intra_pool_comm
> +USE mp_wave, ONLY : mergewf,splitwf
> +USE io_global, ONLY : stdout,ionode
> +USE lsda_mod, ONLY :nspin
> +USE gvect, ONLY : gstart
> +USE mp, ONLY : mp_sum
> +USE mp_world,             ONLY : world_comm
> +USE wvfct,                 ONLY :  npw
> +USE bse_wannier, ONLY : l_gtrick
> +
> +implicit none
> +
> +
> +type(vww_prod) :: vww
> +type(fft_cus) :: fc
> +
> +
> +COMPLEX(kind=DP), allocatable :: vwwg_t(:,:)
> +COMPLEX(kind=DP), ALLOCATABLE :: evc_g(:)
> +
> +
> +!real(kind=dp), allocatable :: phivwwr(:,:)
> +COMPLEX(kind=DP) :: csca
> +
> +integer ii, iv,ispin, iimax
> +
> +logical debug
> +
> +write(stdout,*) 'Routine contract_v_build'
> +
> +debug=.false.
> +
> +!if(debug) then 
> +!   if(ionode) write(stdout,*) 'Direct_v_exc #1'
> +!endif
> +write(stdout,*) 'VWW_SAVE :', fc%nrxxt,vpmax_tot
> +if(.not.l_gtrick) then
> +   allocate(vww_save(fc%nrxxt,vpmax_tot))
> +else
> +   allocate(vww_save_g(fc%npwt,vpmax_tot))
> +endif
> +
> +
> +call initialize_vww_prod(vww)
> +call read_vww_prod(1,iimat_contract%numb_v,npw,iimat_contract%np_max,iimat_contract,vww)
> +
> +!if(debug) then 
> +!   if(ionode) write(stdout,*) 'Direct_v_exc #6'
> +!endif
> +
> +! for every element iv of the excitonic wavefunction vector, here we FFT all
> +! the available v*w_iv*w_ivp(G) products, multiply by a_in_rt%ar(:,ivp)
> +! sum over ivp, and FFT back
> +
> +allocate(vwwg_t(fc%npwt,iimat_contract%np_max))
> +!allocate(phivwwr(fc%nrxxt,a_in%numb_v))
> +allocate(evc_g(fc%ngmt_g ))
> +
> +
> +write(stdout,*) 'ATT1'
> +do iv=1,iimat_contract%numb_v
> +    
> +   vwwg_t(1:fc%npwt,1:iimat_contract%np_max)=dcmplx(0.d0,0.d0)
> +   iimax=0
> +   do ii=1,iimat_contract%np_max
> +      if (iimat_contract%iimat(ii,iv)>0) then
> +         iimax=iimax+1
> +      else
> +         exit
> +      endif
> +   enddo
> +   if(iimax>0) then
> +      call reorderwfp_col(iimax,vww%npw,fc%npwt,vww%vww(1,1,iv),vwwg_t, vww%npw,fc%npwt, &
> +           & ig_l2g,fc%ig_l2gt,fc%ngmt_g,mpime, nproc,intra_pool_comm )
> +   endif
> +   
> +   if(.not.l_gtrick) then
> +      do ii=1,vpmax_ii(iv),2
> +        
> +      
> +         psic(1:fc%nrxxt)=(0.d0,0.d0)
> +         if (ii==vpmax_ii(iv)) then
> +            psic(fc%nlt(1:fc%npwt))  = vwwg_t(1:fc%npwt,ii)
> +            psic(fc%nltm(1:fc%npwt)) = CONJG( vwwg_t(1:fc%npwt,ii) )
> +         else      
> +            psic(fc%nlt(1:fc%npwt))=vwwg_t(1:fc%npwt,ii)+(0.d0,1.d0)*vwwg_t(1:fc%npwt,ii+1)
> +            psic(fc%nltm(1:fc%npwt))=CONJG(vwwg_t(1:fc%npwt,ii))+(0.d0,1.d0)*CONJG(vwwg_t(1:fc%npwt,ii+1))
> +         endif
> +         CALL cft3t( fc, psic, fc%nr1t, fc%nr2t, fc%nr3t, fc%nrx1t, fc%nrx2t, fc%nrx3t, 2 )
> +         vww_save(1:fc%nrxxt,vpmax_ii_start(iv)+ii-1)=real(psic(1:fc%nrxxt))
> +         if (ii/=vpmax_ii(iv)) then
> +            vww_save(1:fc%nrxxt,vpmax_ii_start(iv)+ii)=aimag(psic(1:fc%nrxxt))
> +         endif
> +
> +      enddo
> +   else
> +      do ii=1,vpmax_ii(iv)
> +         vww_save_g(1:fc%npwt,vpmax_ii_start(iv)+ii-1)=cmplx(vwwg_t(1:fc%npwt,ii))
> +      end do
> +   endif
> +enddo
> +
> +!if(debug) then 
> +!   if(ionode) write(stdout,*) 'Direct_v_exc #7'
> +!endif
> +
> +FLUSH( stdout )
> +
> +
> +call free_vww_prod(vww)
> +deallocate(vwwg_t)
> +!deallocate(vwwr_t)
> +deallocate(evc_g)
> +
> +
> +end subroutine
> +
> +
> +subroutine contract_v_apply(a_in,fc,a_out)
> +! computes the v part of the direct term of the exc Hamiltonian
> +USE fft_custom_gwl
> +use bse_basic_structures
> +use exciton
> +USE wavefunctions_module, ONLY :  psic
> +USE gvect,                 ONLY : ig_l2g
> +USE io_global, ONLY : stdout, ionode, ionode_id
> +USE mp_world, ONLY : mpime, nproc
> +USE mp_pools, ONLY: intra_pool_comm
> +USE mp_wave, ONLY : mergewf,splitwf
> +USE io_global, ONLY : stdout,ionode
> +USE lsda_mod, ONLY :nspin
> +USE gvect, ONLY : gstart
> +USE mp, ONLY : mp_sum
> +USE mp_world,             ONLY : world_comm
> +USE bse_wannier, ONLY : l_gtrick
> +
> +
> +implicit none
> +type(exc), intent(in) :: a_in
> +type(exc):: a_out
> +type(exc_r):: a_in_rt
> +type(exc_r):: a_tmp_rt
> +
> +
> +type(fft_cus) :: fc
> +
> +
> +COMPLEX(kind=DP), allocatable :: vwwg_t(:,:)
> +COMPLEX(kind=DP), ALLOCATABLE :: evc_g(:)
> +
> +COMPLEX(kind=DP) :: csca
> +
> +integer ii, iv,ispin, iimax
> +
> +logical debug
> +
> +call start_clock('direct_v_contract')
> +debug=.false.
> +
> +call initialize_exc_r(a_tmp_rt)
> +a_tmp_rt%nrxxt=fc%nrxxt
> +a_tmp_rt%numb_v=a_in%numb_v
> +a_tmp_rt%label=12
> +allocate(a_tmp_rt%ar(a_tmp_rt%nrxxt,a_tmp_rt%numb_v))
> +
> +
> +! FFT a_in to real space (dual grid)
> +call initialize_exc_r(a_in_rt)
> +call fft_a_exc(a_in,fc,a_in_rt)
> +
> +
> +
> +
> +
> +a_tmp_rt%ar(1:a_tmp_rt%nrxxt,1:a_tmp_rt%numb_v) =0.d0
> +do iv=1,a_in%numb_v
> +  
> +
> +   if(.not.l_gtrick) then
> +      do ii=1,vpmax_ii(iv)
> +         a_tmp_rt%ar(1:fc%nrxxt,iv)= &
> +              &a_tmp_rt%ar(1:fc%nrxxt,iv)+DBLE(vww_save(1:fc%nrxxt,vpmax_ii_start(iv)+ii-1))*&
> +              &a_in_rt%ar(1:a_in_rt%nrxxt,iimat_contract%iimat(ii,iv))
> +      enddo
> +   else
> +      do ii=1,vpmax_ii(iv),2
> +         psic(1:fc%nrxxt)=(0.d0,0.d0)
> +         if (ii==vpmax_ii(iv)) then
> +            psic(fc%nlt(1:fc%npwt))  = dcmplx(vww_save_g(1:fc%npwt,vpmax_ii_start(iv)+ii-1))
> +            psic(fc%nltm(1:fc%npwt)) = dcmplx(CONJG(  vww_save_g(1:fc%npwt,vpmax_ii_start(iv)+ii-1) ))
> +         else
> +            psic(fc%nlt(1:fc%npwt))=dcmplx(vww_save_g(1:fc%npwt,vpmax_ii_start(iv)+ii-1)+&
> +                 &(0.0,1.0)*vww_save_g(1:fc%npwt,vpmax_ii_start(iv)+ii-1+1))
> +            psic(fc%nltm(1:fc%npwt))=dcmplx(CONJG(vww_save_g(1:fc%npwt,vpmax_ii_start(iv)+ii-1))&
> +                 &+(0.0,1.0)*CONJG(vww_save_g(1:fc%npwt,vpmax_ii_start(iv)+ii-1+1)))
> +         endif
> +         call start_clock('d_v_fft')
> +         CALL cft3t( fc, psic, fc%nr1t, fc%nr2t, fc%nr3t, fc%nrx1t, fc%nrx2t, fc%nrx3t, 2 )
> +         call stop_clock('d_v_fft')
> +         a_tmp_rt%ar(1:fc%nrxxt,iv)= &
> +              &a_tmp_rt%ar(1:fc%nrxxt,iv)+DBLE(psic(1:fc%nrxxt))*&
> +              &a_in_rt%ar(1:a_in_rt%nrxxt,iimat_contract%iimat(ii,iv))
> +         if (ii/=vpmax_ii(iv)) then
> +            a_tmp_rt%ar(1:fc%nrxxt,iv)= &
> +                 &a_tmp_rt%ar(1:fc%nrxxt,iv)+DIMAG(psic(1:fc%nrxxt))*&
> +                 &a_in_rt%ar(1:a_in_rt%nrxxt,iimat_contract%iimat(ii+1,iv))
> +         endif
> +      enddo
> +   endif
> +enddo
> +
> +
> +
> +call free_memory_exc_a_r(a_in_rt)
> +
> +call fftback_a_exc(a_tmp_rt,fc,a_out)
> +
> +! free memory                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              
> +call free_memory_exc_a_r(a_tmp_rt)
> +
> +
> +
> +call stop_clock('direct_v_contract')
> +end subroutine
> +
> +
> +
> +
> +
> +
> +
> +END MODULE contract_w
> 
> Added: trunk/espresso/GWW/bse/diago_exc.f90
> ===================================================================
> --- trunk/espresso/GWW/bse/diago_exc.f90	                        (rev 0)
> +++ trunk/espresso/GWW/bse/diago_exc.f90	2016-03-21 11:09:18 UTC (rev 12231)
> @@ -0,0 +1,166 @@
> +subroutine diago_exc(a,v,cstate,wcstate)
> +! this subroutine applies the diagonal part of the excitonic Hamiltonian to the excitonic
> +! wavefunction vector (a%a)
> +
> +USE kinds,            ONLY : DP
> +USE exciton
> +use bse_basic_structures
> +USE wvfct,            ONLY : igk,g2kin,npwx
> +USE noncollin_module, ONLY : npol
> +USE uspp,             ONLY : vkb,nkb
> +USE becmod,           ONLY : becp,allocate_bec_type,deallocate_bec_type
> +USE g_psi_mod,        ONLY : h_diag, s_diag
> +USE klist,            ONLY : xk
> +USE gvect
> +USE cell_base,        ONLY: tpiba,tpiba2
> +USE constants,        ONLY: RYTOEV 
> +use io_global,        ONLY : stdout, ionode 
> +use bse_wannier,      ONLY : scissor,l_scissor,qpe_imin,qpe_imax
> +use qpe_exc
> +
> +
> +implicit none
> +
> +type(exc) :: a
> +type(v_state) :: v
> +type(c_state) :: cstate
> +type(c_state) :: wcstate
> +
> +
> +type(exc) :: a1,a2
> +COMPLEX(kind=DP), ALLOCATABLE :: psi_1(:,:)
> +COMPLEX(kind=DP), ALLOCATABLE :: u_0(:,:)
> +
> +
> +
> +logical :: debug
> +real(kind=dp) :: prod
> +real(kind=dp), allocatable :: vb_en(:)
> +integer :: is
> +
> +call start_clock('diago_exc')
> +debug=.false.
> +
> +allocate(psi_1(a%npw,a%numb_v))
> +allocate(u_0(a%npw,a%numb_v))
> +
> +ALLOCATE( h_diag( npwx,npol ) )
> +ALLOCATE( s_diag( npwx,npol ) )
> +
> +ALLOCATE(vb_en(a%numb_v))
> +
> +!just copy a in a temporary variable to apply the different part of the diago
> +!Hamiltonian 
> +
> +call initialize_exc(a1)
> +call initialize_exc(a2)
> +
> +allocate(a1%a(a%npw,a%numb_v))
> +
> +a1%npw=a%npw
> +a1%numb_v=a%numb_v
> +a1%label=20
> +
> +allocate(a2%a(a%npw,a%numb_v))
> +
> +a2%npw=a%npw
> +a2%numb_v=a%numb_v
> +a2%label=30
> +
> +a2%a(1:a2%npw,1:a2%numb_v)=a%a(1:a%npw,1:a%numb_v)
> +
> +call allocate_bec_type ( nkb, a%numb_v, becp)
> +
> +IF ( nkb > 0 )  CALL init_us_2( a%npw, igk, xk(1,1), vkb )
> +g2kin(1:a%npw) = ( (g(1,igk(1:a%npw)) )**2 + &
> +       ( g(2,igk(1:a%npw)) )**2 + &
> +       ( g(3,igk(1:a%npw)) )**2 ) * tpiba2
> +
> +
> +psi_1(1:a%npw,1:a%numb_v)=a%a(1:a%npw,1:a%numb_v)
> +
> +    
> +!calculate H|\phi_i>
> +call h_psi( a%npw, a%npw, a%numb_v,psi_1(1,1), u_0 )
> +a1%a(1:a%npw,1:a%numb_v)=u_0(1:a%npw,1:a%numb_v)
> +
> +
> +!project into the conduction manifold
> +do is = 1,v%nspin
> +  call pc_operator_exc(a1,v,is)
> +enddo
> +
> +!check if everything is ok, the 'scalar' product of a1%a with a%a should be
> +!greater than e_lumo
> +
> +if (debug) then
> +  call sproduct_exc(a,a1,prod)
> +  prod=prod*RYTOEV
> +  if(ionode) write(stdout,*) 'exc_diago, prod (eV)=',prod
> +  if(ionode) write(stdout,*) 'prod should be greater than LUMO level'
> +  CALL flush_unit( stdout )
> +end if
> +
> +if(.not.l_scissor) then
> +   if (qpe_imax>a%numb_v) then
> +      do is=1,a%numb_v
> +         vb_en(is)= qpcbarc
> +      enddo
> +      call c_times_exc(a2,vb_en)
> +      a1%a(1:a%npw,1:a%numb_v)=a1%a(1:a%npw,1:a%numb_v)+a2%a(1:a%npw,1:a%numb_v)
> +      a2%a(1:a2%npw,1:a2%numb_v)=a%a(1:a%npw,1:a%numb_v) 
> +      call poutcstate_exc(a2,a2,cstate,wcstate)
> +      a1%a(1:a%npw,1:a%numb_v)=a1%a(1:a%npw,1:a%numb_v)+a2%a(1:a%npw,1:a%numb_v)
> +   else 
> +      do is=1,a%numb_v
> +         vb_en(is)= qpcbarc
> +      enddo
> +      call c_times_exc(a2,vb_en)
> +      a1%a(1:a%npw,1:a%numb_v)=a1%a(1:a%npw,1:a%numb_v)+a2%a(1:a%npw,1:a%numb_v)
> +   endif
> +endif
> +
> +
> +
> +!multiply each line of the excitonic wavefunction vector with the corresponding
> +!single particle valence state energy
> +
> +if(l_scissor) then
> +   do is=1,a%numb_v
> +      vb_en(is)= v%esp(is,1)-scissor 
> +   enddo
> +
> +   call c_times_exc(a,vb_en)
> +   if (debug) then
> +     do is=1,a%numb_v
> +        prod=vb_en(is)*RYTOEV
> +        if(ionode) write(stdout,*) 'exc_diago, band i  (eV)=',prod
> +     enddo
> +   end if
> +else !not scissor
> +   do is=1,a%numb_v
> +      vb_en(is)= v%esp(is,1)+qpc(is)
> +   enddo
> +   call c_times_exc(a,vb_en)
> +endif
> +
> +
> +! sum-up the two terms
> +a%a(1:a%npw,1:a%numb_v)=-a%a(1:a%npw,1:a%numb_v)+a1%a(1:a%npw,1:a%numb_v)
> +
> +deallocate(psi_1)
> +deallocate(u_0)
> +
> +deallocate(h_diag)
> +deallocate(s_diag)
> +
> +deallocate(vb_en)
> +
> +call deallocate_bec_type(becp)
> +call free_memory_exc_a(a1)
> +call free_memory_exc_a(a2)
> +
> +call stop_clock('diago_exc')
> +return
> +end subroutine
> +
> 
> Added: trunk/espresso/GWW/bse/direct_v_exc.f90
> ===================================================================
> --- trunk/espresso/GWW/bse/direct_v_exc.f90	                        (rev 0)
> +++ trunk/espresso/GWW/bse/direct_v_exc.f90	2016-03-21 11:09:18 UTC (rev 12231)
> @@ -0,0 +1,173 @@
> +subroutine direct_v_exc(a_in,fc,a_out)
> +! computes the v part of the direct term of the exc Hamiltonian 
> +USE fft_custom_gwl
> +use bse_basic_structures
> +use exciton
> +USE wavefunctions_module, ONLY :  psic
> +USE gvect,                 ONLY : ig_l2g
> +USE io_global, ONLY : stdout, ionode, ionode_id
> +USE mp_world, ONLY : mpime, nproc
> +USE mp_pools, ONLY: intra_pool_comm
> +USE mp_wave, ONLY : mergewf,splitwf
> +USE io_global, ONLY : stdout,ionode
> +USE lsda_mod, ONLY :nspin
> +USE gvect, ONLY : gstart
> +USE mp, ONLY : mp_sum
> +USE mp_world,             ONLY : world_comm
> +
> +
> +
> +implicit none
> +type(exc), intent(in) :: a_in
> +type(exc):: a_out
> +type(exc_r):: a_in_rt
> +type(exc_r):: a_tmp_rt
> +type(ii_mat) :: iimat
> +type(vww_prod) :: vww
> +type(fft_cus) :: fc
> +
> +
> +COMPLEX(kind=DP), allocatable :: vwwg_t(:,:)
> +COMPLEX(kind=DP), ALLOCATABLE :: evc_g(:)
> +
> +
> +!real(kind=dp), allocatable :: phivwwr(:,:)
> +COMPLEX(kind=DP) :: csca
> +
> +integer ii, iv,ispin, iimax
> +
> +logical debug
> +
> +call start_clock('direct_v_exc')
> +debug=.false.
> +
> +!if(debug) then 
> +!   if(ionode) write(stdout,*) 'Direct_v_exc #1'
> +!endif
> +
> +
> +! allocate tmp matrix
> +call initialize_exc_r(a_tmp_rt)
> +a_tmp_rt%nrxxt=fc%nrxxt 
> +a_tmp_rt%numb_v=a_in%numb_v
> +a_tmp_rt%label=12
> +allocate(a_tmp_rt%ar(a_tmp_rt%nrxxt,a_tmp_rt%numb_v))
> +
> +
> +! FFT a_in to real space (dual grid)
> +call initialize_exc_r(a_in_rt)
> +call fft_a_exc(a_in,fc,a_in_rt)
> +
> +
> +! read iimat, that tells us for every w_iv which other valence band ivp is
> +! overlapping, and read the corresponding v*w_iv*w_ivp(G) products
> +
> +call initialize_imat(iimat)
> +
> +do ispin=1,nspin
> +! note that for spin-polarized case, probably this do-loop will have to include
> +! also the rest of the subroutine, or something like that
> +   call read_iimat(iimat,ispin)   
> +enddo
> +
> +
> +if(debug) then 
> +   if(ionode) write(stdout,*) 'Direct_v_exc #5'
> +   if(ionode) write(stdout,*) 'a_in%numb_v=',a_in%numb_v
> +   if(ionode) write(stdout,*) 'a_in%npw=',a_in%npw
> +   if(ionode) write(stdout,*) 'iimat%np_max=',iimat%np_max
> +endif
> +
> +call initialize_vww_prod(vww)
> +call read_vww_prod(1,a_in%numb_v,a_in%npw,iimat%np_max,iimat,vww)
> +
> +!if(debug) then 
> +!   if(ionode) write(stdout,*) 'Direct_v_exc #6'
> +!endif
> +
> +! for every element iv of the excitonic wavefunction vector, here we FFT all
> +! the available v*w_iv*w_ivp(G) products, multiply by a_in_rt%ar(:,ivp)
> +! sum over ivp, and FFT back
> +
> +allocate(vwwg_t(fc%npwt,iimat%np_max))
> +!allocate(phivwwr(fc%nrxxt,a_in%numb_v))
> +allocate(evc_g(fc%ngmt_g ))
> +
> +
> +
> +a_tmp_rt%ar(1:a_tmp_rt%nrxxt,1:a_tmp_rt%numb_v) =0.d0
> +do iv=1,a_in%numb_v
> +    
> +   vwwg_t(1:fc%npwt,1:iimat%np_max)=dcmplx(0.d0,0.d0)
> +   iimax=0
> +   do ii=1,iimat%np_max
> +      if (iimat%iimat(ii,iv)>0) then
> +         iimax=iimax+1
> +      else
> +         exit
> +      endif
> +   enddo
> +   if(iimax>0) then
> +      call reorderwfp_col(iimax,vww%npw,fc%npwt,vww%vww(1,1,iv),vwwg_t, vww%npw,fc%npwt, &
> +           & ig_l2g,fc%ig_l2gt,fc%ngmt_g,mpime, nproc,intra_pool_comm )
> +   endif
> +!!!!!!!!!!!!!!!!
> +!   do ii=1,iimat%np_max
> +!      if (iimat%iimat(ii,iv)==0) exit
> +!      if(fc%dual_t==4.d0) then
> +!         vwwg_t(1:fc%npwt,ii)= vww%vww(1:fc%npwt,ii,iv)
> +!      else
> +!        call reorderwfp_col(1,vww%npw,fc%npwt,vww%vww(1,ii,iv),vwwg_t(1,ii), vww%npw,fc%npwt, &
> +!           & ig_l2g,fc%ig_l2gt,fc%ngmt_g,mpime, nproc,intra_pool_comm )
> +!
> +!      endif
> +!
> +!      
> +!   enddo
> +
> +   
> +   do ii=1,iimat%np_max,2
> +      if(debug) then
> +         if(ionode) write(stdout,*) 'ii,iv,iimat', ii,iv,iimat%iimat(ii,iv)
> +      endif 
> +      if (iimat%iimat(ii,iv)==0) exit
> +      psic(:)=(0.d0,0.d0)
> +      if ((ii==iimat%np_max).or.(iimat%iimat(ii+1,iv)==0)) then
> +          psic(fc%nlt(1:fc%npwt))  = vwwg_t(1:fc%npwt,ii)
> +          psic(fc%nltm(1:fc%npwt)) = CONJG( vwwg_t(1:fc%npwt,ii) )
> +      else      
> +          psic(fc%nlt(1:fc%npwt))=vwwg_t(1:fc%npwt,ii)+(0.d0,1.d0)*vwwg_t(1:fc%npwt,ii+1)
> +          psic(fc%nltm(1:fc%npwt))=CONJG(vwwg_t(1:fc%npwt,ii))+(0.d0,1.d0)*CONJG(vwwg_t(1:fc%npwt,ii+1))
> +      endif
> +      CALL cft3t( fc, psic, fc%nr1t, fc%nr2t, fc%nr3t, fc%nrx1t, fc%nrx2t, fc%nrx3t, 2 )
> +      a_tmp_rt%ar(1:fc%nrxxt,iv)= &
> +            &a_tmp_rt%ar(1:fc%nrxxt,iv)+DBLE(psic(1:fc%nrxxt))*a_in_rt%ar(1:a_in_rt%nrxxt,iimat%iimat(ii,iv))
> +      if ((ii/=iimat%np_max).and.(iimat%iimat(ii+1,iv)/=0)) then
> +         a_tmp_rt%ar(1:fc%nrxxt,iv)=&
> +         &a_tmp_rt%ar(1:fc%nrxxt,iv)+DIMAG(psic(1:fc%nrxxt))*a_in_rt%ar(1:a_in_rt%nrxxt,iimat%iimat(ii+1,iv))
> +      endif
> +   enddo
> +enddo
> +
> +!if(debug) then 
> +!   if(ionode) write(stdout,*) 'Direct_v_exc #7'
> +!endif
> +
> +
> +call free_memory_exc_a_r(a_in_rt)
> +
> +call fftback_a_exc(a_tmp_rt,fc,a_out)
> +
> +! free memory
> +call free_memory_exc_a_r(a_tmp_rt)
> +call free_imat(iimat)
> +call free_vww_prod(vww)
> +deallocate(vwwg_t)
> +!deallocate(vwwr_t)
> +deallocate(evc_g)
> +
> +
> +call stop_clock('direct_v_exc')
> +end subroutine
> +
> +
> 
> Added: trunk/espresso/GWW/bse/direct_w_exc.f90
> ===================================================================
> --- trunk/espresso/GWW/bse/direct_w_exc.f90	                        (rev 0)
> +++ trunk/espresso/GWW/bse/direct_w_exc.f90	2016-03-21 11:09:18 UTC (rev 12231)
> @@ -0,0 +1,273 @@
> +subroutine direct_w_exc(a_in,fc,a_out)
> +! this subroutine computes the w part of the direct term of the exc Hamiltonian
> + 
> +USE fft_custom_gwl
> +use bse_basic_structures
> +use exciton
> +USE gvect
> +use bse_wannier, ONLY: l_truncated_coulomb, &
> +           truncation_radius
> +USE constants,        ONLY : e2, fpi
> +USE cell_base,        ONLY : tpiba,omega,tpiba2
> +!USE io_files,             ONLY : find_free_unit, prefix, diropn
> +USE io_files,             ONLY :  prefix, diropn
> +USE wavefunctions_module, ONLY :  psic
> +USE io_global, ONLY : stdout, ionode, ionode_id
> +USE mp_world, ONLY : mpime, nproc
> +USE mp_pools, ONLY: intra_pool_comm
> +USE mp_wave, ONLY : mergewf,splitwf
> +USE polarization
> +USE lsda_mod, ONLY :nspin
> +USE io_global, ONLY : stdout,ionode
> +USE mp,          ONLY :mp_barrier
> +USE mp_world,             ONLY : world_comm
> +
> +
> +
> +
> +
> +implicit none
> +INTEGER, EXTERNAL :: find_free_unit
> +type(bse_z) :: z
> +type(polaw) :: pw
> +type(exc):: a_in
> +type(exc):: a_out
> +type(exc_r):: a_in_rt
> +type(exc_r):: a_tmp_rt
> +type(fft_cus) :: fc
> +type(ii_mat) :: iimat
> +
> +
> +
> +REAL(kind=DP), ALLOCATABLE :: fac(:)
> +COMPLEX(kind=DP), ALLOCATABLE :: p_basis(:,:)
> +COMPLEX(kind=DP), ALLOCATABLE :: p_basis_t(:,:)
> +REAL(kind=DP), ALLOCATABLE :: p_basis_r(:,:)
> +REAL(kind=DP), ALLOCATABLE :: zvphi(:)
> +REAL(kind=dp), ALLOCATABLE :: zvv(:)
> +COMPLEX(kind=DP), ALLOCATABLE :: evc_g(:)
> +
> +INTEGER ::iungprod
> +INTEGER :: ig,ii,iv,ispin
> +REAL(kind=DP) :: qq
> +LOGICAL       :: exst
> +
> +
> +INTEGER :: vpmax,k
> +REAL(kind=DP), allocatable :: zp(:,:)
> +REAL(kind=DP), allocatable :: pizeta(:,:)
> +REAL(kind=DP), allocatable :: vphipizeta(:,:)
> +
> +logical debug
> +
> +call start_clock('direct_w_exc')
> +
> +debug=.false.
> +
> +
> +
> +! read  iimat
> +call initialize_imat(iimat)
> +
> +do ispin=1,nspin
> +   call read_iimat(iimat,ispin) 
> +enddo
> +
> +! read z terms
> +call initialize_bse_z(z)
> +call read_z(1,iimat,z)
> +
> +
> +! get Coulomb potential
> +allocate(fac(a_in%npw))
> +if(l_truncated_coulomb) then
> +   do ig=1,a_in%npw
> +      qq = g(1,ig)**2.d0 + g(2,ig)**2.d0 + g(3,ig)**2.d0
> +      if (qq > 1.d-8) then
> +         fac(ig)=(e2*fpi/(tpiba2*qq))*(1.d0-dcos(dsqrt(qq)*truncation_radius*tpiba))
> +      else
> +         fac(ig)=e2*fpi*(truncation_radius**2.d0/2.d0)
> +     endif
> +   enddo
> +    fac(:)=fac(:)/omega
> +else
> +   fac(:)=0.d0
> +   fac(1:a_in%npw)=vg_q(1:a_in%npw)
> +endif
> +
> +
> +! read polarization basis and multiply per V
> +
> +iungprod = find_free_unit()
> +allocate(p_basis(a_in%npw,z%numw_prod))
> +CALL diropn( iungprod, 'wiwjwfc_red', a_in%npw*2, exst )
> +
> +do ii=1,z%numw_prod
> +   call davcio(p_basis(:,ii),a_in%npw*2,iungprod,ii,-1)
> +   p_basis(1:a_in%npw,ii)=p_basis(1:a_in%npw,ii)*dcmplx(fac(1:a_in%npw))
> +enddo
> +
> +call mp_barrier(world_comm)
> +
> +close(iungprod)
> +
> +! FFT to real space (dual grid)
> +allocate(p_basis_t(fc%npwt,z%numw_prod)) 
> +allocate(p_basis_r(fc%nrxxt,z%numw_prod))
> +allocate(evc_g(fc%ngmt_g ))
> +
> +if(fc%dual_t==4.d0) then
> +   p_basis_t(1:fc%npwt,1:z%numw_prod)=p_basis(1:a_in%npw,1:z%numw_prod)
> +else
> +    call reorderwfp_col(z%numw_prod,a_in%npw,fc%npwt,p_basis(1,1),p_basis_t(1,1),a_in%npw,fc%npwt, &
> +           & ig_l2g,fc%ig_l2gt,fc%ngmt_g,mpime, nproc,intra_pool_comm )
> +
> +!   do ii=1,z%numw_prod
> +!      call mergewf(p_basis(:,ii),evc_g,a_in%npw,ig_l2g,mpime,nproc,ionode_id,intra_pool_comm)
> +!      call splitwf(p_basis_t(:,ii),evc_g,fc%npwt,fc%ig_l2gt,mpime,nproc,ionode_id,intra_pool_comm)
> +!   enddo
> +endif
> +
> +deallocate(evc_g)
> +deallocate(p_basis)
> +
> +call start_clock('direct_w_cft3t')
> +do ii=1,z%numw_prod,2
> +   psic(1:fc%nrxxt)=(0.d0,0.d0)
> +   if (ii==z%numw_prod) then
> +      psic(fc%nlt(1:fc%npwt))  = p_basis_t(1:fc%npwt,ii)
> +      psic(fc%nltm(1:fc%npwt)) = CONJG( p_basis_t(1:fc%npwt,ii) )
> +   else
> +      psic(fc%nlt(1:fc%npwt))=p_basis_t(1:fc%npwt,ii)+(0.d0,1.d0)*p_basis_t(1:fc%npwt,ii+1)
> +      psic(fc%nltm(1:fc%npwt))=CONJG(p_basis_t(1:fc%npwt,ii))+(0.d0,1.d0)*CONJG(p_basis_t(1:fc%npwt,ii+1))
> +   endif
> +      CALL cft3t( fc, psic, fc%nr1t, fc%nr2t, fc%nr3t, fc%nrx1t, fc%nrx2t, fc%nrx3t, 2 )
> +       p_basis_r(1:fc%nrxxt,ii)= DBLE(psic(1:fc%nrxxt))
> +      if(ii/=z%numw_prod) p_basis_r(1:fc%nrxxt,ii+1)= DIMAG(psic(1:fc%nrxxt))
> +enddo
> +call stop_clock('direct_w_cft3t')
> +
> +deallocate(p_basis_t) 
> +
> +!read P
> +call initialize_polaw(pw)
> +call read_polaw_global(0, pw)
> +
> +
> +call mp_barrier(world_comm)
> +
> +!FFT the input excitonic vector to real space (dual grid)
> +call initialize_exc_r(a_in_rt)
> +call fft_a_exc(a_in,fc,a_in_rt)
> +
> +call mp_barrier(world_comm)
> +
> +! allocate tmp matrix
> +call initialize_exc_r(a_tmp_rt)
> +a_tmp_rt%nrxxt=fc%nrxxt 
> +a_tmp_rt%numb_v=a_in%numb_v
> +a_tmp_rt%label=12
> +allocate(a_tmp_rt%ar(a_tmp_rt%nrxxt,a_tmp_rt%numb_v))
> +
> +!compute line by line the output excitonic vector
> +
> +!!!!!!!!!!!!!!!!!dgemm subroutine!!!!!!!!!!!!!!!!!!!!!
> +call start_clock('direct_w_dgemv')
> +allocate(zp(z%numw_prod,iimat%np_max))
> +!!allocate(pizeta(z%numw_prod,iimat%np_max)) 
> +!!allocate(vphipizeta(fc%nrxxt,iimat%np_max)) 
> +!
> +a_tmp_rt%ar(1:a_tmp_rt%nrxxt,1:a_tmp_rt%numb_v) =0.d0
> +!
> +do iv=1, a_in%numb_v
> +   zp(1:z%numw_prod,1:iimat%np_max)=0.d0
> +   vpmax=0
> + 
> +   call start_clock('dgemv1')
> +   do ii=1, iimat%np_max
> +      if (iimat%iimat(ii,iv)==0) cycle
> +      vpmax=vpmax+1
> +      do k=1, z%numw_prod
> +         zp(k,ii)=z%z(k,ii,iv)
> +      enddo
> +   enddo
> +
> +   allocate(pizeta(z%numw_prod,vpmax)) 
> +   allocate(vphipizeta(fc%nrxxt,vpmax)) 
> +
> +   call stop_clock('dgemv1')
> +
> +
> +   call start_clock('dgemv2')
> +   call dgemm('N','N', z%numw_prod,vpmax, z%numw_prod,1.d0,pw%pw,z%numw_prod,zp(1,1),&
> +              z%numw_prod,0.d0,pizeta(1,1),z%numw_prod)
> +   call stop_clock('dgemv2')
> +
> +
> +   call start_clock('dgemv3')
> +   call dgemm('N','N', fc%nrxxt, vpmax, z%numw_prod,1.d0,p_basis_r(1,1),fc%nrxxt,&
> +              pizeta(1,1), z%numw_prod, 0.d0, vphipizeta(1,1),fc%nrxxt)
> +   call stop_clock('dgemv3')
> +
> +!!     sum up
> +   call start_clock('dgemv4')
> +   do ii=1,vpmax
> +      a_tmp_rt%ar(1:fc%nrxxt,iv)= &
> +            &a_tmp_rt%ar(1:fc%nrxxt,iv)+a_in_rt%ar(1:fc%nrxxt,iimat%iimat(ii,iv))*vphipizeta(1:fc%nrxxt,ii)
> +   enddo
> +!
> +   call stop_clock('dgemv4')
> +!
> +!
> +   deallocate(pizeta)
> +   deallocate(vphipizeta)
> +call mp_barrier(world_comm)
> +enddo
> +call stop_clock('direct_w_dgemv')
> +
> +deallocate(zp)
> +
> +!!!!!!!!!!!!!!!!!!dgemv subroutine!!!!!!!!!!!!!!!!!!!!!
> +!call start_clock('direct_w_dgemv')
> +!allocate(zvv(z%numw_prod))
> +!allocate(zvphi(fc%nrxxt))
> +!a_tmp_rt%ar(1:a_tmp_rt%nrxxt,1:a_tmp_rt%numb_v) =0.d0
> +!!
> +!do iv=1, a_in%numb_v
> +!   do ii=1, iimat%np_max !ii is ivp
> +!      if (iimat%iimat(ii,iv)==0) exit
> +!      call start_clock('dgemv1')
> +!      call dgemv('N',z%numw_prod,z%numw_prod,1.d0,pw%pw,z%numw_prod,z%z(1,ii,iv),1,0.d0,zvv,1)
> +!      call stop_clock('dgemv1')
> +!      call start_clock('dgemv2')
> +!      call dgemv('N',fc%nrxxt,z%numw_prod,1.d0,p_basis_r,fc%nrxxt,zvv,1,0.d0,zvphi,1)
> +!      call stop_clock('dgemv2')
> +!!     sum up
> +!      call start_clock('dgemv3')
> +!      a_tmp_rt%ar(1:fc%nrxxt,iv)= &
> +!            &a_tmp_rt%ar(1:fc%nrxxt,iv)+a_in_rt%ar(1:a_in_rt%nrxxt,iimat%iimat(ii,iv))*zvphi(1:fc%nrxxt)
> +!      call stop_clock('dgemv3')
> +!   enddo
> +!enddo
> +!call stop_clock('direct_w_dgemv')
> +
> +!deallocate(zvv)
> +!deallocate(zvphi)
> +
> +call free_memory_exc_a_r(a_in_rt)
> +call free_bse_z(z)
> +call free_memory_polaw(pw)
> +call free_imat(iimat)
> +
> +call start_clock('wdirect_fftback')
> +!FFT back to provide the output excitonic wave vector in G-space
> +call fftback_a_exc(a_tmp_rt,fc,a_out)
> +call stop_clock('wdirect_fftback')
> +
> +call free_memory_exc_a_r(a_tmp_rt)
> +
> +call stop_clock('direct_w_exc')
> +
> +return
> +end subroutine
> +
> 
> Added: trunk/espresso/GWW/bse/dvpsi_bse.f90
> ===================================================================
> --- trunk/espresso/GWW/bse/dvpsi_bse.f90	                        (rev 0)
> +++ trunk/espresso/GWW/bse/dvpsi_bse.f90	2016-03-21 11:09:18 UTC (rev 12231)
> @@ -0,0 +1,254 @@
> +!
> +! Copyright (C) 2003-2007 Quantum ESPRESSO group
> +! This file is distributed under the terms of the
> +! GNU General Public License. See the file `License'
> +! in the root directory of the present distribution,
> +! or http://www.gnu.org/copyleft/gpl.txt .
> +!
> +!
> +!----------------------------------------------------------------------
> +SUBROUTINE dvpsi_e(kpoint,ipol,dvpsi2)
> +  !----------------------------------------------------------------------
> +  ! MARGHE: DA RIPULIRE E OTTIMIZZARE
> +  ! Calculates x * psi_k  for each k-points and for the 3 polarizations
> +  ! Requires on input: vkb, evc, igk
> +  !
> +  USE ions_base, ONLY : ntyp => nsp, nat, ityp
> +  USE kinds, ONLY: DP
> +  USE pwcom
> +  USE uspp, ONLY: nkb, vkb, dvan
> +  USE uspp_param, ONLY: nh
> +  USE wavefunctions_module,  ONLY: evc
> +  USE wvfct,    ONLY : nbnd, npwx,et
> +  USE becmod, ONLY: bec_type, becp, calbec, allocate_bec_type, deallocate_bec_type
> +  USE io_files, ONLY: nwordwfc,iunwfc
> +  USE lsda_mod,              ONLY : nspin
> +  USE control_flags,    ONLY : gamma_only
> +  USE io_global,   ONLY : ionode,stdout
> +  USE gvect,                 ONLY : gstart
> +  USE mp, ONLY : mp_sum, mp_barrier
> +  USE mp_world,             ONLY : world_comm
> +  use bse_wannier, ONLY:num_nbndv
> +  USE gvecw,              ONLY :  ecutwfc
> +!  USE cgcom
> +  !
> +  IMPLICIT NONE
> +  INTEGER :: kpoint, ipol,is,niter_ph
> +  INTEGER :: i,l, na,nt, ibnd,jbnd, info, ih,jkb, iter
> +  real(DP) :: upol(3,3),tr2_ph
> +  real(DP), ALLOCATABLE :: gk(:,:), q(:), overlap(:,:), &
> +       becp_(:,:), dbec(:,:), dbec_(:,:)
> +  real(DP), ALLOCATABLE :: becp2_(:,:), dbec2(:,:), dbec2_(:,:)
> +  COMPLEX(DP), ALLOCATABLE :: dvkb(:,:), dvkb1(:,:), work(:,:), &
> +       &           gr(:,:), h(:,:)
> +  COMPLEX(DP), ALLOCATABLE:: dpsi(:,:), evc2(:,:),dpsi2(:,:)
> +  COMPLEX(DP) :: dvpsi(npwx , nbnd)
> +  COMPLEX(DP) :: dvpsi2(npwx , num_nbndv(1))
> +  
> +  LOGICAL:: precondition, orthonormal,startwith0,debug
> +  EXTERNAL H_h
> +  data upol /1.0d0,0.0d0,0.0d0, 0.0d0,1.0d0,0.0d0, 0.0d0,0.0d0,1.0d0/
> +  real(kind=DP), allocatable :: omat(:,:)
> +  integer j
> +
> +  !
> +  CALL start_clock('dvpsi_e')
> +
> +  CALL gk_sort (xk(1,kpoint),ngm,g,ecutwfc/tpiba2,npw,igk,g2kin)
> +  CALL init_us_2 (npw, igk, xk(1,kpoint), vkb)
> +  
> +  gamma_only=.true.
> +  debug=.true.
> +
> +
> +  allocate(omat(nbnd,nbnd))
> +
> +  allocate( evc( npwx, nbnd ) )
> +  allocate( evc2( npwx, num_nbndv(1) ) )
> +  do is=1,nspin
> +     call davcio(evc,2*nwordwfc,iunwfc,is,-1) 
> +  enddo
> +  evc2(1:npwx,1:num_nbndv(1))=evc(1:npwx, 1:num_nbndv(1))
> +
> +  !
> +  !   becp contains <beta|psi> - used in H_h
> +  !
> +  CALL allocate_bec_type ( nkb, nbnd, becp )
> +  ALLOCATE ( gk   ( 3, npwx) )
> +  ALLOCATE ( dvkb ( npwx, nkb) )
> +  ALLOCATE ( dvkb1( npwx, nkb) )
> +  ALLOCATE ( becp_(nkb,nbnd), dbec ( nkb, nbnd), dbec_(nkb, nbnd) )
> +  ALLOCATE ( becp2_(nkb,num_nbndv(1)), dbec2 ( nkb, num_nbndv(1)), dbec2_(nkb, num_nbndv(1)) )
> +  ALLOCATE ( dpsi (npwx , nbnd))
> +  ALLOCATE ( dpsi2 (npwx , num_nbndv(1)))
> +  !
> +  DO i = 1,npw
> +     gk(1,i) = (xk(1,kpoint)+g(1,igk(i)))*tpiba
> +     gk(2,i) = (xk(2,kpoint)+g(2,igk(i)))*tpiba
> +     gk(3,i) = (xk(3,kpoint)+g(3,igk(i)))*tpiba
> +     g2kin(i)= gk(1,i)**2 + gk(2,i)**2 + gk(3,i)**2
> +  ENDDO
> +  !
> +  !  this is  the kinetic contribution to [H,x]:  -2i (k+G)_ipol * psi
> +  !
> +  dpsi(1:npwx,1:nbnd)=(0.d0,0.d0)  
> +  dpsi2(1:npwx,1:num_nbndv(1))=(0.d0,0.d0)  
> +  
> +  DO ibnd = 1,nbnd
> +     DO i = 1,npw
> +        dpsi(i,ibnd) = gk(ipol,i)*(0.0d0,-2.0d0) * evc(i,ibnd)
> +     ENDDO
> +  ENDDO
> +  DO ibnd = 1,num_nbndv(1)
> +     DO i = 1,npw
> +        dpsi2(i,ibnd) = gk(ipol,i)*(0.0d0,-2.0d0) * evc2(i,ibnd)
> +     ENDDO
> +  ENDDO
> +
> +  !
> +  DO i = 1,npw
> +     IF (g2kin(i)>1.0d-10) THEN
> +        gk(1,i) = gk(1,i)/sqrt(g2kin(i))
> +        gk(2,i) = gk(2,i)/sqrt(g2kin(i))
> +        gk(3,i) = gk(3,i)/sqrt(g2kin(i))
> +     ENDIF
> +  ENDDO
> +  !
> +  ! and these are the contributions from nonlocal pseudopotentials
> +  ! ( upol(3,3) are the three unit vectors along x,y,z)
> +  !
> +
> +
> +  CALL gen_us_dj(kpoint,dvkb)
> +  CALL gen_us_dy(kpoint,upol(1,ipol),dvkb1)
> +
> +  !
> +  DO jkb = 1, nkb
> +     DO i = 1,npw
> +        dvkb(i,jkb) =(0.d0,-1.d0)*(dvkb1(i,jkb) + dvkb(i,jkb)*gk(ipol,i))
> +     ENDDO
> +  ENDDO
> +  !
> +  CALL calbec ( npw,  vkb, evc,  becp )
> +  CALL calbec ( npw, dvkb, evc,  dbec )
> +  !
> +  jkb = 0
> +  DO nt=1, ntyp
> +     DO na = 1,nat
> +        IF (nt==ityp(na)) THEN
> +           DO ih=1,nh(nt)
> +              jkb=jkb+1
> +              DO ibnd = 1,nbnd
> +                 dbec_(jkb,ibnd) = dbec(jkb,ibnd)*dvan(ih,ih,nt)
> +!                     if(ionode) write(*,*) 'dbec(j,ib)',  dbec(jkb,ibnd)
> +!                     if(ionode) write(*,*) 'dvan(ih,ih,nt)', dvan(ih,ih,nt)
> +                 becp_(jkb,ibnd) =becp%r(jkb,ibnd)*dvan(ih,ih,nt)
> +              ENDDO
> +           ENDDO
> +        ENDIF
> +     ENDDO
> +  ENDDO
> +
> + dbec2_(1:nkb,1:num_nbndv(1))=dbec_(1:nkb,1:num_nbndv(1)) 
> + becp2_(1:nkb,1:num_nbndv(1))=becp_(1:nkb,1:num_nbndv(1)) 
> +
> +
> +
> +  !
> +  IF (jkb/=nkb) CALL errore('dvpsi_e','unexpected error',1)
> +  !
> +  CALL dgemm ('N', 'N', 2*npw, nbnd, nkb,-1.d0, vkb, &
> +       2*npwx, dbec_, nkb, 1.d0, dpsi, 2*npwx)
> +
> +  CALL dgemm ('N', 'N', 2*npw, num_nbndv(1), nkb,-1.d0, vkb, &
> +       2*npwx, dbec2_, nkb, 1.d0, dpsi2, 2*npwx)
> +
> +  FLUSH( stdout )
> +
> +  CALL dgemm ('N', 'N', 2*npw, nbnd, nkb, 1.d0,dvkb, &
> +       2*npwx, becp_, nkb, 1.d0, dpsi, 2*npwx)
> +
> +  CALL dgemm ('N', 'N', 2*npw, num_nbndv(1), nkb, 1.d0,dvkb, &
> +       2*npwx, becp2_, nkb, 1.d0, dpsi2, 2*npwx)
> +  !
> +  DEALLOCATE(dbec, dbec_, becp_)
> +  DEALLOCATE(dbec2, dbec2_, becp2_)
> +  DEALLOCATE(dvkb1)
> +  DEALLOCATE(dvkb)
> +  DEALLOCATE(gk)
> +
> +  !
> +  !   dpsi contains now [H,x] psi_v  for the three cartesian polarizations.
> +  !   Now solve the linear systems (H-e_v)*(x*psi_v) = [H,x]*psi_v
> +  !
> +!  ALLOCATE  ( overlap( nbnd, nbnd))
> +!  ALLOCATE  ( work(npwx, nbnd))
> +!  ALLOCATE  ( gr( npwx, nbnd))
> +!  ALLOCATE  ( h ( npwx, nbnd))
> +!  ALLOCATE  ( q ( npwx))
> +  !
> +  ALLOCATE  ( overlap( num_nbndv(1), num_nbndv(1)))
> +  ALLOCATE  ( work(npwx, num_nbndv(1)))
> +  ALLOCATE  ( gr( npwx, num_nbndv(1)))
> +  ALLOCATE  ( h ( npwx, num_nbndv(1)))
> +  ALLOCATE  ( q ( npwx))
> +!
> +  orthonormal = .false.
> +  precondition= .true.
> +  !
> +  IF (precondition) THEN
> +     DO i = 1,npw
> +        q(i) = 1.0d0/max(1.d0,g2kin(i))
> +     ENDDO
> +     CALL zvscal(npw,npwx,num_nbndv(1),q,evc2,work)
> +     CALL calbec ( npw, work, evc2, overlap)
> +     CALL DPOTRF('U',num_nbndv(1),overlap,num_nbndv(1),info)
> +     IF (info/=0) CALL errore('solve_ph','cannot factorize',info)
> +  ENDIF
> +  !
> +  startwith0= .true.
> +  dvpsi2(:,:) = (0.d0, 0.d0)
> +  niter_ph = 50
> +  tr2_ph = 1.0d-12
> +  !
> +  CALL cgsolve (npw,evc2,npwx,num_nbndv(1),overlap,num_nbndv(1),   &
> +       orthonormal,precondition,q,startwith0,et(1,1),&
> +       dpsi2,gr,h,dpsi2,work,niter_ph,tr2_ph,iter,dvpsi2)
> +
> +!  IF (precondition) THEN
> +!     DO i = 1,npw
> +!        q(i) = 1.0d0/max(1.d0,g2kin(i))
> +!     ENDDO
> +!     CALL zvscal(npw,npwx,nbnd,q,evc,work)
> +!     CALL calbec ( npw, work, evc, overlap)
> +!     CALL DPOTRF('U',nbnd,overlap,nbnd,info)
> +!     IF (info/=0) CALL errore('solve_ph','cannot factorize',info)
> +!  ENDIF
> +!  !
> +!  startwith0= .true.
> +!  dvpsi(:,:) = (0.d0, 0.d0)
> +!  niter_ph = 50
> +!  tr2_ph = 1.0d-12
> +  !
> +!  CALL cgsolve (H_h,npw,evc,npwx,nbnd,overlap,nbnd,   &
> +!       orthonormal,precondition,q,startwith0,et(1,1),&
> +!       dpsi,gr,h,dpsi,work,niter_ph,tr2_ph,iter,dvpsi)
> +  !
> +  DEALLOCATE(q)
> +  DEALLOCATE(h)
> +  DEALLOCATE(gr)
> +  DEALLOCATE(work)
> +  DEALLOCATE(overlap)
> +  DEALLOCATE(dpsi)
> +  DEALLOCATE(evc)
> +  DEALLOCATE(dpsi2)
> +  DEALLOCATE(evc2)
> +  DEALLOCATE(omat)
> +
> +  CALL deallocate_bec_type ( becp )
> +  
> +  !
> +  CALL stop_clock('dvpsi_e')
> +  !
> +  RETURN
> +END SUBROUTINE dvpsi_e
> 
> Added: trunk/espresso/GWW/bse/exc_h_a.f90
> ===================================================================
> --- trunk/espresso/GWW/bse/exc_h_a.f90	                        (rev 0)
> +++ trunk/espresso/GWW/bse/exc_h_a.f90	2016-03-21 11:09:18 UTC (rev 12231)
> @@ -0,0 +1,222 @@
> +subroutine exc_h_a(a_in,a_out,vstate,vstate_r,cstate,wcstate,fc) 
> +!this subroutine applies the excitonic hamiltonian exc_h_a on a given vector
> +!(a_in) and returns the transformed vector (a_out)
> +!if l_gtrick==.true. uses vstate only and does not use vstate_r
> +
> +use exciton
> +use bse_basic_structures
> +
> +use bse_wannier, ONLY:num_nbndv,l_truncated_coulomb, &
> +                      truncation_radius,l_fullbse,l_tdhf,l_lf,l_rpa,&
> +                      l_contraction
> +use pwcom
> +USE wvfct,     ONLY : npwx
> +!use io_files,  ONLY : find_free_unit,diropn
> +use io_files,  ONLY : diropn
> +USE io_global, ONLY : stdout,ionode
> +USE fft_custom_gwl
> +USE mp,          ONLY :mp_barrier
> +USE mp_world,             ONLY : world_comm
> +USE contract_w
> +
> +implicit none
> +INTEGER, EXTERNAL :: find_free_unit
> +
> +type(exc) :: a_in
> +type(exc) :: a_out ! to be initialized and allocated outside this subroutine
> +type(v_state) :: vstate
> +type(v_state_r) :: vstate_r
> +type(c_state) :: cstate
> +type(c_state) :: wcstate
> +type(fft_cus) :: fc
> +
> +type(exc) :: a_excdiago,a_exchange 
> +type(exc):: a_dirv
> +type(exc):: a_dirw
> +type(exc):: a_rot
> +
> +
> +integer iuv
> +logical exst
> +logical debug
> +
> +!variables introduced for debug purposes
> +real(kind=DP) prod_test1 
> +real(kind=DP) prod_test2 
> +
> +call start_clock('exc_h_a')
> +debug=.true.
> +
> +  if(debug) then 
> +     write(stdout,*) 'Starting exc_h_a subroutine'
> +  endif
> +
> +  call mp_barrier(world_comm)
> +
> +! initialize and nullify all needed excitonic vectors
> +  call initialize_exc(a_excdiago)
> +  a_excdiago%label=2
> +  a_excdiago%npw=npw
> +  a_excdiago%numb_v=num_nbndv(1)
> +  allocate(a_excdiago%a(a_excdiago%npw,a_excdiago%numb_v))
> +  a_excdiago%a(1:a_excdiago%npw,1:a_excdiago%numb_v)=dcmplx(0.d0,0.d0)
> +
> +  call initialize_exc(a_exchange)
> +  a_exchange%label=3
> +  a_exchange%npw=npw
> +  a_exchange%numb_v=num_nbndv(1)
> +  allocate(a_exchange%a(a_exchange%npw,a_exchange%numb_v))
> +  a_exchange%a(1:a_exchange%npw,1:a_exchange%numb_v)=dcmplx(0.d0,0.d0)
> +
> +  call initialize_exc(a_dirv)
> +  a_dirv%label=4
> +  a_dirv%npw=npw
> +  a_dirv%numb_v=num_nbndv(1)
> +  allocate(a_dirv%a(a_dirv%npw,a_dirv%numb_v))
> +  a_dirv%a(1:a_dirv%npw,1:a_dirv%numb_v)=dcmplx(0.d0,0.d0)
> +
> +  call initialize_exc(a_dirw)
> +  a_dirw%label=6
> +  a_dirw%npw=npw
> +  a_dirw%numb_v=num_nbndv(1)
> +  allocate(a_dirw%a(a_dirw%npw,a_dirw%numb_v))
> +  a_dirw%a(1:a_dirw%npw,1:a_dirw%numb_v)=dcmplx(0.d0,0.d0)
> +
> +! apply the diagonal part of the excitonic Hamiltonian to a copy of a_exc
> +  a_excdiago%a(1:a_excdiago%npw,1:a_excdiago%numb_v)=a_in%a(1:a_in%npw,1:a_in%numb_v)
> +  call diago_exc(a_excdiago,vstate,cstate,wcstate)
> +
> +  if(debug) then 
> +     write(stdout,*) 'Diagonal part computed'
> +  endif
> +  call mp_barrier(world_comm)
> +  call initialize_exc(a_rot)
> +
> +! apply the exchange term of the Hamiltonian
> +  if(.not.l_rpa) then
> +!     if(.not.l_truncated_coulomb) then
> +!        iuv = find_free_unit()
> +!        CALL diropn( iuv, 'vgq', npwx, exst )
> +!        CALL davcio(vg_q,npwx,iuv,1,-1)
> +!        close(iuv)
> +!     endif
> +
> +
> +     if(debug) then 
> +        write(stdout,*) 'vg_q read'
> +     call mp_barrier(world_comm)
> +     endif
> +
> +     call exchange_exc(a_in,vstate,vstate_r,fc,a_exchange)
> +
> +     if(debug) then 
> +        write(stdout,*) 'Exchange part computed'
> +     endif
> +
> +     call mp_barrier(world_comm)
> +
> +!    apply the direct term of the Hamiltonian (v part)
> +     
> +     if(.not.l_lf) then
> +   
> +        call initialize_exc(a_rot)
> +        a_rot%label=5
> +        a_rot%npw=npw
> +        a_rot%numb_v=num_nbndv(1)
> +        allocate(a_rot%a(a_rot%npw,a_rot%numb_v))
> +
> +!       first rotate the excitonic wave function wave vector to use the wannier
> +!       wavefunctions
> +        if(debug) write(stdout,*)  'DEBUG1'
> +        Call urot_a(a_in,a_rot,1)
> +        if(debug) write(stdout,*)  'DEBUG2'
> +        if(.not.l_contraction) then
> +           call direct_v_exc(a_rot,fc,a_dirv)
> +        else
> +           call contract_v_apply(a_rot,fc,a_dirv)
> +        endif
> +        if(debug) write(stdout,*)  'DEBUG3'
> +        call pc_operator_exc(a_dirv,vstate,1)
> +        if(debug) write(stdout,*)  'DEBUG4'
> +!        and rotate back
> +        call urot_a(a_dirv,a_rot,0)
> +        a_dirv%a(1:a_dirv%npw,1:a_dirv%numb_v)=a_rot%a(1:a_rot%npw,1:a_rot%numb_v)
> +
> +        if(debug) write(stdout,*)  'DEBUG5'
> +        call mp_barrier(world_comm)
> +!
> +        if(debug) then 
> +            write(stdout,*) 'After direct_v_exc'
> +        endif
> +
> +! apply the direct term of the excitonic Hamiltonian (Wc part)
> +
> +  
> +        if(.not.l_tdhf) then
> +           call urot_a(a_in,a_rot,1)
> +
> +           if(debug) then 
> +              write(stdout,*) 'Before direct_W_exc'
> +           endif
> +
> +           !if(.true.) then !DEBUG
> +           if(.not.l_contraction) then
> +              call direct_w_exc(a_rot,fc,a_dirw)
> +           else
> +              call contract_w_apply(a_rot,fc,a_dirw)
> +           endif
> +           call pc_operator_exc(a_dirw,vstate,1)
> +
> +!           and rotate back
> +           call urot_a(a_dirw,a_rot,0)
> +           a_dirw%a(1:a_dirw%npw,1:a_dirw%numb_v)=a_rot%a(1:a_rot%npw,1:a_rot%numb_v)
> +
> +           call mp_barrier(world_comm)
> +           if(debug) then 
> +              write(stdout,*) 'After direct_W_exc'
> +           endif
> +
> +
> +
> +
> +        endif ! .not.l_tdhf
> +     endif ! .not.l_lf
> +  endif  ! .not. l_rpa  
> +
> +! sum up all the terms
> +! only valid for spin-singlet class of solutions (non-spin polarized case)
> +  a_out%a(1:a_out%npw,1:a_out%numb_v)=a_excdiago%a(1:a_out%npw,1:a_out%numb_v)&
> +                                      -a_dirv%a(1:a_out%npw,1:a_out%numb_v)&
> +                                     -a_dirw%a(1:a_out%npw,1:a_out%numb_v)&
> +                                     +2.d0*a_exchange%a(1:a_out%npw,1:a_out%numb_v)
> +
> +! free memory
> +  call free_memory_exc_a(a_rot)
> +  call free_memory_exc_a(a_excdiago)
> +  call free_memory_exc_a(a_exchange)
> +  call free_memory_exc_a(a_dirv)
> +  call free_memory_exc_a(a_dirw)
> +
> +
> +
> +  call stop_clock('exc_h_a')
> +
> +if(debug) then
> +  call print_clock('exc_h_a')
> +  call print_clock('diago_exc')
> +  call print_clock('exchange_exc')
> +  call print_clock('direct_v_exc')
> +  call print_clock('direct_w_contract')
> +  call print_clock('wdirect_fftback')
> +  call print_clock('contract_w_dgemv')
> +  call print_clock('direct_v_contract')
> +  call print_clock('d_v_fft')
> +endif
> +
> + return
> +end subroutine exc_h_a
> +
> +
> +
> +
> +
> 
> Added: trunk/espresso/GWW/bse/exchange_exc.f90
> ===================================================================
> --- trunk/espresso/GWW/bse/exchange_exc.f90	                        (rev 0)
> +++ trunk/espresso/GWW/bse/exchange_exc.f90	2016-03-21 11:09:18 UTC (rev 12231)
> @@ -0,0 +1,188 @@
> +subroutine exchange_exc(a_in,v,v_rt,fc,a_out)
> +! this subroutine applies the exchange term of the Hamiltonian to the excitonic
> +! wavefuntion vector a 
> +!if l_gtrick==.true. uses vonly and does not use v_rt     
> +
> +USE kinds,            ONLY : DP
> +USE exciton
> +use bse_basic_structures
> +use bse_wannier,      ONLY: l_truncated_coulomb, &
> +                             truncation_radius, &
> +                             dual_bse
> +USE gvect
> +USE constants,        ONLY : e2, fpi
> +USE cell_base,        ONLY : tpiba,omega,tpiba2
> +!USE mp_wave,          ONLY : mergewf,splitwf
> +USE fft_custom_gwl
> +USE gvecw,              ONLY : ecutwfc
> +USE io_global, ONLY : stdout, ionode, ionode_id
> +USE mp_world, ONLY : mpime, nproc
> +USE mp_pools, ONLY: intra_pool_comm
> +USE mp_world,             ONLY : world_comm
> +USE wavefunctions_module, ONLY :  psic
> +USE mp,          ONLY :mp_barrier
> +
> +!USE io_files,         ONLY : find_free_unit
> +
> +
> +implicit none
> +INTEGER, EXTERNAL :: find_free_unit
> +type(exc) :: a_in,a_out
> +type(exc_r) :: a_rt
> +type(v_state) :: v
> +type(v_state_r) :: v_rt
> +type(fft_cus) :: fc
> +
> +
> +REAL(kind=DP), ALLOCATABLE :: fac(:)
> +COMPLEX(kind=DP), ALLOCATABLE :: fac_t(:)
> +COMPLEX(kind=DP), ALLOCATABLE :: cfac(:)
> +COMPLEX(kind=DP), ALLOCATABLE :: evc_g(:)
> +REAL(kind=DP) :: qq
> +integer ig,iv,iunu
> +COMPLEX(kind=DP), allocatable :: psiv_phiv(:)
> +
> +logical :: debug
> +
> +call start_clock('exchange_exc')
> +debug=.false.
> +
> +if(debug) then 
> +  write(*,*) 'Starting to compute the exchange term'
> +  call mp_barrier(world_comm)
> +endif
> +
> +allocate(fac(a_in%npw))
> +allocate(cfac(a_in%npw))
> +
> +if(l_truncated_coulomb) then
> +   do ig=1,a_in%npw
> +     qq = g(1,ig)**2.d0 + g(2,ig)**2.d0 + g(3,ig)**2.d0
> +     if (qq > 1.d-8) then
> +        fac(ig)=(e2*fpi/(tpiba2*qq))*(1.d0-dcos(dsqrt(qq)*truncation_radius*tpiba))
> +     else
> +        fac(ig)=e2*fpi*(truncation_radius**2.d0/2.d0)
> +     endif
> +   enddo
> +   fac(:)=fac(:)/omega
> +else
> +   fac(:)=0.d0
> +   fac(1:a_in%npw)=vg_q(1:a_in%npw)
> +   !set the 0 component of v to 0 to have vbar (see S. Albrecht phD thesis or Onida,Reining,and Rubio RMP)
> +   if (v%gstart==2) fac(1)=0.d0
> +endif
> +
> +cfac(1:a_in%npw)=dcmplx(fac(1:a_in%npw))
> +
> +
> +if(debug) then 
> +  write(*,*) 'vbar built'
> +  call mp_barrier(world_comm)
> +endif
> +
> +! now distribuite the G vector in the dual grid order
> +
> +allocate(fac_t(fc%npwt))
> +allocate(evc_g(fc%ngmt_g))
> +call reorderwfp_col(1,a_in%npw,fc%npwt,cfac(1),fac_t(1),a_in%npw,fc%npwt, &
> +     & ig_l2g,fc%ig_l2gt,fc%ngmt_g,mpime, nproc,intra_pool_comm )
> +
> +!call  mergewf(cfac,evc_g,a_in%npw,ig_l2g,mpime,nproc,ionode_id,intra_pool_comm)
> +!call  splitwf(fac_t,evc_g,fc%npwt,fc%ig_l2gt,mpime,nproc,ionode_id,intra_pool_comm)
> +deallocate(evc_g)
> +
> +
> +if(debug) then 
> +  write(*,*) 'vbar distributed'
> +  call mp_barrier(world_comm)
> +endif
> +
> +!FFT the excitonic wavefunction vector to real space (dual grid)
> +call initialize_exc_r(a_rt)
> +call fft_a_exc(a_in,fc,a_rt)
> +
> +if(debug) then 
> +  write(*,*) 'fft exc state'
> +  call mp_barrier(world_comm)
> +endif
> +!compute the psi_iv(r)*phi_iv(r) product in r-space (dual grid) for every iv 
> +!FFT back to G space (dual grid order), multiply with vbar, and sum over iv
> + 
> +!be careful! for the moment there is no spin structure in the a_in_r object so this works
> +!only for nspin==1
> +
> +allocate(psiv_phiv(fc%npwt))
> +
> +psiv_phiv(1:fc%npwt)=(0.d0,0.d0)
> +
> +do iv=1,a_in%numb_v,2
> +   if(debug) then 
> +     write(*,*) 'exchange term main loop iv=',iv
> +     write(*,*) 'a_in%numb_v=',a_in%numb_v
> +     write(*,*) 'a_rt%nrxxt=',a_rt%nrxxt
> +!     call mp_barrier
> +   endif
> +   a_rt%ar(1:a_rt%nrxxt,iv) = a_rt%ar(1:a_rt%nrxxt,iv)*v_rt%wfnrt(1:v_rt%nrxxt,iv,1)
> +   if(iv/=a_in%numb_v) a_rt%ar(1:a_rt%nrxxt,iv+1) = a_rt%ar(1:a_rt%nrxxt,iv+1)*v_rt%wfnrt(1:v_rt%nrxxt,iv+1,1)
> +   if (iv==a_in%numb_v) then 
> +      psic(1:fc%nrxxt)=dcmplx(a_rt%ar(1:fc%nrxxt,iv),0.d0)
> +   else
> +      psic(1:fc%nrxxt)=dcmplx(a_rt%ar(1:fc%nrxxt,iv),a_rt%ar(1:fc%nrxxt,iv+1))
> +   endif 
> +   if(debug) then 
> +     write(*,*) 'before fft'
> +     write(*,*) 'fc%nr1t',fc%nr1t
> +     write(*,*) 'fc%nr2t',fc%nr2t
> +     write(*,*) 'fc%nr3t',fc%nr3t
> +     write(*,*) 'fc%nrx1t',fc%nrx1t
> +     write(*,*) 'fc%nrx2t',fc%nrx2t
> +     write(*,*) 'fc%nrx3t',fc%nrx3t
> +!     call mp_barrier
> +   endif
> +   CALL cft3t(fc, psic, fc%nr1t, fc%nr2t, fc%nr3t, fc%nrx1t, fc%nrx2t, fc%nrx3t, -2 )
> +   if(debug) then 
> +     write(*,*) 'after fft'
> +     call mp_barrier(world_comm)
> +   endif
> +   if (iv==a_in%numb_v) then 
> +      psiv_phiv(1:fc%npwt)= psiv_phiv(1:fc%npwt)+psic(fc%nlt(1:fc%npwt))*fac_t(1:fc%npwt)
> +   else
> +      psiv_phiv(1:fc%npwt)= psiv_phiv(1:fc%npwt)+&
> +           &0.5*(psic(fc%nlt(1:fc%npwt))+conjg( psic(fc%nltm(1:fc%npwt))))*fac_t(1:fc%npwt)+&
> +           &(0.d0,-0.5d0)*(psic(fc%nlt(1:fc%npwt))-conjg(psic(fc%nltm(1:fc%npwt))))*fac_t(1:fc%npwt)
> +   endif
> +   if(debug) then 
> +     write(*,*) 'end of loop'
> +     call mp_barrier(world_comm)
> +   endif
> +enddo
> +
> +if(fc%gstart_t==2) psiv_phiv(1)= (0.d0,0.d0)
> +
> +!FFT to real space and multiply by valence state wavefunction vector to create
> +!the components of the excitonic vector and then FFT back 
> +
> +psic(:)=0.d0
> +psic(fc%nlt(1:fc%npwt))  = psiv_phiv(1:fc%npwt)
> +psic(fc%nltm(1:fc%npwt)) = CONJG(psiv_phiv(1:fc%npwt))
> +
> +CALL cft3t( fc, psic, fc%nr1t, fc%nr2t, fc%nr3t, fc%nrx1t, fc%nrx2t, fc%nrx3t, 2 )
> +
> +do iv=1,a_in%numb_v
> +    a_rt%ar(1:a_rt%nrxxt,iv)=v_rt%wfnrt(1:v_rt%nrxxt,iv,1)*dble(psic(1:fc%nrxxt))
> +enddo
> +
> +call fftback_a_exc(a_rt,fc,a_out)
> +
> +! project into the conduction band manifold
> +
> +call pc_operator_exc(a_out,v,1)
> +
> +deallocate(fac_t)
> +call free_memory_exc_a_r(a_rt)
> +
> +call stop_clock('exchange_exc')
> +
> +return
> +end subroutine
> +
> 
> Added: trunk/espresso/GWW/bse/exciton.f90
> ===================================================================
> --- trunk/espresso/GWW/bse/exciton.f90	                        (rev 0)
> +++ trunk/espresso/GWW/bse/exciton.f90	2016-03-21 11:09:18 UTC (rev 12231)
> @@ -0,0 +1,621 @@
> +module exciton
> +! this module cointains variables and subroutines related to 
> +! the excitonic wave functions and energies
> +
> +USE kinds, ONLY : DP
> +
> +type exc
> +!excitonic wavefunction vector in G space
> +    integer npw ! number of plane waves per processor
> +    integer numb_v ! number of valnce state
> +!    real(kind=dp), dimension (:), pointer :: e_ks ! Kohn-Sham energy of valence states  
> +    complex(kind=dp), dimension (:,:), pointer :: a ! vector on which Hexc can be applied  a(npw,nbnd_v)
> +    INTEGER :: label!label to read/write to disk
> +    real(kind=DP) :: e ! energy of the excitonic eigenstate
> +end type exc
> +
> +type exc_r
> +!excitonic wavefunction vector in real space (double grid)
> +    integer nrxxt ! number of plane waves per processor
> +    integer numb_v ! number of valnce state
> +!    real(kind=dp), dimension (:), pointer :: e_ks ! Kohn-Sham energy of valence states  
> +    real(kind=dp), dimension (:,:), pointer :: ar ! vector on which Hexc can be applied  a(npw,nbnd_v)
> +    INTEGER :: label!label to read/write to disk
> +end type exc_r
> +
> +type spectrum
> +!collection of eigenvalues and optical amplitudes 
> +    integer neig
> +    real(kind=dp), dimension(:), pointer:: en 
> +    real(kind=dp), dimension(:,:), pointer:: a
> +end type spectrum
> +
> +
> +type(exc), allocatable :: bse_spectrum(:)
> +!type(exc), dimension(:), pointer :: bse_spectrum
> +!this variable contains the excitonic eigenvectors and eigenvalues
> +!to be deleted probably for memory issues
> +
> +
> +
> +contains
> +
> +   SUBROUTINE initialize_exc(a)
> +   !this subroutine initializes exc
> +      implicit none
> +      TYPE(exc) :: a
> +      nullify(a%a)
> +!      nullify(a%e_ks)
> +      return
> +   END SUBROUTINE 
> +
> +   SUBROUTINE initialize_exc_r(a_r)
> +   !this subroutine initializes exc_r
> +      implicit none
> +      TYPE(exc_r) :: a_r
> +      nullify(a_r%ar)
> +!      nullify(a%e_ks)
> +      return
> +   END SUBROUTINE 
> +
> +   SUBROUTINE initialize_spectrum(s)
> +   !this subroutine initializes spectrum
> +      implicit none
> +      TYPE(spectrum) :: s
> +      nullify(s%en)
> +      nullify(s%a)
> +      return
> +   END SUBROUTINE 
> +
> +   SUBROUTINE free_memory_exc_a(a)
> +   !this subroutine deallocates exc
> +      implicit none
> +      TYPE(exc) a
> +      if(associated(a%a)) deallocate(a%a)
> +      nullify(a%a)
> +!      if(associated(a%e_ks)) deallocate(a%e_ks)
> +!      nullify(a%e_ks)
> +      return
> +   END SUBROUTINE
> +
> +   SUBROUTINE free_memory_exc_a_r(a_r)
> +   !this subroutine deallocates exc_r 
> +      implicit none
> +      TYPE(exc_r) a_r
> +      if(associated(a_r%ar)) deallocate(a_r%ar)
> +      nullify(a_r%ar)
> +      return
> +   END SUBROUTINE
> +
> +   SUBROUTINE free_memory_spectrum(s)
> +   !this subroutine deallocates exc
> +      implicit none
> +      TYPE(spectrum) s
> +      if(associated(s%en)) deallocate(s%en)
> +      nullify(s%en)
> +      if(associated(s%a)) deallocate(s%a)
> +      nullify(s%a)
> +      return
> +   END SUBROUTINE
> +
> +
> +    SUBROUTINE write_exc(a)
> +    !this subroutine writes the excitonic vectors on disk
> +    !the file name is taken from the label
> +!    USE io_files,             ONLY : find_free_unit, prefix
> +    USE io_files,             ONLY : prefix,tmp_dir
> +    USE mp_world,  ONLY : mpime
> +    implicit none
> +    INTEGER, EXTERNAL :: find_free_unit
> +    TYPE(exc) :: a!the exc wavefunction to be written
> +
> +    INTEGER :: iw, jw, iuna
> +    CHARACTER(5) :: nfile
> +    CHARACTER(5) :: nproc
> +
> +    if(a%label >= 0 ) then
> +      write(nfile,'(5i1)') &
> +        & a%label/10000,mod(a%label,10000)/1000,mod(a%label,1000)/100,mod(a%label,100)/10,mod(a%label,10)
> +      write(nproc,'(5i1)') &
> +        & mpime/10000,mod(mpime,10000)/1000,mod(mpime,1000)/100,mod(mpime,100)/10,mod(mpime,10)
> +      iuna = find_free_unit()
> +      open( unit=iuna, file=trim(tmp_dir)//trim(prefix)//'-exc_a.'// nfile //'.'// nproc , &
> +           &status='unknown',form='unformatted')
> +    else
> +      write(nfile,'(5i1)') &
> +        & -a%label/10000,mod(-a%label,10000)/1000,mod(-a%label,1000)/100,mod(-a%label,100)/10,mod(-a%label,10)
> +      write(nproc,'(5i1)') &
> +        & mpime/10000,mod(mpime,10000)/1000,mod(mpime,1000)/100,mod(mpime,100)/10,mod(mpime,10)
> +      iuna = find_free_unit()
> +      open( unit=iuna, file=trim(tmp_dir)//trim(prefix)//'-exc_a.-'// nfile //'.'// nproc , &
> +           &status='unknown',form='formatted')
> +    endif
> +    write(iuna) a%label
> +    write(iuna) a%npw
> +    write(iuna) a%numb_v
> +    write(iuna) a%e
> +    do iw=1,a%numb_v
> +       write(iuna)  a%a(1:a%npw,iw)
> +    enddo
> +    close(iuna)
> +
> +    return
> +    END SUBROUTINE
> +
> +    SUBROUTINE  read_exc(label, a,l_verbose)
> +    !this subroutine reads the excitonic vectors from disk
> +    !the file name is taken from the label
> +
> +
> +!    USE io_files,             ONLY : find_free_unit, prefix
> +    USE io_files,             ONLY :  prefix,tmp_dir
> +    USE io_global,            ONLY : stdout
> +    USE mp_world,  ONLY : mpime
> +    implicit none
> +    INTEGER, EXTERNAL :: find_free_unit
> +    TYPE(exc) :: a!the excitonic wave function to be read
> +    INTEGER :: label! the label identifing the required excitonic wavefunction
> +    LOGICAL, INTENT(in) :: l_verbose
> +    INTEGER :: iw, jw, iuna
> +    CHARACTER(5) :: nfile
> +    CHARACTER(5) :: nproc
> +
> +
> +
> +!first deallocate
> +    call free_memory_exc_a(a)
> +
> +
> +    if(label >= 0 ) then
> +      write(nfile,'(5i1)') label/10000,mod(label,10000)/1000,mod(label,1000)/100,mod(label,100)/10,mod(label,10)
> +      write(nproc,'(5i1)') mpime/10000,mod(mpime,10000)/1000,mod(mpime,1000)/100,mod(mpime,100)/10,mod(mpime,10)
> +      iuna = find_free_unit()
> +      open( unit=iuna, file=trim(tmp_dir)//trim(prefix)//'-exc_a.'//nfile//'.'//nproc, status='old',form='unformatted')
> +    else
> +      write(nfile,'(5i1)') -label/10000,mod(-label,10000)/1000,mod(-label,1000)/100,mod(-label,100)/10,mod(-label,10)
> +      write(nproc,'(5i1)') mpime/10000,mod(mpime,10000)/1000,mod(mpime,1000)/100,mod(mpime,100)/10,mod(mpime,10)
> +      iuna = find_free_unit()
> +      open( unit=iuna, file=trim(tmp_dir)//trim(prefix)//'-exc_a.-'//nfile//'.'//nproc, status='old',form='unformatted')
> +    endif
> +    read(iuna) a%label
> +    read(iuna) a%npw
> +    read(iuna) a%numb_v
> +    read(iuna) a%e
> +
> +!now allocate
> +    allocate(a%a(a%npw,a%numb_v))
> +    do iw=1,a%numb_v
> +       read(iuna)  a%a(1:a%npw,iw)
> +    enddo
> +    close(iuna)
> +
> +    return
> +    END SUBROUTINE
> +
> +    SUBROUTINE c_times_exc(a, c )
> +! this subroutine multiplies each iv line of the excitonic wave-function
> +! matrix (a%a) with the iv element of the real number vector (c).
> +
> +    USE kinds, ONLY : DP
> +
> +    implicit none
> +    type(exc) :: a
> +    real(kind=dp) :: c(a%numb_v)
> +
> +    integer iv
> +    
> +    call start_clock('c_times_exc')
> +
> +    do iv=1, a%numb_v
> +       a%a(1:a%npw,iv)=cmplx(c(iv),0.d0)*a%a(1:a%npw,iv)  
> +    enddo
> +
> +    call stop_clock('c_times_exc')
> +
> +    return
> +    END SUBROUTINE
> +
> +    SUBROUTINE random_exc(a)
> +! this subroutine returns a random excitonic-wavefunction vector (a%a) 
> +
> +    USE random_numbers, ONLY : randy
> +    USE kinds, ONLY : DP
> +    USE gvect,          ONLY : gstart
> +
> +    implicit none
> +    type(exc) ::a
> +
> +    real(kind=DP):: r1,r2
> +    integer      ::  iv,ig
> +
> +    do iv=1,a%numb_v
> +       do ig=1,a%npw
> +          r1=randy()
> +          r2=randy()
> +          a%a(ig,iv)=cmplx(r1,r2)
> +          if (gstart==2) a%a(1,iv)=cmplx(r1,0.d0)
> +       enddo
> +    enddo  
> +    return
> +    END SUBROUTINE
> +
> +
> +    SUBROUTINE pc_operator_exc(a,v,is)
> +! this subroutine projects the excitonic wave-function vector into the conduction states manifold    
> +    USE bse_basic_structures
> +    USE mp, ONLY : mp_sum
> +    USE mp_world,             ONLY : world_comm
> +    USE wvfct,    ONLY : npw,npwx
> +
> +    implicit none
> +    type(exc) :: a ! excitonic wfns vector to be projected on the conduction state manifold 
> +    type(v_state) :: v ! valence states vector in G space  
> +
> +    REAL(kind=DP), ALLOCATABLE :: prod(:)
> +    integer :: iv, iiv,is
> +
> +    call start_clock('pc_operator_exc')
> +
> +    allocate(prod(a%numb_v))
> +
> +    do iv=1,a%numb_v
> +       call dgemm('T','N', a%numb_v,1,2*a%npw,2.d0,v%wfn(:,:,is),2*npw,a%a(:,iv),2*a%npw,&
> +            & 0.d0,prod,a%numb_v)
> +        do iiv=1,a%numb_v
> +           if(v%gstart==2) prod(iiv)=prod(iiv)-dble(conjg(v%wfn(1,iiv,is))*a%a(1,iv))
> +        enddo
> +        call mp_sum(prod(:), world_comm )
> +        call dgemm('N','N',2*a%npw,1,a%numb_v,-1.d0,v%wfn(:,:,is),2*npw,prod,&
> +          &a%numb_v,1.d0,a%a(:,iv),2*a%npw)
> +    enddo
> +
> +    deallocate(prod)
> +
> +    call stop_clock('pc_operator_exc')
> +
> +    return
> +    END SUBROUTINE
> +
> +    
> +    SUBROUTINE poutcstate_exc(a_in,a_out,cstate,wcstate)
> +! this subroutine projects out from the excitonic vectors the component along the conduction states for which the 
> +! the QP are known, and it multiplies it by a weighted vector (used to avoid scissor)
> +    USE bse_basic_structures
> +!    USE qpe, ONLY : qpc
> +    USE mp, ONLY : mp_sum
> +    USE mp_world,             ONLY : world_comm
> +    USE wvfct,    ONLY : npw,npwx
> +   
> +
> +    implicit none
> +    type(exc), intent(in) :: a_in ! excitonic wfns vector to be projected on the conduction state manifold 
> +    type(exc), intent(out) :: a_out ! final
> +    
> +    type(c_state) :: cstate ! GW corrected conduction states vector in G space  
> +    type(c_state) :: wcstate ! weighted GW corrected conduction states vector in G space  
> +
> +    REAL(kind=DP), ALLOCATABLE :: prod(:)
> +    integer :: iv, ic
> +
> +    call start_clock('poutcstate_exc')
> +
> +    allocate(prod(cstate%numb_c))
> +
> +    do iv=1,a_in%numb_v
> +       call dgemm('T','N', cstate%numb_c,1,2*a_in%npw,2.d0,cstate%wfn(:,:),2*npw,a_in%a(:,iv),2*a_in%npw,&
> +            & 0.d0,prod,cstate%numb_c)
> +!       call dgemm('T','N', a%numb_v,1,2*a%npw,2.d0,v%wfn(:,:,is),2*npwx,a%a(:,iv),2*a%npw,&
> +!            & 0.d0,prod,a%numb_v)
> +        do ic=1,cstate%numb_c
> +           if(cstate%gstart==2) prod(ic)=prod(ic)-dble(conjg(cstate%wfn(1,ic))*a_in%a(1,iv))
> +        enddo
> +        call mp_sum(prod(:), world_comm )
> +        call dgemm('N','N',2*a_in%npw,1,cstate%numb_c,1.d0,wcstate%wfn(:,:),2*npw,prod,&
> +          &cstate%numb_c,0.d0,a_out%a(:,iv),2*a_in%npw)
> +    enddo
> +
> +    deallocate(prod)
> +
> +    call stop_clock('poutcstate_exc')
> +
> +    return
> +    END SUBROUTINE
> +
> +
> +    SUBROUTINE sproduct_exc(a1,a2,prod)
> +! this subroutine returns the scalar product (prod) between two
> +! excitonic-wavefunctions, input wave functions are given in G-space
> +    use io_global, ONLY : stdout, ionode 
> +    USE kinds, ONLY : DP
> +    USE mp,             ONLY : mp_sum, mp_barrier
> +    USE mp_world,             ONLY : world_comm
> +    use mp_world, ONLY : mpime
> +    USE gvect,          ONLY : gstart,ngm_g
> +
> +    implicit none
> +    REAL(kind=DP), EXTERNAL :: ddot
> +    type(exc) :: a1,a2
> +    real(kind=DP) :: prod
> +    integer :: ii
> +
> +    logical :: debug
> +    debug=.false.
> +
> +!compute the dot product
> +!    if(debug) then
> +!      write(*,*) 'sproduct_exc in, mpime=',mpime
> +!    endif
> +    call start_clock('sproduct_exc')
> +    prod=0.d0
> +    do ii=1,a1%numb_v
> +       prod=prod+2.d0*ddot(2*a1%npw,a1%a(:,ii),1,a2%a(:,ii),1)
> +       if (gstart==2) prod=prod-a1%a(1,ii)*a2%a(1,ii)
> +    enddo
> +! sum over processor
> +    call mp_sum(prod, world_comm)
> +
> +
> +
> +!    if(debug) then
> +!      write(*,*) 'sproduct_exc out, mpime=',mpime
> +!      write(*,*) 'sproduct_exc out, ngm_g= ',ngm_g
> +!    endif
> +
> +    call stop_clock('sproduct_exc')
> +    return
> +    END SUBROUTINE
> +
> +
> +    SUBROUTINE normalize_exc(a)
> +! this subroutine normalizes the excitonic wave function to 1   
> +    USE kinds, ONLY : DP
> +    use io_global, ONLY : stdout, ionode 
> +    use mp_world, ONLY : mpime
> +
> +
> +    implicit none
> +    type(exc) :: a
> +    real(kind=DP) :: prod
> +
> +    logical :: debug
> +    debug=.false. 
> +
> +    call start_clock('normalize_exc')
> +
> +    if(debug) then
> +      write(*,*) 'normalize_exc in, mpime=',mpime
> +    endif
> +
> +    call sproduct_exc(a,a,prod)
> +
> +    prod=1/sqrt(prod)
> +
> +    a%a(1:a%npw,1:a%numb_v)= a%a(1:a%npw,1:a%numb_v)*prod
> + 
> +    if(debug) then
> +    ! check normalization'
> +       call sproduct_exc(a,a,prod)
> +       if(ionode) write(stdout,*) 'normalize exc check, prod=',prod
> +    endif 
> +
> +    if(debug) then
> +      write(*,*) 'normalize_exc out, mpime=',mpime
> +    endif
> +    
> +    call stop_clock('normalize_exc')
> +
> +    return
> +    END SUBROUTINE
> +    
> +    SUBROUTINE pout_operator_exc(a,i_state)
> +!   this subroutine projects out a component |b> from an excitonic
> +!   wavevector |a> 
> +
> +    implicit none
> +    type(exc) :: a
> +    integer   :: i_state, i
> +    real(kind=DP), allocatable :: prod(:)
> +
> +    call start_clock('pout_operator_exc')
> +    allocate(prod(i_state-1))
> +    
> +    do i=1,(i_state-1) 
> +       call sproduct_exc(a,bse_spectrum(i),prod(i))
> +    enddo
> +    
> +    do i=1,(i_state-1)
> +       a%a(1:a%npw,1:a%numb_v)=a%a(1:a%npw,1:a%numb_v)-prod(i)*&
> +                      bse_spectrum(i)%a(1:bse_spectrum(i)%npw,1:bse_spectrum(i)%numb_v)
> +    enddo
> +
> +!    call normalize_exc(a)
> +    deallocate(prod)
> +    call stop_clock('pout_operator_exc')
> +    return
> +    END SUBROUTINE
> +
> +    SUBROUTINE fft_a_exc(ag,fc,ar)
> +!   this subroutine performs an FFT to real space of the excitonic wavefunction
> +!   vector using the dual grid
> +    USE kinds, ONLY : DP
> +    USE gvect,                 ONLY : ig_l2g
> +    USE fft_custom_gwl
> +    USE bse_wannier, ONLY : dual_bse 
> +    USE wvfct,    ONLY : npwx
> +    USE gvecw,              ONLY : gcutw, ecutwfc
> +    USE io_global, ONLY : stdout, ionode, ionode_id
> +    USE mp_world, ONLY : mpime, nproc
> +    USE mp_pools, ONLY : intra_pool_comm
> +!    USE mp_wave, ONLY : mergewf,splitwf
> +    USE wavefunctions_module, ONLY :  psic
> +
> +
> +    implicit none
> + 
> +    type(exc) ag
> +    type(exc_r) ar
> +    type(fft_cus) :: fc
> +
> +    COMPLEX(kind=DP), allocatable :: ag_t(:,:)
> +    COMPLEX(kind=DP), ALLOCATABLE :: evc_g(:)
> +
> +    integer :: ii
> +
> +! FFT the wannier function to r-space (dual grid)
> +    
> +   call start_clock('fft_a_exc')
> +
> +    allocate(ag_t(fc%npwt,ag%numb_v))
> +    
> +    ar%nrxxt=fc%nrxxt 
> +    ar%numb_v=ag%numb_v
> +    ar%label=ag%label
> +
> +    allocate(ar%ar(ar%nrxxt,ar%numb_v))
> +    
> +    allocate(evc_g(fc%ngmt_g )) 
> +
> +    if(fc%dual_t==4.d0) then
> +       ag_t(1:fc%npwt,1:ag%numb_v)= ag%a(1:fc%npwt,1:ag%numb_v)
> +    else
> +       call reorderwfp_col(ag%numb_v,ag%npw,fc%npwt,ag%a(1,1),ag_t(1,1),ag%npw ,fc%npwt, &
> +           & ig_l2g,fc%ig_l2gt,fc%ngmt_g,mpime, nproc,intra_pool_comm )
> +
> +!       do ii=1,ag%numb_v
> +!          call mergewf(ag%a(:,ii),evc_g,ag%npw,ig_l2g,mpime,nproc,ionode_id,intra_pool_comm)
> +!          call splitwf(ag_t(:,ii),evc_g,fc%npwt,fc%ig_l2gt,mpime,nproc,ionode_id,intra_pool_comm)
> +!       enddo
> +    endif
> +
> +    do ii=1,ag%numb_v,2
> +       psic(:)=(0.d0,0.d0)
> +       if (ii==ag%numb_v) then
> +          psic(fc%nlt(1:fc%npwt))  = ag_t(1:fc%npwt,ii)
> +          psic(fc%nltm(1:fc%npwt)) = CONJG( ag_t(1:fc%npwt,ii) )
> +       else
> +          psic(fc%nlt(1:fc%npwt))=ag_t(1:fc%npwt,ii)+(0.d0,1.d0)*ag_t(1:fc%npwt,ii+1)
> +          psic(fc%nltm(1:fc%npwt))=CONJG(ag_t(1:fc%npwt,ii))+(0.d0,1.d0)*CONJG(ag_t(1:fc%npwt,ii+1))
> +       endif
> +       CALL cft3t( fc, psic, fc%nr1t, fc%nr2t, fc%nr3t, fc%nrx1t, fc%nrx2t, fc%nrx3t, 2 )
> +       ar%ar(1:fc%nrxxt,ii)= DBLE(psic(1:fc%nrxxt))
> +       if(ii/=ag%numb_v) ar%ar(1:fc%nrxxt,ii+1)= DIMAG(psic(1:fc%nrxxt))
> +    enddo
> +
> +    deallocate(evc_g)
> +    call stop_clock('fft_a_exc')
> +    return
> +    END SUBROUTINE 
> +
> +    SUBROUTINE fftback_a_exc(ar,fc,ag)
> +!   this subroutine performs an FFT G space of the excitonic wavefunction
> +!   from the dual R-grid and reorders the wavefunction in the pw grid order 
> +    USE kinds, ONLY : DP
> +    USE gvect,                 ONLY : ig_l2g
> +    USE fft_custom_gwl
> +    USE bse_wannier, ONLY : dual_bse 
> +    USE wvfct,    ONLY : npwx
> +    USE gvecw,              ONLY : gcutw, ecutwfc
> +    USE io_global, ONLY : stdout, ionode, ionode_id
> +    USE mp_world, ONLY : mpime, nproc
> +    USE mp_pools, ONLY : intra_pool_comm
> +!    USE mp_wave, ONLY : mergewf,splitwf
> +    USE wavefunctions_module, ONLY :  psic
> +
> +
> +    implicit none
> + 
> +    type(exc) ag
> +    type(exc_r) ar
> +    type(fft_cus) :: fc
> +    integer :: ii,iv
> +    COMPLEX(kind=DP), ALLOCATABLE :: evc_g(:)
> +    COMPLEX(kind=DP), allocatable :: ag_t(:,:)
> +
> +
> +! FFT the wannier function to g-space ( from dual grid), and put back in the pw
> +! grid order 
> +
> +    call start_clock('fftback_a_exc')
> +    allocate(ag_t(fc%npwt,ar%numb_v))
> +    allocate(evc_g(fc%ngmt_g ))
> +
> +
> +    do iv=1, ag%numb_v,2
> +       if (iv==ag%numb_v) then 
> +          psic(1:fc%nrxxt)=dcmplx(ar%ar(1:ar%nrxxt,iv),0.d0)
> +       else
> +          psic(1:fc%nrxxt)=dcmplx(ar%ar(1:fc%nrxxt,iv),ar%ar(1:fc%nrxxt,iv+1))
> +       endif
> +       CALL cft3t(fc, psic, fc%nr1t, fc%nr2t, fc%nr3t, fc%nrx1t, fc%nrx2t, fc%nrx3t, -2 )
> +       if (iv==ag%numb_v) then 
> +          ag_t(1:fc%npwt,iv)=psic(fc%nlt(1:fc%npwt))
> +       else
> +          ag_t(1:fc%npwt,iv)=0.5d0*(psic(fc%nlt(1:fc%npwt))+conjg( psic(fc%nltm(1:fc%npwt))))
> +          ag_t(1:fc%npwt,iv+1)=(0.d0,-0.5d0)*(psic(fc%nlt(1:fc%npwt)) - conjg(psic(fc%nltm(1:fc%npwt))))
> +       endif
> +    enddo
> + 
> +    if(fc%dual_t==4.d0) then
> +       ag%a(1:fc%npwt,1:ag%numb_v)=ag_t(1:fc%npwt,1:ag%numb_v)
> +    else
> +        call reorderwfp_col(ag%numb_v,fc%npwt,ag%npw,ag_t(1,1),ag%a(1,1),fc%npwt,ag%npw, &
> +           & fc%ig_l2gt,ig_l2g,fc%ngmt_g,mpime, nproc,intra_pool_comm )
> +
> +!       do iv=1, ag%numb_v
> +!          call mergewf(ag_t(:,iv),evc_g,fc%npwt,fc%ig_l2gt,mpime,nproc,ionode_id,intra_pool_comm)
> +!          call splitwf(ag%a(:,iv),evc_g,ag%npw,ig_l2g,mpime,nproc,ionode_id,intra_pool_comm)
> +!       enddo
> +    endif 
> +
> +    deallocate(evc_g)
> +    call stop_clock('fftback_a_exc')
> +    return
> +  END SUBROUTINE fftback_a_exc
> +
> +    SUBROUTINE urot_a(a_in,a_out,itrasp)
> +    USE wvfct,     ONLY : nbnd,npwx
> +    USE bse_basic_structures, ONLY : u_trans
> +    USE lsda_mod,             ONLY : nspin
> +    USE io_global, ONLY : stdout 
> +
> +
> +
> +    implicit none 
> +    type(exc):: a_in
> +    type(exc):: a_out
> +    integer  :: itrasp ! if 1 takes U^T
> +    REAL(kind=DP), ALLOCATABLE :: tmp_rot(:,:)
> +    logical debug
> +    integer :: ii
> +
> +
> +    call start_clock('urot_a')
> +
> +    debug=.false.
> +    allocate(u_trans(nbnd,nbnd,nspin))
> +    call read_wannier_matrix
> +
> +
> +    allocate(tmp_rot(a_in%numb_v,a_in%numb_v))
> +    tmp_rot(1:a_in%numb_v,1:a_in%numb_v)=dble(u_trans(1:a_in%numb_v,1:a_in%numb_v,1))
> +
> +!DEBUG 
> +!    tmp_rot=0.d0
> +!    do ii=1, a_in%numb_v
> +!       tmp_rot(ii,ii)=1.d0
> +!    enddo
> +!fine DEBUG
> +
> +   
> +    if (itrasp==0) call rotate_wannier_gamma_bse(tmp_rot,a_in,a_out,1,0)
> +    if (itrasp==1) call rotate_wannier_gamma_bse(tmp_rot,a_in,a_out,1,1)
> +
> +    deallocate(u_trans)
> +    deallocate(tmp_rot)
> +
> +    call stop_clock('urot_a')
> +    return
> +    END SUBROUTINE 
> +     
> +
> +end module  exciton
> 
> Added: trunk/espresso/GWW/bse/find_eig.f90
> ===================================================================
> --- trunk/espresso/GWW/bse/find_eig.f90	                        (rev 0)
> +++ trunk/espresso/GWW/bse/find_eig.f90	2016-03-21 11:09:18 UTC (rev 12231)
> @@ -0,0 +1,197 @@
> +subroutine find_eig(vstate,vstate_r,cstate,wcstate,fc)
> +!this subroutine finds the lowest n_eig eigenvectors and eigenvalues 
> +!through the  conjugate gradient or steepest descent minimization scheme
> +!for each eigenvector it computes the optical amplitude and eventually the
> +!excitonic wavefunction
> +
> +USE exciton
> +USE io_global, ONLY : stdout,ionode
> +
> +USE bse_basic_structures
> +USE fft_custom_gwl
> +USE wvfct,    ONLY : npw,npwx,nbnd
> +USE bse_wannier, ONLY:num_nbndv,eps,lambda,maxit,n_eig,l_cgrad,l_plotexc,&
> +                      plotn_min,plotn_max,l_plotaverage,l_restart,l_verbose,&
> +                      n_eig_start,l_finite,l_contraction,l_gtrick
> +USE mp,          ONLY : mp_barrier
> +USE mp_world,             ONLY : world_comm
> +USE constants,   ONLY : RYTOEV
> +USE contract_w
> +USE lsda_mod, ONLY :nspin
> +!USE eqv,         ONLY : dpsi, dvpsi, eprec
> +
> +
> +
> +implicit none
> +!type(exc), allocatable :: bse_spectrum(:)
> +type(v_state) :: vstate
> +type(v_state_r) :: vstate_r
> +type(c_state) :: cstate
> +type(c_state) :: wcstate
> +type(fft_cus) :: fc
> +
> +type(spectrum) :: bse_sp
> +
> +type(v_state) :: psibar(3) ! formula (43) of Rev. Mod. Phys. 73, 515 
> +                           ! for the three polarization directions computed through DFPT 
> +integer :: i,nstart,ipol,ispin
> +complex(DP), allocatable:: dvpsi(:,:)
> +
> +real(DP) :: EVTORY
> +
> +
> +
> +EVTORY=1.d0/RYTOEV
> +
> +call start_clock('find_eig')
> +
> +write(stdout,*) 'Routine find_eig'
> +FLUSH(stdout)
> +
> +call initialize_spectrum(bse_sp)
> +
> +if(l_contraction) then
> +   write(stdout,*) 'CALL contract_w_build'
> +   FLUSH(stdout)
> +   call contract_w_build(fc)
> +endif
> +bse_sp%neig=n_eig
> +allocate(bse_sp%en(bse_sp%neig))
> +allocate(bse_sp%a(bse_sp%neig,3))
> +
> +allocate(bse_spectrum(n_eig))
> +
> +if(l_contraction) then
> +! read  iimat
> +   call initialize_imat(iimat_contract)
> +   do ispin=1,nspin
> +      call read_iimat(iimat_contract,ispin)
> +   enddo
> +   write(stdout,*) 'CALL contract_v_build'
> +   call FLUSH(stdout)
> +   call contract_v_build(fc)
> +endif
> +
> +write(stdout,*) 'FIND_EIG 1'
> +call FLUSH(stdout)
> +
> +do i=1,n_eig
> +   call initialize_exc(bse_spectrum(i))
> +   bse_spectrum(i)%npw=npw
> +   bse_spectrum(i)%numb_v=num_nbndv(1)
> +   allocate(bse_spectrum(i)%a(bse_spectrum(i)%npw,bse_spectrum(i)%numb_v)) 
> +   bse_spectrum(i)%label=i
> +enddo
> +
> +
> +write(stdout,*) 'FIND_EIG 2'
> +call FLUSH(stdout)
> +
> +if(l_restart==1)then
> +   nstart=n_eig_start
> +   do i=1,n_eig_start-1
> +      call read_exc(i, bse_spectrum(i),l_verbose)
> +      bse_sp%en(i)=bse_spectrum(i)%e*EVTORY
> +   enddo
> +else
> +   nstart=1
> +endif
> +
> +write(stdout,*) 'FIND_EIG 3'
> +call FLUSH(stdout)
> +
> +!compute the eigenfunction and eigenvalues
> +if(l_restart<2) then
> +   do i=nstart,n_eig
> +      if(l_cgrad) then
> +         call conjgrad(i,vstate,vstate_r,cstate,wcstate,fc,bse_sp%en(i)) 
> +      else
> +         call sdescent(i,vstate,vstate_r,cstate,wcstate,fc,bse_sp%en(i))
> +      endif
> +      call write_exc(bse_spectrum(i))
> +   enddo
> +else if(l_restart==2) then
> +   do i=1,n_eig
> +      call read_exc(i, bse_spectrum(i),l_verbose)
> +      bse_sp%en(i)=bse_spectrum(i)%e*EVTORY
> +   enddo
> +endif
> +
> +
> +call mp_barrier(world_comm)
> +
> +if(l_gtrick)  call v_wfng_to_wfnr(vstate,fc,vstate_r)
> +
> +!compute the optical amplitudes 
> +
> +!compute the  |psibar(iv)>
> +if(.not.l_finite) then
> +   allocate (dvpsi ( npwx , num_nbndv(1)))
> +   do ipol=1,3 
> +      call initialize_v_state(psibar(ipol))
> +      psibar(ipol)%nspin= vstate%nspin
> +      psibar(ipol)%numb_v(:)=vstate%numb_v(:)
> +      psibar(ipol)%npw=npw
> +      psibar(ipol)%gstart=vstate%gstart
> +      
> +      allocate( psibar(ipol)%wfn(psibar(ipol)%npw,psibar(ipol)%numb_v(1),psibar(ipol)%nspin))
> +     
> +      call dvpsi_e (1, ipol,dvpsi(1,1))
> +      do i=1,num_nbndv(1)
> +         psibar(ipol)%wfn(1:npw,i,1)&
> +             & = dvpsi(1:npw,i)
> +      enddo
> +   enddo
> +   deallocate (dvpsi)
> +endif
> +call mp_barrier(world_comm)
> +
> +do ipol=1,3
> +   do i=1,n_eig
> +      call absorption(vstate_r,psibar(ipol)%wfn(1,1,1),fc,i,bse_sp%a(i,ipol),ipol)
> +!      if(ionode) write(stdout,*)'Eigv#',i,'E',bse_spectrum(i)%e, 'Amp',bse_sp%a(i)  
> +!      if(ionode) write(stdout,*)'Eigv#',i,'E',bse_sp%en(i), 'Amp',bse_sp%a(i)  
> +   enddo
> +enddo
> +call mp_barrier(world_comm)
> +
> +!build up the spectrum
> +do ipol=1,3
> +   call build_spectrum(bse_sp%a(1,ipol),bse_sp%en(1),ipol)
> +enddo
> +call mp_barrier(world_comm)
> +
> +!plot the excitonic wfn
> +if(l_plotexc) then
> +   if(l_plotaverage) then
> +      call plot_excwfn(plotn_min,plotn_max,vstate_r,fc)
> +   else
> +      do i=plotn_min,plotn_max
> +         call plot_excwfn(i,i,vstate_r,fc)
> +      enddo 
> +   endif
> +endif
> +
> +
> +do i=1,n_eig
> +   call free_memory_exc_a(bse_spectrum(i))
> +enddo
> +
> +deallocate(bse_spectrum)
> +
> +call free_memory_spectrum(bse_sp)
> +do ipol=1,3
> +   call free_v_state(psibar(ipol))
> +enddo
> +
> +if(l_contraction) then
> +   call free_memory_contrac_w
> +   call free_imat(iimat_contract)
> +endif
> +
> +call stop_clock('find_eig')
> +return
> +
> +
> +
> +end subroutine
> 
> Added: trunk/espresso/GWW/bse/h_h.f90
> ===================================================================
> --- trunk/espresso/GWW/bse/h_h.f90	                        (rev 0)
> +++ trunk/espresso/GWW/bse/h_h.f90	2016-03-21 11:09:18 UTC (rev 12231)
> @@ -0,0 +1,69 @@
> +!
> +! Copyright (C) 2003-2007 Quantum ESPRESSO group
> +! This file is distributed under the terms of the
> +! GNU General Public License. See the file `License'
> +! in the root directory of the present distribution,
> +! or http://www.gnu.org/copyleft/gpl.txt .
> +!
> +!
> +!-----------------------------------------------------------------------
> +SUBROUTINE H_h(e,h,Ah)
> +  !-----------------------------------------------------------------------
> +  !
> +  USE kinds, ONLY: DP
> +!  USE wvfct, ONLY: nbnd, npwx, npw, g2kin, igk
> +  USE wvfct, ONLY: npwx, npw, g2kin, igk
> +  USE gvect, ONLY : gstart
> +  USE uspp,     ONLY : vkb, nkb
> +  USE lsda_mod, ONLY : current_spin
> +  USE scf,      ONLY : vrs
> +  USE becmod, ONLY: bec_type, becp, calbec
> +!  USE cgcom
> +  USE electrons_base, ONLY: nel
> +  use bse_wannier, ONLY:num_nbndv
> +  !
> +  IMPLICIT NONE
> +  !
> +  real(DP):: e(num_nbndv(1))
> +  COMPLEX(DP):: h(npwx,num_nbndv(1)), Ah(npwx,num_nbndv(1))
> +!  real(DP), allocatable     :: e(:)
> +!  COMPLEX(DP), allocatable  :: h(:,:), Ah(:,:)
> +  !
> +  INTEGER:: j,ibnd,nbnd
> +  !
> +  CALL start_clock('h_h')
> +
> +  ! valid only for non-spin resolved calculations
> +  nbnd=num_nbndv(1)
> +
> +  !
> +  ! [(k+G)^2 - e ]psi
> +  DO ibnd = 1,nbnd
> +     ! set to zero the imaginary part of h at G=0
> +     !  needed for numerical stability
> +     IF (gstart==2) h(1,ibnd) = cmplx( dble(h(1,ibnd)),0.d0,kind=DP)
> +     DO j = 1,npw
> +        ah(j,ibnd) = (g2kin(j)-e(ibnd)) * h(j,ibnd)
> +     ENDDO
> +  ENDDO
> +  ! V_Loc psi
> +  CALL vloc_psi_gamma(npwx, npw, nbnd, h, vrs(1,current_spin), ah)
> +  ! V_NL psi
> +   CALL calbec  ( npw, vkb, h, becp, nbnd )
> +  IF (nkb > 0) CALL add_vuspsi (npwx, npw, nbnd, ah)
> +  ! set to zero the imaginary part of ah at G=0
> +  !  needed for numerical stability
> +  IF (gstart==2) THEN
> +     DO ibnd = 1, nbnd
> +        ah(1,ibnd) = cmplx( dble(ah(1,ibnd)),0.d0,kind=DP)
> +     ENDDO
> +  ENDIF
> +
> +!  DEALLOCATE(h)
> +!  DEALLOCATE(Ah)
> +!  DEALLOCATE(e)
> +  !
> +  CALL stop_clock('h_h')
> +  !
> +  RETURN
> +END SUBROUTINE H_h
> 
> Added: trunk/espresso/GWW/bse/lanczos.f90
> ===================================================================
> --- trunk/espresso/GWW/bse/lanczos.f90	                        (rev 0)
> +++ trunk/espresso/GWW/bse/lanczos.f90	2016-03-21 11:09:18 UTC (rev 12231)
> @@ -0,0 +1,426 @@
> +subroutine lanczos(vstate,vstate_r,cstate,wcstate,fc)
> +!this subroutine computes the absorption spectrum through the lanczos procedure
> +USE exciton
> +
> +USE bse_basic_structures
> +USE fft_custom_gwl
> +USE bse_wannier, ONLY: nit_lcz, l_contraction
> +USE contract_w
> +USE lsda_mod, ONLY :nspin
> +USE io_global, ONLY : stdout
> +
> +implicit none
> +type(v_state),   intent(in) :: vstate
> +type(v_state_r), intent(in) :: vstate_r
> +type(c_state),   intent(in) :: cstate
> +type(c_state),   intent(in) :: wcstate
> +type(fft_cus),   intent(in) :: fc
> +
> +
> +real(kind=DP), allocatable :: a(:,:),b(:,:)
> +integer :: ispin
> +
> +call start_clock('lanczos')
> +allocate (a(nit_lcz,3))
> +allocate (b(nit_lcz,3))
> +if(l_contraction) then
> +   write(stdout,*) 'CALL contract_w_build'
> +   FLUSH(stdout)
> +   call contract_w_build(fc)
> +   call initialize_imat(iimat_contract)
> +   do ispin=1,nspin
> +      call read_iimat(iimat_contract,ispin)
> +   enddo
> +   write(stdout,*) 'CALL contract_v_build'
> +   FLUSH(stdout)
> +   call contract_v_build(fc)
> +
> +endif
> +
> +
> +!perform lanczos iterations
> +
> +call lanczos_iterations(vstate,vstate_r,cstate,wcstate,fc,a(1,1),b(1,1))
> +
> +!build the continuum fraction
> +
> +call lanczos_cf(a(1,1),b(1,1))
> +
> +if(l_contraction) then
> +   call free_memory_contrac_w
> +   call free_imat(iimat_contract)
> +endif
> +
> +deallocate(a)
> +deallocate(b)
> +
> +call stop_clock('lanczos')
> +return
> +end subroutine
> +
> +subroutine lanczos_iterations(vstate,vstate_r,cstate,wcstate,fc,a,b)
> +!this subroutine computes the lanczos iteration to get the a(i) and b(i) for the
> +!continued fraction
> +
> +USE exciton
> +
> +USE bse_basic_structures
> +USE fft_custom_gwl
> +USE wvfct,       ONLY : npw,npwx,nbnd
> +USE bse_wannier, ONLY: num_nbndv, nit_lcz,l_restart_lcz, nlcz_restart
> +USE mp,          ONLY : mp_barrier,mp_bcast
> +USE mp_world,    ONLY : world_comm,mpime
> +USE io_global,   ONLY : ionode,ionode_id
> +use io_files,    ONLY : tmp_dir, prefix
> +
> +
> +implicit none
> +INTEGER, EXTERNAL :: find_free_unit
> +type(v_state),   intent(in) :: vstate
> +type(v_state_r), intent(in) :: vstate_r
> +type(c_state),   intent(in) :: cstate
> +type(c_state),   intent(in) :: wcstate
> +type(fft_cus),   intent(in) :: fc
> +
> +integer :: i,ipol,j,is,iunab,iuni,nstart
> +complex(DP), allocatable:: dvpsi(:,:) !formula (43) of Rev. Mod. Phys. 73, 515 
> +
> +
> +type(exc) im1_s ! |i-1> 
> +type(exc) i_s ! |i>
> +type(exc) ip1_s ! |i+1>
> +
> +real(kind=DP) :: bi,bim1
> +real(kind=DP), intent(inout) :: a(nit_lcz,3),b(nit_lcz,3)
> +CHARACTER(5) :: nproc
> +CHARACTER(5) :: nfile
> +
> +logical :: debug
> +
> +call start_clock('lanczos_iterations')
> +debug=.true.
> +
> +call initialize_exc(im1_s)
> +im1_s%label=1
> +im1_s%npw=npw
> +im1_s%numb_v=num_nbndv(1)
> +allocate(im1_s%a(im1_s%npw,im1_s%numb_v))
> +
> +call initialize_exc(i_s)
> +i_s%label=1
> +i_s%npw=npw
> +i_s%numb_v=num_nbndv(1)
> +allocate(i_s%a(i_s%npw,i_s%numb_v))
> +
> +call initialize_exc(ip1_s)
> +ip1_s%label=1
> +ip1_s%npw=npw
> +ip1_s%numb_v=num_nbndv(1)
> +allocate(ip1_s%a(ip1_s%npw,ip1_s%numb_v))
> +
> +allocate (dvpsi ( npwx , num_nbndv(1)))
> +
> +if(l_restart_lcz) then
> +   if(ionode) then
> +      if(debug) write(*,*) 'Restarting lanczos'
> +      iunab = find_free_unit()
> +      open(unit=iunab, file=trim(tmp_dir)//trim(prefix)//'.lczrestart_ab.dat', status='unknown', form='unformatted')
> +      read(iunab) a(1:nlcz_restart,1), a(1:nlcz_restart,2), a(1:nlcz_restart,3)
> +      read(iunab) b(1:nlcz_restart,1), b(1:nlcz_restart,2), b(1:nlcz_restart,3)
> +      close(iunab)
> +   endif
> +   call mp_bcast(a(1:(nlcz_restart),1:3),ionode_id, world_comm)
> +   call mp_bcast(b(1:(nlcz_restart),1:3),ionode_id, world_comm)
> +endif
> +
> +do ipol=1,3 
> +
> +   if(.not.l_restart_lcz) then
> +!     compute the  |psibar(iv)> and set it as initial excitonic state |i-1> for 
> +!     the lanczos procedure
> +      if(debug) write(*,*) 'before dvpsi'
> +      call dvpsi_e (1, ipol,dvpsi(1,1))
> +      if(debug) write(*,*) 'after dvpsi'
> +      do i=1,num_nbndv(1)
> +         im1_s%a(1:npw,i)= dvpsi(1:npw,i)
> +      enddo
> +
> +      call normalize_exc(im1_s)
> +      if(debug) write(*,*) 'after normalize_exc'
> + 
> +!     apply the exc Hamiltonian  
> +      call exc_h_a(im1_s,i_s,vstate,vstate_r,cstate,wcstate,fc) 
> +      if(debug) write(*,*) 'after exc_h_a'
> +
> +!     a(1)= <1|H|1>
> +      call sproduct_exc(im1_s,i_s,a(1,ipol))
> +
> +      do i=1,num_nbndv(1)
> +         i_s%a(1:npw,i)=i_s%a(1:npw,i)-dcmplx(a(1,ipol),0.d0)*im1_s%a(1:npw,i)
> +      enddo
> +
> +!     b(1)=bim1=|H|1>-a(1)|1>|
> +      call sproduct_exc(i_s,i_s,bim1)
> +      b(1,ipol)=sqrt(bim1)
> +
> +!     project into the conduction manifold
> +      do is = 1,vstate%nspin
> +         call pc_operator_exc(i_s,vstate,is)
> +      enddo
> + 
> +!     and normalize 
> +      call normalize_exc(i_s)
> +
> +
> +!     apply the exc Hamiltonian  
> +      call exc_h_a(i_s,ip1_s,vstate,vstate_r,cstate,wcstate,fc) 
> +   
> +!     a(2)= <2|H|2>
> +      call sproduct_exc(i_s,ip1_s,a(2,ipol))
> +
> +      do i=1,num_nbndv(1)
> +         ip1_s%a(1:npw,i)=ip1_s%a(1:npw,i)-dcmplx(a(2,ipol),0.d0)*i_s%a(1:npw,i)-dcmplx(b(1,ipol),0.d0)*im1_s%a(1:npw,i)
> +      enddo
> +
> +!    b(2)=bi=|H|2>-a(2)|2>-b(1)|1>|
> +      call sproduct_exc(ip1_s,ip1_s,bi)
> +      b(2,ipol)=sqrt(bi)
> +
> +!     project into the conduction manifold
> +      do is = 1,vstate%nspin
> +         call pc_operator_exc(ip1_s,vstate,is)
> +      enddo
> + 
> +!     and normalize 
> +      call normalize_exc(ip1_s)
> +      nstart=3
> +    else
> +!   read starting excitonic vector
> +      nstart=nlcz_restart+1
> +      iuni = find_free_unit()
> +      write(nproc,'(5i1)') &
> +              & mpime/10000,mod(mpime,10000)/1000,mod(mpime,1000)/100,mod(mpime,100)/10,mod(mpime,10)
> +      write(nfile,'(5i1)') &
> +           & ipol/10000,mod(ipol,10000)/1000,mod(ipol,1000)/100,mod(ipol,100)/10,mod(ipol,10)
> +      open( unit=iuni, file=trim(tmp_dir)//trim(prefix)//'.lcz_is.'// nfile //'.'// nproc , status='unknown',form='unformatted')
> +      read(iuni) i_s%label
> +      read(iuni) i_s%npw
> +      read(iuni) i_s%numb_v
> +      read(iuni) i_s%e
> +      do j=1,i_s%numb_v
> +         read(iuni)  i_s%a(1:i_s%npw,j)
> +      enddo
> +      close(iuni)
> +
> +      iuni = find_free_unit()
> +      write(nproc,'(5i1)') &
> +              & mpime/10000,mod(mpime,10000)/1000,mod(mpime,1000)/100,mod(mpime,100)/10,mod(mpime,10)
> +      write(nfile,'(5i1)') &
> +           & ipol/10000,mod(ipol,10000)/1000,mod(ipol,1000)/100,mod(ipol,100)/10,mod(ipol,10)
> +      open( unit=iuni, file=trim(tmp_dir)//trim(prefix)//'.lcz_ip1s.'// nfile //'.'// nproc , status='unknown',form='unformatted')
> +      read(iuni) ip1_s%label
> +      read(iuni) ip1_s%npw
> +      read(iuni) ip1_s%numb_v
> +      read(iuni) ip1_s%e
> +      do j=1,ip1_s%numb_v
> +         read(iuni)  ip1_s%a(1:ip1_s%npw,j)
> +      enddo
> +      close(iuni)
> +    endif
> +
> +!   Now start lanczos iteration
> +
> +    do j=nstart,nit_lcz
> +
> +      if(ionode.and.(mod(j,10)==0)) write(*,*) 'lanczos iteration #', j
> +      do i=1,num_nbndv(1)
> +         im1_s%a(1:npw,i)=i_s%a(1:npw,i) ! |j-1>=|j>
> +      enddo
> +
> +      do i=1,num_nbndv(1)
> +         i_s%a(1:npw,i)=ip1_s%a(1:npw,i) ! |j>=|j+1>
> +      enddo
> +     
> +!     apply the exc Hamiltonian  
> +      call exc_h_a(i_s,ip1_s,vstate,vstate_r,cstate,wcstate,fc) 
> +   
> +!     a(j)= <j|H|j>
> +      call sproduct_exc(i_s,ip1_s,a(j,ipol))
> +
> +      do i=1,num_nbndv(1)
> +         ip1_s%a(1:npw,i)=ip1_s%a(1:npw,i)-dcmplx(a(j,ipol),0.d0)*i_s%a(1:npw,i)&
> +                          -dcmplx(b(j-1,ipol),0.d0)*im1_s%a(1:npw,i)
> +      enddo
> +
> +!     b(j)=|H|j>-a(j)|j>-b(j-1)|j-1>|
> +      call sproduct_exc(ip1_s,ip1_s,bi)
> +      b(j,ipol)=sqrt(bi)
> +
> +!     project into the conduction manifold
> +      do is = 1,vstate%nspin
> +         call pc_operator_exc(ip1_s,vstate,is)
> +      enddo
> + 
> +!     and normalize 
> +      call normalize_exc(ip1_s)
> +     
> +      call mp_barrier(world_comm)
> +    enddo ! end of lanczos iterations
> +
> +!    write restart information on file
> +
> +   iuni = find_free_unit()
> +   write(nproc,'(5i1)') &
> +           & mpime/10000,mod(mpime,10000)/1000,mod(mpime,1000)/100,mod(mpime,100)/10,mod(mpime,10)
> +   write(nfile,'(5i1)') &
> +        & ipol/10000,mod(ipol,10000)/1000,mod(ipol,1000)/100,mod(ipol,100)/10,mod(ipol,10)
> +   open( unit=iuni, file=trim(tmp_dir)//trim(prefix)//'.lcz_is.'// nfile //'.'// nproc , status='unknown',form='unformatted')
> +   write(iuni) i_s%label
> +   write(iuni) i_s%npw
> +   write(iuni) i_s%numb_v
> +   write(iuni) i_s%e
> +   do j=1,i_s%numb_v
> +      write(iuni)  i_s%a(1:i_s%npw,j)
> +   enddo
> +   close(iuni)
> +
> +   iuni = find_free_unit()
> +   write(nproc,'(5i1)') &
> +           & mpime/10000,mod(mpime,10000)/1000,mod(mpime,1000)/100,mod(mpime,100)/10,mod(mpime,10)
> +   write(nfile,'(5i1)') &
> +        & ipol/10000,mod(ipol,10000)/1000,mod(ipol,1000)/100,mod(ipol,100)/10,mod(ipol,10)
> +   open( unit=iuni, file=trim(tmp_dir)//trim(prefix)//'.lcz_ip1s.'// nfile //'.'// nproc , status='unknown',form='unformatted')
> +   write(iuni) ip1_s%label
> +   write(iuni) ip1_s%npw
> +   write(iuni) ip1_s%numb_v
> +   write(iuni) ip1_s%e
> +   do j=1,ip1_s%numb_v
> +      write(iuni)  ip1_s%a(1:ip1_s%npw,j)
> +   enddo
> +   close(iuni)
> +
> +enddo !ipol
> +
> +!    write restart information on file
> +if(ionode) then
> +      iunab = find_free_unit()
> +      open(unit=iunab, file=trim(tmp_dir)//trim(prefix)//'.lczrestart_ab.dat', status='unknown', form='unformatted')
> +      write(iunab) a(1:nit_lcz,1), a(1:nit_lcz,2), a(1:nit_lcz,3)
> +      write(iunab) b(1:nit_lcz,1), b(1:nit_lcz,2), b(1:nit_lcz,3)
> +      close(iunab)
> +endif
> +
> +if(debug) then
> +   if(ionode) then
> +     do ipol=1,3
> +       do j=1,nit_lcz
> +         write(*,*) 'ipol, it, a', ipol, j, a(j,ipol)
> +       enddo
> +     enddo
> +     do ipol=1,3
> +       do j=1,nit_lcz
> +         write(*,*) 'ipol, it, b', ipol, j, b(j,ipol)
> +       enddo
> +     enddo
> +    endif
> +endif 
> +
> +!free memory
> +deallocate (dvpsi)
> +call free_memory_exc_a(im1_s)
> +call free_memory_exc_a(i_s)
> +call free_memory_exc_a(ip1_s)
> +
> +call stop_clock('lanczos_iterations')
> +return
> +end subroutine
> +
> +
> +subroutine lanczos_cf(a,b)
> +
> +USE bse_wannier, ONLY: nit_lcz, spectra_e_min,spectra_e_max,spectra_nstep
> +USE constants,   ONLY : RYTOEV, PI
> +USE kinds,       ONLY: DP
> +USE io_global, ONLY : ionode
> +
> +implicit none
> +real(DP), intent(in) :: a(nit_lcz,3),b(nit_lcz,3)
> +
> +complex(DP), allocatable      :: comega_g(:)
> +complex(DP), allocatable      :: den(:)
> +real(DP), allocatable         :: abss(:,:)    ! epsilon2
> +real(DP), allocatable         :: rp_abss(:,:) ! epsilon1
> +real(DP)                      :: eta,step,e_start
> +
> +integer :: ipol,j,i
> +logical :: debug, im
> +
> +call start_clock('lanczos_cf')
> +debug=.false.
> +eta=0.001d0
> +
> +allocate(comega_g(spectra_nstep))
> +allocate(den(spectra_nstep))
> +allocate(abss(spectra_nstep,3))
> +allocate(rp_abss(spectra_nstep,3))
> +
> +!build the energy grid (including a small imaginary part eta)
> +step=(spectra_e_max-spectra_e_min)/(dble(spectra_nstep-1)*RYTOEV)
> +e_start=spectra_e_min/RYTOEV
> +
> +do i=0, spectra_nstep-1
> +   comega_g(i+1)=dcmplx((e_start+dble(i)*step),eta)
> +enddo
> +
> +do ipol=1,3
> +!  build the continued fraction
> +   
> +   den(1:spectra_nstep)=comega_g(1:spectra_nstep)-dcmplx(a(nit_lcz,ipol),0.d0)
> +
> +   if((debug).and.(ionode)) then
> +            write(*,*) 'ipol, den'
> +         do i=1,spectra_nstep
> +            write(*,'(I1,I6,2F8.4)') ipol, i, real(den(i)), aimag(den(i))
> +      enddo
> +   endif 
> +
> +   do j=nit_lcz-1,1,-1
> +      den(1:spectra_nstep)= comega_g(1:spectra_nstep)&
> +                            -dcmplx(a(j,ipol),0.d0)&
> +                            -dcmplx(b(j,ipol)**2.d0,0.d0)/den(1:spectra_nstep)
> +
> +      if((debug).and.(ionode)) then
> +            write(*,*) 'ipol, den'
> +         do i=1,spectra_nstep
> +            write(*,'(I1,I6,2F8.4)') ipol, i, real(den(i)), aimag(den(i))
> +         enddo
> +      endif 
> +
> +   enddo
> +
> +   abss(1:spectra_nstep,ipol)=-4*PI*aimag(dcmplx(1.d0,0.d0)/den(1:spectra_nstep))
> +   rp_abss(1:spectra_nstep,ipol)=1.d0-4*PI*real(dcmplx(1.d0,0.d0)/den(1:spectra_nstep))
> +
> +   if((debug).and.(ionode)) then
> +     write(*,*) 'ABSORPTION ipol', ipol
> +     do i=1,spectra_nstep
> +        write(*,'(I6,F8.4,F12.4)') i, real(comega_g(i))*RYTOEV, abss(i,ipol)
> +     enddo
> +   endif 
> +
> +
> +enddo !ipol
> +im=.true.
> +call print_spectrum(abss,im)
> +im=.false.
> +call print_spectrum(rp_abss,im)
> +
> +
> +deallocate(comega_g)
> +deallocate(den)
> +deallocate(abss)
> +
> +call stop_clock('lanczos_cf')
> +return
> +end subroutine
> +
> 
> Added: trunk/espresso/GWW/bse/openfil_bse.f90
> ===================================================================
> --- trunk/espresso/GWW/bse/openfil_bse.f90	                        (rev 0)
> +++ trunk/espresso/GWW/bse/openfil_bse.f90	2016-03-21 11:09:18 UTC (rev 12231)
> @@ -0,0 +1,39 @@
> +SUBROUTINE openfil_bse()
> +!
> +! ... This routine opens all files needed to the self consistent run,
> +! ... sets various file names, units, record lengths
> +  USE wvfct,          ONLY : nbnd, npwx
> +  use control_flags,  ONLY:  twfcollect
> +  USE io_files,       ONLY : prefix, iunwfc, nwordwfc,nwordatwfc, diropn
> +  USE noncollin_module, ONLY : npol
> +  USE basis,            ONLY : natomwfc
> +  USE ions_base,        ONLY : nat, ityp
> +  USE noncollin_module,   ONLY : noncolin
> +  USE uspp_param,         ONLY : n_atom_wfc
> +
> +
> +
> +  IMPLICIT NONE
> +  !
> +  LOGICAL       :: exst
> +  !
> +  !
> +  twfcollect=.false.
> +  !
> +  ! ... nwordwfc is the record length for the direct-access file
> +  ! ... containing wavefunctions
> +  !
> +  nwordwfc = nbnd * npwx * npol
> +  !
> +  CALL diropn( iunwfc, 'wfc', 2*nwordwfc, exst )
> +  !
> +  IF ( .NOT. exst ) THEN
> +     call errore ('openfil_pw4gww','file '//TRIM( prefix )//'.wfc'//' not found',1)     
> +  END IF
> +  natomwfc = n_atom_wfc( nat, ityp, noncolin )
> +  nwordatwfc = 2*npwx*natomwfc*npol
> +
> +  RETURN
> +  !
> +END SUBROUTINE openfil_bse
> +
> 
> Added: trunk/espresso/GWW/bse/plot_excwfn.f90
> ===================================================================
> --- trunk/espresso/GWW/bse/plot_excwfn.f90	                        (rev 0)
> +++ trunk/espresso/GWW/bse/plot_excwfn.f90	2016-03-21 11:09:18 UTC (rev 12231)
> @@ -0,0 +1,182 @@
> +subroutine plot_excwfn(nstart,nend,vstate_r,fc)
> +! this subroutine computes and writes on file the nplot-th excitonic wavefunction
> +! to be read by pp.x
> +! note that this subroutine is working only for gamma only calculations 
> +
> +USE exciton
> +USE bse_wannier, ONLY:num_nbndv,&
> +                      r_hole,l_plotaverage
> +use bse_basic_structures
> +USE pwcom
> +USE fft_custom_gwl
> +USE io_global, ONLY : stdout,ionode,ionode_id
> +USE io_files, ONLY : tmp_dir,prefix
> +USE mp_world, ONLY : mpime, nproc
> +USE mp, ONLY: mp_sum
> +USE mp_world,             ONLY : world_comm
> +!USE io_files, ONLY : find_free_unit
> +USE ions_base,      ONLY : nat, tau, atm,ityp
> + 
> +implicit none
> +INTEGER, EXTERNAL :: find_free_unit
> +
> +integer :: nplot,nstart,nend
> +type(v_state_r) :: vstate_r
> +type(exc_r) :: a_rt
> +type(fft_cus) :: fc
> +
> +integer ::nxh,nyh,nzh,nh
> +INTEGER :: nr3s_start, nr3s_end
> +real(kind=dp), allocatable :: psi_exc(:)
> +real(kind=dp), allocatable :: psi_excio(:)
> +real(kind=dp), allocatable :: psi_excsum(:)
> +real(kind=dp), allocatable :: v_rh(:)
> +
> +integer :: iv,ii,iplane,iz,ounit,ix,iy,iip
> +logical :: debug
> +
> +CHARACTER(5) :: nfile
> +
> +call start_clock('plot_excwfn')
> +debug=.true.
> +
> +!check if all variables are ok from read_file in main 
> +if (debug) then
> +   if(ionode) then
> +      !bg(:,i) are the reciprocal lattice vectors, b_i,
> +      !in tpiba=2pi/alat units: b_i(:) = bg(:,i)/tpiba
> +      !at(:,i) are the lattice vectors of the simulation cell, a_i,
> +      !in alat units: a_i(:) = at(:,i)/alat
> +      write(stdout,*) 'plotexcwfn bg(:,1)=',bg(1,1),bg(2,1),bg(3,1)
> +      write(stdout,*) 'plotexcwfn bg(:,2)=',bg(1,2),bg(2,2),bg(3,2)
> +      write(stdout,*) 'plotexcwfn bg(:,3)=',bg(1,3),bg(2,3),bg(3,3)
> +      write(stdout,*) 'plotexcwfn alat=',alat
> +   endif
> +endif
> +
> +!find FFT grid point (dual grid) closer to r_hole (given in alat units)
> +
> +nxh = nint ( (r_hole(1)*bg(1,1) + r_hole(2)*bg(2,1) + r_hole(3)*bg(3,1) )*fc%nr1t) + 1
> +nyh = nint ( (r_hole(1)*bg(1,2) + r_hole(2)*bg(2,2) + r_hole(3)*bg(3,2) )*fc%nr2t) + 1
> +nzh = nint ( (r_hole(1)*bg(1,3) + r_hole(2)*bg(2,3) + r_hole(3)*bg(3,3) )*fc%nr3t) + 1
> +
> +allocate(v_rh(num_nbndv(1)))
> +v_rh(:)=0.d0
> +
> +
> +!get the valence wavefunctions at the nxh,nyh,nzh (only one processor has it!) 
> +#ifndef __PARA
> +nh=(nzh-1)*fc%nrx1t*fc%nrx2t+(nyh-1)*fc%nrx1t+nxh
> +v_rh(:)=v_rt%wfnrt(nh,:,1)
> +#else
> +nr3s_start=0
> +nr3s_end =0
> +do ii=1,mpime+1
> +   nr3s_start=nr3s_end+1
> +   nr3s_end=nr3s_end+fc%dfftt%npp(ii)
> +enddo
> +
> +
> +do iplane=1,fc%dfftt%npp(mpime+1)
> +   iz=nr3s_start+iplane-1
> +   if (iz==nzh) then
> +      nh=(iplane-1)*fc%nrx1t*fc%nrx2t+(nyh-1)*fc%nrx1t+nxh
> +      v_rh(:)=vstate_r%wfnrt(nh,:,1)
> +   endif
> +enddo
> +call mp_sum(v_rh,world_comm)
> +#endif
> +
> +
> +if (debug) then
> +   if(ionode) write(stdout,*) 'plotexcwfn qui'
> +endif
> +!stop
> +
> +!allocate and initialize the excitonic wavefunction 
> +allocate(psi_exc(fc%nrxxt))
> +psi_exc(1:fc%nrxxt)=0.d0
> +
> +allocate(psi_excsum(fc%nrx1t*fc%nrx2t*fc%nrx3t))
> +psi_excsum(1:fc%nrx1t*fc%nrx2t*fc%nrx3t)=0.d0
> +
> +do nplot=nstart,nend
> +if (debug) then
> +   if(ionode) write(stdout,*) 'plotexcwfn qui2',nplot
> +endif
> +!
> +!FFT the excitonic wavefunction vector to real space (dual grid)
> +   call initialize_exc_r(a_rt)
> +   call fft_a_exc(bse_spectrum(nplot),fc,a_rt)
> +
> +!now compute the exitonic wavefunction
> +   do iv=1,num_nbndv(1)
> +      psi_exc(1:fc%nrxxt)=psi_exc(1:fc%nrxxt)+v_rh(iv)*&
> +                                           a_rt%ar(1:a_rt%nrxxt,iv)
> +   enddo
> +
> +if (debug) then
> +   if(ionode) write(stdout,*) 'plotexcwfn qui3',nplot
> +endif
> +!square modulus
> +   psi_exc(1:fc%nrxxt)=psi_exc(1:fc%nrxxt)**2
> +
> +   if(debug) then
> +      if(ionode) write(stdout,*) 'fc%nr1t, fc%nr2t, fc%nr3t', fc%nr1t, fc%nr2t, fc%nr3t
> +      if(ionode) write(stdout,*) 'fc%nrx1t, fc%nrx2t, fc%nrx3t', fc%nrx1t, fc%nrx2t, fc%nrx3t
> +   endif
> +
> +!Now gather the excitonic wavefunction from all the processors
> +!and sum for the l_plotaverage case (when nstart is different from nend) 
> +
> +   allocate(psi_excio(fc%nrx1t*fc%nrx2t*fc%nrx3t))
> +
> +   psi_excio(1:fc%nrx1t*fc%nrx3t*fc%nrx3t)=0.d0
> +   do iplane=1,fc%dfftt%npp(mpime+1)
> +     iz=nr3s_start+iplane-1
> +     do iy=1,fc%nr2t
> +        do ix=1,fc%nr1t
> +           ii=(iz-1)*(fc%nrx1t*fc%nrx2t)+(iy-1)*fc%nrx1t+ix
> +           iip=(iplane-1)*fc%nrx1t*fc%nrx2t+(iy-1)*fc%nrx1t+ix
> +           psi_excio(ii)=psi_exc(iip) 
> +        enddo
> +     enddo 
> +   enddo
> +   call mp_sum(psi_excio,world_comm)
> +
> +   psi_excsum(1:fc%nrx1t*fc%nrx3t*fc%nrx3t)=psi_excsum(1:fc%nrx1t*fc%nrx3t*fc%nrx3t)+&
> +                      psi_excio(1:fc%nrx1t*fc%nrx3t*fc%nrx3t)/(real(nend)-real(nstart)+1.d0)
> +
> +   if (debug) then
> +      if(ionode) write(stdout,*) 'plotexcwfn qui3',nplot
> +   endif
> +   call free_memory_exc_a_r(a_rt)
> +   deallocate(psi_excio)
> +enddo ! nplot
> +
> +
> +!
> +! XCRYSDEN FORMAT
> +!
> +if(ionode) then
> +   ounit=find_free_unit()
> +!   open(ounit,file='exc_average.xsf',form='formatted')
> +   if(l_plotaverage)   open(ounit,file='exc_average.xsf',form='formatted')
> +   if(.not.l_plotaverage)   then
> +       write(nfile,'(5i1)') &
> +        & nstart/10000,mod(nstart,10000)/1000,mod(nstart,1000)/100,mod(nstart,100)/10,mod(nstart,10)
> +      open(ounit,file=trim(tmp_dir)//trim(prefix)//'.exc.xsf'//nfile,form='formatted')
> +   endif
> +   CALL xsf_struct (alat, at, nat, tau, atm, ityp, ounit)
> +   CALL xsf_fast_datagrid_3d &
> +             (psi_excsum, fc%nr1t, fc%nr2t, fc%nr3t, fc%nrx1t, fc%nrx2t, fc%nrx3t, at, alat, ounit)
> +!   close(ounit)   
> +endif
> +
> +
> +deallocate(v_rh,psi_exc)
> +
> +deallocate(psi_excsum)
> +
> +call stop_clock('plot_excwfn')
> +end subroutine plot_excwfn
> 
> Added: trunk/espresso/GWW/bse/print_bse.f90
> ===================================================================
> --- trunk/espresso/GWW/bse/print_bse.f90	                        (rev 0)
> +++ trunk/espresso/GWW/bse/print_bse.f90	2016-03-21 11:09:18 UTC (rev 12231)
> @@ -0,0 +1,24 @@
> +subroutine print_bseinfo()
> +!prints basic info from the BSE input file
> +USE io_global, ONLY : ionode
> +USE bse_wannier, ONLY : l_truncated_coulomb, truncation_radius, &
> +           numw_prod,&
> +           dual_bse,&
> +           lambda,eps,&
> +           l_cgrad,maxit,n_eig,eps_eig, scissor,&
> +           l_plotexc,plotn_min,plotn_max,r_hole,l_plotaverage,&
> +           spectra_e_min,spectra_e_max,spectra_broad,&
> +           l_restart,n_eig_start, nit_lcz,l_lanczos
> +implicit none
> +
> +if(ionode) then
> +   write(*,*) 'Dimension of the polarizability basis:', numw_prod
> +   write(*,*) 'Scissor operator (eV)=', scissor
> +   if(l_truncated_coulomb) then
> +      write(*,*) 'Using truncated Coulomb interaction'
> +      write(*,*) 'Truncation Radius (a.u.)=', truncation_radius
> +   endif
> +endif
> +
> +return
> +end subroutine
> 
> Added: trunk/espresso/GWW/bse/print_spectrum.f90
> ===================================================================
> --- trunk/espresso/GWW/bse/print_spectrum.f90	                        (rev 0)
> +++ trunk/espresso/GWW/bse/print_spectrum.f90	2016-03-21 11:09:18 UTC (rev 12231)
> @@ -0,0 +1,107 @@
> +subroutine print_spectrum(aspectrum,im)
> +!this subroutine applies a gaussian broadening and prints the absorption
> +!spectrum on file
> +
> +USE exciton
> +USE io_global,   ONLY : stdout,ionode
> +USE bse_wannier, ONLY : spectra_e_min,spectra_e_max,n_eig,spectra_nstep,spectra_broad,l_lanczos
> +USE constants,   ONLY : RYTOEV, PI
> +USE cell_base,   ONLY : omega
> +USE io_files,    ONLY : tmp_dir,prefix
> +
> +implicit none
> +
> +INTEGER, EXTERNAL :: find_free_unit
> +
> +REAL(kind=DP), INTENT(inout)  :: aspectrum(spectra_nstep,3)
> +INTEGER           :: ipol
> +
> +REAL(DP), ALLOCATABLE :: omega_g(:),broad_abs(:,:)
> +REAL(DP)    :: step,prefac,sumdos,norm
> +INTEGER     :: i,j,iun 
> +
> +LOGICAL     :: debug, im
> +
> +call start_clock('print_spectrum')
> +debug=.false.
> + 
> +allocate(omega_g(spectra_nstep))
> +allocate(broad_abs(spectra_nstep,3))
> +
> +!build the omega grid (in eV)
> +step=(spectra_e_max-spectra_e_min)/dble(spectra_nstep-1)
> +
> +do i=0, spectra_nstep-1
> +   omega_g(i+1)=(spectra_e_min+dble(i)*step)
> +enddo
> +
> +
> +prefac=4.d0*PI/omega
> +!prefac=1.d0
> +do ipol=1,3
> +   aspectrum(1:spectra_nstep,ipol)=prefac*aspectrum(1:spectra_nstep,ipol)
> +
> +   broad_abs(1:spectra_nstep, ipol)=0.d0
> +   do i=1,spectra_nstep 
> +        norm=0.d0
> +        do j=1,spectra_nstep
> +           broad_abs(i,ipol)=broad_abs(i,ipol)+&
> +                    aspectrum(j,ipol)*exp(-((omega_g(i)-omega_g(j))**2)/(2.d0*spectra_broad**2))
> +           norm=norm+exp(-((omega_g(i)-omega_g(j))**2)/(2.d0*spectra_broad**2))
> +        enddo
> +        broad_abs(i, ipol)=broad_abs(i, ipol)/norm
> +   enddo
> +enddo
> +
> +!print absorption aspectrum on file
> +if(im) then
> +   do ipol=1,3
> +      if(ionode) then
> +         iun = find_free_unit()
> +
> +         if (ipol==1) then
> +            open(unit=iun, file=trim(tmp_dir)//trim(prefix)//'.eps2x.dat', status='unknown', form='formatted')
> +         elseif (ipol==2) then
> +            open(unit=iun, file=trim(tmp_dir)//trim(prefix)//'.eps2y.dat', status='unknown', form='formatted')
> +         elseif (ipol==3) then
> +            open(unit=iun, file=trim(tmp_dir)//trim(prefix)//'.eps2z.dat', status='unknown', form='formatted')
> +         endif
> +      
> +!         write(*,*) '# Energy(eV)   Eps2 Eps2(Nogaussbroad)'
> +         write(iun,*) '# Energy(eV)   Eps2 Eps2(Nogaussbroad)'
> +         do i=1,spectra_nstep
> +            write(iun,*) omega_g(i),broad_abs(i,ipol),aspectrum(i,ipol)
> +            !write(*,*) omega_g(i),broad_abs(i,ipol),aspectrum(i,ipol)
> +         enddo
> +         close(iun)
> +      endif
> +   enddo 
> +else
> +   do ipol=1,3
> +      if(ionode) then
> +         iun = find_free_unit()
> +
> +         if (ipol==1) then
> +            open(unit=iun, file=trim(tmp_dir)//trim(prefix)//'.eps1x.dat', status='unknown', form='formatted')
> +         elseif (ipol==2) then
> +            open(unit=iun, file=trim(tmp_dir)//trim(prefix)//'.eps1y.dat', status='unknown', form='formatted')
> +         elseif (ipol==3) then
> +            open(unit=iun, file=trim(tmp_dir)//trim(prefix)//'.eps1z.dat', status='unknown', form='formatted')
> +         endif
> +      
> +!         write(*,*) '# Energy(eV)   Eps1 Eps1(Nogaussbroad)'
> +         write(iun,*) '# Energy(eV)   Eps1 Eps1(Nogaussbroad)'
> +         do i=1,spectra_nstep
> +            write(iun,*) omega_g(i),broad_abs(i,ipol),aspectrum(i,ipol)
> +            !write(*,*) omega_g(i),broad_abs(i,ipol),aspectrum(i,ipol)
> +         enddo
> +         close(iun)
> +      endif
> +   enddo 
> +endif
> +
> +deallocate(omega_g,broad_abs)
> +call stop_clock('print_spectrum')
> +
> +end subroutine
> +
> 
> Added: trunk/espresso/GWW/bse/qpcorrections.f90
> ===================================================================
> --- trunk/espresso/GWW/bse/qpcorrections.f90	                        (rev 0)
> +++ trunk/espresso/GWW/bse/qpcorrections.f90	2016-03-21 11:09:18 UTC (rev 12231)
> @@ -0,0 +1,42 @@
> +subroutine qpcorrections(wcstate)
> +! this subroutine reads the qp corrections and gives the correct weight to each line of the 
> +! wcstate vector
> +
> +!USE qpe,         ONLY: qpc,qpcbarc,qpcbarv
> +USE qpe_exc         
> +USE bse_wannier, ONLY: qpe_imin, qpe_imax, num_nbndv,scissor
> +USE kinds,        ONLY : DP
> +USE bse_basic_structures
> +
> +implicit none
> +type(c_state)   :: wcstate
> +real(kind=DP), allocatable :: qpcw(:)
> +
> +
> +call build_qpc(qpc)
> +allocate(qpcw(wcstate%numb_c))
> +qpcw=0.d0
> +
> +if (qpe_imin <= num_nbndv(1)) then
> +   qpcbarv=qpc(qpe_imin)
> +   qpc(1:qpe_imin)=qpcbarv
> +else
> +!case only conduction bands corrections are computed, valence shifts rigidly 
> +   qpc(1:qpe_imin)=-scissor
> +endif
> +
> +if (qpe_imax > num_nbndv(1)) then
> +   qpcbarc=qpc(qpe_imax)
> +   qpcw(1:qpe_imax-num_nbndv(1))=qpc(num_nbndv(1)+1:qpe_imax)-qpcbarc
> +   
> +   call c_times_cstate(qpcw,wcstate,wcstate)
> +else
> +!case only valence bands corrections are computed, conduction shifts rigidly 
> +   qpcbarc=scissor
> +endif
> +
> +
> +
> +
> +return
> +end subroutine
> 
> Added: trunk/espresso/GWW/bse/qpe_exc.f90
> ===================================================================
> --- trunk/espresso/GWW/bse/qpe_exc.f90	                        (rev 0)
> +++ trunk/espresso/GWW/bse/qpe_exc.f90	2016-03-21 11:09:18 UTC (rev 12231)
> @@ -0,0 +1,61 @@
> +MODULE qpe_exc
> +!this modules contains variables and subroutines related to the use of QP energies
> +!in the BSE kernel beyond the scissor operator
> + 
> +USE kinds, ONLY : DP
> +USE io_global, ONLY : ionode,ionode_id
> +USE mp, ONLY : mp_bcast
> +USE mp_world, ONLY : world_comm
> +
> +REAL(kind=DP), pointer :: qpc(:)  ! vector containing QPC
> +REAL(kind=DP)          :: qpcbarc ! average qpc to be applied for higher 
> +                                  ! (band index above qpc_imax) energy states
> +REAL(kind=DP)          :: qpcbarv ! average qpc to be applied for lower 
> +                                  ! (band index below qpc_imin) energy states
> +
> +CONTAINS
> +
> +    SUBROUTINE build_qpc(qpc)
> +    !this subroutine reads bands.dat and builds the qp correction vector
> +    USE bse_wannier, ONLY : qpe_imin,qpe_imax
> +    USE kinds, ONLY : DP
> +    USE io_files, ONLY : tmp_dir,prefix
> +    USE wvfct,    ONLY : nbnd
> +    USE constants,   ONLY: RYTOEV
> +
> +    implicit none
> +     
> +    INTEGER, EXTERNAL :: find_free_unit
> +    integer :: ib, iun, idum
> +    logical :: debug
> +    real(kind=DP) :: qpc(qpe_imax)
> +    real(kind=DP)          :: rdum,edft,egw,dumm1,dumm2,dumm3,dumm4
> +        
> +    qpc(1:qpe_imax)=0.d0
> +
> +    iun = find_free_unit()  
> +    if(ionode) then
> +       open(iun,file=trim(tmp_dir)//trim(prefix)//'-bands.dat', status='old', form='formatted')
> +       
> +       read(iun,*) idum
> +       read(iun,*) idum
> +
> +       do ib=1,qpe_imin-1
> +          read(iun,*) idum,dumm1,dumm2,dumm3,dumm4
> +       enddo
> +
> +       do ib=qpe_imin,qpe_imax
> +          read(iun,*) idum, edft,rdum,egw,rdum
> +          qpc(ib)=(egw-edft)/RYTOEV
> +       
> +       enddo
> + 
> +       close(iun)
> +    endif
> +    do ib=qpe_imin,qpe_imax
> +       call mp_bcast(qpc(ib), ionode_id, world_comm )
> +    enddo
> +    return
> +    END SUBROUTINE
> +
> +end MODULE qpe_exc
> 
> Added: trunk/espresso/GWW/bse/read_export.f90
> ===================================================================
> --- trunk/espresso/GWW/bse/read_export.f90	                        (rev 0)
> +++ trunk/espresso/GWW/bse/read_export.f90	2016-03-21 11:09:18 UTC (rev 12231)
> @@ -0,0 +1,305 @@
> +subroutine read_export (pp_file,kunit,uspp_spsi, ascii, single_file, raw)
> +  !-----------------------------------------------------------------------
> +  !
> +  use iotk_module
> +
> +
> +  use kinds,          ONLY : DP 
> +  use pwcom  
> +  use control_flags,  ONLY : gamma_only  
> +  use becmod,         ONLY : bec_type, becp, calbec, &
> +                             allocate_bec_type, deallocate_bec_type
> +!  use symme,          ONLY : nsym, s, invsym, sname, irt, ftau
> +!  use symme,          ONLY : nsym, s, invsym, irt, ftau
> +!  use char,           ONLY : sname
> +! occhio sname is in symme which is now outside pwcom
> +  use  uspp,          ONLY : nkb, vkb
> +  use wavefunctions_module,  ONLY : evc
> +  use io_files,       ONLY : nd_nmbr, outdir, prefix, iunwfc, nwordwfc, iunsat, nwordatwfc
> +  use io_files,       ONLY : pseudo_dir, psfile
> +  use io_global,      ONLY : ionode, stdout
> +  USE ions_base,      ONLY : atm, nat, ityp, tau, nsp
> +  use mp_world,      ONLY : nproc,  mpime
> +  use mp_pools,      ONLY : my_pool_id, intra_pool_comm, inter_pool_comm, nproc_pool
> +  USE mp_world,             ONLY : world_comm
> +  use mp,             ONLY : mp_sum, mp_max
> +!  use ldaU,           ONLY : swfcatom, lda_plus_u
> +  use ldaU,           ONLY :  lda_plus_u
> +  USE gvecw,              ONLY :  ecutwfc
> +
> +  implicit none
> +
> +  CHARACTER(5), PARAMETER :: fmt_name="QEXPT"
> +  CHARACTER(5), PARAMETER :: fmt_version="1.1.0"
> +
> +  integer, intent(in) :: kunit
> +  character(80), intent(in) :: pp_file
> +  logical, intent(in) :: uspp_spsi, ascii, single_file, raw
> +
> +  integer :: i, j, k, ig, ik, ibnd, na, ngg,ig_, ierr
> +  integer, allocatable :: kisort(:)
> +  real(DP) :: xyz(3), tmp(3)
> +  integer :: npool, nkbl, nkl, nkr, npwx_g
> +  integer :: ike, iks, npw_g, ispin, local_pw
> +  integer, allocatable :: ngk_g( : )
> +  integer, allocatable :: itmp_g( :, : )
> +  real(DP),allocatable :: rtmp_g( :, : )
> +  real(DP),allocatable :: rtmp_gg( : )
> +  integer, allocatable :: itmp1( : )
> +  integer, allocatable :: igwk( :, : )
> +  integer, allocatable :: l2g_new( : )
> +  integer, allocatable :: igk_l2g( :, : )
> +
> +
> +  real(DP) :: wfc_scal 
> +  logical :: twf0, twfm
> +  character(iotk_attlenx) :: attr
> +  complex(DP), allocatable :: sevc (:,:)
> +
> +  call start_clock('read_export')
> +  write(stdout,*) "nkstot=", nkstot
> +
> +  IF( nkstot > 0 ) THEN
> +
> +     IF( ( kunit < 1 ) .OR. ( MOD( nkstot, kunit ) /= 0 ) ) &
> +       CALL errore( ' write_export ',' wrong kunit ', 1 )
> +
> +     IF( ( nproc_pool > nproc ) .OR. ( MOD( nproc, nproc_pool ) /= 0 ) ) &
> +       CALL errore( ' write_export ',' nproc_pool ', 1 )
> +
> +     !  find out the number of pools
> +     npool = nproc / nproc_pool
> +
> +     !  find out number of k points blocks
> +     nkbl = nkstot / kunit
> +
> +     !  k points per pool
> +     nkl = kunit * ( nkbl / npool )
> +
> +     !  find out the reminder
> +     nkr = ( nkstot - nkl * npool ) / kunit
> +
> +     !  Assign the reminder to the first nkr pools
> +     IF( my_pool_id < nkr ) nkl = nkl + kunit
> +
> +     !  find out the index of the first k point in this pool
> +     iks = nkl * my_pool_id + 1
> +     IF( my_pool_id >= nkr ) iks = iks + nkr * kunit
> +
> +     !  find out the index of the last k point in this pool
> +     ike = iks + nkl - 1
> +
> +  END IF
> +
> +  write(stdout,*) "after first init"
> +
> +  ! find out the global number of G vectors: ngm_g  
> +  ngm_g = ngm
> +  call mp_sum( ngm_g , world_comm )
> +
> +  ! collect all G vectors across processors within the pools
> +  ! and compute their modules
> +  !
> +  allocate( itmp_g( 3, ngm_g ) )
> +  allocate( rtmp_g( 3, ngm_g ) )
> +  allocate( rtmp_gg( ngm_g ) )
> +
> +  itmp_g = 0
> +  do  ig = 1, ngm
> +    itmp_g( 1, ig_l2g( ig ) ) = mill(1, ig )
> +    itmp_g( 2, ig_l2g( ig ) ) = mill(2, ig )
> +    itmp_g( 3, ig_l2g( ig ) ) = mill(3, ig )
> +  end do
> +  call mp_sum( itmp_g , world_comm )
> +  !
> +  ! here we are in crystal units
> +  rtmp_g(1:3,1:ngm_g) = REAL( itmp_g(1:3,1:ngm_g) )
> +  !
> +  ! go to cartesian units (tpiba)
> +  call cryst_to_cart( ngm_g, rtmp_g, bg , 1 )
> +  !
> +  ! compute squared moduli
> +  do  ig = 1, ngm_g 
> +     rtmp_gg(ig) = rtmp_g(1,ig)**2 + rtmp_g(2,ig)**2 + rtmp_g(3,ig)**2 
> +  enddo
> +  deallocate( rtmp_g )
> +
> +  ! build the G+k array indexes
> +  allocate ( igk_l2g ( npwx, nks ) )
> +  allocate ( kisort( npwx ) )
> +  do ik = 1, nks
> +     kisort = 0
> +     npw = npwx
> +     call gk_sort (xk (1, ik+iks-1), ngm, g, ecutwfc / tpiba2, npw, kisort(1), g2kin)
> +     !
> +     ! mapping between local and global G vector index, for this kpoint
> +     !
> +     DO ig = 1, npw
> +        !
> +        igk_l2g(ig,ik) = ig_l2g( kisort(ig) )
> +        !
> +     END DO
> +     !
> +     igk_l2g( npw+1 : npwx, ik ) = 0
> +     !
> +     ngk (ik) = npw
> +  end do
> +  deallocate (kisort)
> +
> +  ! compute the global number of G+k vectors for each k point
> +  allocate( ngk_g( nkstot ) )
> +  ngk_g = 0
> +  ngk_g( iks:ike ) = ngk( 1:nks )
> +  CALL mp_sum( ngk_g, world_comm )
> +
> +  ! compute the Maximum G vector index among all G+k and processors
> +  npw_g = MAXVAL( igk_l2g(:,:) )
> +  CALL mp_max( npw_g, world_comm )
> +
> +  ! compute the Maximum number of G vector among all k points
> +  npwx_g = MAXVAL( ngk_g( 1:nkstot ) )
> +
> +  deallocate(rtmp_gg)
> +
> +  allocate( igwk( npwx_g,nkstot ) )
> +
> +  write(stdout,*) "after g stuff"
> +
> +! wfc grids
> +
> +  DO ik = 1, nkstot
> +    igwk(:,ik) = 0
> +    !
> +    ALLOCATE( itmp1( npw_g ), STAT= ierr )
> +    IF ( ierr/=0 ) CALL errore('pw_export','allocating itmp1', ABS(ierr) )
> +    itmp1 = 0
> +    ! 
> +    IF( ik >= iks .AND. ik <= ike ) THEN 
> +      DO  ig = 1, ngk( ik-iks+1 )
> +        itmp1( igk_l2g( ig, ik-iks+1 ) ) = igk_l2g( ig, ik-iks+1 ) 
> +      END DO
> +    END IF
> +    !
> +    CALL mp_sum( itmp1, world_comm )
> +    !
> +    ngg = 0
> +    DO  ig = 1, npw_g
> +      IF( itmp1( ig ) == ig ) THEN
> +        ngg = ngg + 1
> +        igwk( ngg , ik) = ig
> +      END IF
> +    END DO
> +    IF( ngg /= ngk_g( ik ) ) THEN
> +      WRITE( stdout,*) ' ik, ngg, ngk_g = ', ik, ngg, ngk_g( ik )
> +    END IF
> +    !
> +    DEALLOCATE( itmp1 )
> +    !
> +  ENDDO
> +  !
> +  deallocate( itmp_g )
> +  
> +  write(stdout,*)"after wfc waves"
> +
> +#ifdef __PARA
> +  call poolrecover (et, nbnd, nkstot, nks)
> +#endif
> + 
> +  wfc_scal = 1.0d0
> +  twf0 = .true.
> +  twfm = .false.
> +
> +  do ik = 1, nkstot
> +     local_pw = 0
> +     IF( (ik >= iks) .AND. (ik <= ike) ) THEN
> +
> +       call davcio (evc, 2*nwordwfc, iunwfc, (ik-iks+1), - 1)
> +!       IF ( lda_plus_u ) CALL davcio( swfcatom, nwordatwfc, iunsat, (ik-iks+1), -1 )
> +       local_pw = ngk(ik-iks+1)
> +
> +     ENDIF
> +
> +
> +     allocate(l2g_new(local_pw))
> +
> +     l2g_new = 0
> +     do ig = 1, local_pw
> +       ngg = igk_l2g(ig,ik-iks+1)
> +       do ig_ = 1, ngk_g(ik)
> +         if(ngg == igwk(ig_,ik)) then
> +           l2g_new(ig) = ig_
> +           exit
> +         end if
> +       end do
> +     end do
> +
> +
> +     ispin = isk( ik )
> +     !  WRITE(0,*) ' ### ', ik,nkstot,iks,ike,kunit,nproc,nproc_pool 
> +     deallocate(l2g_new)
> +  end do
> +  !  
> +
> +  write(stdout,*) "after davcio"
> +
> +  ! If specified and if USPP are used the wfcs S_psi are written  
> +  ! | spsi_nk > = \hat S | psi_nk >  
> +  ! where S is the overlap operator of US PP 
> +  !  
> +  IF ( uspp_spsi .AND. nkb > 0 ) THEN
> +
> +       ALLOCATE( sevc(npwx,nbnd), STAT=ierr )
> +       IF (ierr/=0) CALL errore( ' read_export ',' Unable to allocate SEVC ', ABS(ierr) )
> +
> +       CALL init_us_1
> +       CALL init_at_1
> +
> +       CALL allocate_bec_type (nkb,nbnd,becp)
> +
> +       do ik = 1, nkstot
> + 
> +           local_pw = 0
> +           IF( (ik >= iks) .AND. (ik <= ike) ) THEN
> +               
> +               CALL gk_sort (xk (1, ik+iks-1), ngm, g, ecutwfc / tpiba2, npw, igk, g2kin)
> +               CALL davcio (evc, 2*nwordwfc, iunwfc, (ik-iks+1), - 1)
> +
> +               CALL init_us_2(npw, igk, xk(1, ik), vkb)
> +               local_pw = ngk(ik-iks+1)
> +                            
> +               IF ( gamma_only ) THEN
> +                  if(nkb>0) CALL calbec ( ngk_g(ik), vkb, evc, becp )
> +               ELSE
> +                  CALL calbec ( npw, vkb, evc, becp )
> +               ENDIF
> +               CALL s_psi(npwx, npw, nbnd, evc, sevc)
> +           ENDIF
> +
> +           ALLOCATE(l2g_new(local_pw))
> +
> +           l2g_new = 0
> +           DO ig = 1, local_pw
> +             ngg = igk_l2g(ig,ik-iks+1)
> +             DO ig_ = 1, ngk_g(ik)
> +               IF(ngg == igwk(ig_,ik)) THEN
> +                 l2g_new(ig) = ig_
> +                 EXIT
> +               ENDIF
> +             ENDDO
> +           ENDDO
> +
> +           ispin = isk( ik )
> +           DEALLOCATE(l2g_new)
> +       ENDDO
> +      
> +       DEALLOCATE( sevc, STAT=ierr )
> +       IF ( ierr/= 0 ) CALL errore('read_export','Unable to deallocate SEVC',ABS(ierr))
> +       CALL deallocate_bec_type ( becp )
> +  ENDIF
> +
> +  DEALLOCATE( igk_l2g )
> +  DEALLOCATE( igwk )
> +  DEALLOCATE ( ngk_g )
> +  call stop_clock('read_export')
> +
> +end subroutine read_export
> 
> Added: trunk/espresso/GWW/bse/rotate_wannier.f90
> ===================================================================
> --- trunk/espresso/GWW/bse/rotate_wannier.f90	                        (rev 0)
> +++ trunk/espresso/GWW/bse/rotate_wannier.f90	2016-03-21 11:09:18 UTC (rev 12231)
> @@ -0,0 +1,97 @@
> +!-----------------------------------------
> +subroutine rotate_wannier_gamma_bse( rot_u,a_in,a_out,ispin, itrasp)
> +!----------------------------------------
> +!
> +! (GAMMA-ONLY CALCULATIONS) and rotate the wavefunctions
> +! according to rot_u
> +! only ispin states used (not implemented ye
> +! ONLY -NORMCONSERVING
> +
> +
> +  USE kinds,    ONLY : DP
> +  USE us
> +  USE wvfct,    ONLY : igk, g2kin, npwx, npw, nbndx,nbnd
> +  USE gvect
> +  USE basis
> +  USE klist
> +  USE constants, ONLY : e2, pi, tpi, fpi
> +  USE io_files, ONLY: nwordwfc
> +  USE cell_base, ONLY: at, alat, tpiba, omega, tpiba2
> +!  USE wavefunctions_module, ONLY: evc
> +  use exciton
> +  USE io_global, ONLY : stdout 
> +
> +
> +  implicit none
> +
> +  INTEGER, INTENT(in) :: ispin!+1 or -1
> +   type(exc):: a_in, a_out
> +  REAL(kind=DP) :: rot_u(a_in%numb_v,a_in%numb_v)
> +  INTEGER, INTENT(in) :: itrasp!if 1 takes U^T
> + 
> +
> +
> +
> +  REAL(kind=DP), ALLOCATABLE  :: evc0(:,:),evc_re(:,:),evc_im(:,:)!reads wavefunctions here
> +  integer i,j,k,ig
> +  logical debug
> +
> +  call start_clock('rotate_wannier_gamma_bse')
> +  debug=.true.
> +
> +  allocate( evc0(npw,a_in%numb_v))
> +  allocate( evc_re(npw,a_in%numb_v))
> +  allocate( evc_im(npw,a_in%numb_v))
> +
> +  if(debug) then 
> +     write(stdout,*) 'rotate wannier #1'
> +  endif
> +
> +!now real part
> +  if(itrasp/=1) then
> +     evc0(1:a_in%npw,1:a_in%numb_v)=dble(a_in%a(1:a_in%npw,1:a_in%numb_v))
> +     call dgemm('N','N',npw,a_in%numb_v,a_in%numb_v,1.d0,evc0,npw,rot_u,a_in%numb_v,0.d0,evc_re,npw)
> +!now imaginary part
> +     evc0(1:a_in%npw,1:a_in%numb_v)=dimag(a_in%a(1:a_in%npw,1:a_in%numb_v))
> +     call dgemm('N','N',npw,a_in%numb_v,a_in%numb_v,1.d0,evc0,npw,rot_u,a_in%numb_v,0.d0,evc_im,npw)
> +  else
> +     evc0(1:a_in%npw,1:a_in%numb_v)=dble(a_in%a(1:a_in%npw,1:a_in%numb_v))
> +     call dgemm('N','T',npw,a_in%numb_v,a_in%numb_v,1.d0,evc0,npw,rot_u,a_in%numb_v,0.d0,evc_re,npw)
> +     !now imaginary part                                                                                                
> +     evc0(1:a_in%npw,1:a_in%numb_v)=dimag(a_in%a(1:a_in%npw,1:a_in%numb_v))
> +     call dgemm('N','T',npw,a_in%numb_v,a_in%numb_v,1.d0,evc0,npw,rot_u,a_in%numb_v,0.d0,evc_im,npw)
> +  endif
> +
> +  
> +!  do i=1,nbnd
> +!     do ig=1,npw
> +!        evc(ig,i)=dcmplx(evc_re(ig,i),evc_im(ig,i))
> +!     enddo
> +!  enddo
> +
> +
> + a_out%a(1:a_in%npw,1:a_in%numb_v)=dcmplx(evc_re(1:a_in%npw,1:a_in%numb_v),evc_im(1:a_in%npw,1:a_in%numb_v))
> +  
> +
> +
> +!rotate
> +!  do i=1,nbnd
> +!     do j=1,nbnd
> +!        do ig=1,npw
> +!           evc(ig,i)=evc(ig,i)+rot_u(j,i)*evc0(ig,j)
> +!        enddo
> +!     enddo
> +!  enddo
> +
> +
> +
> +
> +  DEALLOCATE(evc0)
> +  deallocate(evc_re,evc_im)
> +
> +
> +  call stop_clock('rotate_wannier_gamma_bse')
> +  return
> +
> +end subroutine rotate_wannier_gamma_bse
> +
> 
> Added: trunk/espresso/GWW/bse/sdescent.f90
> ===================================================================
> --- trunk/espresso/GWW/bse/sdescent.f90	                        (rev 0)
> +++ trunk/espresso/GWW/bse/sdescent.f90	2016-03-21 11:09:18 UTC (rev 12231)
> @@ -0,0 +1,130 @@
> +subroutine sdescent(i_state,vstatesd,vstate_rsd,cstate,cstate_r,fc,en)
> +
> +use exciton
> +use bse_basic_structures
> +USE fft_custom_gwl
> +USE io_global, ONLY : stdout,ionode
> +USE wvfct,    ONLY : npw
> +use bse_wannier, ONLY:num_nbndv,eps,lambda,eps_eig
> +USE mp,          ONLY :mp_barrier
> +USE mp_world,             ONLY : world_comm
> +USE constants,        ONLY: RYTOEV
> +
> +
> +
> +implicit none
> +
> +type(exc) :: a_in
> +type(exc) :: a_out
> +type(v_state) :: vstatesd
> +type(v_state_r) :: vstate_rsd
> +type(c_state) :: cstate
> +type(c_state_r) :: cstate_r
> +type(fft_cus) :: fc
> +
> +real(kind=DP), intent(out) :: en
> +real(kind=DP) :: eigout
> +real(kind=DP) :: eig
> +real(kind=DP) ::delta,delta_eig,hsquare
> +
> +integer :: it,is,i,i_state
> +
> +call start_clock('sdescent')
> +
> +!create a random excitonic wavefunction vector a_exc
> +!and normalize it
> +call initialize_exc(a_in)
> +a_in%label=50
> +a_in%npw=npw
> +a_in%numb_v=num_nbndv(1)
> +allocate(a_in%a(a_in%npw,a_in%numb_v))
> +
> +call random_exc(a_in)
> +!project into the conduction manifold
> +
> +do is = 1,vstatesd%nspin
> +   call pc_operator_exc(a_in,vstatesd,is)
> +enddo
> +
> +!project out all the previous found state
> +call pout_operator_exc(a_in,i_state)
> +
> +call normalize_exc(a_in)
> +CALL mp_barrier(world_comm)
> +
> +call initialize_exc(a_out)
> +a_out%label=1
> +a_out%npw=npw
> +a_out%numb_v=num_nbndv(1)
> +allocate(a_out%a(a_out%npw,a_out%numb_v))
> +
> +eig=0.d0
> +eigout=100.d0
> +delta_eig=100.d0
> +delta=100.d0
> +
> +if(ionode) write(stdout,*) 'Steepest descent started.'
> +if(ionode) write(stdout,*) 'lambda=',lambda
> +if(ionode) write(stdout,*) 'eps',eps
> +
> +it=0
> +!do while(((abs(delta))>=eps)) 
> +do while(((abs(delta))>=eps).or.(abs(delta_eig)>eps_eig)) 
> +!   write(*,*) 's descent, iteration, delta=',it,delta
> +! |a_out>=H|a_in>
> +   call exc_h_a(a_in,a_out,vstatesd,vstate_rsd,cstate,cstate_r,fc) 
> +   call mp_barrier(world_comm)
> +!   call normalize_exc(a_out)
> +   
> +!  eigout= <psi_(it)|H|psi_(it)>/<psi_(it)|psi_(it)>  
> +   call sproduct_exc(a_out,a_in,eigout)
> +   eigout=eigout*RYTOEV
> +   write(*,*) 'sd. eig# =',i_state, 'it=', it, 'E(eV)=', eigout 
> +  
> +!  check how good it is as an eigenstate 
> +   call sproduct_exc(a_out,a_out,hsquare)
> +   hsquare=hsquare*RYTOEV*RYTOEV
> +
> +   delta_eig=hsquare-eigout**2
> + 
> +   eigout=eigout*RYTOEV
> +! compute |psi_(it+1)>
> +   a_out%a(1:a_out%npw,1:a_out%numb_v)=(1.d0+lambda*eigout)*a_in%a(1:a_in%npw,1:a_in%numb_v)&
> +                          -lambda*a_out%a(1:a_out%npw,1:a_out%numb_v)
> +
> +   !project into the conduction manifold
> +
> +   do is = 1,vstatesd%nspin
> +      call pc_operator_exc(a_out,vstatesd,is)
> +   enddo
> +
> +   !project out all the previous found state
> +   call pout_operator_exc(a_out,i_state)
> +   call normalize_exc(a_out)
> +
> +   a_in%a(1:a_out%npw,1:a_out%numb_v)= a_out%a(1:a_out%npw,1:a_out%numb_v)
> +
> +   call mp_barrier(world_comm)
> +
> +   it=it+1
> +   delta=eig-eigout
> +   eig=eigout
> +enddo
> +
> +bse_spectrum(i_state)%a(1:bse_spectrum(i_state)%npw,1:bse_spectrum(i_state)%numb_v)=&
> +                         a_out%a(1:a_out%npw,1:a_out%numb_v)
> +
> +
> +bse_spectrum(i_state)%e=eigout
> +en=eigout
> +
> +!if(ionode) write(stdout,*) 'Lowest eigenvalue=',eig
> +
> +
> +!free memory
> +call free_memory_exc_a(a_out)
> +call free_memory_exc_a(a_in)
> +
> +call stop_clock('sdescent')
> +return
> +end subroutine sdescent
> 
> Added: trunk/espresso/GWW/bse/spectrum.f90
> ===================================================================
> --- trunk/espresso/GWW/bse/spectrum.f90	                        (rev 0)
> +++ trunk/espresso/GWW/bse/spectrum.f90	2016-03-21 11:09:18 UTC (rev 12231)
> @@ -0,0 +1,136 @@
> +subroutine build_spectrum(ampl,en,ipol)
> +!this subroutine builds up the absorption spectrum
> +!and prints it on file
> +
> +USE exciton
> +USE io_global,   ONLY : stdout,ionode
> +USE bse_wannier, ONLY : spectra_e_min,spectra_e_max,n_eig,spectra_nstep,spectra_broad
> +USE constants,   ONLY : RYTOEV, PI
> +USE cell_base,   ONLY : omega
> +USE io_files,    ONLY : tmp_dir,prefix
> +
> +implicit none
> +
> +REAL(DP), INTENT(in)  :: ampl(n_eig), en(n_eig)
> +
> +INTEGER, EXTERNAL :: find_free_unit
> +
> +REAL(DP), ALLOCATABLE :: omega_g(:),absorption(:),broad_abs(:),excdos(:)
> +COMPLEX(DP), ALLOCATABLE :: den(:,:),cspectrum(:),campl(:)
> +REAL(DP)    :: eta,step,prefac,sumdos,norm
> +COMPLEX(DP) :: lambda_sum
> +INTEGER     :: i,j,iun,ipol 
> +
> +LOGICAL     :: debug
> +
> +call start_clock('build_spectrum')
> +debug=.true.
> +eta=0.001
> +
> +if(debug) then
> +   if(ionode) then
> +     do i=1,n_eig
> +        write(stdout,*) '#',i,'E=',en(i),'A=',ampl(i)
> +     enddo
> +   endif
> +endif
> + 
> +allocate(omega_g(spectra_nstep))
> +allocate(absorption(spectra_nstep))
> +allocate(excdos(spectra_nstep))
> +allocate(broad_abs(spectra_nstep))
> +allocate(cspectrum(spectra_nstep))
> +allocate(den(spectra_nstep,n_eig))
> +allocate(campl(n_eig))
> +
> +if (ipol==1) then
> +!convert energy range in Ry
> +   spectra_e_min=spectra_e_min/RYTOEV
> +   spectra_e_max=spectra_e_max/RYTOEV
> +endif
> +
> +!build the omega grid
> +step=(spectra_e_max-spectra_e_min)/dble(spectra_nstep-1)
> +
> +do i=0, spectra_nstep-1
> +   den(i+1,1:n_eig)=dcmplx(1.d0,0.d0)/(dcmplx(en(1:n_eig),0.d0)-dcmplx(spectra_e_min+dble(i)*step,eta))
> +   omega_g(i+1)=(spectra_e_min+dble(i)*step)*RYTOEV
> +enddo
> +
> +!compute the absorption spectrum
> +campl(1:n_eig)=dcmplx(ampl(1:n_eig),0.d0)
> +!campl(1:n_eig)=dcmplx(1.d0,0.d0)
> +
> +cspectrum(1:spectra_nstep)=(0.d0,0.d0)
> +
> +call zgemm('N','N',spectra_nstep,1,n_eig,(1.d0,0.d0),den,spectra_nstep,campl,n_eig,(0.d0,0.d0),cspectrum,spectra_nstep)
> +
> +prefac=8.d0*PI/omega
> +!prefac=1.d0
> +absorption(1:spectra_nstep)=prefac*aimag(cspectrum(1:spectra_nstep))
> +
> +!add gaussian broadening
> +broad_abs(1:spectra_nstep)=0.d0
> +do i=1,spectra_nstep 
> +     norm=0.d0
> +     do j=1,spectra_nstep
> +        broad_abs(i)=broad_abs(i)+&
> +                     absorption(j)*exp(-((omega_g(i)-omega_g(j))**2)/(2.d0*spectra_broad**2))
> +        norm=norm+exp(-((omega_g(i)-omega_g(j))**2)/(2.d0*spectra_broad**2))
> +     enddo
> +     broad_abs(i)=broad_abs(i)/norm
> +enddo
> +
> +!compute DOS using a lorentzian
> +if (ipol==1) then
> +   excdos(1:spectra_nstep)=0.d0
> +   do i=0,spectra_nstep-1
> +       do j=1,n_eig
> +          excdos(i+1)=excdos(i+1)+2.d0*eta/(PI*((en(j)-spectra_e_min-dble(i)*step)**2+eta**2))
> +       enddo
> +   enddo
> +   excdos(1:spectra_nstep)=excdos(1:spectra_nstep)/(2.d0*n_eig)
> +endif
> +
> +
> +
> +write(*,*) 'Absorption' 
> +write(*,*) 'Energy(eV)   Eps2 Eps2(Nogaussbroad)'
> +
> +!print absorption spectrum on file
> +if(ionode) then
> +   iun = find_free_unit()
> +
> +   if (ipol==1) then
> +      open(unit=iun, file=trim(tmp_dir)//trim(prefix)//'.eps2x.dat', status='unknown', form='formatted')
> +   elseif (ipol==2) then
> +      open(unit=iun, file=trim(tmp_dir)//trim(prefix)//'.eps2y.dat', status='unknown', form='formatted')
> +   elseif (ipol==3) then
> +      open(unit=iun, file=trim(tmp_dir)//trim(prefix)//'.eps2z.dat', status='unknown', form='formatted')
> +   endif
> +
> +   do i=1,spectra_nstep
> +      write(iun,*) omega_g(i),broad_abs(i), absorption (i)
> +      write(*,*) omega_g(i),broad_abs(i), absorption(i)
> +   enddo
> +   close(iun)
> +endif
> +
> +!print excdos spectrum on file
> +if(ionode.and.(ipol==1)) then
> +   sumdos=0.d0
> +   iun = find_free_unit()
> +   open(unit=iun, file=trim(tmp_dir)//trim(prefix)//'.excdos.dat', status='unknown', form='formatted')
> +   do i=1,spectra_nstep
> +      write(iun,*) omega_g(i),excdos(i)
> +      sumdos=sumdos+excdos(i)
> +   enddo
> +   close(iun)
> +   write(*,*) 'sumdos=',sumdos/dble(spectra_nstep)
> +endif
> +
> +deallocate(omega_g,absorption,broad_abs,cspectrum,den,campl,excdos)
> +
> +call stop_clock('build_spectrum')
> +end subroutine
> +
> 
> Added: trunk/espresso/GWW/bse/start_bse.f90
> ===================================================================
> --- trunk/espresso/GWW/bse/start_bse.f90	                        (rev 0)
> +++ trunk/espresso/GWW/bse/start_bse.f90	2016-03-21 11:09:18 UTC (rev 12231)
> @@ -0,0 +1,18 @@
> +subroutine start_bse
> +  !
> +  !  Usage: [mpirun, mpprun, whatever] postproc [-npool N]
> +  !
> +  !  Wrapper routine for postprocessing initialization
> +  !
> +  USE mp_global,     ONLY: mp_startup
> +  USE environment,   ONLY: environment_start
> +  implicit none
> +  character(len=9) :: code = 'BSE'
> +  !
> +#ifdef __PARA
> +  CALL mp_startup ( )
> +#endif
> +  CALL environment_start ( code )
> +  ! 
> +  return
> +end subroutine start_bse
> 
> Added: trunk/espresso/GWW/bse/tmp.f90
> ===================================================================
> --- trunk/espresso/GWW/bse/tmp.f90	                        (rev 0)
> +++ trunk/espresso/GWW/bse/tmp.f90	2016-03-21 11:09:18 UTC (rev 12231)
> @@ -0,0 +1,45 @@
> +subroutine make_v_state(numb_v,v)
> +  USE gvect,                 ONLY : gstart
> +  USE lsda_mod,              ONLY : nspin
> +  use wavefunctions_module,  ONLY : evc
> +  use io_files,  ONLY : prefix, iunwfc
> +  USE wvfct,    ONLY : nbnd, npwx,npw
> +  implicit none
> +
> +  type(v_state) :: v
> +  integer :: numb_v
> +  
> +  integer :: is,ivmax
> +
> +  v%nspin=nspin
> +  v%numb_v(:)=numb_v(:)
> +  v%npw=npw
> +  v%gstart=gstart
> +
> +  allocate( evc( npwx, nbnd ) )
> +  
> +  if (nspin==1) then
> +     ivmax= v%numb_v(1)
> +  else 
> +     ivmax=max(v%numb_v(1),v%numb_v(2))
> +  endif
> +
> +  allocate( v%wfn(v%npw,ivmax,v%nspin)
> +  allocate( v%esp(ivmax,v%nspin)
> +
> +  do is=1,nspin
> +     call davcio(evc,2*nwordwfc,iunwfc,is,-1)
> +     do iv=1,v%numb_v(is)
> +        v%wfn(1:v%npw,1:v%numb_v(is),is)=evc(1:v%npw,1:v%numb_v(is))
> +     enddo  
> +        v%esp(1:v%numb_v(is),is)=et(1:v%numb_v(is),is)
> +  enddo
> +
> +  deallocate(evc)
> +
> +  
> +
> +
> +  return
> +end subroutine
> +
> 
> Added: trunk/espresso/GWW/bse/transitions.f90
> ===================================================================
> --- trunk/espresso/GWW/bse/transitions.f90	                        (rev 0)
> +++ trunk/espresso/GWW/bse/transitions.f90	2016-03-21 11:09:18 UTC (rev 12231)
> @@ -0,0 +1,13 @@
> +! this module contains the variables related to the solution of the BSE 
> +! in transition space
> +
> +MODULE transitions
> +
> +USE kinds, ONLY: DP
> +INTEGER, ALLOCATABLE :: ttab(:,:) !maps the (iv,ic) couple into the it index
> +INTEGER, ALLOCATABLE :: itiv(:)!for the it-th transition gives the
> +INTEGER, ALLOCATABLE :: itic(:)!corresponding valence and conduction band index respectively 
> +REAL(KIND=DP), ALLOCATABLE :: exch(:,:) !excitonic Hamiltonian in transition space
> +
> +
> +END MODULE transitions
> 
> Added: trunk/espresso/GWW/bse/tspace.f90
> ===================================================================
> --- trunk/espresso/GWW/bse/tspace.f90	                        (rev 0)
> +++ trunk/espresso/GWW/bse/tspace.f90	2016-03-21 11:09:18 UTC (rev 12231)
> @@ -0,0 +1,187 @@
> +subroutine tspace_diago(vstate,vstate_r,fc)
> +! this subroutine solves teh BSE equation in transition space
> +
> +USE bse_wannier, ONLY : num_nbndv
> +USE transitions, ONLY : ttab,itiv,itic,exch
> +USE wvfct,       ONLY : nbnd
> +USE io_global,   ONLY : stdout,ionode
> +USE kinds,       ONLY : DP
> +USE constants,   ONLY : RYTOEV
> +USE mp,          ONLY : mp_barrier
> +USE mp_world,             ONLY : world_comm
> +USE bse_basic_structures 
> +USE fft_custom_gwl 
> +
> +
> +implicit none
> +type(v_state) vstate
> +type(v_state_r) vstate_r
> +type(fft_cus) :: fc
> +
> +integer                         :: nt,it ! number of transitions
> +real(kind=DP),    allocatable   :: eig(:)
> +
> +! for zheev
> +integer                         :: info
> +real(kind=DP), allocatable   :: work(:)
> +
> +call start_clock('tspace_diago')
> +! compute the number of transitions
> +nt=num_nbndv(1)*(nbnd-num_nbndv(1))
> +if(ionode) write(stdout,*) 'number of transitions nt:', nt
> +
> +! allocate and build the transition table (c,v)-->it
> +allocate(ttab(nbnd,num_nbndv(1)))
> +allocate(itiv(nt))
> +allocate(itic(nt))
> +
> +call build_ttab
> +
> +! allocate and build the excitonic Hamiltonian in transition space
> +allocate(exch(nt,nt))
> +exch(1:nt,1:nt)=0.d0
> +
> +call mp_barrier(world_comm)
> +call build_exch(vstate,vstate_r,fc)
> +
> +call mp_barrier(world_comm)
> +! diagonalize (not parallel so only one proc will do it)
> +
> +allocate(eig(nt))
> +if(ionode) then
> +   allocate(work(3*nt-1))
> +   call dsyev('V', 'U', nt, exch, nt, eig, work, 3*nt-1 , info)
> +   eig(1:nt)=eig(1:nt)*RYTOEV 
> +   deallocate(work)
> +endif
> +
> +call mp_barrier(world_comm)
> +
> +if(ionode) then
> +   do it=1,nt 
> +      write(stdout,*) 'Eigenvalue number', it, eig(it)
> +   enddo
> +endif
> +
> +deallocate(ttab,itic,itiv,exch,eig)
> +call stop_clock('tspace_diago')
> +return
> +end subroutine
> +
> +!--------------------!
> +subroutine build_exch(vstate,vstate_r,fc)
> +!this subroutine builds the excitonic Hamiltonian in transition space
> +
> +USE bse_wannier,           ONLY : num_nbndv
> +USE wvfct,                 ONLY : nbnd, npwx,npw,et
> +USE lsda_mod,              ONLY : nspin
> +USE wavefunctions_module,  ONLY : evc
> +USE io_files,              ONLY : prefix, iunwfc, nwordwfc
> +USE transitions,           ONLY : ttab,exch
> +USE io_global,             ONLY : stdout,ionode
> +USE bse_basic_structures 
> +USE fft_custom_gwl 
> +USE exciton 
> +
> +
> +implicit none
> +type(v_state),intent(in)    :: vstate
> +type(v_state_r), intent(in) :: vstate_r
> +type(fft_cus), intent(in)   :: fc
> +
> +!internal
> +type(exc) :: phic
> +type(exc) :: phicp
> +type(exc) :: hphic
> +integer   :: iv,ivp,ic,icp,is
> +
> +call start_clock('build_exch')
> +
> +call initialize_exc(phic)
> +phic%label=1
> +phic%npw=npw
> +phic%numb_v=num_nbndv(1)
> +allocate(phic%a(phic%npw,phic%numb_v))
> +
> +call initialize_exc(hphic)
> +hphic%label=1
> +hphic%npw=npw
> +hphic%numb_v=num_nbndv(1)
> +allocate(hphic%a(hphic%npw,hphic%numb_v))
> +
> +call initialize_exc(phicp)
> +phicp%label=1
> +phicp%npw=npw
> +phicp%numb_v=num_nbndv(1)
> +allocate(phicp%a(phicp%npw,phicp%numb_v))
> +
> +allocate( evc( npwx, nbnd ) )
> +
> +!read wavefunctions
> +do is=1,nspin
> +   call davcio(evc,2*nwordwfc,iunwfc,is,-1)
> +enddo
> +
> +if(ionode) write(stdout,*) 'wfns read from disk'
> +
> +do iv=1,num_nbndv(1)
> +   do ic=num_nbndv(1)+1,nbnd
> +      
> +      if(ionode) write(stdout,*) 'iv ic', iv, ic
> +      phic%a(1:phic%npw,1:phic%numb_v)=dcmplx(0.d0,0.d0)
> +      phic%a(1:phic%npw,iv)=evc(1:npw,ic)
> +      call normalize_exc(phic)
> +      !apply tyhe excitonic Hamiltonian to this ic,iv state
> +      call exc_h_a(phic,hphic,vstate,vstate_r,fc) 
> +     
> +      do ivp=iv,num_nbndv(1)
> +          do icp= ic,nbnd
> +             if(ionode) write(stdout,*) 'ivp icp', ivp, icp
> +             phicp%a(1:phicp%npw,1:phicp%numb_v)=dcmplx(0.d0,0.d0)
> +             phicp%a(1:phicp%npw,ivp)=evc(1:npw,icp)
> +             call normalize_exc(phicp)
> +             call sproduct_exc(phicp,hphic,exch(ttab(ic,iv),ttab(icp,ivp)))
> +             if(ionode) write(stdout,*) 'Exc. Hamiltonian built for transition:', ttab(ic,iv),ttab(icp,ivp) 
> +          enddo !icp
> +      enddo !ivp
> +   enddo !ic
> +enddo !iv
> +
> +call free_memory_exc_a(phic)
> +deallocate(evc)
> +
> +call stop_clock('build_exch')
> +return
> +endsubroutine
> +
> +!--------------------!
> +subroutine build_ttab
> +!this subroutine builds the transition table
> +
> +USE wvfct,    ONLY : nbnd
> +USE bse_wannier, ONLY:num_nbndv
> +use transitions, ONLY:ttab,itiv,itic
> +USE io_global, ONLY : stdout,ionode
> +
> +implicit none
> +integer       :: it,iv,ic
> +
> +it=1
> +
> +do iv=1,num_nbndv(1)
> +   do ic=num_nbndv(1)+1,nbnd
> +      ttab(ic,iv)=it
> +      itiv(it)=iv
> +      itic(it)=ic
> +      it=it+1
> +   enddo
> +enddo
> +
> +it=it-1
> +
> +if(ionode) write(stdout,*) 'ttab built, number of transitions found:', it
> +if(ionode) write(stdout,*) 'total number of bands', nbnd
> +if(ionode) write(stdout,*) 'number of valence bands', num_nbndv(1)
> +return
> +end subroutine
> +!--------------------!
> 
> Added: trunk/espresso/GWW/bse/write_wannier_matrix.f90
> ===================================================================
> --- trunk/espresso/GWW/bse/write_wannier_matrix.f90	                        (rev 0)
> +++ trunk/espresso/GWW/bse/write_wannier_matrix.f90	2016-03-21 11:09:18 UTC (rev 12231)
> @@ -0,0 +1,59 @@
> + subroutine read_wannier_matrix
> +!this read the inverse transfromation matrix from KS eigenstates
> +!to ML wanniers on file, to be read by GWW code
> +!the INVERSE matrix is calculated here
> +
> +  USE kinds, ONLY : DP
> +!  USE wannier_gw, ONLY : u_trans, num_nbndv
> +  USE wvfct,    ONLY : et,nbnd
> +  USE io_global, ONLY : stdout,ionode,ionode_id
> +!  USE io_files, ONLY : find_free_unit, prefix
> +  USE io_files, ONLY : prefix, tmp_dir
> +  USE mp, ONLY : mp_bcast
> +  USE mp_world,             ONLY : world_comm
> +  USE lsda_mod, ONLY :nspin
> +  USE bse_basic_structures, ONLY : u_trans
> +  use bse_wannier, ONLY:num_nbndv
> + 
> +  implicit none
> +  INTEGER, EXTERNAL :: find_free_unit
> +
> +
> +
> +  INTEGER :: iunu, iw, is
> +  INTEGER :: idumm
> +  REAL(kind=DP), ALLOCATABLE :: rdummv(:)
> +  
> +  call start_clock('read_wannier_matrix')
> +  allocate(rdummv(nbnd))
> +
> +
> +  if(ionode) then
> +     iunu = find_free_unit()
> +     open(unit=iunu,file=trim(tmp_dir)//trim(prefix)//'.wannier',status='old',form='unformatted')
> +  
> +     read(iunu) idumm
> +     read(iunu) idumm
> +  endif
> +  do is=1,nspin
> +     if(ionode) then
> +        read(iunu) idumm
> +        read(iunu) rdummv(1:nbnd)
> +        read(iunu) rdummv(1:nbnd)
> +        read(iunu) rdummv(1:nbnd)
> +     endif
> + 
> +
> +     do iw=1,nbnd
> +        if(ionode) read(iunu) u_trans(1:nbnd,iw,is)
> +        call mp_bcast(u_trans(1:nbnd,iw,is),ionode_id, world_comm)       
> +     enddo
> +  enddo
> +  if(ionode) close(iunu)
> +  
> +  deallocate(rdummv)
> +
> +
> +  call stop_clock('read_wannier_matrix')
> +  return
> +end subroutine read_wannier_matrix
> 
> _______________________________________________
> Q-e-commits mailing list
> Q-e-commits at qe-forge.org
> http://qe-forge.org/mailman/listinfo/q-e-commits

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.quantum-espresso.org/pipermail/developers/attachments/20160324/ec58b584/attachment.html>