program CG

! Conjugate Gradient
! Solve 1D heat equation
! Solve A T = 0, with A the matrix corresponding to the laplacian
!
! Heat equation : dT/dt = alpha d^2 T/dx^2
! When converged, dT/dt = 0, the equation becomes : 0 = alpha d^2 T/dx^2 <--> 0 = d^2 T/dx^2
! Numericaly, this means : (T(i+1)-2.0*T(i)+T(i-1)) * 1/(Dx*Dx) = 0 <--> A T = 0 with the matrix A, a diagonal symetric positiv definit matrix
! The matrix is like :
! -2/(dx*dx)   1/(dx*dx)     0           0           0          0          ....
!  1/(dx*dx)   -2/(dx*dx)    1/(dx*dx)   0           0          0          ....
!  0            1/(dx*dx)	-2/(dx*dx)   1/(dx*dx)   0          0          ....
!  0		    0            1/(dx*dx)  -2/(dx*dx)   1/(dx*dx)  0          ....
!  0		    0            0           1/(dx*dx)  -2/(dx*dx)  1/(dx*dx)  0
! etc
	use mpi
	implicit none

	integer, parameter :: ndim = 1
	real(8), dimension(1:ndim) :: Lx,Dx,InvDxDx
	integer, dimension(1:ndim) :: Nx,Global_Nx
	real(8) :: beta, rho, rho0, gam, sum_rho, sum_gam, max_Residue
	integer :: n,i
	real(8), allocatable, dimension(:) :: T,Residue,p,q

	integer :: rank, nb_mpi_processes, ierror, tag, statu(MPI_STATUS_SIZE),niter
	real(8), dimension(:), allocatable :: Field,Field_buff
	character(len=4) :: charbuff
	integer, dimension(1) :: nb_process_axe     ! number of processes per axe of cartesian topology
	logical, dimension(1) :: cart_boundaries    ! Type of boundaries : .true. -> periodic, .false. -> non periodic
	integer :: MPI_COMM_CART                    ! our new communicator
	integer, dimension(1) :: cart_position, pos ! position of process on the cartesian topology ( and pos a buffer )
	integer, dimension(-1:1) :: cart_neigh      ! neigbours rank on the cartesian topology

	tag = 7777

	call MPI_INIT( ierror )
	call MPI_COMM_SIZE( MPI_COMM_WORLD , nb_mpi_processes , ierror )
	call MPI_COMM_RANK( MPI_COMM_WORLD , rank , ierror )

	if(nb_mpi_processes /= 4) stop 'This program is design to be run with 4 processes only'
	nb_process_axe(1) = 4
	cart_boundaries(1) = .false.

	call MPI_CART_CREATE( MPI_COMM_WORLD , ndim , nb_process_axe(1:ndim) , &
	                    & cart_boundaries(1:ndim) , .true. , MPI_COMM_CART , ierror )

	call MPI_CART_COORDS( MPI_COMM_CART , rank , ndim , cart_position(1:ndim) , ierror )

	! Starting from 2009, MPI does not return MPI_PROC_NULL when calling to a position outside the domain
	! You need to do it yourself, removing one of the greatest aspects of this type of communicator...

	do i=-1,1
		if((cart_boundaries(1) .eqv. .false.) .AND. &
		& ((cart_position(1) == 0 .AND. i==-1) .OR. (cart_position(1) == nb_process_axe(1)-1) .AND. i==1)) then
			cart_neigh(i) = MPI_PROC_NULL
		else
			pos(1) = cart_position(1) + i
			call MPI_CART_RANK( MPI_COMM_CART , pos(1:ndim) , cart_neigh(i) , ierror )
		end if
	end do

	! Global Variables
	Global_Nx(1) = 100
	Lx(:) = 0.1
	Dx(:) = Lx(:)/((Global_Nx(:)-1)*1.0)
	InvDxDx(:) = 1.0/(Dx(:)*Dx(:))

	! Local Variables
	if(modulo(Global_Nx(1),nb_mpi_processes) /= 0) stop "Error, number of points cannot be divided by the number of processes. STOP."
	Nx(1) = Global_Nx(1)/nb_mpi_processes

	allocate(T(0:Nx(1)+1),Residue(0:Nx(1)+1),p(0:Nx(1)+1),q(0:Nx(1)+1))

	! Initial Solution
	T(1:Nx(1)) = 0.0
	if(rank == 0) T(0) = 10.0
	if(rank == nb_mpi_processes-1) T(Nx(1)+1) = -1.0

	! Initialisation

	! Recv left, send right
	call MPI_SENDRECV ( T(Nx(1)) , 1 , MPI_DOUBLE_PRECISION , cart_neigh(1)  , tag , &
	                  & T(0)     , 1 , MPI_DOUBLE_PRECISION , cart_neigh(-1) , tag , MPI_COMM_CART , statu , ierror )
	! Recv right, send left
	call MPI_SENDRECV ( T(1)       , 1 , MPI_DOUBLE_PRECISION , cart_neigh(-1) , tag , &
	                  & T(Nx(1)+1) , 1 , MPI_DOUBLE_PRECISION , cart_neigh(1)  , tag , MPI_COMM_CART , statu , ierror )

	do i=1,Nx(1)
		Residue(i) = - (T(i+1)-2.0*T(i)+T(i-1)) * InvDxDx(1)
	end do

	p(:) = 0.0
	beta = 0.0

	rho = 0.0
	do i=1,Nx(1)
		rho = rho + Residue(i)*Residue(i)
	end do
	call MPI_ALLREDUCE ( rho , sum_rho , 1 , MPI_DOUBLE_PRECISION , MPI_SUM , MPI_COMM_WORLD , ierror)
	! Prefer not using the buffer variable in calculations (sum_rho, sum_gam, etc) to avoid mistakes. It will not cost more, and it is more secure.
	rho = sum_rho

	! Iteration

	n = 0
	do
		n = n + 1

		do i=1,Nx(1)
			p(i) = Residue(i) + beta * p(i)
		end do

		! Matrix Operation : q = A p
		call MPI_SENDRECV ( p(Nx(1)) , 1 , MPI_DOUBLE_PRECISION , cart_neigh(1)  , tag , &
		                  & p(0)     , 1 , MPI_DOUBLE_PRECISION , cart_neigh(-1) , tag , MPI_COMM_CART , statu , ierror )
		! Recv right, send left
		call MPI_SENDRECV ( p(1)       , 1 , MPI_DOUBLE_PRECISION , cart_neigh(-1) , tag , &
		                  & p(Nx(1)+1) , 1 , MPI_DOUBLE_PRECISION , cart_neigh(1)  , tag , MPI_COMM_CART , statu , ierror )
		do i=1,Nx(1)
			q(i) = (p(i+1)-2.0*p(i)+p(i-1)) * InvDxDx(1)
		end do

		gam = 0.0
		do i=1,Nx(1)
			gam = gam + p(i)*q(i)
		end do
		call MPI_ALLREDUCE ( gam , sum_gam , 1 , MPI_DOUBLE_PRECISION , MPI_SUM , MPI_COMM_WORLD , ierror)
		! Prefer not using the buffer variable in calculations (sum_rho, sum_gam, etc) to avoid mistakes. It will not cost more, and it is more secure.
		gam = sum_gam

		gam = rho / gam

		do i=1,Nx(1)
			T(i) = T(i) + gam * p(i)
		end do

		do i=1,Nx(1)
			Residue(i) = Residue(i) - gam * q(i)
		end do

		! Test if Residue is < 1.e-7 so that solution is converged
		call MPI_ALLREDUCE ( maxval(abs(Residue(1:Nx(1)))) , max_Residue , 1 , MPI_DOUBLE_PRECISION , MPI_MAX , MPI_COMM_WORLD , ierror)
		if(max_Residue < 1e-7) exit ! Converged Solution
		if(rank==0) write(*,*) "CG step ",n,"Residue",max_Residue

		rho0 = rho ! Save old rho value
		rho = 0.0
		do i=1,Nx(1)
			rho = rho + Residue(i)*Residue(i)
		end do
		call MPI_ALLREDUCE ( rho , sum_rho , 1 , MPI_DOUBLE_PRECISION , MPI_SUM , MPI_COMM_WORLD , ierror)
		! Prefer not using the buffer variable in calculations (sum_rho, sum_gam, etc) to avoid mistakes. It will not cost more, and it is more secure.
		rho = sum_rho

		beta = rho / rho0

	end do

	write(*,*) "Solution converged in ", n ,"Iterations, with a res of ", max_Residue

	! Solution plot

	! Last write
	write(charbuff,'(I4.4)') rank
	open(10,file=trim(adjustl('OUT-CG'//'.'//trim(adjustl(charbuff))//'.dat')))
	do i=1,10
		write(10,*) i+(rank*10),T(i)
	end do
	close(10)

	deallocate(T,Residue,p,q)

end program CG