LCOV - code coverage report
Current view: top level - mpi - setupMPI.F90 (source / functions) Hit Total Coverage
Test: FLEUR test coverage Lines: 72 146 49.3 %
Date: 2024-04-25 04:21:55 Functions: 3 6 50.0 %

          Line data    Source code
       1             : !--------------------------------------------------------------------------------
       2             : ! Copyright (c) 2016 Peter Grünberg Institut, Forschungszentrum Jülich, Germany
       3             : ! This file is part of FLEUR and available as free software under the conditions
       4             : ! of the MIT license as expressed in the LICENSE file in more detail.
       5             : !--------------------------------------------------------------------------------
       6             : 
       7             : MODULE m_setupMPI
       8             : #ifdef CPP_MPI
       9             :   use mpi
      10             : #endif
      11             :   use m_juDFT
      12             :   IMPLICIT NONE
      13             : 
      14             : CONTAINS
      15         160 :   SUBROUTINE setupMPI(nkpt,neigd,nbasfcn,fmpi)
      16             : !$  use omp_lib
      17             : 
      18             :     use m_omp_checker
      19             :     USE m_types
      20             :     USE m_available_solvers,ONLY:parallel_solver_available,print_solver
      21             :     INTEGER,INTENT(in)           :: nkpt,neigd,nbasfcn
      22             :     TYPE(t_mpi),INTENT(inout)    :: fmpi
      23             : 
      24             :     INTEGER :: omp=-1,i,isize,localrank,gpus,ii, me, nk,ierr
      25             :     logical :: finished
      26             : 
      27         160 :     TYPE(t_log_message) :: log
      28             :     
      29             : #ifdef CPP_MPI
      30         160 :     CALL juDFT_COMM_SPLIT_TYPE(fmpi%mpi_comm,MPI_COMM_TYPE_SHARED,0,MPI_INFO_NULL,fmpi%mpi_comm_same_node)
      31             : #endif
      32         160 :     call omp_checker()
      33         160 :     !$ omp=omp_get_max_threads()
      34         160 :     if (fmpi%irank==0) THEN
      35             :        !print INFO on parallelization
      36          80 :        WRITE(*,*) "------------Calculation Setup---------------------------"
      37             : #ifdef CPP_MPI
      38          80 :        write(*,*) "Number of MPI-tasks  : ",fmpi%isize
      39          80 :        CALL MPI_COMM_SIZE(fmpi%mpi_comm_same_node,isize,i)
      40          80 :        write(*,*) "Number of PE/node    : ",isize
      41          80 :        CALL add_usage_data("MPI-PE",fmpi%isize)
      42          80 :        call log%add("MPI-Ranks",int2str(fmpi%isize))
      43             : #else
      44             :        CALL add_usage_data("MPI-PE",1)
      45             :        call log%add("MPI-Ranks","noMPI")
      46             : #endif
      47          80 :        IF (omp==-1) THEN
      48           0 :          WRITE(*,*) "Number of OMP-threads:            No OpenMP"
      49           0 :           CALL add_usage_data("OMP",0)
      50           0 :           call log%add("OMP","NoOpenMP")
      51             :        ELSE
      52          80 :          WRITE(*,*) "Number of OMP-threads: ",omp
      53          80 :           call log%add("OMP-Tasks",int2str(omp))
      54          80 :           IF(omp.EQ.1.AND.fmpi%isize.GE.6.AND.&
      55             :              ABS(NINT(REAL(nkpt)/REAL(fmpi%isize))*fmpi%isize-nkpt).GT.1.0e-7) THEN
      56           0 :              WRITE(*,*) ''
      57           0 :              WRITE(*,*) '========================================'
      58           0 :              WRITE(*,*) 'WARNING:'
      59           0 :              WRITE(*,*) 'You are making use of multiple MPI processes but no OpenMP parallelization.'
      60           0 :              WRITE(*,*) 'The chosen parallelization scheme is also no pure k-point parallelization.'
      61           0 :              WRITE(*,*) 'The performance of your calculation may benefit by also employing some'
      62           0 :              WRITE(*,*) 'OpenMP parallelization.'
      63           0 :              WRITE(*,*) ''
      64           0 :              WRITE(*,*) 'Fleur will proceed with the calculation.'
      65           0 :              WRITE(*,*) '========================================'
      66           0 :              WRITE(*,*) ''
      67             :           END IF
      68             : 
      69          80 :           CALL add_usage_data("OMP",omp)
      70             :        ENDIF
      71             :     endif
      72             :     call priv_distribute_gpu(fmpi,log)
      73         160 :     call log%report(logmode_info)
      74         160 :     IF (fmpi%isize==1) THEN
      75             :        !give some info on available parallelisation
      76           0 :        CALL priv_dist_info(nkpt)
      77           0 :        fmpi%n_rank=0
      78           0 :        fmpi%n_size=1
      79           0 :        fmpi%sub_comm=fmpi%mpi_comm
      80           0 :        fmpi%diag_sub_comm=fmpi%mpi_comm
      81           0 :        IF (ALLOCATED(fmpi%k_list)) DEALLOCATE(fmpi%k_List)
      82           0 :        IF (ALLOCATED(fmpi%ev_list)) DEALLOCATE(fmpi%ev_list)
      83           0 :        ALLOCATE(fmpi%ev_list(neigd))
      84           0 :        ALLOCATE(fmpi%k_list(nkpt))
      85           0 :        fmpi%k_list=[(i,i=1,nkpt)]
      86           0 :        fmpi%coulomb_owner=[(0,i=1,nkpt)]
      87           0 :        fmpi%ev_list=[(i,i=1,neigd)]
      88           0 :        WRITE(*,*) "--------------------------------------------------------"
      89           0 :        RETURN
      90             :     END IF
      91             : #ifdef CPP_MPI
      92             :     !Distribute the work
      93         160 :     CALL priv_distribute_k(nkpt,nbasfcn,fmpi)
      94             :     !generate the MPI communicators
      95         160 :     CALL priv_create_comm(nkpt,neigd,fmpi)
      96             :     !Now check if parallelization is possible
      97         160 :     IF (fmpi%n_size>1) THEN
      98          92 :       if (judft_was_argument("-serial_diag")) THEN
      99         160 :         call priv_redist_for_diag(fmpi)
     100          92 :       else if (.NOT.parallel_solver_available()) then
     101           0 :         call juDFT_error("MPI parallelization failed",hint="You have to either compile FLEUR with a parallel diagonalization library (ELPA,SCALAPACK...) or you have to run such that the No of kpoints can be distributed on the PEs")
     102             :       endif
     103             :     endif
     104             : #endif
     105         160 :     if (fmpi%irank==0) call print_solver(fmpi%n_size>0)
     106             : 
     107        2432 :     ALLOCATE(fmpi%k_list(SIZE([(i, i=INT(fmpi%irank/fmpi%n_size)+1,nkpt,fmpi%isize/fmpi%n_size )])))
     108             :     ! this corresponds to the compact = .true. switch in priv_create_comm
     109        3168 :     fmpi%k_list=[(i, i=INT(fmpi%irank/fmpi%n_size)+1,nkpt,fmpi%isize/fmpi%n_size )]
     110             : 
     111         160 :     fmpi%max_length_k_list=size(fmpi%k_list)
     112             : #ifdef CPP_MPI    
     113         160 :     CALL MPI_ALLREDUCE(MPI_IN_PLACE,fmpi%max_length_k_list,1,MPI_INTEGER,MPI_MAX,fmpi%mpi_comm,ierr)
     114             : #endif
     115             :     ! create an array with the owners of the correct coulomb matrix
     116        1756 :     allocate(fmpi%coulomb_owner(nkpt), source=-1)
     117        1436 :     do nk =1,nkpt
     118             :       me = 0
     119             :       finished = .False.
     120        3092 :       do while(.not. finished)
     121       61232 :          if(any(nk == [(i, i=INT(me/fmpi%n_size)+1,nkpt,fmpi%isize/fmpi%n_size)] )) then
     122        1276 :             fmpi%coulomb_owner(nk) = me
     123        1276 :             finished = .True.
     124             :          endif
     125        1656 :          me = me + 1
     126        2932 :          if(me > fmpi%isize .and. .not. finished) then
     127           0 :             call judft_error("somehow i cant lokate this k-point")
     128             :          endif
     129             :       enddo
     130             :    enddo
     131             : 
     132             : 
     133         160 :     call fmpi%set_errhandler()
     134         160 :     if (fmpi%irank==0) WRITE(*,*) "--------------------------------------------------------"
     135             : 
     136         320 :   END SUBROUTINE setupMPI
     137             : 
     138             : 
     139         160 :   SUBROUTINE priv_distribute_k(nkpt,nbasfcn,fmpi)
     140             :     use m_types
     141             :     implicit none
     142             :     INTEGER,INTENT(in)      :: nkpt, nbasfcn
     143             :     TYPE(t_mpi),INTENT(inout)    :: fmpi
     144             : 
     145             :     !-------------------------------------------------------------------------------------------
     146             :     !
     147             :     ! Distribute the k-point / eigenvector  parallelisation so, that
     148             :     ! all pe's have aproximately equal load. Maximize for k-point
     149             :     ! parallelisation. The naming conventions are as follows:
     150             :     !
     151             :     ! groups             1               2               3             4      (n_groups = 4)
     152             :     !                 /     \         /     \          /   \         /   \
     153             :     ! k-points:      1       2       3       4       5       6      7     8     (nkpts = 8)
     154             :     !               /|\     /|\     /|\     /|\     /|\     /|\    /|\   /|\
     155             :     ! irank        0 1 2   3 4 5   1 2 3   4 5 6   0 1 2   3 4 5  1 2 3  4 5 6  (fmpi%isize = 6)
     156             :     !
     157             :     ! n_rank       0 1 2   0 1 2   0 1 2   0 1 2   0 1 2   0 1 2  0 1 2  0 1 2  (fmpi%n_size = 3)
     158             :     !
     159             :     !
     160             :     ! In the above example, 6 pe's should work on 8 k-points and distribute
     161             :     ! their load in a way, that 3 pe's work on each k-points, so 2 k-points
     162             :     ! are done in parellel (n_members=2) and each processor runs a loop over
     163             :     ! 4 k-points (fmpi%n_groups = 4).
     164             :     ! n_rank and n_size are the equivalents of irank and isize. The former
     165             :     ! belong to the communicator SUB_COMM, the latter to MPI_COMM.
     166             :     !
     167             :     !          G.B. `99
     168             :     !
     169             :     !-------------------------------------------------------------------------------------------
     170             :     INTEGER:: n_members,n_size_min,nk
     171             :     CHARACTER(len=1000)::txt
     172             : 
     173         160 :     IF (judft_was_argument("-pe_per_kpt")) THEN
     174           0 :       txt=judft_string_for_argument("-pe_per_kpt")
     175           0 :       READ(txt,*) fmpi%n_size
     176           0 :       WRITE(*,*) "Using exactly ",fmpi%n_size," PE per kpt"
     177           0 :       if (mod(fmpi%isize,fmpi%n_size).ne.0) call judft_error("Parallelization error",&
     178           0 :            hint="If you specify -pe_per_kpt, the total number of processes needs to be a multiple.")   
     179           0 :       n_members=fmpi%isize/fmpi%n_size     
     180             :     ELSE
     181         160 :       n_members = MIN(nkpt,fmpi%isize)
     182         160 :       IF (judft_was_argument("-min_pe_per_kpt")) THEN
     183           0 :          txt=judft_string_for_argument("-min_pe_per_kpt")
     184           0 :          READ(txt,*) n_size_min
     185           0 :          WRITE(*,*) "Trying to use ",n_size_min," PE per kpt"
     186           0 :          n_members = MIN(n_members , CEILING(REAL(fmpi%isize)/n_size_min) )
     187             :       ENDIF
     188          56 :       DO
     189         216 :          IF ((MOD(fmpi%isize,n_members) == 0).AND.(MOD(nkpt,n_members) == 0) ) EXIT
     190          56 :          n_members = n_members - 1
     191             :       ENDDO
     192             :     endif  
     193             : 
     194             :     !fmpi%n_groups = nkpt/n_members
     195         160 :     fmpi%n_size   = fmpi%isize/n_members
     196             :     !fmpi%n_stride = n_members
     197         160 :     IF (fmpi%irank == 0) THEN
     198          80 :        WRITE(*,*) 'k-points in parallel : ',n_members
     199          80 :        WRITE(*,*) "pe's per k-point     : ",fmpi%n_size
     200          80 :        WRITE(*,*) 'No of k-point loops  : ',nkpt/n_members
     201          80 :        if (mod(nkpt,n_members).ne.0) then
     202           0 :          Write(*,*) 'Info/Warning         : your k-point parallelism is not fully load-balanced'
     203             :        endif
     204             : 
     205          80 :        IF((REAL(nbasfcn) / REAL(fmpi%n_size)).LE.20) THEN
     206           0 :           WRITE(*,*) ''
     207           0 :           WRITE(*,*) '========================================'
     208           0 :           WRITE(*,*) 'WARNING:'
     209           0 :           WRITE(*,*) 'The chosen parallelization scheme implies very few LAPW basis functions'
     210           0 :           WRITE(*,*) 'per MPI process. This may lead to poor performance and other problems.'
     211           0 :           IF((nkpt/n_members).GE.4*fmpi%isize) THEN
     212           0 :              WRITE(*,*) ''
     213           0 :              WRITE(*,*) 'You may want to adjust the number of MPI processes such that the k-point'
     214           0 :              WRITE(*,*) 'parallelization is increased.'
     215           0 :              WRITE(*,*) ''
     216             :           END IF
     217           0 :           WRITE(*,*) 'Fleur will proceed with the calculation.'
     218           0 :           WRITE(*,*) '========================================'
     219           0 :           WRITE(*,*) ''
     220             :        END IF
     221             : 
     222             :     ENDIF
     223         160 :   END SUBROUTINE priv_distribute_k
     224             : 
     225         160 :   SUBROUTINE priv_create_comm(nkpt,neigd,fmpi)
     226             :     use m_types
     227             :     implicit none
     228             :     INTEGER,INTENT(in)      :: nkpt,neigd
     229             :     TYPE(t_mpi),INTENT(inout)    :: fmpi
     230             : #ifdef CPP_MPI
     231             :     INTEGER :: n_members,n,i,ierr,sub_group,world_group,n_start
     232         160 :     INTEGER :: i_mygroup(fmpi%n_size)
     233             :     LOGICAL :: compact ! Deside how to distribute k-points
     234             : 
     235         160 :     compact = .true.
     236         160 :     n_members = fmpi%isize/fmpi%n_size
     237             : 
     238             :     ! now, we make the groups
     239             : 
     240             : 
     241             :     IF (compact) THEN
     242             : 
     243             :         ! This will distribute sub ranks in a compact manner.
     244             :         ! For example, if nkpt = 8 and fmpi%isize = 6:
     245             : 
     246             :         !  -----------------------------------
     247             :         ! |  0  |  1  |  2  |  3  |  4  |  5  |    fmpi%irank
     248             :         !  -----------------------------------
     249             :         ! |  0  |  1  |  3  |  0  |  1  |  2  |    fmpi%n_rank
     250             :         !  -----------------------------------
     251             :         ! |        1        |        2        |    k - points
     252             :         ! |        3        |        4        |
     253             :         ! |        5        |        6        |
     254             :         ! |        7        |        8        |
     255             :         !  -----------------------------------
     256             : 
     257         160 :         n_start = INT(fmpi%irank/fmpi%n_size) + 1
     258         160 :         i_mygroup(1) = (n_start-1) * fmpi%n_size
     259         252 :         do i = 2, fmpi%n_size
     260         252 :            i_mygroup(i) = i_mygroup(i-1) + 1
     261             :         enddo
     262             : 
     263             :     ELSE
     264             : 
     265             :         ! This will distribute sub ranks in a spread manner.
     266             :         ! For example, if nkpt = 8 and fmpi%isize = 6:
     267             : 
     268             :         !  -----------------------------------
     269             :         ! |  0  |  1  |  2  |  3  |  4  |  5  |    fmpi%irank
     270             :         !  -----------------------------------
     271             :         ! |  0  |  1  |  3  |  0  |  1  |  2  |    fmpi%n_rank
     272             :         !  -----------------------------------
     273             :         ! |  1  |  2  |  1  |  2  |  1  |  2  |    k - points
     274             :         ! |  3  |  4  |  3  |  4  |  3  |  4  |
     275             :         ! |  5  |  6  |  5  |  6  |  5  |  6  |
     276             :         ! |  7  |  8  |  7  |  8  |  7  |  8  |
     277             :         !  -----------------------------------
     278             : 
     279             :         n_start = MOD(fmpi%irank,n_members) + 1
     280             :         !!      n_start = INT(irank/n_size) * n_size
     281             :         n = 0
     282             :         DO i = n_start,fmpi%isize,n_members
     283             :         !!      DO i = n_start+1,n_start+n_size
     284             :            n = n+1
     285             :            i_mygroup(n) = i-1
     286             :         ENDDO
     287             : 
     288             :     ENDIF ! compact
     289             : 
     290         160 :     CALL MPI_COMM_GROUP (fmpi%MPI_COMM,WORLD_GROUP,ierr)
     291         160 :     CALL MPI_GROUP_INCL (WORLD_GROUP,fmpi%n_size,i_mygroup,SUB_GROUP,ierr)
     292         160 :     CALL MPI_COMM_CREATE (fmpi%MPI_COMM,SUB_GROUP,fmpi%SUB_COMM,ierr)
     293             :     !write (*,"(a,i0,100i4)") "MPI:",fmpi%sub_comm,fmpi%irank,fmpi%n_groups,fmpi%n_size,n,i_mygroup
     294             : 
     295         160 :     CALL MPI_COMM_RANK (fmpi%SUB_COMM,fmpi%n_rank,ierr)
     296         480 :     ALLOCATE(fmpi%ev_list(neigd/fmpi%n_size+1))
     297       16285 :     fmpi%ev_list=[(i,i=fmpi%n_rank+1,neigd,fmpi%n_size)]
     298         160 :     fmpi%diag_sub_comm=fmpi%sub_comm !default both are the same...
     299             : #endif
     300         160 :   END SUBROUTINE priv_create_comm
     301             : 
     302           0 :   SUBROUTINE priv_dist_info(nkpt)
     303             :     USE m_available_solvers,ONLY:parallel_solver_available
     304             :     IMPLICIT NONE
     305             :     INTEGER,INTENT(in)           :: nkpt
     306             : 
     307           0 :     INTEGER:: n,k_only,pe_k_only(nkpt)
     308             : 
     309             : #ifdef CPP_MPI
     310             :     !Create a list of PE that will lead to k-point parallelization only
     311           0 :     k_only=0
     312           0 :     DO n=1,nkpt
     313           0 :        IF (MOD(nkpt,n)==0) THEN
     314           0 :           k_only=k_only+1
     315           0 :           pe_k_only(k_only)=n
     316             :        ENDIF
     317             :     END DO
     318           0 :     WRITE(*,*) "Most efficient MPI parallelization for:"
     319           0 :     WRITE(*,*) pe_k_only(:k_only)
     320             :     !check if eigenvalue parallelization is possible
     321           0 :     IF (parallel_solver_available()) WRITE(*,*) "Additional eigenvalue parallelization possible"
     322             : #endif
     323           0 :   END SUBROUTINE priv_dist_info
     324             : 
     325           0 :   subroutine priv_redist_for_diag(fmpi)
     326             :     use m_types_mpi
     327             :     type(t_mpi),intent(inout):: fmpi
     328             : #ifdef CPP_MPI
     329           0 :     IF (fmpi%n_rank==0) THEN
     330           0 :       fmpi%diag_sub_comm=MPI_COMM_SELF
     331           0 :       fmpi%pe_diag=.true. !actually the default
     332             :     ELSE
     333           0 :       fmpi%diag_sub_comm=MPI_COMM_NULL
     334           0 :       fmpi%pe_diag=.false.
     335             :     ENDIF
     336             : #endif
     337           0 :     end
     338             : 
     339           0 :    subroutine priv_distribute_gpu(fmpi,log)
     340             : #ifdef _OPENACC
     341             :    use openacc
     342             : #endif    
     343             :     use m_types_mpi
     344             :     type(t_mpi),intent(in):: fmpi
     345             :     type(t_log_message),INTENT(INOUT):: log
     346             :     INTEGER :: i, isize, gpus,localrank
     347             : #ifdef _OPENACC
     348             : call timestart("Distribute GPUs")
     349             :     gpus=acc_get_num_devices(acc_device_nvidia)
     350             :     
     351             :     call log%add("No GPUs",int2str(gpus))
     352             : #ifdef CPP_MPI
     353             :     if (fmpi%irank==0) write(*,*) "Number of GPU per node/MPI:",gpus
     354             :     CALL MPI_COMM_SIZE(fmpi%mpi_comm_same_node,isize,i)
     355             :     if (isize>1) THEN
     356             :       if (fmpi%irank==0) write(*,*) "Number of MPI/PE per node:",isize
     357             :       if (gpus>isize) call judft_warn("You use more GPU/node as MPI-PEs/node running. This will underutilize the GPUs")
     358             :       CALL MPI_COMM_RANK(fmpi%mpi_comm_same_node,localrank,i)
     359             :       if (gpus>1) THEN 
     360             :          call acc_set_device_num(mod(localrank,gpus),acc_device_nvidia)
     361             :          write(*,*) "Assigning PE:",fmpi%irank," to local GPU:",mod(localrank,gpus)
     362             :       endif   
     363             :     ENDIF
     364             : #else
     365             :     write(*,*) "Number of GPU    :",gpus
     366             : #endif
     367             :    call timestop("Distribute GPUs")
     368             : #endif
     369           0 :     end subroutine
     370             : 
     371             : END MODULE m_setupMPI

Generated by: LCOV version 1.14