Line data Source code
1 : !--------------------------------------------------------------------------------
2 : ! Copyright (c) 2016 Peter Grünberg Institut, Forschungszentrum Jülich, Germany
3 : ! This file is part of FLEUR and available as free software under the conditions
4 : ! of the MIT license as expressed in the LICENSE file in more detail.
5 : !--------------------------------------------------------------------------------
6 :
7 : MODULE m_setupMPI
8 : #ifdef CPP_MPI
9 : use mpi
10 : #endif
11 : use m_juDFT
12 : IMPLICIT NONE
13 :
14 : CONTAINS
15 160 : SUBROUTINE setupMPI(nkpt,neigd,nbasfcn,fmpi)
16 : !$ use omp_lib
17 :
18 : use m_omp_checker
19 : USE m_types
20 : USE m_available_solvers,ONLY:parallel_solver_available,print_solver
21 : INTEGER,INTENT(in) :: nkpt,neigd,nbasfcn
22 : TYPE(t_mpi),INTENT(inout) :: fmpi
23 :
24 : INTEGER :: omp=-1,i,isize,localrank,gpus,ii, me, nk,ierr
25 : logical :: finished
26 :
27 160 : TYPE(t_log_message) :: log
28 :
29 : #ifdef CPP_MPI
30 160 : CALL juDFT_COMM_SPLIT_TYPE(fmpi%mpi_comm,MPI_COMM_TYPE_SHARED,0,MPI_INFO_NULL,fmpi%mpi_comm_same_node)
31 : #endif
32 160 : call omp_checker()
33 160 : !$ omp=omp_get_max_threads()
34 160 : if (fmpi%irank==0) THEN
35 : !print INFO on parallelization
36 80 : WRITE(*,*) "------------Calculation Setup---------------------------"
37 : #ifdef CPP_MPI
38 80 : write(*,*) "Number of MPI-tasks : ",fmpi%isize
39 80 : CALL MPI_COMM_SIZE(fmpi%mpi_comm_same_node,isize,i)
40 80 : write(*,*) "Number of PE/node : ",isize
41 80 : CALL add_usage_data("MPI-PE",fmpi%isize)
42 80 : call log%add("MPI-Ranks",int2str(fmpi%isize))
43 : #else
44 : CALL add_usage_data("MPI-PE",1)
45 : call log%add("MPI-Ranks","noMPI")
46 : #endif
47 80 : IF (omp==-1) THEN
48 0 : WRITE(*,*) "Number of OMP-threads: No OpenMP"
49 0 : CALL add_usage_data("OMP",0)
50 0 : call log%add("OMP","NoOpenMP")
51 : ELSE
52 80 : WRITE(*,*) "Number of OMP-threads: ",omp
53 80 : call log%add("OMP-Tasks",int2str(omp))
54 80 : IF(omp.EQ.1.AND.fmpi%isize.GE.6.AND.&
55 : ABS(NINT(REAL(nkpt)/REAL(fmpi%isize))*fmpi%isize-nkpt).GT.1.0e-7) THEN
56 0 : WRITE(*,*) ''
57 0 : WRITE(*,*) '========================================'
58 0 : WRITE(*,*) 'WARNING:'
59 0 : WRITE(*,*) 'You are making use of multiple MPI processes but no OpenMP parallelization.'
60 0 : WRITE(*,*) 'The chosen parallelization scheme is also no pure k-point parallelization.'
61 0 : WRITE(*,*) 'The performance of your calculation may benefit by also employing some'
62 0 : WRITE(*,*) 'OpenMP parallelization.'
63 0 : WRITE(*,*) ''
64 0 : WRITE(*,*) 'Fleur will proceed with the calculation.'
65 0 : WRITE(*,*) '========================================'
66 0 : WRITE(*,*) ''
67 : END IF
68 :
69 80 : CALL add_usage_data("OMP",omp)
70 : ENDIF
71 : endif
72 : call priv_distribute_gpu(fmpi,log)
73 160 : call log%report(logmode_info)
74 160 : IF (fmpi%isize==1) THEN
75 : !give some info on available parallelisation
76 0 : CALL priv_dist_info(nkpt)
77 0 : fmpi%n_rank=0
78 0 : fmpi%n_size=1
79 0 : fmpi%sub_comm=fmpi%mpi_comm
80 0 : fmpi%diag_sub_comm=fmpi%mpi_comm
81 0 : IF (ALLOCATED(fmpi%k_list)) DEALLOCATE(fmpi%k_List)
82 0 : IF (ALLOCATED(fmpi%ev_list)) DEALLOCATE(fmpi%ev_list)
83 0 : ALLOCATE(fmpi%ev_list(neigd))
84 0 : ALLOCATE(fmpi%k_list(nkpt))
85 0 : fmpi%k_list=[(i,i=1,nkpt)]
86 0 : fmpi%coulomb_owner=[(0,i=1,nkpt)]
87 0 : fmpi%ev_list=[(i,i=1,neigd)]
88 0 : WRITE(*,*) "--------------------------------------------------------"
89 0 : RETURN
90 : END IF
91 : #ifdef CPP_MPI
92 : !Distribute the work
93 160 : CALL priv_distribute_k(nkpt,nbasfcn,fmpi)
94 : !generate the MPI communicators
95 160 : CALL priv_create_comm(nkpt,neigd,fmpi)
96 : !Now check if parallelization is possible
97 160 : IF (fmpi%n_size>1) THEN
98 92 : if (judft_was_argument("-serial_diag")) THEN
99 160 : call priv_redist_for_diag(fmpi)
100 92 : else if (.NOT.parallel_solver_available()) then
101 0 : call juDFT_error("MPI parallelization failed",hint="You have to either compile FLEUR with a parallel diagonalization library (ELPA,SCALAPACK...) or you have to run such that the No of kpoints can be distributed on the PEs")
102 : endif
103 : endif
104 : #endif
105 160 : if (fmpi%irank==0) call print_solver(fmpi%n_size>0)
106 :
107 2432 : ALLOCATE(fmpi%k_list(SIZE([(i, i=INT(fmpi%irank/fmpi%n_size)+1,nkpt,fmpi%isize/fmpi%n_size )])))
108 : ! this corresponds to the compact = .true. switch in priv_create_comm
109 3168 : fmpi%k_list=[(i, i=INT(fmpi%irank/fmpi%n_size)+1,nkpt,fmpi%isize/fmpi%n_size )]
110 :
111 160 : fmpi%max_length_k_list=size(fmpi%k_list)
112 : #ifdef CPP_MPI
113 160 : CALL MPI_ALLREDUCE(MPI_IN_PLACE,fmpi%max_length_k_list,1,MPI_INTEGER,MPI_MAX,fmpi%mpi_comm,ierr)
114 : #endif
115 : ! create an array with the owners of the correct coulomb matrix
116 1756 : allocate(fmpi%coulomb_owner(nkpt), source=-1)
117 1436 : do nk =1,nkpt
118 : me = 0
119 : finished = .False.
120 3092 : do while(.not. finished)
121 61232 : if(any(nk == [(i, i=INT(me/fmpi%n_size)+1,nkpt,fmpi%isize/fmpi%n_size)] )) then
122 1276 : fmpi%coulomb_owner(nk) = me
123 1276 : finished = .True.
124 : endif
125 1656 : me = me + 1
126 2932 : if(me > fmpi%isize .and. .not. finished) then
127 0 : call judft_error("somehow i cant lokate this k-point")
128 : endif
129 : enddo
130 : enddo
131 :
132 :
133 160 : call fmpi%set_errhandler()
134 160 : if (fmpi%irank==0) WRITE(*,*) "--------------------------------------------------------"
135 :
136 320 : END SUBROUTINE setupMPI
137 :
138 :
139 160 : SUBROUTINE priv_distribute_k(nkpt,nbasfcn,fmpi)
140 : use m_types
141 : implicit none
142 : INTEGER,INTENT(in) :: nkpt, nbasfcn
143 : TYPE(t_mpi),INTENT(inout) :: fmpi
144 :
145 : !-------------------------------------------------------------------------------------------
146 : !
147 : ! Distribute the k-point / eigenvector parallelisation so, that
148 : ! all pe's have aproximately equal load. Maximize for k-point
149 : ! parallelisation. The naming conventions are as follows:
150 : !
151 : ! groups 1 2 3 4 (n_groups = 4)
152 : ! / \ / \ / \ / \
153 : ! k-points: 1 2 3 4 5 6 7 8 (nkpts = 8)
154 : ! /|\ /|\ /|\ /|\ /|\ /|\ /|\ /|\
155 : ! irank 0 1 2 3 4 5 1 2 3 4 5 6 0 1 2 3 4 5 1 2 3 4 5 6 (fmpi%isize = 6)
156 : !
157 : ! n_rank 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 (fmpi%n_size = 3)
158 : !
159 : !
160 : ! In the above example, 6 pe's should work on 8 k-points and distribute
161 : ! their load in a way, that 3 pe's work on each k-points, so 2 k-points
162 : ! are done in parellel (n_members=2) and each processor runs a loop over
163 : ! 4 k-points (fmpi%n_groups = 4).
164 : ! n_rank and n_size are the equivalents of irank and isize. The former
165 : ! belong to the communicator SUB_COMM, the latter to MPI_COMM.
166 : !
167 : ! G.B. `99
168 : !
169 : !-------------------------------------------------------------------------------------------
170 : INTEGER:: n_members,n_size_min,nk
171 : CHARACTER(len=1000)::txt
172 :
173 160 : IF (judft_was_argument("-pe_per_kpt")) THEN
174 0 : txt=judft_string_for_argument("-pe_per_kpt")
175 0 : READ(txt,*) fmpi%n_size
176 0 : WRITE(*,*) "Using exactly ",fmpi%n_size," PE per kpt"
177 0 : if (mod(fmpi%isize,fmpi%n_size).ne.0) call judft_error("Parallelization error",&
178 0 : hint="If you specify -pe_per_kpt, the total number of processes needs to be a multiple.")
179 0 : n_members=fmpi%isize/fmpi%n_size
180 : ELSE
181 160 : n_members = MIN(nkpt,fmpi%isize)
182 160 : IF (judft_was_argument("-min_pe_per_kpt")) THEN
183 0 : txt=judft_string_for_argument("-min_pe_per_kpt")
184 0 : READ(txt,*) n_size_min
185 0 : WRITE(*,*) "Trying to use ",n_size_min," PE per kpt"
186 0 : n_members = MIN(n_members , CEILING(REAL(fmpi%isize)/n_size_min) )
187 : ENDIF
188 56 : DO
189 216 : IF ((MOD(fmpi%isize,n_members) == 0).AND.(MOD(nkpt,n_members) == 0) ) EXIT
190 56 : n_members = n_members - 1
191 : ENDDO
192 : endif
193 :
194 : !fmpi%n_groups = nkpt/n_members
195 160 : fmpi%n_size = fmpi%isize/n_members
196 : !fmpi%n_stride = n_members
197 160 : IF (fmpi%irank == 0) THEN
198 80 : WRITE(*,*) 'k-points in parallel : ',n_members
199 80 : WRITE(*,*) "pe's per k-point : ",fmpi%n_size
200 80 : WRITE(*,*) 'No of k-point loops : ',nkpt/n_members
201 80 : if (mod(nkpt,n_members).ne.0) then
202 0 : Write(*,*) 'Info/Warning : your k-point parallelism is not fully load-balanced'
203 : endif
204 :
205 80 : IF((REAL(nbasfcn) / REAL(fmpi%n_size)).LE.20) THEN
206 0 : WRITE(*,*) ''
207 0 : WRITE(*,*) '========================================'
208 0 : WRITE(*,*) 'WARNING:'
209 0 : WRITE(*,*) 'The chosen parallelization scheme implies very few LAPW basis functions'
210 0 : WRITE(*,*) 'per MPI process. This may lead to poor performance and other problems.'
211 0 : IF((nkpt/n_members).GE.4*fmpi%isize) THEN
212 0 : WRITE(*,*) ''
213 0 : WRITE(*,*) 'You may want to adjust the number of MPI processes such that the k-point'
214 0 : WRITE(*,*) 'parallelization is increased.'
215 0 : WRITE(*,*) ''
216 : END IF
217 0 : WRITE(*,*) 'Fleur will proceed with the calculation.'
218 0 : WRITE(*,*) '========================================'
219 0 : WRITE(*,*) ''
220 : END IF
221 :
222 : ENDIF
223 160 : END SUBROUTINE priv_distribute_k
224 :
225 160 : SUBROUTINE priv_create_comm(nkpt,neigd,fmpi)
226 : use m_types
227 : implicit none
228 : INTEGER,INTENT(in) :: nkpt,neigd
229 : TYPE(t_mpi),INTENT(inout) :: fmpi
230 : #ifdef CPP_MPI
231 : INTEGER :: n_members,n,i,ierr,sub_group,world_group,n_start
232 160 : INTEGER :: i_mygroup(fmpi%n_size)
233 : LOGICAL :: compact ! Deside how to distribute k-points
234 :
235 160 : compact = .true.
236 160 : n_members = fmpi%isize/fmpi%n_size
237 :
238 : ! now, we make the groups
239 :
240 :
241 : IF (compact) THEN
242 :
243 : ! This will distribute sub ranks in a compact manner.
244 : ! For example, if nkpt = 8 and fmpi%isize = 6:
245 :
246 : ! -----------------------------------
247 : ! | 0 | 1 | 2 | 3 | 4 | 5 | fmpi%irank
248 : ! -----------------------------------
249 : ! | 0 | 1 | 3 | 0 | 1 | 2 | fmpi%n_rank
250 : ! -----------------------------------
251 : ! | 1 | 2 | k - points
252 : ! | 3 | 4 |
253 : ! | 5 | 6 |
254 : ! | 7 | 8 |
255 : ! -----------------------------------
256 :
257 160 : n_start = INT(fmpi%irank/fmpi%n_size) + 1
258 160 : i_mygroup(1) = (n_start-1) * fmpi%n_size
259 252 : do i = 2, fmpi%n_size
260 252 : i_mygroup(i) = i_mygroup(i-1) + 1
261 : enddo
262 :
263 : ELSE
264 :
265 : ! This will distribute sub ranks in a spread manner.
266 : ! For example, if nkpt = 8 and fmpi%isize = 6:
267 :
268 : ! -----------------------------------
269 : ! | 0 | 1 | 2 | 3 | 4 | 5 | fmpi%irank
270 : ! -----------------------------------
271 : ! | 0 | 1 | 3 | 0 | 1 | 2 | fmpi%n_rank
272 : ! -----------------------------------
273 : ! | 1 | 2 | 1 | 2 | 1 | 2 | k - points
274 : ! | 3 | 4 | 3 | 4 | 3 | 4 |
275 : ! | 5 | 6 | 5 | 6 | 5 | 6 |
276 : ! | 7 | 8 | 7 | 8 | 7 | 8 |
277 : ! -----------------------------------
278 :
279 : n_start = MOD(fmpi%irank,n_members) + 1
280 : !! n_start = INT(irank/n_size) * n_size
281 : n = 0
282 : DO i = n_start,fmpi%isize,n_members
283 : !! DO i = n_start+1,n_start+n_size
284 : n = n+1
285 : i_mygroup(n) = i-1
286 : ENDDO
287 :
288 : ENDIF ! compact
289 :
290 160 : CALL MPI_COMM_GROUP (fmpi%MPI_COMM,WORLD_GROUP,ierr)
291 160 : CALL MPI_GROUP_INCL (WORLD_GROUP,fmpi%n_size,i_mygroup,SUB_GROUP,ierr)
292 160 : CALL MPI_COMM_CREATE (fmpi%MPI_COMM,SUB_GROUP,fmpi%SUB_COMM,ierr)
293 : !write (*,"(a,i0,100i4)") "MPI:",fmpi%sub_comm,fmpi%irank,fmpi%n_groups,fmpi%n_size,n,i_mygroup
294 :
295 160 : CALL MPI_COMM_RANK (fmpi%SUB_COMM,fmpi%n_rank,ierr)
296 480 : ALLOCATE(fmpi%ev_list(neigd/fmpi%n_size+1))
297 16285 : fmpi%ev_list=[(i,i=fmpi%n_rank+1,neigd,fmpi%n_size)]
298 160 : fmpi%diag_sub_comm=fmpi%sub_comm !default both are the same...
299 : #endif
300 160 : END SUBROUTINE priv_create_comm
301 :
302 0 : SUBROUTINE priv_dist_info(nkpt)
303 : USE m_available_solvers,ONLY:parallel_solver_available
304 : IMPLICIT NONE
305 : INTEGER,INTENT(in) :: nkpt
306 :
307 0 : INTEGER:: n,k_only,pe_k_only(nkpt)
308 :
309 : #ifdef CPP_MPI
310 : !Create a list of PE that will lead to k-point parallelization only
311 0 : k_only=0
312 0 : DO n=1,nkpt
313 0 : IF (MOD(nkpt,n)==0) THEN
314 0 : k_only=k_only+1
315 0 : pe_k_only(k_only)=n
316 : ENDIF
317 : END DO
318 0 : WRITE(*,*) "Most efficient MPI parallelization for:"
319 0 : WRITE(*,*) pe_k_only(:k_only)
320 : !check if eigenvalue parallelization is possible
321 0 : IF (parallel_solver_available()) WRITE(*,*) "Additional eigenvalue parallelization possible"
322 : #endif
323 0 : END SUBROUTINE priv_dist_info
324 :
325 0 : subroutine priv_redist_for_diag(fmpi)
326 : use m_types_mpi
327 : type(t_mpi),intent(inout):: fmpi
328 : #ifdef CPP_MPI
329 0 : IF (fmpi%n_rank==0) THEN
330 0 : fmpi%diag_sub_comm=MPI_COMM_SELF
331 0 : fmpi%pe_diag=.true. !actually the default
332 : ELSE
333 0 : fmpi%diag_sub_comm=MPI_COMM_NULL
334 0 : fmpi%pe_diag=.false.
335 : ENDIF
336 : #endif
337 0 : end
338 :
339 0 : subroutine priv_distribute_gpu(fmpi,log)
340 : #ifdef _OPENACC
341 : use openacc
342 : #endif
343 : use m_types_mpi
344 : type(t_mpi),intent(in):: fmpi
345 : type(t_log_message),INTENT(INOUT):: log
346 : INTEGER :: i, isize, gpus,localrank
347 : #ifdef _OPENACC
348 : call timestart("Distribute GPUs")
349 : gpus=acc_get_num_devices(acc_device_nvidia)
350 :
351 : call log%add("No GPUs",int2str(gpus))
352 : #ifdef CPP_MPI
353 : if (fmpi%irank==0) write(*,*) "Number of GPU per node/MPI:",gpus
354 : CALL MPI_COMM_SIZE(fmpi%mpi_comm_same_node,isize,i)
355 : if (isize>1) THEN
356 : if (fmpi%irank==0) write(*,*) "Number of MPI/PE per node:",isize
357 : if (gpus>isize) call judft_warn("You use more GPU/node as MPI-PEs/node running. This will underutilize the GPUs")
358 : CALL MPI_COMM_RANK(fmpi%mpi_comm_same_node,localrank,i)
359 : if (gpus>1) THEN
360 : call acc_set_device_num(mod(localrank,gpus),acc_device_nvidia)
361 : write(*,*) "Assigning PE:",fmpi%irank," to local GPU:",mod(localrank,gpus)
362 : endif
363 : ENDIF
364 : #else
365 : write(*,*) "Number of GPU :",gpus
366 : #endif
367 : call timestop("Distribute GPUs")
368 : #endif
369 0 : end subroutine
370 :
371 : END MODULE m_setupMPI
|