Line data Source code
1 : !--------------------------------------------------------------------------------
2 : ! Copyright (c) 2016 Peter Grünberg Institut, Forschungszentrum Jülich, Germany
3 : ! This file is part of FLEUR and available as free software under the conditions
4 : ! of the MIT license as expressed in the LICENSE file in more detail.
5 : !--------------------------------------------------------------------------------
6 :
7 : MODULE m_mpimakegroups
8 : use m_juDFT
9 : use mpi
10 : CONTAINS
11 0 : SUBROUTINE mpi_make_groups(&
12 : fmpi,kpts, input,atoms,noco,&
13 : mlotot,mlolotot,&
14 : n_start,n_groups,n,matsize,ne, n_rank,n_size,SUB_COMM)
15 : !------------------------------------------------------------------------
16 : !
17 : ! Distribute the k-point / eigenvector parallelisation so, that
18 : ! all pe's have aproximately equal load. Maximize for k-point
19 : ! parallelisation. The naming conventions are as follows:
20 : !
21 : ! groups 1 2 (n_groups = 2)
22 : ! / \ / \
23 : ! k-points: 1 2 3 4 (nkpts = 4)
24 : ! /|\ /|\ /|\ /|\
25 : ! irank 01 2 34 5 01 2 34 5 (isize = 6)
26 : !
27 : ! n_rank 01 2 01 2 01 2 01 2 (n_size = 3)
28 : !
29 : ! nrec 12 3 45 6 78 9 1011 12 ...rec. no. on eig-file
30 : ! * * * * * * * *
31 : !
32 : ! In the above example, 6 pe's should work on 4 k-points and distribute
33 : ! their load in a way, that 3 pe's work on each k-points, so 2 k-points
34 : ! are done in parellel (n_members=2) and there are 2 groups of k-points.
35 : ! n_rank and n_size are the equivalents of irank and isize. The former
36 : ! belong to the communicator SUB_COMM, the latter to MPI_COMM.
37 : !
38 : ! The results (containing a subset of ew & ev's) are written as separate
39 : ! records on the file 'eig' with the vacuum energy parameter of the
40 : ! marked (*) records set to 999.9 to indicate that this is only one part
41 : ! of a k-point's results. This marker is needed in routines fermie and
42 : ! cdnval.
43 : ! G.B. `99
44 : !
45 : !------------------------------------------------------------------------
46 : USE m_types
47 : IMPLICIT NONE
48 :
49 : TYPE(t_mpi),INTENT(IN) :: fmpi
50 :
51 : TYPE(t_input),INTENT(IN) :: input
52 : TYPE(t_noco),INTENT(IN) :: noco
53 : TYPE(t_kpts),INTENT(IN) :: kpts
54 : TYPE(t_atoms),INTENT(IN) :: atoms
55 : INTEGER, INTENT (IN) :: mlotot
56 : INTEGER, INTENT (IN) :: mlolotot
57 : INTEGER, INTENT (OUT) :: n_start,n_groups,n,SUB_COMM
58 : INTEGER, INTENT (OUT) :: matsize,n_rank,n_size,ne
59 :
60 : INTEGER i,n_members
61 0 : INTEGER, ALLOCATABLE :: i_mygroup(:)
62 :
63 : INTEGER WORLD_GROUP,SUB_GROUP
64 : INTEGER ierr(3)
65 : LOGICAL l_cm
66 :
67 : !
68 : ! first determine the number of groups of k-points to process
69 : !
70 0 : n_groups = 0
71 0 : IF (kpts%nkpt.GT.fmpi%isize) THEN ! if there are more k-points than PEs
72 :
73 0 : IF (mod(kpts%nkpt,fmpi%isize).EQ.0) THEN ! maybe kpts%nkpt is a multiple of fmpi%isize
74 0 : n_groups = kpts%nkpt/fmpi%isize
75 0 : n_size = 1
76 : ELSE ! or an integer fraction of fmpi%isize fits
77 0 : DO i=2,fmpi%isize
78 0 : IF (mod(fmpi%isize,i).EQ.0) THEN
79 0 : n_size = i
80 0 : n_groups = kpts%nkpt * i/fmpi%isize
81 0 : IF (mod(kpts%nkpt,fmpi%isize/i).EQ.0) GOTO 990
82 : ENDIF
83 : ENDDO
84 0 : n_groups = kpts%nkpt ! or use all PE's per k-point
85 0 : n_size = fmpi%isize
86 : ENDIF
87 :
88 0 : ELSEIF (kpts%nkpt.LT.fmpi%isize) THEN ! if there are more PEs than k-points
89 :
90 0 : IF (mod(fmpi%isize,kpts%nkpt).EQ.0) THEN ! maybe fmpi%isize is a multiple of kpts%nkpt
91 0 : n_groups = 1
92 0 : n_size = fmpi%isize/kpts%nkpt
93 : ELSE ! or an integer fraction of kpts%nkpt fits
94 0 : DO i=kpts%nkpt-1,2,-1
95 0 : IF (mod(kpts%nkpt,i).EQ.0) THEN
96 0 : n_groups = kpts%nkpt/i
97 0 : n_size = fmpi%isize/i
98 0 : IF (mod(fmpi%isize,i).EQ.0) GOTO 990
99 : ENDIF
100 : ENDDO
101 0 : n_groups = kpts%nkpt ! or use all PE's per k-point
102 0 : n_size = fmpi%isize
103 : ENDIF
104 :
105 : ELSE
106 : !
107 : ! if there are as many pe's as k-points (isize = nkpt), use one PE per
108 : ! kpoint (n_size = 1) and finish in a single k-loop (n_groups = 1)
109 : !
110 0 : n_groups = 1
111 0 : n_size = 1
112 : ENDIF
113 :
114 :
115 :
116 0 : 990 IF (n_groups==0) CALL juDFT_error("mpi_make_groups:1",calledby ="mpi_make_groups")
117 : n_members = kpts%nkpt/n_groups
118 : !
119 : ! check different algorithm
120 : !
121 0 : CALL check_memory(input,atoms, mlotot,mlolotot,noco,kpts,fmpi, n_size)
122 :
123 0 : write(*,*) n_size
124 0 : n_members = MIN(kpts%nkpt,fmpi%isize)
125 0 : n_members = MIN(n_members , CEILING(REAL(fmpi%isize)/n_size) ) + 1
126 :
127 0 : l_cm = .false.
128 : DO WHILE (.not.l_cm)
129 0 : n_members = n_members - 1
130 0 : IF ((mod(fmpi%isize,n_members) == 0).AND.&
131 : & (mod(kpts%nkpt,n_members) == 0) ) THEN
132 : l_cm = .true.
133 : ENDIF
134 : ENDDO
135 0 : n_groups = kpts%nkpt/n_members
136 0 : n_size = fmpi%isize/n_members
137 0 : IF (fmpi%irank == 0) THEN
138 0 : write(*,*) 'k-points in parallel: ',n_members
139 0 : write(*,*) "pe's per k-point: ",n_size
140 0 : write(*,*) '# of k-point loops: ',n_groups
141 : ENDIF
142 : !
143 : ! now, we make the groups
144 : !
145 0 : n_start = mod(fmpi%irank,n_members) + 1
146 : !! n_start = INT(irank/n_size) * n_size
147 0 : ALLOCATE ( i_mygroup(n_size) )
148 0 : n = 0
149 0 : DO i = n_start,fmpi%isize,n_members
150 : !! DO i = n_start+1,n_start+n_size
151 0 : n = n+1
152 0 : i_mygroup(n) = i-1
153 : ENDDO
154 :
155 : ! write (*,*) irank,n_groups,n_start,i_mygroup
156 :
157 0 : CALL MPI_COMM_GROUP (fmpi%MPI_COMM,WORLD_GROUP,ierr(1))
158 0 : CALL MPI_GROUP_INCL (WORLD_GROUP,n_size,i_mygroup, SUB_GROUP,ierr(1))
159 0 : CALL MPI_COMM_CREATE (fmpi%MPI_COMM,SUB_GROUP,SUB_COMM,ierr(1))
160 :
161 0 : CALL MPI_COMM_RANK (SUB_COMM,n_rank,ierr(1))
162 : !
163 : ! determine number of columns per group
164 : !
165 0 : n=0
166 0 : DO i=1+n_rank, lapw_dim_nbasfcn, n_size
167 0 : n=n+1
168 : ENDDO
169 0 : IF (n_size.GT.1) THEN
170 0 : matsize = lapw_dim_nbasfcn * n
171 : ELSE
172 0 : matsize = (lapw_dim_nbasfcn+1)*lapw_dim_nbasfcn/2
173 : ENDIF
174 : !
175 0 : ne = input%neig
176 0 : ne = max(ne,5)
177 :
178 0 : DEALLOCATE (i_mygroup)
179 :
180 0 : END SUBROUTINE mpi_make_groups
181 :
182 : !----------------------------------------------------------------------
183 0 : SUBROUTINE check_memory(input,atoms, mlotot,mlolotot, noco,kpts,fmpi, n_size)
184 :
185 : !
186 : ! check the free and the (approximate) required memory ;
187 : ! determine minimal n_size to fit into the memory (hopefully).
188 : !
189 : USE m_types
190 : IMPLICIT NONE
191 : type(t_mpi),INTENT(IN) :: fmpi
192 :
193 : TYPE(t_kpts),INTENT(IN) :: kpts
194 : TYPE(t_atoms),INTENT(IN) :: atoms
195 : TYPE(t_noco),INTENT(IN) :: noco
196 : TYPE(t_input),INTENT(IN) :: input
197 :
198 : INTEGER, INTENT (IN) :: mlotot,mlolotot
199 : INTEGER, INTENT (OUT) :: n_size
200 :
201 : INTEGER err, mb
202 : INTEGER*8 mem, matsz, m_h
203 0 : REAL, ALLOCATABLE :: test(:)
204 :
205 0 : n_size = CEILING( real(fmpi%isize)/min(kpts%nkpt,fmpi%isize) )
206 :
207 : 10 CONTINUE
208 : !
209 : ! some basic arrays allocated in eigen()
210 : !
211 :
212 0 : mem = ((atoms%lmaxd*(atoms%lmaxd+2)* (atoms%lmaxd*(atoms%lmaxd+2)+3))/2+1)*atoms%ntype*4 ! tlmplm%tuu,tlmplm%tdd etc.
213 0 : mem = mem + (atoms%lmaxd*(atoms%lmaxd+2)+1)*(2*atoms%llod+1)*max(mlotot,1)*2 ! tlmplm%tuulo ...
214 0 : mem = mem + (2*atoms%llod+1)**2 * max(mlolotot,1) ! tlmplm%tuloulo
215 0 : IF (noco%l_noco) mem = mem * 2 ! both spins
216 0 : mem = mem + 49*(atoms%n_u+atoms%n_hia)*input%jspins*2 ! lda+U, *2 for complex
217 0 : mem = mem+INT((lapw_dim_nbasfcn*2+(atoms%lmaxd*(atoms%lmaxd+2)+1)*atoms%ntype)*0.5)+1 ! tlmplm%ind, *0.5 for integer
218 :
219 0 : matsz = lapw_dim_nbasfcn * CEILING(REAL(lapw_dim_nbasfcn)/n_size) ! size of a, b
220 : #ifdef CPP_INVERSION
221 : mem = mem + 2 * matsz ! real case
222 : #else
223 0 : mem = mem + 2 * matsz * 2 ! complec case
224 : #endif
225 : !
226 : ! now the arrays in hssphn()
227 : !
228 0 : m_h = lapw_dim_nvd*(atoms%lmaxd*(atoms%lmaxd+2)+1)*4 + lapw_dim_nvd*8 + atoms%nlod ! ar, ai ..., cph, rph, vk, gk
229 0 : m_h = m_h + 2 * (2*atoms%llod+1)**2 * atoms%nlod * 3 * 2 ! alo,blo,clo
230 0 : IF (noco%l_ss) m_h = m_h * 2
231 0 : m_h = m_h + lapw_dim_nvd*(5+atoms%lmaxd) ! axr, ... plegend
232 0 : IF (noco%l_ss.OR.any(noco%l_constrained).OR.(noco%l_noco.AND.noco%l_soc)) THEN
233 0 : m_h = m_h + lapw_dim_nvd*(atoms%lmaxd+1)*atoms%ntype*2*2 ! fj,gj
234 : ELSE
235 0 : m_h = m_h + lapw_dim_nvd*(atoms%lmaxd+1)*atoms%ntype*2 ! fj,gj
236 : ENDIF
237 0 : IF (noco%l_noco.AND.noco%l_soc) THEN
238 0 : m_h = m_h + lapw_dim_nvd*(atoms%lmaxd+4)
239 : ENDIF
240 0 : IF (any(noco%l_constrained)) THEN
241 0 : m_h = m_h + (atoms%lmaxd+1)*atoms%ntype
242 : ENDIF
243 0 : IF (noco%l_noco.AND.(.NOT.noco%l_ss)) THEN
244 0 : matsz = (lapw_dim_nvd+mlotot) * CEILING(REAL(lapw_dim_nvd+mlotot)/n_size)
245 0 : m_h = m_h + matsz * 2 * 2 ! aahlp,bbhlp
246 : ENDIF
247 : !
248 : ! see, whether it fits
249 : !
250 0 : mb = (mem+m_h)*8/(1024)**2
251 0 : ALLOCATE ( test(mem+m_h) , stat = err)
252 0 : WRITE(*,*) mb,'Mbytes needed in hssphn!',err,mem
253 0 : IF ( err /= 0 ) THEN
254 0 : n_size = n_size * 2
255 0 : IF (n_size > fmpi%isize) THEN
256 : mb = (mem+m_h)*8/(1024)**2
257 0 : WRITE(*,*) mb,'Mbytes needed in hssphn!'
258 0 : CALL juDFT_error("mpi_make_groups: memory too small!",calledby ="mpi_make_groups")
259 : ENDIF
260 : GOTO 10
261 : ENDIF
262 0 : DEALLOCATE (test)
263 : !
264 : ! now, allocate z and jump into chani
265 : !
266 0 : matsz = lapw_dim_nbasfcn * CEILING(REAL(lapw_dim_nbasfcn)/n_size) ! size of z
267 : #ifdef CPP_INVERSION
268 : mem = mem + matsz ! real case
269 : #else
270 0 : mem = mem + matsz * 2 ! complex case
271 : #endif
272 0 : mem = mem + matsz * 2 * 3 ! asca,bsca,eigvec
273 0 : mem = mem + lapw_dim_nbasfcn
274 : #ifdef CPP_INVERSION
275 : mem = mem + matsz ! work
276 : #else
277 0 : mem = mem + matsz * 2 ! work, rwork neglected
278 : #endif
279 : !
280 : ! see, whether it fits
281 : !
282 0 : mb = (mem)*8/(1024)**2
283 0 : ALLOCATE ( test(mem) , stat = err)
284 0 : WRITE(*,*) mb,'Mbytes needed in chani !',err,mem
285 0 : IF ( err /= 0 ) THEN
286 0 : n_size = n_size * 2
287 0 : IF (n_size > fmpi%isize) THEN
288 : mb = (mem)*8/(1024)**2
289 0 : WRITE(*,*) mb,'Mbytes needed in chani !'
290 : CALL juDFT_error("mpi_make_groups: memory too small!",calledby&
291 0 : & ="mpi_make_groups")
292 : ENDIF
293 : GOTO 10
294 : ENDIF
295 0 : DEALLOCATE (test)
296 :
297 0 : END SUBROUTINE check_memory
298 : !----------------------------------------------------------------------
299 :
300 : END MODULE m_mpimakegroups
|