[QE-users] mysterious kill

Matthew Marcus mamarcus at lbl.gov
Tue Jun 23 20:04:37 CEST 2020


I'm trying to run a long phonon calculation using 3 nodes of a cluster, 
each of which has 16 processors.  I'm using 15 on each node, with this 
command:

mpirun -np 45 --hostfile hostfile.txt ph.x < NiOHF_ordered1H7.PhG.in | 
tee NiOHF_ordered1H7.PhG.out 2> error.txt

The input files are attached.  Before the ph.x calculation, I ran an SCF 
using the attached SCF input file.

My problem is that after 3 days of computation, the process gets killed 
with no indication of why.  The dmesg -T command doesn't show any kills 
at the right date.  How can I figure out what went wrong and what to do 
about it?  Would running on fewer processors help?
	mam
-------------- next part --------------
phonons of Ni3F2(OH)4 at Gamma
 &inputph
  tr2_ph=1.0d-14,
  outdir = './out/'
  prefix = 'aiida',
  epsil=.true.,
  amass(1)=58.6934,
  amass(2)=58.6934,
  amass(3)=18.9984
  amass(4)=1.00794,
  amass(5)=15.9994,
  trans  = .true.
  fildyn='NiOHF_ordered1H7_.dynG',
 /
0.0 0.0 0.0
-------------- next part --------------
&CONTROL
  calculation = 'scf'
  etot_conv_thr =   1.3000000000d-04
  forc_conv_thr =   1.0000000000d-04
  outdir = './out/'
  prefix = 'aiida'
  pseudo_dir = './pseudo/'
  tprnfor = .true.
  tstress = .true.
  verbosity = 'low'
/
&SYSTEM
  degauss =   1.4699723600d-02
  ecutrho =   600
  ecutwfc =   100
  ibrav = 0
  nat = 26
  nspin = 2
  ntyp = 5
Hubbard_U(1) = 7.
Hubbard_U(2) = 7.
  occupations = 'fixed'
!  smearing = 'cold'
  lda_plus_u=.true.,
  tot_magnetization = 0.
  starting_magnetization(1) =   0.8
  starting_magnetization(2) =   -0.8
  starting_magnetization(3) =   0.
  starting_magnetization(4) =   0.
  starting_magnetization(5) =   0.
/
&ELECTRONS
  conv_thr =   1.0000000000d-07
  electron_maxstep = 3000
  mixing_beta =   2.0000000000d-01
/
ATOMIC_SPECIES
Ni1     58.6934 ni_pbesol_v1.4.uspp.F.UPF
Ni2     58.6934 ni_pbesol_v1.4.uspp.F.UPF
F      18.9984 F.oncvpsp.upf
H      1.00794 H_ONCV_PBEsol-1.0.upf
O      15.9994 O.pbesol-n-kjpaw_psl.0.1.UPF
ATOMIC_POSITIONS crystal
Ni1           0.9680183807       0.0263107459       0.9783108308
Ni1           0.3045746118       0.0209143866       0.0084271434
Ni1           0.6411601951       0.0326592337       0.0131468998
Ni2           0.0287555271       0.9790835341       0.4915771994
Ni2           0.3653117094       0.9736857834       0.5216841580
Ni2           0.6921696367       0.9673388130       0.4868535613
H            0.3664337110       0.5587673063       0.2207053022
H            0.7711523533       0.6673850125       0.2109430049
H            0.0847701418       0.5318553413       0.7085663827
H            0.4959331063       0.5608752024       0.7208617849
H            0.5621900948       0.3326319169       0.2890537903
H            0.9668980679       0.4412345344       0.2792980517
H            0.2485858688       0.4681576584       0.7914372862
H            0.8373976074       0.4391119548       0.7791359710
F            0.0594468841       0.6215010140       0.1059432366
F            0.7934945419       0.5779511484       0.6060003729
F            0.2738815299       0.3784930960       0.3940541285
F            0.5398339136       0.4220452355       0.8940008576
O            0.4029952628       0.6389325035       0.1163156174
O            0.7496761883       0.6657977104       0.1092517007
O            0.1215216453       0.5881897105       0.6060767314
O            0.4712462387       0.5970002502       0.6197252787
O            0.5836559299       0.3342035007       0.3907472769
O            0.9303359025       0.3610680558       0.3836873758
O            0.2118128870       0.4118165671       0.8939225762
O            0.8620813928       0.4029897812       0.8802737008
K_POINTS automatic
4 11 3 0 0 0
CELL_PARAMETERS angstrom
      9.2705774000       0.0000000000       0.0000000000
     -1.5637153840       2.7010480439       0.0000000000
      0.2403598005       1.2112911718       9.2695561023
-------------- next part --------------
n0000.phasis
n0000.phasis
n0000.phasis
n0000.phasis
n0000.phasis
n0000.phasis
n0000.phasis
n0000.phasis
n0000.phasis
n0000.phasis
n0000.phasis
n0000.phasis
n0000.phasis
n0000.phasis
n0000.phasis
n0001.phasis
n0001.phasis
n0001.phasis
n0001.phasis
n0001.phasis
n0001.phasis
n0001.phasis
n0001.phasis
n0001.phasis
n0001.phasis
n0001.phasis
n0001.phasis
n0001.phasis
n0001.phasis
n0001.phasis
n0004.phasis
n0004.phasis
n0004.phasis
n0004.phasis
n0004.phasis
n0004.phasis
n0004.phasis
n0004.phasis
n0004.phasis
n0004.phasis
n0004.phasis
n0004.phasis
n0004.phasis
n0004.phasis
n0004.phasis


More information about the users mailing list