pyfock.DFT
1# DFT.py 2# Author: Manas Sharma ([email protected]) 3# This is a part of CrysX (https://bragitoff.com/crysx) 4# 5# 6# .d8888b. Y88b d88P 8888888b. 8888888888 888 7# d88P Y88b Y88b d88P 888 Y88b 888 888 8# 888 888 Y88o88P 888 888 888 888 9# 888 888d888 888 888 .d8888b Y888P 888 d88P 888 888 8888888 .d88b. .d8888b 888 888 10# 888 888P" 888 888 88K d888b 8888888P" 888 888 888 d88""88b d88P" 888 .88P 11# 888 888 888 888 888 "Y8888b. d88888b 888888 888 888 888 888 888 888 888 888888K 12# Y88b d88P 888 Y88b 888 X88 d88P Y88b 888 Y88b 888 888 Y88..88P Y88b. 888 "88b 13# "Y8888P" 888 "Y88888 88888P' d88P Y88b 888 "Y88888 888 "Y88P" "Y8888P 888 888 14# 888 888 15# Y8b d88P Y8b d88P 16# "Y88P" "Y88P" 17from re import T 18from pyfock.Utils import print_pyfock_logo 19# Print system information 20from pyfock.Utils import print_sys_info 21import pyfock.Mol as Mol 22import pyfock.Basis as Basis 23# import pyfock.Integrals as Integrals 24import pyfock.Integrals as Integrals 25import pyfock.Grids as Grids 26from threadpoolctl import ThreadpoolController, threadpool_info, threadpool_limits 27# controller = ThreadpoolController() 28import numpy as np 29from numpy.linalg import eig, multi_dot as dot 30import scipy 31 32from timeit import default_timer as timer 33import numba 34from opt_einsum import contract 35import pylibxc 36# import sparse 37# import dask.array as da 38from scipy.sparse import csr_matrix, csc_matrix 39# from memory_profiler import profile 40import os 41from numba import njit, prange, cuda 42import numexpr 43try: 44 import cupy as cp 45 from cupy import fuse 46 import cupyx 47 CUPY_AVAILABLE = True 48except Exception as e: 49 print('Cupy is not installed. GPU acceleration is not availble.') 50 CUPY_AVAILABLE = False 51 pass 52from pyfock.DFT_Helper_Coulomb import density_fitting_prelims_for_DFT_development 53from pyfock.DFT_Helper_Coulomb import Jmat_from_density_fitting 54 55# @njit(parallel=True, cache=True, fastmath=True, error_model="numpy") 56# def compute_B(errVecs): 57# nKS = errVecs.shape[0] 58# B = np.zeros((nKS + 1, nKS + 1)) 59# B[-1, :] = B[:, -1] = -1.0 60# B[-1, -1] = 0.0 61# for i in prange(nKS): 62# # errVec_i_conj_T = errVecs[i].conj().T 63# errVec_i_conj_T = errVecs[i].T 64# for j in range(i + 1): 65# # B[i, j] = B[j, i] = np.real(np.trace(np.dot(errVec_i_conj_T, errVecs[j]))) 66# B[i, j] = B[j, i] = np.trace(np.dot(errVec_i_conj_T, errVecs[j])) 67# return B 68 69class DFT: 70 """ 71 A class for performing Density Functional Theory (DFT) calculations 72 with optional support for density fitting (DF), GPU acceleration, and LibXC. 73 74 Parameters 75 ---------- 76 mol : Molecule 77 Molecular object on which the DFT calculation is to be performed. 78 79 basis : Basis 80 Orbital basis set used for the SCF calculation. 81 82 auxbasis : Basis, optional 83 Auxiliary basis set for density fitting (DF). If None, a default will be assigned. 84 85 conv_crit : float, optional 86 Convergence criterion for the SCF cycle in Hartrees (default is 1e-7). 87 88 dmat_guess_method : str, optional 89 Method for the initial density matrix guess (e.g., 'core', 'huckel'). 90 91 xc : list or str, optional 92 Exchange-correlation functional specification. If None, defaults to LDA (`[1, 7]`). 93 94 grids : object, optional 95 Precomputed numerical integration grids. If None, they will be generated automatically. 96 97 gridsLevel : int, optional 98 Level of numerical integration grid refinement (default is 3). 99 100 blocksize : int, optional 101 Block size for XC grid evaluations. Defaults depend on whether GPU is used. 102 103 save_ao_values : bool, optional 104 If True, saves AO values to reuse during XC evaluation. Increases speed but uses more memory. 105 106 use_gpu : bool, optional 107 Whether to use GPU acceleration. 108 109 ncores : int, optional 110 Number of CPU cores to use (default is 2). 111 112 Attributes 113 ---------- 114 dmat : ndarray 115 Initial guess for the density matrix, will be computed during setup. 116 117 KSmats : list 118 List of Kohn–Sham matrices used in DIIS extrapolation. 119 120 errVecs : list 121 List of error vectors for DIIS. 122 123 max_itr : int 124 Maximum number of SCF iterations (default is 50). 125 126 isDF : bool 127 Whether to use density fitting for Coulomb integrals. 128 129 rys : bool 130 Whether to use Rys quadrature for evaluating electron repulsion integrals. 131 132 DF_algo : int 133 Algorithm selector for DF (reserved for developer use). 134 135 XC_algo : int 136 Algorithm selector for XC evaluation (2 for CPU, 3 for GPU). 137 138 sortGrids : bool 139 Whether to sort DFT integration grids (not recommended). 140 141 xc_bf_screen : bool 142 Enable basis function screening for XC term evaluation. 143 144 threshold_schwarz : float 145 Threshold for Schwarz screening (default is 1e-9). 146 147 strict_schwarz : bool 148 If True, applies stricter Schwarz screening. 149 150 cholesky : bool 151 If True, uses Cholesky decomposition for DF. 152 153 orthogonalize : bool 154 If True, orthogonalizes AO basis functions. 155 156 sao : bool 157 If True, uses SAO basis instead of CAO basis. 158 159 keep_ao_in_gpu : bool 160 Whether to retain AO values in GPU memory during SCF (if `save_ao_values` is True). 161 162 use_libxc : bool 163 Whether to use LibXC for XC functional evaluation (recommended off for GPU). 164 165 n_streams : int 166 Number of CUDA streams to use (if applicable). 167 168 n_gpus : int 169 Number of GPUs to use. 170 171 free_gpu_mem : bool 172 Whether to forcibly free GPU memory after use. 173 174 max_threads_per_block : int 175 Maximum threads per CUDA block supported by the device. 176 177 threads_x : int 178 CUDA thread configuration (X dimension). 179 180 threads_y : int 181 CUDA thread configuration (Y dimension). 182 183 dynamic_precision : bool 184 Whether to use precision switching during XC evaluation for performance gains. 185 186 keep_ints3c2e_in_gpu : bool 187 Whether to keep 3-center 2-electron integrals in GPU memory to avoid transfers. 188 189 debug : bool 190 If True, prints debugging output during DFT calculations. 191 192 Notes 193 ----- 194 - This class supports SCF DFT calculations with density fitting (DF). 195 - GPU support is optional and provides significant speed-up for large systems. 196 - The class is tightly integrated with PyFock and LibXC libraries. 197 198 Examples 199 -------- 200 >>> mol = Molecule(...) 201 >>> basis = Basis(mol, ...) 202 >>> dft = DFT(mol, basis, xc='PBE', use_gpu=True) 203 >>> dft.run_scf() 204 """ 205 def __init__(self, mol, basis, auxbasis=None, conv_crit=1e-7, dmat_guess_method=None, 206 xc=None, grids=None, gridsLevel=3, blocksize=None, 207 save_ao_values=False, use_gpu=False, ncores=1): 208 209 self.mol = mol 210 """ Molecular object for which the DFT calculation will be performed """ 211 if self.mol is None: 212 print('ERROR: A Mol object is required to initialize a DFT object.') 213 214 self.basis = basis 215 """ Basis object for corresponding to the molecule for the DFT calculation """ 216 if self.basis is None: 217 print('ERROR: A Basis object is required to initialize a DFT object.') 218 219 220 self.dmat_guess_method = dmat_guess_method 221 """ Initial guess for the density matrix """ 222 if self.dmat_guess_method is None: 223 self.dmat_guess_method = 'core' 224 225 self.dmat = None 226 """ Initial density matrix guess for SCF. Will get updated at each SCF iteration. """ 227 228 self.xc = xc 229 """ Exchange-Correlation Functional """ 230 if self.xc is None: 231 self.xc = [1, 7] #LDA 232 print("XC not specified, defaulting to LDA functional (LibXC codes: 1, 7)") 233 234 # DIIS 235 self.KSmats = [] 236 self.errVecs = [] 237 self.diisSpace = 6 238 239 self.conv_crit = conv_crit 240 """ Convergence criterion for SCF (in Hartrees) """ 241 # if self.conv_crit is None: 242 # self.conv_crit = 1.0E-7 243 244 self.max_itr = 50 245 """ Maximum number of iterations for SCF """ 246 # if self.max_itr is None: 247 # self.max_itr = 50 248 249 self.ncores = ncores 250 """ Number of cores to be used for DFT calculation """ 251 if self.ncores is None: 252 self.ncores = 2 253 254 self.grids = grids 255 """ Atomic grids for DFT calculation (If None or not supplied, will be generated using NumGrid) """ 256 self.gridsLevel = gridsLevel 257 """ Atomic grids for DFT calculation """ 258 259 self.isDF = True 260 """ Use density fitting (DF) for two-electron Coulomb integrals. This is only for developers. Users should not change it. """ 261 262 self.auxbasis = auxbasis 263 """ Basis object to be used as the auxiliary basis for DF """ 264 if self.auxbasis is None: 265 auxbasis = Basis(mol, {'all':Basis.load(mol=mol, basis_name='def2-universal-jfit')}) 266 267 self.rys = True 268 """ Use rys quadrature for the evaluation of two electron integrals (with and without DF)""" 269 270 self.DF_algo = 10 271 """ This is only for developers. Users should not change it. """ 272 273 self.blocksize = blocksize 274 """ Block size for the evaulation of XC term on grids. For CPUs a value of ~5000 is recommended. For GPUs, a value >20480 is recommended. """ 275 if blocksize is None: 276 if use_gpu: 277 self.blocksize = 20480 278 else: 279 self.blocksize = 5000 280 281 self.XC_algo = None 282 """ This is only for developers. Users should not change it. The algorithm for XC evaluation should be 2 for CPU and 3 for GPU.""" 283 if use_gpu: 284 self.XC_algo = 3 285 else: 286 self.XC_algo = 2 287 288 self.debug = False 289 """ Turn on printing debug statements """ 290 self.sortGrids = False 291 """ Enable/Disable sorting of DFT grids. Doesn't seem to offer any signficant advantage.""" 292 self.save_ao_values = save_ao_values 293 """ Whether to save atomic orbital (AO) values for reuse during XC evaluation. Improves performance but requires more memory. """ 294 self.xc_bf_screen = True 295 """ Enable screening of basis functions for XC term evaluation to reduce computation time drastically. """ 296 self.threshold_schwarz = 1e-09 297 """ Threshold for Schwarz screening of two-electron integrals. Smaller values increase accuracy but reduce sparsity. """ 298 self.strict_schwarz = True 299 """ If True, enforce stricter Schwarz screening to aggressively eliminate small two-electron integrals. """ 300 self.cholesky = True 301 """ Whether to use Cholesky decomposition for DF. Slightly speeds up calculations. """ 302 self.orthogonalize = True 303 """ Apply orthogonalization to the AO basis. Should be True for most standard calculations. """ 304 305 self.mo_occupations = None 306 """ Molecular orbital occupations. Will be computed during SCF. """ 307 self.mo_coefficients = None 308 """ Molecular orbital coefficients. Will be computed during SCF. """ 309 self.mo_energies = None 310 """ Molecular orbital energies. Will be computed during SCF. """ 311 self.Total_energy = None 312 """ Total energy of the system. Will be computed during SCF. """ 313 self.J_energy = None 314 """ Coulomb energy contribution. Will be computed during SCF. """ 315 self.XC_energy = None 316 """ Exchange-correlation energy contribution. Will be computed during SCF. """ 317 self.Nuclear_rep_energy = None 318 """ Nuclear repulsion energy. Will be computed during SCF. """ 319 self.Kinetic_energy = None 320 """ Kinetic energy contribution. Will be computed during SCF. """ 321 self.Nuc_energy = None 322 """ Nuclear potential energy contribution. Will be computed during SCF. """ 323 324 325 # CAO or SAO 326 self.sao = False 327 """ Whether to use SAO basis or CAO basis. Default is CAO basis. """ 328 329 330 # GPU acceleration 331 self.use_gpu = use_gpu 332 """ Whether to use GPU acceleration or not """ 333 self.keep_ao_in_gpu = True 334 """ Whether to keep the atomic orbitals for XC evaluation in GPU memory or CPU memory. Only relevant if save_ao_values = True. """ 335 self.use_libxc = True 336 """ Whether to use LibXC's version of XC functionals or PyFock implementations. 337 Only relevant when GPU is used. For GPU calculations it is recommended to use PyFock 338 implementation as it avoids CPU-GPU transfers.""" 339 self.n_streams = 1 340 self.n_gpus = 1 341 """ Number of GPUs to be used """ 342 self.free_gpu_mem = False 343 """ Whether the GPU memory should be freed by force or not""" 344 try: 345 self.max_threads_per_block = cuda.get_current_device().MAX_THREADS_PER_BLOCK 346 except: 347 self.max_threads_per_block = 1024 348 349 self.threads_x = int(self.max_threads_per_block/16) 350 self.threads_y = int(self.max_threads_per_block/64) 351 self.dynamic_precision = False # Only for the XC term 352 """ Whether to use dynamic precision switching for XC term or not """ 353 self.keep_ints3c2e_in_gpu = True 354 """ Whether to keep the 3c2e integrals in GPU memory or not. 355 Recommended to keep in GPU memory to avoid CPU-GPU transfers at each iteration.""" 356 357 # self.max_threads_per_block = 1024 358 # self.threads_x = 64 359 # self.threads_y = 16 360 361 # if self.use_gpu: 362 # try: 363 # global cp 364 # global cupy_scipy 365 # import cupy as cp 366 # import cupyx.scipy as cupy_scipy 367 # # from cupy.linalg import eig, multi_dot as dot 368 # except ModuleNotFoundError: 369 # print('Cupy was not found!') 370 371 # def removeLinearDep(self, H, S): 372 # return 1 373 374 def nuclear_rep_energy(self, mol=None): 375 """ 376 Compute the nuclear-nuclear repulsion energy. 377 378 Parameters 379 ---------- 380 mol : Molecule, optional 381 Molecule object containing nuclear coordinates and charges. 382 If None, uses `self.mol`. 383 384 Returns 385 ------- 386 float 387 The nuclear repulsion energy in Hartrees. 388 """ 389 # Nuclear-nuclear energy 390 if mol is None: 391 mol = self.mol 392 393 e = 0 394 for i in range(mol.natoms): 395 for j in range(i, mol.natoms): 396 if (i!=j): 397 dist = mol.coordsBohrs[i]-mol.coordsBohrs[j] 398 e = e + mol.Zcharges[i]*mol.Zcharges[j]/np.sqrt(np.sum(dist**2)) 399 400 return e 401 402 # full density matrix for RKS/RHF 403 def gen_dm(self, mo_coeff, mo_occ): 404 """ 405 Generate the density matrix from molecular orbital coefficients and occupations. 406 407 Parameters 408 ---------- 409 mo_coeff : ndarray 410 Molecular orbital coefficient matrix. 411 412 mo_occ : ndarray 413 Array of MO occupation numbers. 414 415 Returns 416 ------- 417 ndarray 418 Density matrix (RHF/RKS type). 419 """ 420 mocc = mo_coeff[:,mo_occ>0] 421 422 return np.dot(mocc*mo_occ[mo_occ>0], mocc.conj().T) 423 424 def getOcc(self, mol=None, energy_mo=None, coeff_mo=None): 425 """ 426 Assign occupation numbers to molecular orbitals based on their energies. 427 428 Parameters 429 ---------- 430 mol : Molecule 431 Molecule object to extract the number of electrons. 432 433 energy_mo : ndarray 434 Array of MO energies. 435 436 coeff_mo : ndarray 437 Array of MO coefficients (not used but kept for compatibility). 438 439 Returns 440 ------- 441 ndarray 442 Array of MO occupations (0 or 2 for RHF). 443 """ 444 e_idx = np.argsort(energy_mo) 445 e_sort = energy_mo[e_idx] 446 nmo = energy_mo.size 447 occ_mo = np.zeros(nmo) 448 nocc = mol.nelectrons // 2 449 occ_mo[e_idx[:nocc]] = 2 450 return occ_mo 451 452 def gen_dm_cupy(self, mo_coeff, mo_occ): 453 """ 454 Generate the density matrix using CuPy for GPU acceleration. 455 456 Parameters 457 ---------- 458 mo_coeff : cp.ndarray 459 Molecular orbital coefficient matrix (on GPU). 460 461 mo_occ : cp.ndarray 462 Array of MO occupation numbers (on GPU). 463 464 Returns 465 ------- 466 cp.ndarray 467 Density matrix (RHF/RKS type) computed on the GPU. 468 """ 469 mocc = mo_coeff[:,mo_occ>0] 470 return cp.dot(mocc*mo_occ[mo_occ>0], mocc.conj().T) 471 472 def getOcc_cupy(self, mol=None, energy_mo=None, coeff_mo=None): 473 """ 474 Assign occupation numbers to MOs using CuPy (for GPU-based calculations). 475 476 Parameters 477 ---------- 478 mol : Molecule 479 Molecule object to determine number of electrons. 480 481 energy_mo : cp.ndarray 482 MO energy array on the GPU. 483 484 coeff_mo : cp.ndarray 485 MO coefficients array on the GPU (not used). 486 487 Returns 488 ------- 489 cp.ndarray 490 Array of MO occupations (on GPU). 491 """ 492 e_idx = cp.argsort(energy_mo) 493 e_sort = energy_mo[e_idx] 494 nmo = energy_mo.size 495 occ_mo = cp.zeros(nmo) 496 nocc = mol.nelectrons // 2 497 occ_mo[e_idx[:nocc]] = 2 498 return occ_mo 499 500 def solve(self, H, S, orthogonalize=False, x=None): 501 """ 502 Solve the generalized or canonical eigenvalue equation. 503 504 Parameters 505 ---------- 506 H : ndarray 507 Hamiltonian matrix. 508 509 S : ndarray 510 Overlap matrix. 511 512 orthogonalize : bool, optional 513 If True, solve using orthogonalized basis. 514 515 x : ndarray, optional 516 Transformation matrix (if already computed). 517 518 Returns 519 ------- 520 tuple of (ndarray, ndarray) 521 Eigenvalues and eigenvectors of the system. 522 """ 523 if not orthogonalize: 524 #Solve the generalized eigenvalue equation HC = SCE 525 eigvalues, eigvectors = scipy.linalg.eigh(H, S) 526 527 else: 528 if x is None: 529 eig_val_s, eig_vec_s = scipy.linalg.eigh(S) 530 # Removing the eigenvectors assoicated to the smallest eigenvalue. 531 x = eig_vec_s[:,eig_val_s>1e-7] / np.sqrt(eig_val_s[eig_val_s>1e-7]) 532 xHx = x.T @ H @ x 533 #Solve the canonical eigenvalue equation HC = CE 534 eigvalues, eigvectors = scipy.linalg.eigh(xHx) 535 eigvectors = np.dot(x, eigvectors) 536 537 idx = np.argmax(np.abs(eigvectors.real), axis=0) 538 eigvectors[:,eigvectors[idx,np.arange(len(eigvalues))].real<0] *= -1 539 return eigvalues, eigvectors # E, C 540 541 def solve_cupy(self, H, S, orthogonalize=True): 542 """ 543 Solve the generalized eigenvalue problem using CuPy (GPU). 544 545 Parameters 546 ---------- 547 H : cp.ndarray 548 Hamiltonian matrix (on GPU). 549 550 S : cp.ndarray 551 Overlap matrix (on GPU). 552 553 orthogonalize : bool, optional 554 If True, solve using orthogonalized basis. 555 556 Returns 557 ------- 558 tuple of (cp.ndarray, cp.ndarray) 559 Eigenvalues and eigenvectors (on GPU). 560 """ 561 eig_val_s, eig_vec_s = cp.linalg.eigh(S) 562 # Removing the eigenvectors assoicated to the smallest eigenvalue. 563 x = eig_vec_s[:,eig_val_s>1e-7] / cp.sqrt(eig_val_s[eig_val_s>1e-7]) 564 xhx = x.T @ H @ x 565 #Solve the canonical eigenvalue equation HC = SCE 566 eigvalues, eigvectors = cp.linalg.eigh(xhx) 567 eigvectors = cp.dot(x, eigvectors) 568 569 idx = cp.argmax(cp.abs(eigvectors.real), axis=0) 570 eigvectors[:,eigvectors[idx,cp.arange(len(eigvalues))].real<0] *= -1 571 return eigvalues, eigvectors # E, C 572 573 574 def getCoreH(self, mol=None, basis=None): 575 """ 576 Compute the core Hamiltonian matrix (T + V). 577 578 Parameters 579 ---------- 580 mol : Molecule, optional 581 Molecule object. 582 583 basis : Basis, optional 584 Basis set object. 585 586 Returns 587 ------- 588 ndarray 589 Core Hamiltonian matrix. 590 """ 591 #Get the core Hamiltonian 592 if mol is None: 593 mol = self.mol 594 if basis is None: 595 basis = self.basis 596 nao = basis.bfs_nao 597 H = np.empty((nao,nao)) 598 Vmat = Integrals.nuc_mat_symm(basis, mol) 599 Tmat = Integrals.kin_mat_symm(basis) 600 H = Vmat + Tmat 601 602 return H 603 604 605 def guessCoreH(self, mol=None, basis=None, Hcore=None, S=None): 606 """ 607 Generate a guess density matrix using the core Hamiltonian. 608 609 Parameters 610 ---------- 611 mol : Molecule, optional 612 Molecule object. 613 614 basis : Basis, optional 615 Basis set. 616 617 Hcore : ndarray, optional 618 Core Hamiltonian. If None, it will be computed. 619 620 S : ndarray, optional 621 Overlap matrix. If None, it will be computed. 622 623 Returns 624 ------- 625 ndarray 626 Initial guess density matrix. 627 """ 628 #Get a guess for the density matrix using the core Hamiltonian 629 if mol is None: 630 mol = self.mol 631 if basis is None: 632 basis = self.basis 633 if Hcore is None: 634 Hcore = self.getCoreH(mol, basis) 635 if S is None: 636 S = Integrals.overlap_mat_symm(basis) 637 638 eigvalues, eigvectors = scipy.linalg.eigh(Hcore, S) 639 # print(eigvalues) 640 idx = np.argmax(abs(eigvectors.real), axis=0) 641 eigvectors[:,eigvectors[idx,np.arange(len(eigvalues))].real<0] *= -1 642 mo_occ = self.getOcc(mol, eigvalues, eigvectors) 643 # print(mo_occ) 644 dmat = self.gen_dm(eigvectors, mo_occ) 645 646 return dmat 647 648 649 def DIIS(self,S,D,F): 650 """ 651 Perform Direct Inversion in the Iterative Subspace (DIIS) to improve SCF convergence. 652 653 Adapted from 654 ---------- 655 McMurchie-Davidson project: 656 https://github.com/jjgoings/McMurchie-Davidson 657 Licensed under the BSD-3-Clause license 658 659 Parameters 660 ---------- 661 S : ndarray 662 Overlap matrix. 663 664 D : ndarray 665 Density matrix. 666 667 F : ndarray 668 Fock matrix. 669 670 Returns 671 ------- 672 ndarray 673 DIIS-extrapolated Fock matrix. 674 """ 675 FDS = dot([F,D,S]) 676 # SDF = np.conjugate(FDS).T 677 errVec = FDS - np.conjugate(FDS).T 678 self.KSmats.append(F) 679 self.errVecs.append(errVec) 680 nKS = len(self.KSmats) 681 if nKS > self.diisSpace: 682 self.KSmats.pop(0) 683 self.errVecs.pop(0) 684 nKS = nKS - 1 685 B = np.zeros((nKS + 1,nKS + 1)) 686 B[-1,:] = B[:,-1] = -1.0 687 B[-1,-1] = 0.0 688 # B is symmetric 689 for i in range(nKS): 690 for j in range(i+1): 691 B[i,j] = B[j,i] = \ 692 np.real(np.trace(np.dot(np.conjugate(self.errVecs[i]).T, self.errVecs[j]))) 693 # for i in range(nKS): 694 # for j in range(i+1): 695 # print(self.errVecs[i].shape) 696 # B[i,j] = np.real(np.dot(np.conjugate(self.errVecs[i]).T, self.errVecs[j])) 697 # B[j,i] = B[i,j] 698 699 residual = np.zeros((nKS + 1, 1)) 700 residual[-1] = -1.0 701 weights = scipy.linalg.solve(B,residual) 702 703 # weights is 1 x numFock + 1, but first numFock values 704 # should sum to one if we are doing DIIS correctly 705 assert np.isclose(sum(weights[:-1]),1.0) 706 707 F = np.zeros(F.shape) 708 for i, KS in enumerate(self.KSmats): 709 weight = weights[i] 710 F += numexpr.evaluate('(weight * KS)') 711 712 return F 713 714 def DIIS_cupy(self,S,D,F): 715 """ 716 Perform DIIS on GPU using CuPy to accelerate SCF convergence. 717 718 Adapted from 719 ---------- 720 McMurchie-Davidson project: 721 https://github.com/jjgoings/McMurchie-Davidson 722 Licensed under the BSD-3-Clause license 723 724 Parameters 725 ---------- 726 S : cp.ndarray 727 Overlap matrix (on GPU). 728 729 D : cp.ndarray 730 Density matrix (on GPU). 731 732 F : cp.ndarray 733 Fock matrix (on GPU). 734 735 Returns 736 ------- 737 cp.ndarray 738 DIIS-extrapolated Fock matrix (on GPU). 739 """ 740 FDS = F @ D @ S 741 errVec = FDS - cp.conjugate(FDS).T 742 self.KSmats.append(F) 743 self.errVecs.append(errVec) 744 nKS = len(self.KSmats) 745 if nKS > self.diisSpace: 746 self.KSmats.pop(0) 747 self.errVecs.pop(0) 748 nKS = nKS - 1 749 B = cp.zeros((nKS + 1,nKS + 1)) 750 B[-1,:] = B[:,-1] = -1.0 751 B[-1,-1] = 0.0 752 # B is symmetric 753 for i in range(nKS): 754 for j in range(i+1): 755 B[i,j] = B[j,i] = \ 756 cp.real(cp.trace(cp.dot(cp.conjugate(self.errVecs[i]).T, self.errVecs[j]))) 757 758 residual = cp.zeros((nKS + 1, 1)) 759 residual[-1] = -1.0 760 weights = cp.linalg.solve(B,residual) 761 762 # weights is 1 x numFock + 1, but first numFock values 763 # should sum to one if we are doing DIIS correctly 764 assert cp.isclose(sum(weights[:-1]),1.0) 765 766 F = cp.zeros(F.shape) 767 for i, KS in enumerate(self.KSmats): 768 weight = weights[i] 769 F += weight * KS 770 771 return F 772 773 # @profile 774 def scf(self): 775 """ 776 Perform Self-Consistent Field (SCF) calculation for Density Functional Theory (DFT). 777 778 This method implements a complete DFT SCF procedure including: 779 - One-electron integral calculation (overlap, kinetic, nuclear attraction) 780 - Two-electron integral calculation (4-center ERIs or density fitting) 781 - Grid generation and pruning for exchange-correlation evaluation 782 - Iterative SCF cycles with DIIS convergence acceleration 783 - Exchange-correlation energy and potential evaluation using LibXC 784 - GPU acceleration support for performance-critical operations 785 786 The implementation supports multiple algorithmic variants: 787 - Density fitting (DF) 788 - Schwarz screening for integral sparsity 789 - Multiple XC evaluation algorithms (CPU/GPU optimized) 790 - Dynamic precision switching for GPU calculations 791 792 SCF Procedure: 793 1. Initialize one-electron integrals (S, T, V_nuc) 794 2. Calculate/prepare two-electron integrals with optional screening 795 3. Generate and prune integration grids for XC evaluation 796 4. Iterative SCF loop: 797 - Build Coulomb matrix J from density matrix 798 - Evaluate exchange-correlation energy/potential on grids 799 - Form Kohn-Sham matrix: H_KS = H_core + J + V_xc 800 - Apply DIIS convergence acceleration 801 - Diagonalize KS matrix to get new orbitals 802 - Generate new density matrix from occupied orbitals 803 - Check energy convergence 804 5. Return converged total energy and density matrix 805 806 Computational Features: 807 - Multi-threading support via Numba and configurable core count 808 - GPU acceleration using CuPy for grid-based operations 809 - Memory-efficient batched evaluation of XC terms 810 - Multiple density fitting algorithms (DF_algo 1-10) 811 - Basis function screening for XC evaluation efficiency 812 - Optional Cholesky decomposition for 2-center integrals 813 814 Returns: 815 -------- 816 tuple[float, numpy.ndarray] 817 Etot : float 818 Converged total electronic energy in atomic units 819 dmat : numpy.ndarray, shape (nbf, nbf) 820 Converged density matrix in atomic orbital basis 821 822 Raises: 823 ------- 824 ConvergenceError 825 If SCF fails to converge within max_itr iterations 826 827 Notes: 828 ------ 829 - Uses class attributes for all computational parameters (basis, xc, conv_crit, etc.) 830 - Extensive timing and profiling information printed during execution 831 - Memory usage information displayed for large arrays 832 - GPU memory automatically freed after completion 833 - Supports both Cartesian (CAO) and Spherical (SAO) atomic orbital bases 834 835 The function performs comprehensive error checking and provides detailed 836 timing breakdowns for performance analysis. GPU acceleration is automatically 837 enabled when CuPy is available and use_gpu=True. 838 839 Example Energy Components: 840 - Electron-nuclear attraction energy 841 - Nuclear repulsion energy 842 - Kinetic energy 843 - Coulomb (electron-electron repulsion) energy 844 - Exchange-correlation energy 845 """ 846 #### Timings 847 duration1e = 0 848 durationDIIS = 0 849 durationAO_values = 0 850 durationCoulomb = 0 851 durationgrids = 0 852 durationItr = 0 853 durationxc = 0 854 durationXCpreprocessing = 0 855 durationDF = 0 856 durationKS = 0 857 durationSCF = 0 858 durationgrids_prune_rho = 0 859 durationSchwarz = 0 860 duration2c2e = 0 861 durationDF_coeff = 0 862 durationDF_gamma = 0 863 durationDF_Jtri = 0 864 durationDF_cholesky = 0 865 startSCF = timer() 866 867 mol = self.mol 868 basis = self.basis 869 dmat = self.dmat 870 xc = self.xc 871 conv_crit = self.conv_crit 872 max_itr = self.max_itr 873 ncores = self.ncores 874 grids = self.grids 875 gridsLevel = self.gridsLevel 876 isDF = self.isDF 877 auxbasis = self.auxbasis 878 rys = self.rys 879 DF_algo = self.DF_algo 880 blocksize = self.blocksize 881 XC_algo = self.XC_algo 882 debug = self.debug 883 sortGrids = self.sortGrids 884 save_ao_values = self.save_ao_values 885 xc_bf_screen = self.xc_bf_screen 886 threshold_schwarz = self.threshold_schwarz 887 strict_schwarz = self.strict_schwarz 888 cholesky = self.cholesky 889 orthogonalize = self.orthogonalize 890 891 print_pyfock_logo() 892 print('\n\nNumber of atoms:', mol.natoms) 893 print('\n\nNumber of basis functions (atomic orbitals):', basis.bfs_nao) 894 print('\n\nNumber of auxiliary basis functions:', auxbasis.bfs_nao) 895 print('\n\n================================================================================\n\n') 896 897 print_sys_info() 898 899 #### Set number of cores 900 numba.set_num_threads(ncores) 901 os.environ['RAYON_NUM_THREADS'] = str(ncores) 902 903 print('Running DFT using '+str(numba.get_num_threads())+' threads for Numba.\n\n', flush=True) 904 # if basis is None: 905 # basis = self.basis 906 # if mol is None: 907 # mol = self.mol 908 # if xc is None: 909 # xc = self.xc 910 # if gridsLevel is None: 911 # gridsLevel = self.gridsLevel 912 # if auxbasis is None: 913 # auxbasis = Basis(mol, {'all':Basis.load(mol=mol, basis_name='def2-universal-jfit')}) 914 if grids is None: 915 sortGrids = True 916 if xc_bf_screen==False: 917 if save_ao_values==True: 918 print('Warning! AO screening is set to False, but AO values are requested to be saved. \ 919 AO values can only be saved when XC_BF_SCREEN=TRUE. AO values will not be saved.', flush=True) 920 save_ao_values = False 921 if CUPY_AVAILABLE: 922 if self.use_gpu: 923 print('GPU acceleration is enabled. Currently this only accelerates AO values and XC term evaluation.', flush=True) 924 print('GPU(s) information:') 925 # print(cp.cuda.Device.mem_info()) 926 print(cuda.detect()) 927 print('Max threads per block supported by the GPU: ', cuda.get_current_device().MAX_THREADS_PER_BLOCK, flush=True) 928 print('The user has specified to use '+str(self.n_gpus)+' GPU(s).') 929 930 # For CUDA computations 931 threads_per_block = (self.threads_x, self.threads_y) 932 print('Threads per block configuration for the XC term: ', threads_per_block, flush=True) 933 print('Threads per block configuration for the all other calculations: ', (32, 32), flush=True) 934 if self.dynamic_precision: 935 print('\n\nWill use dynamic precision. ') 936 print('This means that the XC term will be evaluated in single precision until the ') 937 print('relative energy difference b/w successive iterations is less than 5.0E-7.') 938 precision_XC = cp.float32 939 else: 940 precision_XC = cp.float64 941 942 if XC_algo is None: 943 XC_algo = 3 944 if blocksize is None: 945 blocksize = 20480 946 streams = [] 947 nb_streams = [] 948 for i in range(self.n_gpus): 949 cp.cuda.Device(i).use() 950 cp_stream = cp.cuda.Stream(non_blocking = True) 951 nb_stream = cuda.external_stream(cp_stream.ptr) 952 streams.append(cp_stream) 953 nb_streams.append(nb_stream) 954 # Switch back to main GPU 955 cp.cuda.Device(0).use() 956 streams[0].use() 957 # # Set some basis function data as cupy arrays to avoid redoing it during XC term evaluation at every SCF iteration 958 # bfs_coords = cp.asarray([basis.bfs_coords], dtype=precision_XC) 959 # bfs_contr_prim_norms = cp.asarray([basis.bfs_contr_prim_norms], dtype=precision_XC) 960 # bfs_lmn = cp.asarray([basis.bfs_lmn]) 961 # bfs_nprim = cp.asarray([basis.bfs_nprim]) 962 # #The remaining properties like bfs_coeffs are a list of lists of unequal sizes. 963 # #Numba won't be able to work with these efficiently. 964 # #So, we convert them to a numpy 2d array by applying a trick, 965 # #that the second dimension is that of the largest list. So that 966 # #it can accomadate all the lists. 967 # maxnprim = max(basis.bfs_nprim) 968 # bfs_coeffs = cp.zeros([basis.bfs_nao, maxnprim], dtype=precision_XC) 969 # bfs_expnts = cp.zeros([basis.bfs_nao, maxnprim], dtype=precision_XC) 970 # bfs_prim_norms = cp.zeros([basis.bfs_nao, maxnprim], dtype=precision_XC) 971 # bfs_radius_cutoff = cp.zeros([basis.bfs_nao], dtype=precision_XC) 972 # for i in range(basis.bfs_nao): 973 # for j in range(basis.bfs_nprim[i]): 974 # bfs_coeffs[i,j] = basis.bfs_coeffs[i][j] 975 # bfs_expnts[i,j] = basis.bfs_expnts[i][j] 976 # bfs_prim_norms[i,j] = basis.bfs_prim_norms[i][j] 977 # bfs_radius_cutoff[i] = basis.bfs_radius_cutoff[i] 978 # # Now bf/ao values can be evaluated by calling the following 979 # # bf_values = Integrals.bf_val_helpers.eval_bfs(bfs_coords[0], bfs_contr_prim_norms[0], bfs_nprim[0], bfs_lmn[0], bfs_coeffs, bfs_prim_norms, bfs_expnts, bfs_radius_cutoff, coord) 980 # bfs_data_as_np_arrays = [bfs_coords[0], bfs_contr_prim_norms[0], bfs_nprim[0], bfs_lmn[0], bfs_coeffs, bfs_prim_norms, bfs_expnts, bfs_radius_cutoff] 981 982 983 else: 984 if self.use_gpu: 985 print('GPU acceleration requested but cannot be enabled as Cupy is not installed.', flush=True) 986 self.use_gpu = False 987 988 if XC_algo is None: 989 XC_algo = 2 990 if blocksize is None: 991 blocksize = 5000 992 if isDF: 993 isSchwarz = True 994 else: 995 isSchwarz = False # Schwarz screening is not yet implemented for 4c2e integrals 996 if strict_schwarz: 997 if not (DF_algo==6 or DF_algo==10): 998 print('Warning: The stricter variation of Schwarz screening is only compatible with DF algo #6 or #10 so turning it off.') 999 strict_schwarz = False 1000 if cholesky: 1001 if not (DF_algo==6 or DF_algo==10): 1002 print('Warning: The Cholesky decomposition of 2c2e integrls is only compatible with DF algo #6 or #10 so turning it off.') 1003 cholesky = False 1004 1005 if self.sao: 1006 print('\n\nSpherical Atomic Orbitals are being used!\n\n') 1007 # Get the CAO to SAO transformation matrix 1008 c2sph_mat = basis.cart2sph_basis() # CAO --> SAO 1009 # Calculate the pseudoinverse transformation matrix (for back transformation of SAO dmat to CAO dmat) 1010 sph2c_mat_pseudo = basis.sph2cart_basis() # SAO --> CAO 1011 1012 1013 eigvectors = None 1014 # DF_algo = 1 # Worst algorithm for more than 500 bfs/auxbfs (requires 2x mem of 3c2e integrals and a large prefactor) 1015 # DF_algo = 2 # Sligthly better (2x memory efficient) algorithm than above (requires 1x mem of 3c2e integrals and a large prefactor) 1016 # DF_algo = 3 # Memory effcient without any prefactor. (Can easily be converted into a sparse version, unlike the others) (https://aip.scitation.org/doi/pdf/10.1063/1.1567253) 1017 # DF_algo = 4 # Same as 3, except now we use triangular version of ints3c2e to save on memory 1018 # DF_algo = 5 # Same as 4 in terms of memory requirements, however faster in performance due to the use of Schwarz screening. 1019 # DF_algo = 6 # Much cheaper than 4 and 5 in terms of memory requirements because the indices of significant (ij|P) are efficiently calculated without duplicates/temporary arrays. 1020 # The speed maybe same or just slightly slower. 1021 # DF_algo = 7 # The significant indices (ij|P) are stored even more efficiently by using shell indices instead of bf indices. 1022 # DF_algo = 8 # Similar to 6, except that here the significant indices are not stored resulting in 50% memory savings. The drawback is that it only works in serial which is useful for Google colab or Kaggle perhaps. 1023 # DF_algo = 9 # 1024 1025 if not strict_schwarz: # If a stricter variant of Schwarz screening is not requested 1026 start1e = timer() 1027 print('\nCalculating one electron integrals...\n\n', flush=True) 1028 # One electron integrals 1029 if not self.use_gpu: 1030 S = Integrals.overlap_mat_symm(basis) 1031 V = Integrals.nuc_mat_symm(basis, mol) 1032 T = Integrals.kin_mat_symm(basis) 1033 # Core hamiltonian 1034 H = T + V 1035 else: 1036 S = Integrals.overlap_mat_symm_cupy(basis, cp_stream=streams[0]) 1037 V = Integrals.nuc_mat_symm_cupy(basis, mol, cp_stream = streams[0]) 1038 T = Integrals.kin_mat_symm_cupy(basis, cp_stream = streams[0]) 1039 # Core hamiltonian 1040 H = T + V 1041 1042 print('Core H size in GB ',H.nbytes/1e9, flush=True) 1043 print('done!', flush=True) 1044 duration1e = timer() - start1e 1045 print('Time taken '+str(duration1e)+' seconds.\n', flush=True) 1046 else: 1047 start1e = timer() 1048 print('\nCalculating overlap and kinetic integrals...\n\n', flush=True) 1049 # One electron integrals 1050 if not self.use_gpu: 1051 S = Integrals.overlap_mat_symm(basis) 1052 T = Integrals.kin_mat_symm(basis) 1053 # Core hamiltonian 1054 H = T 1055 else: 1056 S = Integrals.overlap_mat_symm_cupy(basis, cp_stream = streams[0]) 1057 T = Integrals.kin_mat_symm_cupy(basis, cp_stream = streams[0]) 1058 # Core hamiltonian 1059 H = T 1060 1061 print('Core H size in GB ',(H.nbytes/1e9)*2, flush=True) # Factor of 2 because nuclear matrix will also be included here later 1062 print('done!', flush=True) 1063 duration1e = timer() - start1e 1064 print('Time taken '+str(duration1e)+' seconds.\n', flush=True) 1065 1066 1067 if dmat is None: 1068 if self.dmat_guess_method=='core': 1069 dmat = self.guessCoreH(mol, basis, Hcore=H, S=S) 1070 1071 if self.use_gpu: 1072 dmat_cp = cp.asarray(dmat, dtype=cp.float64) 1073 streams[0].synchronize() 1074 cp.cuda.Stream.null.synchronize() 1075 1076 1077 if self.sao: 1078 # It is possible that the supplied density matrix to the SCF was in SAO format already. 1079 # In such a case we need to transform this density matrix to CAO basis so that the J and XC term evaluations can be done properly 1080 if not dmat.shape==S.shape: 1081 dmat = np.dot(sph2c_mat_pseudo, np.dot(dmat, sph2c_mat_pseudo.T)) # Convert to CAO from SAO (SAO --> CAO) 1082 # Later the dmat will be converted back to SAO after J and XC term evaluations 1083 if not isDF: # 4c2e ERI case 1084 1085 print('\nCalculating four centered two electron integrals (ERIs)...\n\n', flush=True) 1086 if not isSchwarz: 1087 # Four centered two electron integrals (ERIs) 1088 if rys: 1089 ints4c2e = Integrals.rys_4c2e_symm(basis) 1090 else: 1091 ints4c2e = Integrals.conv_4c2e_symm(basis) 1092 print('Four Center Two electron ERI size in GB ',ints4c2e.nbytes/1e9, flush=True) 1093 print('done!') 1094 else: 1095 print('\n\nPerforming Schwarz screening...') 1096 # threshold_schwarz = 1e-09 1097 print('Threshold ', threshold_schwarz) 1098 startSchwarz = timer() 1099 nints4c2e_sym8 = int(basis.bfs_nao**4/8) 1100 nints4c2e = int(basis.bfs_nao**4) 1101 duration_4c2e_diag = 0.0 1102 start_4c2e_diag = timer() 1103 # Diagonal elements of ERI 4c2e array 1104 ints4c2e_diag = Integrals.schwarz_helpers.eri_4c2e_diag(basis) 1105 duration_4c2e_diag = timer() - start_4c2e_diag 1106 print('Time taken to evaluate the "diagonal" of 4c2e ERI tensor: ', duration_4c2e_diag) 1107 # Calculate the square roots required for 1108 duration_square_roots = 0.0 1109 start_square_roots = timer() 1110 sqrt_ints4c2e_diag = np.sqrt(np.abs(ints4c2e_diag)) 1111 duration_square_roots = timer() - start_square_roots 1112 print('Time taken to evaluate the square roots needed: ', duration_square_roots) 1113 chunksize = int(1e9) # Results in 2 GB chunks 1114 duration_indices_calc = 0.0 1115 duration_concatenation = 0.0 1116 start_indices_calc = timer() 1117 # indices_temp = [] 1118 ijkl = [0, 0, 0, 0] 1119 if chunksize<nints4c2e_sym8: 1120 nchunks = nints4c2e_sym8//chunksize + 1 1121 else: 1122 nchunks=1 1123 indicesA = None 1124 for ichunk in range(nchunks): 1125 indices_temp = Integrals.schwarz_helpers.calc_indices_4c2e_schwarz(sqrt_ints4c2e_diag, min(chunksize, nints4c2e_sym8), basis.bfs_nao, ijkl[0], ijkl[1], ijkl[2], ijkl[3], threshold_schwarz) 1126 1127 ijkl = indices_temp[4] 1128 count = indices_temp[5] 1129 # start_concatenation = timer() 1130 if indicesA is not None and count>0: 1131 indicesA = np.concatenate([indicesA, indices_temp[0][0:count]]) 1132 indicesB = np.concatenate([indicesB, indices_temp[1][0:count]]) 1133 indicesC = np.concatenate([indicesC, indices_temp[2][0:count]]) 1134 indicesD = np.concatenate([indicesD, indices_temp[3][0:count]]) 1135 else: 1136 1137 indicesA = indices_temp[0][0:count] 1138 indicesB = indices_temp[1][0:count] 1139 indicesC = indices_temp[2][0:count] 1140 indicesD = indices_temp[3][0:count] 1141 # duration_concatenation += timer() - start_concatenation 1142 # Break out of the for loop if the nol. of significant triplets found is less than the chunksize 1143 # This is because, it means that there are no more significant triplets to be found from all possible configurations. 1144 if count<chunksize: 1145 break 1146 1147 duration_indices_calc += timer() - start_indices_calc 1148 print('Time for significant indices evaluation: ', duration_indices_calc) 1149 # print('Time for array concatenation: ', duration_concatenation) 1150 1151 # Get rid of temp variables 1152 indices_temp=0 1153 ijk = 0 1154 1155 print('Size of permanent array storing the significant indices of 4c2e ERI in GB ', indicesA.nbytes/1e9+indicesB.nbytes/1e9+indicesC.nbytes/1e9+indicesD.nbytes/1e9, flush=True) 1156 1157 nsignificant = len(indicesA) 1158 print('No. of elements in the standard four-centered two electron ERI tensor: ', nints4c2e, flush=True) 1159 print('No. of elements after factoring in the 8-fold symmetry in the four-centered two electron ERI tensor: ', nints4c2e_sym8, flush=True) 1160 print('No. of significant quadruplets based on Schwarz inequality and 8-fold symmetry: ' + str(nsignificant) + ' or '+str(np.round(nsignificant/nints4c2e*100,1)) + '% of original', flush=True) 1161 print('Schwarz screening done!') 1162 durationSchwarz = timer() - startSchwarz 1163 print('Total time taken for Schwarz screening '+str(durationSchwarz)+' seconds.\n', flush=True) 1164 1165 ints4c2e = Integrals.schwarz_helpers.rys_4c2e_tri_schwarz_sparse(basis, auxbasis, indicesA, indicesB, indicesC, indicesD) 1166 1167 else: # Density fitting case (3c2e, and 2c2e will be calculated) 1168 H_temp, V_temp, ints3c2e, ints2c2e, nsignificant, indicesA, indicesB, indicesC, offsets_3c2e, indices, ints4c2e_diag, sqrt_ints4c2e_diag, sqrt_diag_ints2c2e, indices_dmat_tri, indices_dmat_tri_2, df_coeff0, Qpq, cho_decomp_ints2c2e, durationDF_cholesky, durationCoulomb = density_fitting_prelims_for_DFT_development(mol, basis, auxbasis, T, dmat, self.use_gpu, self.keep_ints3c2e_in_gpu, threshold_schwarz, strict_schwarz, rys, DF_algo, cholesky) 1169 if not H_temp is None: 1170 H = H_temp 1171 if not V_temp is None: 1172 V = V_temp 1173 if cholesky: 1174 durationDF += durationDF_cholesky 1175 1176 1177 1178 1179 1180 scf_converged = False 1181 durationgrids = 0 1182 1183 if grids is None: 1184 startGrids = timer() 1185 print('\nGenerating grids...\n\n', flush=True) 1186 # To get the same energies as PySCF (level=5) upto 1e-7 au, use the following settings 1187 # radial_precision = 1.0e-13 1188 # level=3 1189 # pruning by density with threshold = 1e-011 1190 # alpha_min and alpha_max corresponding to QZVP 1191 print('Grids level: ', gridsLevel) 1192 # Generate grids for XC term 1193 basisGrids = Basis(mol, {'all':Basis.load(mol=mol, basis_name='def2-QZVP')}) 1194 grids = Grids(mol, basis=basisGrids, level = gridsLevel, ncores=ncores) 1195 1196 print('done!', flush=True) 1197 durationgrids = timer() - startGrids 1198 print('Time taken '+str(durationgrids)+' seconds.\n', flush=True) 1199 1200 # Begin pruning the grids based on density (rho) 1201 # Evaluate ao_values to calculate rho 1202 print('\nPruning generated grids by rho...\n\n', flush=True) 1203 startGrids_prune_rho = timer() 1204 threshold_rho = 1e-011 1205 ngrids_temp = grids.coords.shape[0] 1206 ndeleted = 0 1207 blocksize_temp = 50000 1208 nblocks_temp = ngrids_temp//blocksize_temp 1209 weightsNew = None 1210 coordsNew = None 1211 for iblock in range(nblocks_temp+1): 1212 offset = iblock*blocksize_temp 1213 weights_block = grids.weights[offset : min(offset+blocksize_temp,ngrids_temp)] 1214 coords_block = grids.coords[offset : min(offset+blocksize_temp,ngrids_temp)] 1215 ao_value_block = Integrals.bf_val_helpers.eval_bfs(basis, coords_block) 1216 rho_block = contract('ij,mi,mj->m', dmat, ao_value_block, ao_value_block) 1217 zero_indices = np.where(np.abs(rho_block*weights_block) < threshold_rho)[0] 1218 ndeleted += len(zero_indices) 1219 weightsNew_block = np.delete(weights_block, zero_indices) 1220 coordsNew_block = np.delete(coords_block, zero_indices, 0) 1221 if weightsNew_block.shape[0]>0: 1222 if weightsNew is None: 1223 weightsNew = weightsNew_block 1224 coordsNew = coordsNew_block 1225 else: 1226 weightsNew = np.concatenate((weightsNew, weightsNew_block)) 1227 coordsNew = np.concatenate([coordsNew, coordsNew_block], axis=0) 1228 1229 grids.coords = coordsNew 1230 grids.weights = weightsNew 1231 print('done!', flush=True) 1232 durationgrids_prune_rho = timer() - startGrids_prune_rho 1233 print('Time taken '+str(durationgrids_prune_rho)+' seconds.\n', flush=True) 1234 print('\nDeleted '+ str(ndeleted) + ' grid points.', flush=True) 1235 1236 else: 1237 print('\nUsing the user supplied grids!\n\n', flush=True) 1238 1239 1240 # Grid information initial 1241 print('\nNo. of supplied/generated grid points: ', grids.coords.shape[0], flush=True) 1242 1243 # Prune grids based on weights 1244 # start_pruning_weights = timer() 1245 # print('\nPruning grids based on weights....', flush=True) 1246 # zero_indices = np.where(np.logical_and(grids.weights>=-1.0e-12, grids.weights<=1.e-12)) 1247 # grids.weights = np.delete(grids.weights, zero_indices) 1248 # grids.coords = np.delete(grids.coords, zero_indices, 0) 1249 # print('done!', flush=True) 1250 # duration_pruning_weights = timer() - start_pruning_weights 1251 # print('\nTime taken '+str(duration_pruning_weights)+' seconds.\n', flush=True) 1252 # print('\nNo. of grid points after screening by weights: ', grids.coords.shape[0], flush=True) 1253 1254 print('Size (in GB) for storing the coordinates of grid: ', grids.coords.nbytes/1e9, flush=True) 1255 print('Size (in GB) for storing the weights of grid: ', grids.weights.nbytes/1e9, flush=True) 1256 print('Size (in GB) for storing the density at gridpoints: ', grids.weights.nbytes/1e9, flush=True) 1257 1258 # Sort the grids for slightly better performance with batching (doesn't seem to make much difference) 1259 if sortGrids: 1260 print('\nSorting grids ....', flush=True) 1261 # Function to sort grids 1262 def get_ordered_list(points, x, y, z): 1263 points.sort(key = lambda p: (p[0] - x)**2 + (p[1] - y)**2 + (p[2] - z)**2) 1264 # print(points[0:10]) 1265 return points 1266 # Make a single array of coords and weights 1267 coords_weights = np.c_[grids.coords, grids.weights] 1268 coords_weights = np.array(get_ordered_list(coords_weights.tolist(), min(grids.coords[:,0]), min(grids.coords[:,1]), min(grids.coords[:,2]))) 1269 # Now go back to two arrays for coords and weights 1270 grids.weights = coords_weights[:,3] 1271 grids.coords = coords_weights[:,0:3] 1272 coords_weights = 0#None 1273 print('done!', flush=True) 1274 1275 # blocksize = 10000 1276 ngrids = grids.coords.shape[0] 1277 nblocks = ngrids//blocksize 1278 print('\nWill use batching to evaluate the XC term for memory efficiency.', flush=True) 1279 print('Batch size: ', blocksize, flush=True) 1280 print('No. of batches: ', nblocks+1, flush=True) 1281 1282 1283 #### Some preliminary stuff for XC evaluation 1284 durationXCpreprocessing = 0 1285 list_nonzero_indices = None 1286 count_nonzero_indices = None 1287 list_ao_values = None 1288 list_ao_grad_values = None 1289 if xc_bf_screen: 1290 xc_family_dict = {1:'LDA',2:'GGA',4:'MGGA'} 1291 # Create a LibXC object 1292 funcx = pylibxc.LibXCFunctional(xc[0], "unpolarized") 1293 funcc = pylibxc.LibXCFunctional(xc[1], "unpolarized") 1294 x_family_code = funcx.get_family() 1295 c_family_code = funcc.get_family() 1296 ### Find the list of significanlty contributing bfs for xc evaluations 1297 startXCpreprocessing = timer() 1298 print('\nPreliminary processing for XC term evaluations...', flush=True) 1299 print('Calculating the value of basis functions (atomic orbitals) and get the indices of siginificantly contributing functions...', flush=True) 1300 # Calculate the value of basis functions for all grid points in batches 1301 # and find the indices of basis functions that have a significant contribution to those batches for each batch 1302 if not self.use_gpu: 1303 list_nonzero_indices, count_nonzero_indices = Integrals.bf_val_helpers.nonzero_ao_indices(basis, grids.coords, blocksize, nblocks, ngrids) 1304 else: 1305 list_nonzero_indices, count_nonzero_indices = Integrals.bf_val_helpers.nonzero_ao_indices_cupy(basis, grids.coords, blocksize, nblocks, ngrids, streams[0]) 1306 print('done!', flush=True) 1307 durationXCpreprocessing = timer() - startXCpreprocessing 1308 print('Time taken '+str(durationXCpreprocessing)+' seconds.\n', flush=True) 1309 print('Maximum no. of basis functions contributing to a batch of grid points: ', max(count_nonzero_indices)) 1310 print('Average no. of basis functions contributing to a batch of grid points: ', int(np.mean(count_nonzero_indices))) 1311 1312 1313 durationAO_values = 0 1314 if save_ao_values: 1315 startAO_values = timer() 1316 if xc_family_dict[x_family_code]=='LDA' and xc_family_dict[c_family_code]=='LDA': 1317 print('\nYou have asked to save the values of significant basis functions on grid points so as to avoid recalculation for each SCF cycle.', flush=True) 1318 memory_required = sum(count_nonzero_indices*blocksize)*8/1024/1024/1024 1319 print('Please note: This will require addtional memory that is approximately: '+ str(np.round(memory_required,1))+ ' GB', flush=True) 1320 print('Calculating the value of significantly contributing basis functions (atomic orbitals)...', flush=True) 1321 list_ao_values = [] 1322 # Loop over batches 1323 for iblock in range(nblocks+1): 1324 offset = iblock*blocksize 1325 coords_block = grids.coords[offset : min(offset+blocksize,ngrids)] 1326 ao_values_block = Integrals.bf_val_helpers.eval_bfs(basis, coords_block, parallel=True, non_zero_indices=list_nonzero_indices[iblock][0:count_nonzero_indices[iblock]]) 1327 if self.use_gpu and self.keep_ao_in_gpu: 1328 list_ao_values.append(cp.asarray(ao_values_block)) 1329 else: 1330 list_ao_values.append(ao_values_block) 1331 #Free memory 1332 ao_values_block = 0 1333 if xc_family_dict[x_family_code]!='LDA' or xc_family_dict[c_family_code]!='LDA': 1334 print('\nYou have asked to save the values of significant basis functions and their gradients on grid points so as to avoid recalculation for each SCF cycle.', flush=True) 1335 memory_required = 4*sum(count_nonzero_indices*blocksize)*8/1024/1024/1024 1336 print('Please note: This will require addtional memory that is approximately :'+ str(np.round(memory_required,1))+ ' GB', flush=True) 1337 print('Calculating the value of significantly contributing basis functions (atomic orbitals)...', flush=True) 1338 list_ao_values = [] 1339 list_ao_grad_values = [] 1340 # Loop over batches 1341 for iblock in range(nblocks+1): 1342 offset = iblock*blocksize 1343 coords_block = grids.coords[offset : min(offset+blocksize,ngrids)] 1344 # ao_values_block = Integrals.bf_val_helpers.eval_bfs(basis, coords_block, parallel=True, non_zero_indices=list_nonzero_indices[iblock][0:count_nonzero_indices[iblock]]) 1345 ao_values_block, ao_grad_values_block = Integrals.bf_val_helpers.eval_bfs_and_grad(basis, coords_block, parallel=True, non_zero_indices=list_nonzero_indices[iblock][0:count_nonzero_indices[iblock]]) 1346 if self.use_gpu and self.keep_ao_in_gpu: 1347 list_ao_values.append(cp.asarray(ao_values_block)) 1348 list_ao_grad_values.append(cp.asarray(ao_grad_values_block)) 1349 else: 1350 list_ao_values.append(ao_values_block) 1351 list_ao_grad_values.append(ao_grad_values_block) 1352 #Free memory 1353 ao_values_block = 0 1354 ao_grad_values_block =0 1355 print('done!', flush=True) 1356 durationAO_values = timer() - startAO_values 1357 print('Time taken '+str(durationAO_values)+' seconds.\n', flush=True) 1358 1359 if self.use_gpu: 1360 grids.coords = cp.asarray(grids.coords, dtype=cp.float64) 1361 grids.weights = cp.asarray(grids.weights, dtype=cp.float64) 1362 1363 #-------XC Stuff start---------------------- 1364 1365 funcid = self.xc 1366 1367 xc_family_dict = {1:'LDA',2:'GGA',4:'MGGA'} 1368 1369 # Create a LibXC object 1370 funcx = pylibxc.LibXCFunctional(funcid[0], "unpolarized") 1371 funcc = pylibxc.LibXCFunctional(funcid[1], "unpolarized") 1372 x_family_code = funcx.get_family() 1373 c_family_code = funcc.get_family() 1374 1375 1376 print('\n\n------------------------------------------------------', flush=True) 1377 print('Exchange-Correlation Functional') 1378 print('------------------------------------------------------\n', flush=True) 1379 print('XC Functional IDs supplied: ', funcid, flush=True) 1380 print('\n\nDescription of exchange functional: \n') 1381 print('The Exchange function belongs to the family:', xc_family_dict[x_family_code], flush=True) 1382 print(funcx.describe()) 1383 print('\n\nDescription of correlation functional: \n', flush=True) 1384 print(' The Correlation function belongs to the family:', xc_family_dict[c_family_code], flush=True) 1385 print(funcc.describe()) 1386 print('------------------------------------------------------\n', flush=True) 1387 print('\n\n', flush=True) 1388 #-------XC Stuff end---------------------- 1389 1390 Etot = 0 1391 1392 itr = 1 1393 Enn = self.nuclear_rep_energy(mol) 1394 #diis = scf.CDIIS() 1395 dmat_old = 0 1396 J_diff = 0 1397 Ecoul = 0.0 1398 Ecoul_temp = 0.0 1399 1400 durationxc = 0 1401 1402 startKS = timer() 1403 if self.sao: 1404 ######### 1405 # I use a trick to calculate the DFT energy in SAO basis. 1406 # The trick is to get rid of the extra information in the CAO basis matrices. 1407 # The following does just that. 1408 # Going from CAO --> SAO we lose the extra information then we go back to from SAO --> CAO 1409 # This way all the integrals (potential matrices and density matrices) can still be calculated 1410 # in the CAO basis. In fact even the KS matrix is diagonalized in the CAO basis with the extra information 1411 # removed by forward and backward transformation: CAO --> SAO followed by SAO --> CAO. 1412 # This also helps in reducing the number of transformations needed for calculation of various energy contributions. 1413 ######### 1414 # Convert the overlap matrix from CAO to SAO basis 1415 S = np.dot(c2sph_mat, np.dot(S, c2sph_mat.T)) # CAO --> SAO 1416 # Convert back to SAO so that now we lose the extra information that the CAO basis had 1417 S = np.dot(sph2c_mat_pseudo, np.dot(S, sph2c_mat_pseudo.T)) 1418 if orthogonalize: 1419 if not self.use_gpu: 1420 eig_val_s, eig_vec_s = scipy.linalg.eigh(S) 1421 # Removing the eigenvectors assoicated to the smallest eigenvalue. 1422 x = eig_vec_s[:,eig_val_s>1e-7] / np.sqrt(eig_val_s[eig_val_s>1e-7]) 1423 else: 1424 x = None 1425 durationKS = durationKS + timer() - startKS 1426 1427 1428 while not (scf_converged or itr==max_itr+1): 1429 startIter = timer() 1430 if itr==1: 1431 dmat_diff = dmat # This is in CAO basis 1432 # Coulomb (Hartree) matrix 1433 if not isDF: 1434 J = contract('ijkl,ij',ints4c2e,dmat) # This is in CAO basis 1435 else: 1436 J, durationDF, durationDF_coeff, durationDF_gamma, durationDF_Jtri, Ecoul_temp = Jmat_from_density_fitting(dmat, DF_algo, cholesky, cho_decomp_ints2c2e, df_coeff0, Qpq, ints3c2e, ints2c2e, indices_dmat_tri, indices_dmat_tri_2, indicesA, indicesB, indicesC, offsets_3c2e, sqrt_ints4c2e_diag, sqrt_diag_ints2c2e, threshold_schwarz, strict_schwarz, basis, auxbasis, self.use_gpu, self.keep_ints3c2e_in_gpu, durationDF_gamma, ncores, durationDF_coeff, durationDF_Jtri, durationDF) 1437 1438 1439 1440 1441 else: 1442 dmat_diff = dmat-dmat_old 1443 if not isDF: 1444 # J_diff = contract('ijkl,ij',ints4c2e,dmat_diff) 1445 # J += J_diff 1446 J = contract('ijkl,ij', ints4c2e, dmat) 1447 else: 1448 J, durationDF, durationDF_coeff, durationDF_gamma, durationDF_Jtri, Ecoul_temp = Jmat_from_density_fitting(dmat, DF_algo, cholesky, cho_decomp_ints2c2e, df_coeff0, Qpq, ints3c2e, ints2c2e, indices_dmat_tri, indices_dmat_tri_2, indicesA, indicesB, indicesC, offsets_3c2e, sqrt_ints4c2e_diag, sqrt_diag_ints2c2e, threshold_schwarz, strict_schwarz, basis, auxbasis, self.use_gpu, self.keep_ints3c2e_in_gpu, durationDF_gamma, ncores, durationDF_coeff, durationDF_Jtri, durationDF) 1449 # J += J_diff 1450 if self.use_gpu: 1451 J = cp.asarray(J, dtype=cp.float64) 1452 streams[0].synchronize() 1453 cp.cuda.Stream.null.synchronize() 1454 1455 1456 # XC energy and potential 1457 startxc = timer() 1458 1459 if not self.use_gpu: 1460 if XC_algo==1: 1461 # Much slower than JOBLIB version 1462 # Still keeping it because, it can be useful when using GPUs 1463 Exc, Vxc = Integrals.eval_xc_1(basis, dmat, grids.weights, grids.coords, funcid, blocksize=blocksize, debug=debug, \ 1464 list_nonzero_indices=list_nonzero_indices, count_nonzero_indices=count_nonzero_indices, \ 1465 list_ao_values=list_ao_values, list_ao_grad_values=list_ao_grad_values) 1466 if XC_algo==2: 1467 # Much faster than above and stable too, therefore this should be default now. 1468 # Used to unstable and had memory leaks, 1469 # But now all that is fixed by using threadpoolctl, garbage collection or freeing up memory after XC evaluation at each iteration 1470 1471 Exc, Vxc = Integrals.eval_xc_2(basis, dmat, grids.weights, grids.coords, funcid, ncores=ncores, blocksize=blocksize, \ 1472 list_nonzero_indices=list_nonzero_indices, count_nonzero_indices=count_nonzero_indices, \ 1473 list_ao_values=list_ao_values, list_ao_grad_values=list_ao_grad_values, debug=debug) 1474 1475 if XC_algo==3: 1476 with threadpool_limits(limits=1, user_api='blas'): 1477 Exc, Vxc = Integrals.eval_xc_3(basis, dmat, grids.weights, grids.coords, funcid, ncores=ncores, blocksize=blocksize, \ 1478 list_nonzero_indices=list_nonzero_indices, count_nonzero_indices=count_nonzero_indices, \ 1479 list_ao_values=list_ao_values, list_ao_grad_values=list_ao_grad_values, debug=debug) 1480 else: # GPU 1481 if XC_algo==1: 1482 Exc, Vxc = Integrals.eval_xc_1_cupy(basis, dmat_cp, grids.weights, grids.coords, funcid, blocksize=blocksize, debug=debug, \ 1483 list_nonzero_indices=list_nonzero_indices, count_nonzero_indices=count_nonzero_indices, \ 1484 list_ao_values=list_ao_values, list_ao_grad_values=list_ao_grad_values, use_libxc=self.use_libxc,\ 1485 nstreams=self.n_streams, ngpus=self.n_gpus, freemem=self.free_gpu_mem, threads_per_block=threads_per_block, 1486 type=precision_XC) 1487 if XC_algo==2: 1488 Exc, Vxc = Integrals.eval_xc_2_cupy(basis, dmat_cp, grids.weights, cp.asnumpy(grids.coords), funcid, ncores=ncores, blocksize=blocksize, \ 1489 list_nonzero_indices=list_nonzero_indices, count_nonzero_indices=count_nonzero_indices, \ 1490 list_ao_values=list_ao_values, list_ao_grad_values=list_ao_grad_values, debug=debug) 1491 if XC_algo==3: 1492 # Default for GPUs 1493 Exc, Vxc = Integrals.eval_xc_3_cupy(basis, dmat_cp, grids.weights, grids.coords, funcid, blocksize=blocksize, debug=debug, \ 1494 list_nonzero_indices=list_nonzero_indices, count_nonzero_indices=count_nonzero_indices, \ 1495 list_ao_values=list_ao_values, list_ao_grad_values=list_ao_grad_values, use_libxc=self.use_libxc,\ 1496 nstreams=self.n_streams, ngpus=self.n_gpus, freemem=self.free_gpu_mem, threads_per_block=threads_per_block, 1497 type=precision_XC, streams=streams, nb_streams=nb_streams) 1498 1499 Vxc = cp.asarray(Vxc, dtype=cp.float64) 1500 1501 durationxc = durationxc + timer() - startxc 1502 1503 1504 dmat_old = dmat # This is in CAO basis 1505 1506 1507 if self.use_gpu: 1508 Enuc = contract('ij,ji->', dmat_cp, V) 1509 Ekin = contract('ij,ji->', dmat_cp, T) 1510 Ecoul = contract('ij,ji->', dmat_cp, J)*0.5 1511 else: 1512 with threadpool_limits(limits=ncores, user_api='blas'): 1513 # print('Energy contractions', controller.info()) 1514 Enuc = contract('ij,ji->', dmat, V) 1515 Ekin = contract('ij,ji->', dmat, T) 1516 Ecoul = contract('ij,ji->', dmat, J)*0.5 1517 if isDF and (DF_algo==6 or DF_algo==10): 1518 Ecoul = Ecoul*2 - 0.5*Ecoul_temp # This is the correct formula for Coulomb energy with DF 1519 1520 Etot_new = Exc + Enuc + Ekin + Enn + Ecoul 1521 self.Total_energy = Etot_new 1522 self.XC_energy = Exc 1523 self.Kinetic_energy = Ekin 1524 self.Nuclear_repulsion_energy = Enn 1525 self.J_energy = Ecoul 1526 self.Nuc_energy = Enuc 1527 1528 # Set label width and numeric format 1529 label_w = 30 1530 num_fmt = "{:>20.13f}" # 20-wide, 10 decimal places 1531 1532 print(f"\n\n\n------Iteration {itr}--------\n\n", flush=True) 1533 print("Energies\n") 1534 print(f"{'Electron-Nuclear Energy':<{label_w}}{num_fmt.format(Enuc)}") 1535 print(f"{'Nuclear repulsion Energy':<{label_w}}{num_fmt.format(Enn)}") 1536 print(f"{'Kinetic Energy':<{label_w}}{num_fmt.format(Ekin)}") 1537 print(f"{'Coulomb Energy':<{label_w}}{num_fmt.format(Ecoul)}") 1538 print(f"{'Exchange-Correlation Energy':<{label_w}}{num_fmt.format(Exc)}") 1539 print('-' * (label_w + 20)) 1540 print(f"{'Total Energy':<{label_w}}{num_fmt.format(Etot_new)}") 1541 print('-' * (label_w + 20) + "\n\n\n", flush=True) 1542 1543 1544 1545 print('Energy difference : ',abs(Etot_new-Etot), flush=True) 1546 1547 # Check convergence criteria 1548 if abs(Etot_new-Etot)<conv_crit: 1549 scf_converged = True 1550 print('\nSCF Converged after '+str(itr) +' iterations!', flush=True) 1551 Etot = Etot_new 1552 print('\n-------------------------------------', flush=True) 1553 print('Total Energy = ', Etot, flush=True) 1554 print('-------------------------------------\n\n', flush=True) 1555 break 1556 1557 Etot = Etot_new 1558 itr = itr + 1 1559 1560 1561 if itr>=max_itr and not scf_converged: 1562 print('\nSCF NOT Converged after '+str(itr-1) +' iterations!', flush=True) 1563 1564 1565 1566 if not scf_converged: 1567 KS = H + J + Vxc 1568 if self.sao: 1569 # The following gets rid of the extra information in the CAO basis KS matrix by going to SAO and then back to CAO. 1570 # This way even though the matrix dimensions would be that of CAO but the information would be the same as SAO 1571 # leading to the same energy as SAO basis PySCF or TURBOMOLE calculations 1572 KS = np.dot(c2sph_mat, np.dot(KS, c2sph_mat.T)) # CAO --> SAO 1573 KS = np.dot(sph2c_mat_pseudo, np.dot(KS, sph2c_mat_pseudo.T)) #SAO --> CAO 1574 1575 1576 #### DIIS 1577 startDIIS = timer() 1578 diis_start_itr = 1 1579 if itr >= diis_start_itr: 1580 if not self.use_gpu: 1581 with threadpool_limits(limits=ncores, user_api='blas'): 1582 # print('DIIS ', controller.info()) 1583 KS = self.DIIS(S, dmat, KS) 1584 else: 1585 KS = self.DIIS_cupy(S, dmat_cp, KS) 1586 streams[0].synchronize() 1587 cp.cuda.Stream.null.synchronize() 1588 durationDIIS = durationDIIS + timer() - startDIIS 1589 1590 #### Solve KS equation (Diagonalize KS matrix) 1591 startKS = timer() 1592 if self.use_gpu: 1593 eigvalues, eigvectors = self.solve_cupy(KS, S, orthogonalize=True) # Orthogonalization is necessary with CUDA 1594 mo_occ = self.getOcc_cupy(mol, eigvalues, eigvectors) 1595 dmat = self.gen_dm_cupy(eigvectors, mo_occ) 1596 dmat_cp = dmat 1597 dmat = cp.asnumpy(dmat) 1598 streams[0].synchronize() 1599 cp.cuda.Stream.null.synchronize() 1600 # HOMO-LUMO gap 1601 occupied = cp.where(mo_occ > 1e-8)[0] 1602 if len(occupied) < len(eigvalues): 1603 homo_idx = occupied[-1] 1604 lumo_idx = homo_idx + 1 1605 gap = (eigvalues[lumo_idx] - eigvalues[homo_idx]) 1606 print(f"\n\nHOMO-LUMO gap: {gap} au", flush=True) 1607 print(f"HOMO-LUMO gap: {gap*27.211324570273:.3f} eV", flush=True) 1608 else: 1609 with threadpool_limits(limits=ncores, user_api='blas'): 1610 # print('KS eigh', controller.info()) 1611 eigvalues, eigvectors = self.solve(KS, S, orthogonalize=orthogonalize, x=x) 1612 mo_occ = self.getOcc(mol, eigvalues, eigvectors) 1613 dmat = self.gen_dm(eigvectors, mo_occ) 1614 # HOMO-LUMO gap 1615 occupied = np.where(mo_occ > 1e-8)[0] 1616 if len(occupied) < len(eigvalues): 1617 homo_idx = occupied[-1] 1618 lumo_idx = homo_idx + 1 1619 gap = (eigvalues[lumo_idx] - eigvalues[homo_idx]) 1620 print(f"\n\nHOMO-LUMO gap: {gap} au", flush=True) 1621 print(f"HOMO-LUMO gap: {gap*27.211324570273:.3f} eV", flush=True) 1622 durationKS = durationKS + timer() - startKS 1623 1624 self.dmat = dmat 1625 self.mo_coefficients = eigvectors 1626 self.mo_energies = eigvalues 1627 self.mo_occupations = mo_occ 1628 1629 # Check when to switch to double precision for XC 1630 if self.use_gpu: 1631 if precision_XC is cp.float32: 1632 if abs(Etot_new-Etot)/abs(Etot_new)<5e-7: 1633 precision_XC = cp.float64 1634 print('\nSwitching to double precision for XC evaluation after '+str(itr) +' iterations!', flush=True) 1635 1636 1637 1638 durationItr = timer() - startIter 1639 print('\n\nTime taken for the previous iteration: '+str(durationItr)+'\n\n', flush=True) 1640 1641 # # Check convergence criteria 1642 # if abs(Etot_new-Etot)<conv_crit: 1643 # scf_converged = True 1644 # print('\nSCF Converged after '+str(itr) +' iterations!', flush=True) 1645 # Etot = Etot_new 1646 # print('\n-------------------------------------', flush=True) 1647 # print('Total Energy = ', Etot, flush=True) 1648 # print('-------------------------------------\n\n', flush=True) 1649 # break 1650 1651 # Etot = Etot_new 1652 # itr = itr + 1 1653 1654 1655 # if itr>=max_itr and not scf_converged: 1656 # print('\nSCF NOT Converged after '+str(itr-1) +' iterations!', flush=True) 1657 1658 1659 1660 durationSCF = timer() - startSCF 1661 # print(dmat) 1662 print('\nTime taken : '+str(durationSCF) +' seconds.', flush=True) 1663 print('\n\n', flush=True) 1664 print('-------------------------------------', flush=True) 1665 print('Profiling', flush=True) 1666 print('-------------------------------------', flush=True) 1667 print('Preprocessing ', durationXCpreprocessing + durationAO_values + durationgrids_prune_rho + durationSchwarz) 1668 if isDF: 1669 print('Density Fitting ', durationDF, flush=True) 1670 if DF_algo==6 or DF_algo==10: 1671 print(' DF (gamma) ', durationDF_gamma, flush=True) 1672 print(' DF (coeff) ', durationDF_coeff, flush=True) 1673 print(' DF (Jtri) ', durationDF_Jtri, flush=True) 1674 if cholesky: 1675 print(' DF (Cholesky) ', durationDF_cholesky, flush=True) 1676 print('DIIS ', durationDIIS, flush=True) 1677 print('KS matrix diagonalization ', durationKS, flush=True) 1678 print('One electron Integrals (S, T, Vnuc) ', duration1e, flush=True) 1679 if isDF: 1680 print('Coulomb Integrals (2c2e + 3c2e) ', durationCoulomb-durationSchwarz-durationDF_cholesky, flush=True) 1681 if not isDF: 1682 print('Coulomb Integrals (4c2e) ', durationCoulomb-durationSchwarz, flush=True) 1683 print('Grids construction ', durationgrids, flush=True) 1684 print('Exchange-Correlation Term ', durationxc, flush=True) 1685 totalTime = durationXCpreprocessing + durationAO_values + duration1e + durationCoulomb - durationDF_cholesky + \ 1686 durationgrids + durationxc + durationDF + durationKS + durationDIIS + durationgrids_prune_rho 1687 print('Misc. ', durationSCF - totalTime, flush=True) 1688 print('Complete SCF ', durationSCF, flush=True) 1689 1690 if self.use_gpu: 1691 # Free memory of all GPUs 1692 for igpu in range(self.n_gpus): 1693 cp.cuda.Device(igpu).use() 1694 cp._default_memory_pool.free_all_blocks() 1695 1696 # Switch back to main GPU 1697 cp.cuda.Device(0).use() 1698 1699 return Etot, dmat 1700 1701 1702 1703 1704 1705 1706 1707 1708
70class DFT: 71 """ 72 A class for performing Density Functional Theory (DFT) calculations 73 with optional support for density fitting (DF), GPU acceleration, and LibXC. 74 75 Parameters 76 ---------- 77 mol : Molecule 78 Molecular object on which the DFT calculation is to be performed. 79 80 basis : Basis 81 Orbital basis set used for the SCF calculation. 82 83 auxbasis : Basis, optional 84 Auxiliary basis set for density fitting (DF). If None, a default will be assigned. 85 86 conv_crit : float, optional 87 Convergence criterion for the SCF cycle in Hartrees (default is 1e-7). 88 89 dmat_guess_method : str, optional 90 Method for the initial density matrix guess (e.g., 'core', 'huckel'). 91 92 xc : list or str, optional 93 Exchange-correlation functional specification. If None, defaults to LDA (`[1, 7]`). 94 95 grids : object, optional 96 Precomputed numerical integration grids. If None, they will be generated automatically. 97 98 gridsLevel : int, optional 99 Level of numerical integration grid refinement (default is 3). 100 101 blocksize : int, optional 102 Block size for XC grid evaluations. Defaults depend on whether GPU is used. 103 104 save_ao_values : bool, optional 105 If True, saves AO values to reuse during XC evaluation. Increases speed but uses more memory. 106 107 use_gpu : bool, optional 108 Whether to use GPU acceleration. 109 110 ncores : int, optional 111 Number of CPU cores to use (default is 2). 112 113 Attributes 114 ---------- 115 dmat : ndarray 116 Initial guess for the density matrix, will be computed during setup. 117 118 KSmats : list 119 List of Kohn–Sham matrices used in DIIS extrapolation. 120 121 errVecs : list 122 List of error vectors for DIIS. 123 124 max_itr : int 125 Maximum number of SCF iterations (default is 50). 126 127 isDF : bool 128 Whether to use density fitting for Coulomb integrals. 129 130 rys : bool 131 Whether to use Rys quadrature for evaluating electron repulsion integrals. 132 133 DF_algo : int 134 Algorithm selector for DF (reserved for developer use). 135 136 XC_algo : int 137 Algorithm selector for XC evaluation (2 for CPU, 3 for GPU). 138 139 sortGrids : bool 140 Whether to sort DFT integration grids (not recommended). 141 142 xc_bf_screen : bool 143 Enable basis function screening for XC term evaluation. 144 145 threshold_schwarz : float 146 Threshold for Schwarz screening (default is 1e-9). 147 148 strict_schwarz : bool 149 If True, applies stricter Schwarz screening. 150 151 cholesky : bool 152 If True, uses Cholesky decomposition for DF. 153 154 orthogonalize : bool 155 If True, orthogonalizes AO basis functions. 156 157 sao : bool 158 If True, uses SAO basis instead of CAO basis. 159 160 keep_ao_in_gpu : bool 161 Whether to retain AO values in GPU memory during SCF (if `save_ao_values` is True). 162 163 use_libxc : bool 164 Whether to use LibXC for XC functional evaluation (recommended off for GPU). 165 166 n_streams : int 167 Number of CUDA streams to use (if applicable). 168 169 n_gpus : int 170 Number of GPUs to use. 171 172 free_gpu_mem : bool 173 Whether to forcibly free GPU memory after use. 174 175 max_threads_per_block : int 176 Maximum threads per CUDA block supported by the device. 177 178 threads_x : int 179 CUDA thread configuration (X dimension). 180 181 threads_y : int 182 CUDA thread configuration (Y dimension). 183 184 dynamic_precision : bool 185 Whether to use precision switching during XC evaluation for performance gains. 186 187 keep_ints3c2e_in_gpu : bool 188 Whether to keep 3-center 2-electron integrals in GPU memory to avoid transfers. 189 190 debug : bool 191 If True, prints debugging output during DFT calculations. 192 193 Notes 194 ----- 195 - This class supports SCF DFT calculations with density fitting (DF). 196 - GPU support is optional and provides significant speed-up for large systems. 197 - The class is tightly integrated with PyFock and LibXC libraries. 198 199 Examples 200 -------- 201 >>> mol = Molecule(...) 202 >>> basis = Basis(mol, ...) 203 >>> dft = DFT(mol, basis, xc='PBE', use_gpu=True) 204 >>> dft.run_scf() 205 """ 206 def __init__(self, mol, basis, auxbasis=None, conv_crit=1e-7, dmat_guess_method=None, 207 xc=None, grids=None, gridsLevel=3, blocksize=None, 208 save_ao_values=False, use_gpu=False, ncores=1): 209 210 self.mol = mol 211 """ Molecular object for which the DFT calculation will be performed """ 212 if self.mol is None: 213 print('ERROR: A Mol object is required to initialize a DFT object.') 214 215 self.basis = basis 216 """ Basis object for corresponding to the molecule for the DFT calculation """ 217 if self.basis is None: 218 print('ERROR: A Basis object is required to initialize a DFT object.') 219 220 221 self.dmat_guess_method = dmat_guess_method 222 """ Initial guess for the density matrix """ 223 if self.dmat_guess_method is None: 224 self.dmat_guess_method = 'core' 225 226 self.dmat = None 227 """ Initial density matrix guess for SCF. Will get updated at each SCF iteration. """ 228 229 self.xc = xc 230 """ Exchange-Correlation Functional """ 231 if self.xc is None: 232 self.xc = [1, 7] #LDA 233 print("XC not specified, defaulting to LDA functional (LibXC codes: 1, 7)") 234 235 # DIIS 236 self.KSmats = [] 237 self.errVecs = [] 238 self.diisSpace = 6 239 240 self.conv_crit = conv_crit 241 """ Convergence criterion for SCF (in Hartrees) """ 242 # if self.conv_crit is None: 243 # self.conv_crit = 1.0E-7 244 245 self.max_itr = 50 246 """ Maximum number of iterations for SCF """ 247 # if self.max_itr is None: 248 # self.max_itr = 50 249 250 self.ncores = ncores 251 """ Number of cores to be used for DFT calculation """ 252 if self.ncores is None: 253 self.ncores = 2 254 255 self.grids = grids 256 """ Atomic grids for DFT calculation (If None or not supplied, will be generated using NumGrid) """ 257 self.gridsLevel = gridsLevel 258 """ Atomic grids for DFT calculation """ 259 260 self.isDF = True 261 """ Use density fitting (DF) for two-electron Coulomb integrals. This is only for developers. Users should not change it. """ 262 263 self.auxbasis = auxbasis 264 """ Basis object to be used as the auxiliary basis for DF """ 265 if self.auxbasis is None: 266 auxbasis = Basis(mol, {'all':Basis.load(mol=mol, basis_name='def2-universal-jfit')}) 267 268 self.rys = True 269 """ Use rys quadrature for the evaluation of two electron integrals (with and without DF)""" 270 271 self.DF_algo = 10 272 """ This is only for developers. Users should not change it. """ 273 274 self.blocksize = blocksize 275 """ Block size for the evaulation of XC term on grids. For CPUs a value of ~5000 is recommended. For GPUs, a value >20480 is recommended. """ 276 if blocksize is None: 277 if use_gpu: 278 self.blocksize = 20480 279 else: 280 self.blocksize = 5000 281 282 self.XC_algo = None 283 """ This is only for developers. Users should not change it. The algorithm for XC evaluation should be 2 for CPU and 3 for GPU.""" 284 if use_gpu: 285 self.XC_algo = 3 286 else: 287 self.XC_algo = 2 288 289 self.debug = False 290 """ Turn on printing debug statements """ 291 self.sortGrids = False 292 """ Enable/Disable sorting of DFT grids. Doesn't seem to offer any signficant advantage.""" 293 self.save_ao_values = save_ao_values 294 """ Whether to save atomic orbital (AO) values for reuse during XC evaluation. Improves performance but requires more memory. """ 295 self.xc_bf_screen = True 296 """ Enable screening of basis functions for XC term evaluation to reduce computation time drastically. """ 297 self.threshold_schwarz = 1e-09 298 """ Threshold for Schwarz screening of two-electron integrals. Smaller values increase accuracy but reduce sparsity. """ 299 self.strict_schwarz = True 300 """ If True, enforce stricter Schwarz screening to aggressively eliminate small two-electron integrals. """ 301 self.cholesky = True 302 """ Whether to use Cholesky decomposition for DF. Slightly speeds up calculations. """ 303 self.orthogonalize = True 304 """ Apply orthogonalization to the AO basis. Should be True for most standard calculations. """ 305 306 self.mo_occupations = None 307 """ Molecular orbital occupations. Will be computed during SCF. """ 308 self.mo_coefficients = None 309 """ Molecular orbital coefficients. Will be computed during SCF. """ 310 self.mo_energies = None 311 """ Molecular orbital energies. Will be computed during SCF. """ 312 self.Total_energy = None 313 """ Total energy of the system. Will be computed during SCF. """ 314 self.J_energy = None 315 """ Coulomb energy contribution. Will be computed during SCF. """ 316 self.XC_energy = None 317 """ Exchange-correlation energy contribution. Will be computed during SCF. """ 318 self.Nuclear_rep_energy = None 319 """ Nuclear repulsion energy. Will be computed during SCF. """ 320 self.Kinetic_energy = None 321 """ Kinetic energy contribution. Will be computed during SCF. """ 322 self.Nuc_energy = None 323 """ Nuclear potential energy contribution. Will be computed during SCF. """ 324 325 326 # CAO or SAO 327 self.sao = False 328 """ Whether to use SAO basis or CAO basis. Default is CAO basis. """ 329 330 331 # GPU acceleration 332 self.use_gpu = use_gpu 333 """ Whether to use GPU acceleration or not """ 334 self.keep_ao_in_gpu = True 335 """ Whether to keep the atomic orbitals for XC evaluation in GPU memory or CPU memory. Only relevant if save_ao_values = True. """ 336 self.use_libxc = True 337 """ Whether to use LibXC's version of XC functionals or PyFock implementations. 338 Only relevant when GPU is used. For GPU calculations it is recommended to use PyFock 339 implementation as it avoids CPU-GPU transfers.""" 340 self.n_streams = 1 341 self.n_gpus = 1 342 """ Number of GPUs to be used """ 343 self.free_gpu_mem = False 344 """ Whether the GPU memory should be freed by force or not""" 345 try: 346 self.max_threads_per_block = cuda.get_current_device().MAX_THREADS_PER_BLOCK 347 except: 348 self.max_threads_per_block = 1024 349 350 self.threads_x = int(self.max_threads_per_block/16) 351 self.threads_y = int(self.max_threads_per_block/64) 352 self.dynamic_precision = False # Only for the XC term 353 """ Whether to use dynamic precision switching for XC term or not """ 354 self.keep_ints3c2e_in_gpu = True 355 """ Whether to keep the 3c2e integrals in GPU memory or not. 356 Recommended to keep in GPU memory to avoid CPU-GPU transfers at each iteration.""" 357 358 # self.max_threads_per_block = 1024 359 # self.threads_x = 64 360 # self.threads_y = 16 361 362 # if self.use_gpu: 363 # try: 364 # global cp 365 # global cupy_scipy 366 # import cupy as cp 367 # import cupyx.scipy as cupy_scipy 368 # # from cupy.linalg import eig, multi_dot as dot 369 # except ModuleNotFoundError: 370 # print('Cupy was not found!') 371 372 # def removeLinearDep(self, H, S): 373 # return 1 374 375 def nuclear_rep_energy(self, mol=None): 376 """ 377 Compute the nuclear-nuclear repulsion energy. 378 379 Parameters 380 ---------- 381 mol : Molecule, optional 382 Molecule object containing nuclear coordinates and charges. 383 If None, uses `self.mol`. 384 385 Returns 386 ------- 387 float 388 The nuclear repulsion energy in Hartrees. 389 """ 390 # Nuclear-nuclear energy 391 if mol is None: 392 mol = self.mol 393 394 e = 0 395 for i in range(mol.natoms): 396 for j in range(i, mol.natoms): 397 if (i!=j): 398 dist = mol.coordsBohrs[i]-mol.coordsBohrs[j] 399 e = e + mol.Zcharges[i]*mol.Zcharges[j]/np.sqrt(np.sum(dist**2)) 400 401 return e 402 403 # full density matrix for RKS/RHF 404 def gen_dm(self, mo_coeff, mo_occ): 405 """ 406 Generate the density matrix from molecular orbital coefficients and occupations. 407 408 Parameters 409 ---------- 410 mo_coeff : ndarray 411 Molecular orbital coefficient matrix. 412 413 mo_occ : ndarray 414 Array of MO occupation numbers. 415 416 Returns 417 ------- 418 ndarray 419 Density matrix (RHF/RKS type). 420 """ 421 mocc = mo_coeff[:,mo_occ>0] 422 423 return np.dot(mocc*mo_occ[mo_occ>0], mocc.conj().T) 424 425 def getOcc(self, mol=None, energy_mo=None, coeff_mo=None): 426 """ 427 Assign occupation numbers to molecular orbitals based on their energies. 428 429 Parameters 430 ---------- 431 mol : Molecule 432 Molecule object to extract the number of electrons. 433 434 energy_mo : ndarray 435 Array of MO energies. 436 437 coeff_mo : ndarray 438 Array of MO coefficients (not used but kept for compatibility). 439 440 Returns 441 ------- 442 ndarray 443 Array of MO occupations (0 or 2 for RHF). 444 """ 445 e_idx = np.argsort(energy_mo) 446 e_sort = energy_mo[e_idx] 447 nmo = energy_mo.size 448 occ_mo = np.zeros(nmo) 449 nocc = mol.nelectrons // 2 450 occ_mo[e_idx[:nocc]] = 2 451 return occ_mo 452 453 def gen_dm_cupy(self, mo_coeff, mo_occ): 454 """ 455 Generate the density matrix using CuPy for GPU acceleration. 456 457 Parameters 458 ---------- 459 mo_coeff : cp.ndarray 460 Molecular orbital coefficient matrix (on GPU). 461 462 mo_occ : cp.ndarray 463 Array of MO occupation numbers (on GPU). 464 465 Returns 466 ------- 467 cp.ndarray 468 Density matrix (RHF/RKS type) computed on the GPU. 469 """ 470 mocc = mo_coeff[:,mo_occ>0] 471 return cp.dot(mocc*mo_occ[mo_occ>0], mocc.conj().T) 472 473 def getOcc_cupy(self, mol=None, energy_mo=None, coeff_mo=None): 474 """ 475 Assign occupation numbers to MOs using CuPy (for GPU-based calculations). 476 477 Parameters 478 ---------- 479 mol : Molecule 480 Molecule object to determine number of electrons. 481 482 energy_mo : cp.ndarray 483 MO energy array on the GPU. 484 485 coeff_mo : cp.ndarray 486 MO coefficients array on the GPU (not used). 487 488 Returns 489 ------- 490 cp.ndarray 491 Array of MO occupations (on GPU). 492 """ 493 e_idx = cp.argsort(energy_mo) 494 e_sort = energy_mo[e_idx] 495 nmo = energy_mo.size 496 occ_mo = cp.zeros(nmo) 497 nocc = mol.nelectrons // 2 498 occ_mo[e_idx[:nocc]] = 2 499 return occ_mo 500 501 def solve(self, H, S, orthogonalize=False, x=None): 502 """ 503 Solve the generalized or canonical eigenvalue equation. 504 505 Parameters 506 ---------- 507 H : ndarray 508 Hamiltonian matrix. 509 510 S : ndarray 511 Overlap matrix. 512 513 orthogonalize : bool, optional 514 If True, solve using orthogonalized basis. 515 516 x : ndarray, optional 517 Transformation matrix (if already computed). 518 519 Returns 520 ------- 521 tuple of (ndarray, ndarray) 522 Eigenvalues and eigenvectors of the system. 523 """ 524 if not orthogonalize: 525 #Solve the generalized eigenvalue equation HC = SCE 526 eigvalues, eigvectors = scipy.linalg.eigh(H, S) 527 528 else: 529 if x is None: 530 eig_val_s, eig_vec_s = scipy.linalg.eigh(S) 531 # Removing the eigenvectors assoicated to the smallest eigenvalue. 532 x = eig_vec_s[:,eig_val_s>1e-7] / np.sqrt(eig_val_s[eig_val_s>1e-7]) 533 xHx = x.T @ H @ x 534 #Solve the canonical eigenvalue equation HC = CE 535 eigvalues, eigvectors = scipy.linalg.eigh(xHx) 536 eigvectors = np.dot(x, eigvectors) 537 538 idx = np.argmax(np.abs(eigvectors.real), axis=0) 539 eigvectors[:,eigvectors[idx,np.arange(len(eigvalues))].real<0] *= -1 540 return eigvalues, eigvectors # E, C 541 542 def solve_cupy(self, H, S, orthogonalize=True): 543 """ 544 Solve the generalized eigenvalue problem using CuPy (GPU). 545 546 Parameters 547 ---------- 548 H : cp.ndarray 549 Hamiltonian matrix (on GPU). 550 551 S : cp.ndarray 552 Overlap matrix (on GPU). 553 554 orthogonalize : bool, optional 555 If True, solve using orthogonalized basis. 556 557 Returns 558 ------- 559 tuple of (cp.ndarray, cp.ndarray) 560 Eigenvalues and eigenvectors (on GPU). 561 """ 562 eig_val_s, eig_vec_s = cp.linalg.eigh(S) 563 # Removing the eigenvectors assoicated to the smallest eigenvalue. 564 x = eig_vec_s[:,eig_val_s>1e-7] / cp.sqrt(eig_val_s[eig_val_s>1e-7]) 565 xhx = x.T @ H @ x 566 #Solve the canonical eigenvalue equation HC = SCE 567 eigvalues, eigvectors = cp.linalg.eigh(xhx) 568 eigvectors = cp.dot(x, eigvectors) 569 570 idx = cp.argmax(cp.abs(eigvectors.real), axis=0) 571 eigvectors[:,eigvectors[idx,cp.arange(len(eigvalues))].real<0] *= -1 572 return eigvalues, eigvectors # E, C 573 574 575 def getCoreH(self, mol=None, basis=None): 576 """ 577 Compute the core Hamiltonian matrix (T + V). 578 579 Parameters 580 ---------- 581 mol : Molecule, optional 582 Molecule object. 583 584 basis : Basis, optional 585 Basis set object. 586 587 Returns 588 ------- 589 ndarray 590 Core Hamiltonian matrix. 591 """ 592 #Get the core Hamiltonian 593 if mol is None: 594 mol = self.mol 595 if basis is None: 596 basis = self.basis 597 nao = basis.bfs_nao 598 H = np.empty((nao,nao)) 599 Vmat = Integrals.nuc_mat_symm(basis, mol) 600 Tmat = Integrals.kin_mat_symm(basis) 601 H = Vmat + Tmat 602 603 return H 604 605 606 def guessCoreH(self, mol=None, basis=None, Hcore=None, S=None): 607 """ 608 Generate a guess density matrix using the core Hamiltonian. 609 610 Parameters 611 ---------- 612 mol : Molecule, optional 613 Molecule object. 614 615 basis : Basis, optional 616 Basis set. 617 618 Hcore : ndarray, optional 619 Core Hamiltonian. If None, it will be computed. 620 621 S : ndarray, optional 622 Overlap matrix. If None, it will be computed. 623 624 Returns 625 ------- 626 ndarray 627 Initial guess density matrix. 628 """ 629 #Get a guess for the density matrix using the core Hamiltonian 630 if mol is None: 631 mol = self.mol 632 if basis is None: 633 basis = self.basis 634 if Hcore is None: 635 Hcore = self.getCoreH(mol, basis) 636 if S is None: 637 S = Integrals.overlap_mat_symm(basis) 638 639 eigvalues, eigvectors = scipy.linalg.eigh(Hcore, S) 640 # print(eigvalues) 641 idx = np.argmax(abs(eigvectors.real), axis=0) 642 eigvectors[:,eigvectors[idx,np.arange(len(eigvalues))].real<0] *= -1 643 mo_occ = self.getOcc(mol, eigvalues, eigvectors) 644 # print(mo_occ) 645 dmat = self.gen_dm(eigvectors, mo_occ) 646 647 return dmat 648 649 650 def DIIS(self,S,D,F): 651 """ 652 Perform Direct Inversion in the Iterative Subspace (DIIS) to improve SCF convergence. 653 654 Adapted from 655 ---------- 656 McMurchie-Davidson project: 657 https://github.com/jjgoings/McMurchie-Davidson 658 Licensed under the BSD-3-Clause license 659 660 Parameters 661 ---------- 662 S : ndarray 663 Overlap matrix. 664 665 D : ndarray 666 Density matrix. 667 668 F : ndarray 669 Fock matrix. 670 671 Returns 672 ------- 673 ndarray 674 DIIS-extrapolated Fock matrix. 675 """ 676 FDS = dot([F,D,S]) 677 # SDF = np.conjugate(FDS).T 678 errVec = FDS - np.conjugate(FDS).T 679 self.KSmats.append(F) 680 self.errVecs.append(errVec) 681 nKS = len(self.KSmats) 682 if nKS > self.diisSpace: 683 self.KSmats.pop(0) 684 self.errVecs.pop(0) 685 nKS = nKS - 1 686 B = np.zeros((nKS + 1,nKS + 1)) 687 B[-1,:] = B[:,-1] = -1.0 688 B[-1,-1] = 0.0 689 # B is symmetric 690 for i in range(nKS): 691 for j in range(i+1): 692 B[i,j] = B[j,i] = \ 693 np.real(np.trace(np.dot(np.conjugate(self.errVecs[i]).T, self.errVecs[j]))) 694 # for i in range(nKS): 695 # for j in range(i+1): 696 # print(self.errVecs[i].shape) 697 # B[i,j] = np.real(np.dot(np.conjugate(self.errVecs[i]).T, self.errVecs[j])) 698 # B[j,i] = B[i,j] 699 700 residual = np.zeros((nKS + 1, 1)) 701 residual[-1] = -1.0 702 weights = scipy.linalg.solve(B,residual) 703 704 # weights is 1 x numFock + 1, but first numFock values 705 # should sum to one if we are doing DIIS correctly 706 assert np.isclose(sum(weights[:-1]),1.0) 707 708 F = np.zeros(F.shape) 709 for i, KS in enumerate(self.KSmats): 710 weight = weights[i] 711 F += numexpr.evaluate('(weight * KS)') 712 713 return F 714 715 def DIIS_cupy(self,S,D,F): 716 """ 717 Perform DIIS on GPU using CuPy to accelerate SCF convergence. 718 719 Adapted from 720 ---------- 721 McMurchie-Davidson project: 722 https://github.com/jjgoings/McMurchie-Davidson 723 Licensed under the BSD-3-Clause license 724 725 Parameters 726 ---------- 727 S : cp.ndarray 728 Overlap matrix (on GPU). 729 730 D : cp.ndarray 731 Density matrix (on GPU). 732 733 F : cp.ndarray 734 Fock matrix (on GPU). 735 736 Returns 737 ------- 738 cp.ndarray 739 DIIS-extrapolated Fock matrix (on GPU). 740 """ 741 FDS = F @ D @ S 742 errVec = FDS - cp.conjugate(FDS).T 743 self.KSmats.append(F) 744 self.errVecs.append(errVec) 745 nKS = len(self.KSmats) 746 if nKS > self.diisSpace: 747 self.KSmats.pop(0) 748 self.errVecs.pop(0) 749 nKS = nKS - 1 750 B = cp.zeros((nKS + 1,nKS + 1)) 751 B[-1,:] = B[:,-1] = -1.0 752 B[-1,-1] = 0.0 753 # B is symmetric 754 for i in range(nKS): 755 for j in range(i+1): 756 B[i,j] = B[j,i] = \ 757 cp.real(cp.trace(cp.dot(cp.conjugate(self.errVecs[i]).T, self.errVecs[j]))) 758 759 residual = cp.zeros((nKS + 1, 1)) 760 residual[-1] = -1.0 761 weights = cp.linalg.solve(B,residual) 762 763 # weights is 1 x numFock + 1, but first numFock values 764 # should sum to one if we are doing DIIS correctly 765 assert cp.isclose(sum(weights[:-1]),1.0) 766 767 F = cp.zeros(F.shape) 768 for i, KS in enumerate(self.KSmats): 769 weight = weights[i] 770 F += weight * KS 771 772 return F 773 774 # @profile 775 def scf(self): 776 """ 777 Perform Self-Consistent Field (SCF) calculation for Density Functional Theory (DFT). 778 779 This method implements a complete DFT SCF procedure including: 780 - One-electron integral calculation (overlap, kinetic, nuclear attraction) 781 - Two-electron integral calculation (4-center ERIs or density fitting) 782 - Grid generation and pruning for exchange-correlation evaluation 783 - Iterative SCF cycles with DIIS convergence acceleration 784 - Exchange-correlation energy and potential evaluation using LibXC 785 - GPU acceleration support for performance-critical operations 786 787 The implementation supports multiple algorithmic variants: 788 - Density fitting (DF) 789 - Schwarz screening for integral sparsity 790 - Multiple XC evaluation algorithms (CPU/GPU optimized) 791 - Dynamic precision switching for GPU calculations 792 793 SCF Procedure: 794 1. Initialize one-electron integrals (S, T, V_nuc) 795 2. Calculate/prepare two-electron integrals with optional screening 796 3. Generate and prune integration grids for XC evaluation 797 4. Iterative SCF loop: 798 - Build Coulomb matrix J from density matrix 799 - Evaluate exchange-correlation energy/potential on grids 800 - Form Kohn-Sham matrix: H_KS = H_core + J + V_xc 801 - Apply DIIS convergence acceleration 802 - Diagonalize KS matrix to get new orbitals 803 - Generate new density matrix from occupied orbitals 804 - Check energy convergence 805 5. Return converged total energy and density matrix 806 807 Computational Features: 808 - Multi-threading support via Numba and configurable core count 809 - GPU acceleration using CuPy for grid-based operations 810 - Memory-efficient batched evaluation of XC terms 811 - Multiple density fitting algorithms (DF_algo 1-10) 812 - Basis function screening for XC evaluation efficiency 813 - Optional Cholesky decomposition for 2-center integrals 814 815 Returns: 816 -------- 817 tuple[float, numpy.ndarray] 818 Etot : float 819 Converged total electronic energy in atomic units 820 dmat : numpy.ndarray, shape (nbf, nbf) 821 Converged density matrix in atomic orbital basis 822 823 Raises: 824 ------- 825 ConvergenceError 826 If SCF fails to converge within max_itr iterations 827 828 Notes: 829 ------ 830 - Uses class attributes for all computational parameters (basis, xc, conv_crit, etc.) 831 - Extensive timing and profiling information printed during execution 832 - Memory usage information displayed for large arrays 833 - GPU memory automatically freed after completion 834 - Supports both Cartesian (CAO) and Spherical (SAO) atomic orbital bases 835 836 The function performs comprehensive error checking and provides detailed 837 timing breakdowns for performance analysis. GPU acceleration is automatically 838 enabled when CuPy is available and use_gpu=True. 839 840 Example Energy Components: 841 - Electron-nuclear attraction energy 842 - Nuclear repulsion energy 843 - Kinetic energy 844 - Coulomb (electron-electron repulsion) energy 845 - Exchange-correlation energy 846 """ 847 #### Timings 848 duration1e = 0 849 durationDIIS = 0 850 durationAO_values = 0 851 durationCoulomb = 0 852 durationgrids = 0 853 durationItr = 0 854 durationxc = 0 855 durationXCpreprocessing = 0 856 durationDF = 0 857 durationKS = 0 858 durationSCF = 0 859 durationgrids_prune_rho = 0 860 durationSchwarz = 0 861 duration2c2e = 0 862 durationDF_coeff = 0 863 durationDF_gamma = 0 864 durationDF_Jtri = 0 865 durationDF_cholesky = 0 866 startSCF = timer() 867 868 mol = self.mol 869 basis = self.basis 870 dmat = self.dmat 871 xc = self.xc 872 conv_crit = self.conv_crit 873 max_itr = self.max_itr 874 ncores = self.ncores 875 grids = self.grids 876 gridsLevel = self.gridsLevel 877 isDF = self.isDF 878 auxbasis = self.auxbasis 879 rys = self.rys 880 DF_algo = self.DF_algo 881 blocksize = self.blocksize 882 XC_algo = self.XC_algo 883 debug = self.debug 884 sortGrids = self.sortGrids 885 save_ao_values = self.save_ao_values 886 xc_bf_screen = self.xc_bf_screen 887 threshold_schwarz = self.threshold_schwarz 888 strict_schwarz = self.strict_schwarz 889 cholesky = self.cholesky 890 orthogonalize = self.orthogonalize 891 892 print_pyfock_logo() 893 print('\n\nNumber of atoms:', mol.natoms) 894 print('\n\nNumber of basis functions (atomic orbitals):', basis.bfs_nao) 895 print('\n\nNumber of auxiliary basis functions:', auxbasis.bfs_nao) 896 print('\n\n================================================================================\n\n') 897 898 print_sys_info() 899 900 #### Set number of cores 901 numba.set_num_threads(ncores) 902 os.environ['RAYON_NUM_THREADS'] = str(ncores) 903 904 print('Running DFT using '+str(numba.get_num_threads())+' threads for Numba.\n\n', flush=True) 905 # if basis is None: 906 # basis = self.basis 907 # if mol is None: 908 # mol = self.mol 909 # if xc is None: 910 # xc = self.xc 911 # if gridsLevel is None: 912 # gridsLevel = self.gridsLevel 913 # if auxbasis is None: 914 # auxbasis = Basis(mol, {'all':Basis.load(mol=mol, basis_name='def2-universal-jfit')}) 915 if grids is None: 916 sortGrids = True 917 if xc_bf_screen==False: 918 if save_ao_values==True: 919 print('Warning! AO screening is set to False, but AO values are requested to be saved. \ 920 AO values can only be saved when XC_BF_SCREEN=TRUE. AO values will not be saved.', flush=True) 921 save_ao_values = False 922 if CUPY_AVAILABLE: 923 if self.use_gpu: 924 print('GPU acceleration is enabled. Currently this only accelerates AO values and XC term evaluation.', flush=True) 925 print('GPU(s) information:') 926 # print(cp.cuda.Device.mem_info()) 927 print(cuda.detect()) 928 print('Max threads per block supported by the GPU: ', cuda.get_current_device().MAX_THREADS_PER_BLOCK, flush=True) 929 print('The user has specified to use '+str(self.n_gpus)+' GPU(s).') 930 931 # For CUDA computations 932 threads_per_block = (self.threads_x, self.threads_y) 933 print('Threads per block configuration for the XC term: ', threads_per_block, flush=True) 934 print('Threads per block configuration for the all other calculations: ', (32, 32), flush=True) 935 if self.dynamic_precision: 936 print('\n\nWill use dynamic precision. ') 937 print('This means that the XC term will be evaluated in single precision until the ') 938 print('relative energy difference b/w successive iterations is less than 5.0E-7.') 939 precision_XC = cp.float32 940 else: 941 precision_XC = cp.float64 942 943 if XC_algo is None: 944 XC_algo = 3 945 if blocksize is None: 946 blocksize = 20480 947 streams = [] 948 nb_streams = [] 949 for i in range(self.n_gpus): 950 cp.cuda.Device(i).use() 951 cp_stream = cp.cuda.Stream(non_blocking = True) 952 nb_stream = cuda.external_stream(cp_stream.ptr) 953 streams.append(cp_stream) 954 nb_streams.append(nb_stream) 955 # Switch back to main GPU 956 cp.cuda.Device(0).use() 957 streams[0].use() 958 # # Set some basis function data as cupy arrays to avoid redoing it during XC term evaluation at every SCF iteration 959 # bfs_coords = cp.asarray([basis.bfs_coords], dtype=precision_XC) 960 # bfs_contr_prim_norms = cp.asarray([basis.bfs_contr_prim_norms], dtype=precision_XC) 961 # bfs_lmn = cp.asarray([basis.bfs_lmn]) 962 # bfs_nprim = cp.asarray([basis.bfs_nprim]) 963 # #The remaining properties like bfs_coeffs are a list of lists of unequal sizes. 964 # #Numba won't be able to work with these efficiently. 965 # #So, we convert them to a numpy 2d array by applying a trick, 966 # #that the second dimension is that of the largest list. So that 967 # #it can accomadate all the lists. 968 # maxnprim = max(basis.bfs_nprim) 969 # bfs_coeffs = cp.zeros([basis.bfs_nao, maxnprim], dtype=precision_XC) 970 # bfs_expnts = cp.zeros([basis.bfs_nao, maxnprim], dtype=precision_XC) 971 # bfs_prim_norms = cp.zeros([basis.bfs_nao, maxnprim], dtype=precision_XC) 972 # bfs_radius_cutoff = cp.zeros([basis.bfs_nao], dtype=precision_XC) 973 # for i in range(basis.bfs_nao): 974 # for j in range(basis.bfs_nprim[i]): 975 # bfs_coeffs[i,j] = basis.bfs_coeffs[i][j] 976 # bfs_expnts[i,j] = basis.bfs_expnts[i][j] 977 # bfs_prim_norms[i,j] = basis.bfs_prim_norms[i][j] 978 # bfs_radius_cutoff[i] = basis.bfs_radius_cutoff[i] 979 # # Now bf/ao values can be evaluated by calling the following 980 # # bf_values = Integrals.bf_val_helpers.eval_bfs(bfs_coords[0], bfs_contr_prim_norms[0], bfs_nprim[0], bfs_lmn[0], bfs_coeffs, bfs_prim_norms, bfs_expnts, bfs_radius_cutoff, coord) 981 # bfs_data_as_np_arrays = [bfs_coords[0], bfs_contr_prim_norms[0], bfs_nprim[0], bfs_lmn[0], bfs_coeffs, bfs_prim_norms, bfs_expnts, bfs_radius_cutoff] 982 983 984 else: 985 if self.use_gpu: 986 print('GPU acceleration requested but cannot be enabled as Cupy is not installed.', flush=True) 987 self.use_gpu = False 988 989 if XC_algo is None: 990 XC_algo = 2 991 if blocksize is None: 992 blocksize = 5000 993 if isDF: 994 isSchwarz = True 995 else: 996 isSchwarz = False # Schwarz screening is not yet implemented for 4c2e integrals 997 if strict_schwarz: 998 if not (DF_algo==6 or DF_algo==10): 999 print('Warning: The stricter variation of Schwarz screening is only compatible with DF algo #6 or #10 so turning it off.') 1000 strict_schwarz = False 1001 if cholesky: 1002 if not (DF_algo==6 or DF_algo==10): 1003 print('Warning: The Cholesky decomposition of 2c2e integrls is only compatible with DF algo #6 or #10 so turning it off.') 1004 cholesky = False 1005 1006 if self.sao: 1007 print('\n\nSpherical Atomic Orbitals are being used!\n\n') 1008 # Get the CAO to SAO transformation matrix 1009 c2sph_mat = basis.cart2sph_basis() # CAO --> SAO 1010 # Calculate the pseudoinverse transformation matrix (for back transformation of SAO dmat to CAO dmat) 1011 sph2c_mat_pseudo = basis.sph2cart_basis() # SAO --> CAO 1012 1013 1014 eigvectors = None 1015 # DF_algo = 1 # Worst algorithm for more than 500 bfs/auxbfs (requires 2x mem of 3c2e integrals and a large prefactor) 1016 # DF_algo = 2 # Sligthly better (2x memory efficient) algorithm than above (requires 1x mem of 3c2e integrals and a large prefactor) 1017 # DF_algo = 3 # Memory effcient without any prefactor. (Can easily be converted into a sparse version, unlike the others) (https://aip.scitation.org/doi/pdf/10.1063/1.1567253) 1018 # DF_algo = 4 # Same as 3, except now we use triangular version of ints3c2e to save on memory 1019 # DF_algo = 5 # Same as 4 in terms of memory requirements, however faster in performance due to the use of Schwarz screening. 1020 # DF_algo = 6 # Much cheaper than 4 and 5 in terms of memory requirements because the indices of significant (ij|P) are efficiently calculated without duplicates/temporary arrays. 1021 # The speed maybe same or just slightly slower. 1022 # DF_algo = 7 # The significant indices (ij|P) are stored even more efficiently by using shell indices instead of bf indices. 1023 # DF_algo = 8 # Similar to 6, except that here the significant indices are not stored resulting in 50% memory savings. The drawback is that it only works in serial which is useful for Google colab or Kaggle perhaps. 1024 # DF_algo = 9 # 1025 1026 if not strict_schwarz: # If a stricter variant of Schwarz screening is not requested 1027 start1e = timer() 1028 print('\nCalculating one electron integrals...\n\n', flush=True) 1029 # One electron integrals 1030 if not self.use_gpu: 1031 S = Integrals.overlap_mat_symm(basis) 1032 V = Integrals.nuc_mat_symm(basis, mol) 1033 T = Integrals.kin_mat_symm(basis) 1034 # Core hamiltonian 1035 H = T + V 1036 else: 1037 S = Integrals.overlap_mat_symm_cupy(basis, cp_stream=streams[0]) 1038 V = Integrals.nuc_mat_symm_cupy(basis, mol, cp_stream = streams[0]) 1039 T = Integrals.kin_mat_symm_cupy(basis, cp_stream = streams[0]) 1040 # Core hamiltonian 1041 H = T + V 1042 1043 print('Core H size in GB ',H.nbytes/1e9, flush=True) 1044 print('done!', flush=True) 1045 duration1e = timer() - start1e 1046 print('Time taken '+str(duration1e)+' seconds.\n', flush=True) 1047 else: 1048 start1e = timer() 1049 print('\nCalculating overlap and kinetic integrals...\n\n', flush=True) 1050 # One electron integrals 1051 if not self.use_gpu: 1052 S = Integrals.overlap_mat_symm(basis) 1053 T = Integrals.kin_mat_symm(basis) 1054 # Core hamiltonian 1055 H = T 1056 else: 1057 S = Integrals.overlap_mat_symm_cupy(basis, cp_stream = streams[0]) 1058 T = Integrals.kin_mat_symm_cupy(basis, cp_stream = streams[0]) 1059 # Core hamiltonian 1060 H = T 1061 1062 print('Core H size in GB ',(H.nbytes/1e9)*2, flush=True) # Factor of 2 because nuclear matrix will also be included here later 1063 print('done!', flush=True) 1064 duration1e = timer() - start1e 1065 print('Time taken '+str(duration1e)+' seconds.\n', flush=True) 1066 1067 1068 if dmat is None: 1069 if self.dmat_guess_method=='core': 1070 dmat = self.guessCoreH(mol, basis, Hcore=H, S=S) 1071 1072 if self.use_gpu: 1073 dmat_cp = cp.asarray(dmat, dtype=cp.float64) 1074 streams[0].synchronize() 1075 cp.cuda.Stream.null.synchronize() 1076 1077 1078 if self.sao: 1079 # It is possible that the supplied density matrix to the SCF was in SAO format already. 1080 # In such a case we need to transform this density matrix to CAO basis so that the J and XC term evaluations can be done properly 1081 if not dmat.shape==S.shape: 1082 dmat = np.dot(sph2c_mat_pseudo, np.dot(dmat, sph2c_mat_pseudo.T)) # Convert to CAO from SAO (SAO --> CAO) 1083 # Later the dmat will be converted back to SAO after J and XC term evaluations 1084 if not isDF: # 4c2e ERI case 1085 1086 print('\nCalculating four centered two electron integrals (ERIs)...\n\n', flush=True) 1087 if not isSchwarz: 1088 # Four centered two electron integrals (ERIs) 1089 if rys: 1090 ints4c2e = Integrals.rys_4c2e_symm(basis) 1091 else: 1092 ints4c2e = Integrals.conv_4c2e_symm(basis) 1093 print('Four Center Two electron ERI size in GB ',ints4c2e.nbytes/1e9, flush=True) 1094 print('done!') 1095 else: 1096 print('\n\nPerforming Schwarz screening...') 1097 # threshold_schwarz = 1e-09 1098 print('Threshold ', threshold_schwarz) 1099 startSchwarz = timer() 1100 nints4c2e_sym8 = int(basis.bfs_nao**4/8) 1101 nints4c2e = int(basis.bfs_nao**4) 1102 duration_4c2e_diag = 0.0 1103 start_4c2e_diag = timer() 1104 # Diagonal elements of ERI 4c2e array 1105 ints4c2e_diag = Integrals.schwarz_helpers.eri_4c2e_diag(basis) 1106 duration_4c2e_diag = timer() - start_4c2e_diag 1107 print('Time taken to evaluate the "diagonal" of 4c2e ERI tensor: ', duration_4c2e_diag) 1108 # Calculate the square roots required for 1109 duration_square_roots = 0.0 1110 start_square_roots = timer() 1111 sqrt_ints4c2e_diag = np.sqrt(np.abs(ints4c2e_diag)) 1112 duration_square_roots = timer() - start_square_roots 1113 print('Time taken to evaluate the square roots needed: ', duration_square_roots) 1114 chunksize = int(1e9) # Results in 2 GB chunks 1115 duration_indices_calc = 0.0 1116 duration_concatenation = 0.0 1117 start_indices_calc = timer() 1118 # indices_temp = [] 1119 ijkl = [0, 0, 0, 0] 1120 if chunksize<nints4c2e_sym8: 1121 nchunks = nints4c2e_sym8//chunksize + 1 1122 else: 1123 nchunks=1 1124 indicesA = None 1125 for ichunk in range(nchunks): 1126 indices_temp = Integrals.schwarz_helpers.calc_indices_4c2e_schwarz(sqrt_ints4c2e_diag, min(chunksize, nints4c2e_sym8), basis.bfs_nao, ijkl[0], ijkl[1], ijkl[2], ijkl[3], threshold_schwarz) 1127 1128 ijkl = indices_temp[4] 1129 count = indices_temp[5] 1130 # start_concatenation = timer() 1131 if indicesA is not None and count>0: 1132 indicesA = np.concatenate([indicesA, indices_temp[0][0:count]]) 1133 indicesB = np.concatenate([indicesB, indices_temp[1][0:count]]) 1134 indicesC = np.concatenate([indicesC, indices_temp[2][0:count]]) 1135 indicesD = np.concatenate([indicesD, indices_temp[3][0:count]]) 1136 else: 1137 1138 indicesA = indices_temp[0][0:count] 1139 indicesB = indices_temp[1][0:count] 1140 indicesC = indices_temp[2][0:count] 1141 indicesD = indices_temp[3][0:count] 1142 # duration_concatenation += timer() - start_concatenation 1143 # Break out of the for loop if the nol. of significant triplets found is less than the chunksize 1144 # This is because, it means that there are no more significant triplets to be found from all possible configurations. 1145 if count<chunksize: 1146 break 1147 1148 duration_indices_calc += timer() - start_indices_calc 1149 print('Time for significant indices evaluation: ', duration_indices_calc) 1150 # print('Time for array concatenation: ', duration_concatenation) 1151 1152 # Get rid of temp variables 1153 indices_temp=0 1154 ijk = 0 1155 1156 print('Size of permanent array storing the significant indices of 4c2e ERI in GB ', indicesA.nbytes/1e9+indicesB.nbytes/1e9+indicesC.nbytes/1e9+indicesD.nbytes/1e9, flush=True) 1157 1158 nsignificant = len(indicesA) 1159 print('No. of elements in the standard four-centered two electron ERI tensor: ', nints4c2e, flush=True) 1160 print('No. of elements after factoring in the 8-fold symmetry in the four-centered two electron ERI tensor: ', nints4c2e_sym8, flush=True) 1161 print('No. of significant quadruplets based on Schwarz inequality and 8-fold symmetry: ' + str(nsignificant) + ' or '+str(np.round(nsignificant/nints4c2e*100,1)) + '% of original', flush=True) 1162 print('Schwarz screening done!') 1163 durationSchwarz = timer() - startSchwarz 1164 print('Total time taken for Schwarz screening '+str(durationSchwarz)+' seconds.\n', flush=True) 1165 1166 ints4c2e = Integrals.schwarz_helpers.rys_4c2e_tri_schwarz_sparse(basis, auxbasis, indicesA, indicesB, indicesC, indicesD) 1167 1168 else: # Density fitting case (3c2e, and 2c2e will be calculated) 1169 H_temp, V_temp, ints3c2e, ints2c2e, nsignificant, indicesA, indicesB, indicesC, offsets_3c2e, indices, ints4c2e_diag, sqrt_ints4c2e_diag, sqrt_diag_ints2c2e, indices_dmat_tri, indices_dmat_tri_2, df_coeff0, Qpq, cho_decomp_ints2c2e, durationDF_cholesky, durationCoulomb = density_fitting_prelims_for_DFT_development(mol, basis, auxbasis, T, dmat, self.use_gpu, self.keep_ints3c2e_in_gpu, threshold_schwarz, strict_schwarz, rys, DF_algo, cholesky) 1170 if not H_temp is None: 1171 H = H_temp 1172 if not V_temp is None: 1173 V = V_temp 1174 if cholesky: 1175 durationDF += durationDF_cholesky 1176 1177 1178 1179 1180 1181 scf_converged = False 1182 durationgrids = 0 1183 1184 if grids is None: 1185 startGrids = timer() 1186 print('\nGenerating grids...\n\n', flush=True) 1187 # To get the same energies as PySCF (level=5) upto 1e-7 au, use the following settings 1188 # radial_precision = 1.0e-13 1189 # level=3 1190 # pruning by density with threshold = 1e-011 1191 # alpha_min and alpha_max corresponding to QZVP 1192 print('Grids level: ', gridsLevel) 1193 # Generate grids for XC term 1194 basisGrids = Basis(mol, {'all':Basis.load(mol=mol, basis_name='def2-QZVP')}) 1195 grids = Grids(mol, basis=basisGrids, level = gridsLevel, ncores=ncores) 1196 1197 print('done!', flush=True) 1198 durationgrids = timer() - startGrids 1199 print('Time taken '+str(durationgrids)+' seconds.\n', flush=True) 1200 1201 # Begin pruning the grids based on density (rho) 1202 # Evaluate ao_values to calculate rho 1203 print('\nPruning generated grids by rho...\n\n', flush=True) 1204 startGrids_prune_rho = timer() 1205 threshold_rho = 1e-011 1206 ngrids_temp = grids.coords.shape[0] 1207 ndeleted = 0 1208 blocksize_temp = 50000 1209 nblocks_temp = ngrids_temp//blocksize_temp 1210 weightsNew = None 1211 coordsNew = None 1212 for iblock in range(nblocks_temp+1): 1213 offset = iblock*blocksize_temp 1214 weights_block = grids.weights[offset : min(offset+blocksize_temp,ngrids_temp)] 1215 coords_block = grids.coords[offset : min(offset+blocksize_temp,ngrids_temp)] 1216 ao_value_block = Integrals.bf_val_helpers.eval_bfs(basis, coords_block) 1217 rho_block = contract('ij,mi,mj->m', dmat, ao_value_block, ao_value_block) 1218 zero_indices = np.where(np.abs(rho_block*weights_block) < threshold_rho)[0] 1219 ndeleted += len(zero_indices) 1220 weightsNew_block = np.delete(weights_block, zero_indices) 1221 coordsNew_block = np.delete(coords_block, zero_indices, 0) 1222 if weightsNew_block.shape[0]>0: 1223 if weightsNew is None: 1224 weightsNew = weightsNew_block 1225 coordsNew = coordsNew_block 1226 else: 1227 weightsNew = np.concatenate((weightsNew, weightsNew_block)) 1228 coordsNew = np.concatenate([coordsNew, coordsNew_block], axis=0) 1229 1230 grids.coords = coordsNew 1231 grids.weights = weightsNew 1232 print('done!', flush=True) 1233 durationgrids_prune_rho = timer() - startGrids_prune_rho 1234 print('Time taken '+str(durationgrids_prune_rho)+' seconds.\n', flush=True) 1235 print('\nDeleted '+ str(ndeleted) + ' grid points.', flush=True) 1236 1237 else: 1238 print('\nUsing the user supplied grids!\n\n', flush=True) 1239 1240 1241 # Grid information initial 1242 print('\nNo. of supplied/generated grid points: ', grids.coords.shape[0], flush=True) 1243 1244 # Prune grids based on weights 1245 # start_pruning_weights = timer() 1246 # print('\nPruning grids based on weights....', flush=True) 1247 # zero_indices = np.where(np.logical_and(grids.weights>=-1.0e-12, grids.weights<=1.e-12)) 1248 # grids.weights = np.delete(grids.weights, zero_indices) 1249 # grids.coords = np.delete(grids.coords, zero_indices, 0) 1250 # print('done!', flush=True) 1251 # duration_pruning_weights = timer() - start_pruning_weights 1252 # print('\nTime taken '+str(duration_pruning_weights)+' seconds.\n', flush=True) 1253 # print('\nNo. of grid points after screening by weights: ', grids.coords.shape[0], flush=True) 1254 1255 print('Size (in GB) for storing the coordinates of grid: ', grids.coords.nbytes/1e9, flush=True) 1256 print('Size (in GB) for storing the weights of grid: ', grids.weights.nbytes/1e9, flush=True) 1257 print('Size (in GB) for storing the density at gridpoints: ', grids.weights.nbytes/1e9, flush=True) 1258 1259 # Sort the grids for slightly better performance with batching (doesn't seem to make much difference) 1260 if sortGrids: 1261 print('\nSorting grids ....', flush=True) 1262 # Function to sort grids 1263 def get_ordered_list(points, x, y, z): 1264 points.sort(key = lambda p: (p[0] - x)**2 + (p[1] - y)**2 + (p[2] - z)**2) 1265 # print(points[0:10]) 1266 return points 1267 # Make a single array of coords and weights 1268 coords_weights = np.c_[grids.coords, grids.weights] 1269 coords_weights = np.array(get_ordered_list(coords_weights.tolist(), min(grids.coords[:,0]), min(grids.coords[:,1]), min(grids.coords[:,2]))) 1270 # Now go back to two arrays for coords and weights 1271 grids.weights = coords_weights[:,3] 1272 grids.coords = coords_weights[:,0:3] 1273 coords_weights = 0#None 1274 print('done!', flush=True) 1275 1276 # blocksize = 10000 1277 ngrids = grids.coords.shape[0] 1278 nblocks = ngrids//blocksize 1279 print('\nWill use batching to evaluate the XC term for memory efficiency.', flush=True) 1280 print('Batch size: ', blocksize, flush=True) 1281 print('No. of batches: ', nblocks+1, flush=True) 1282 1283 1284 #### Some preliminary stuff for XC evaluation 1285 durationXCpreprocessing = 0 1286 list_nonzero_indices = None 1287 count_nonzero_indices = None 1288 list_ao_values = None 1289 list_ao_grad_values = None 1290 if xc_bf_screen: 1291 xc_family_dict = {1:'LDA',2:'GGA',4:'MGGA'} 1292 # Create a LibXC object 1293 funcx = pylibxc.LibXCFunctional(xc[0], "unpolarized") 1294 funcc = pylibxc.LibXCFunctional(xc[1], "unpolarized") 1295 x_family_code = funcx.get_family() 1296 c_family_code = funcc.get_family() 1297 ### Find the list of significanlty contributing bfs for xc evaluations 1298 startXCpreprocessing = timer() 1299 print('\nPreliminary processing for XC term evaluations...', flush=True) 1300 print('Calculating the value of basis functions (atomic orbitals) and get the indices of siginificantly contributing functions...', flush=True) 1301 # Calculate the value of basis functions for all grid points in batches 1302 # and find the indices of basis functions that have a significant contribution to those batches for each batch 1303 if not self.use_gpu: 1304 list_nonzero_indices, count_nonzero_indices = Integrals.bf_val_helpers.nonzero_ao_indices(basis, grids.coords, blocksize, nblocks, ngrids) 1305 else: 1306 list_nonzero_indices, count_nonzero_indices = Integrals.bf_val_helpers.nonzero_ao_indices_cupy(basis, grids.coords, blocksize, nblocks, ngrids, streams[0]) 1307 print('done!', flush=True) 1308 durationXCpreprocessing = timer() - startXCpreprocessing 1309 print('Time taken '+str(durationXCpreprocessing)+' seconds.\n', flush=True) 1310 print('Maximum no. of basis functions contributing to a batch of grid points: ', max(count_nonzero_indices)) 1311 print('Average no. of basis functions contributing to a batch of grid points: ', int(np.mean(count_nonzero_indices))) 1312 1313 1314 durationAO_values = 0 1315 if save_ao_values: 1316 startAO_values = timer() 1317 if xc_family_dict[x_family_code]=='LDA' and xc_family_dict[c_family_code]=='LDA': 1318 print('\nYou have asked to save the values of significant basis functions on grid points so as to avoid recalculation for each SCF cycle.', flush=True) 1319 memory_required = sum(count_nonzero_indices*blocksize)*8/1024/1024/1024 1320 print('Please note: This will require addtional memory that is approximately: '+ str(np.round(memory_required,1))+ ' GB', flush=True) 1321 print('Calculating the value of significantly contributing basis functions (atomic orbitals)...', flush=True) 1322 list_ao_values = [] 1323 # Loop over batches 1324 for iblock in range(nblocks+1): 1325 offset = iblock*blocksize 1326 coords_block = grids.coords[offset : min(offset+blocksize,ngrids)] 1327 ao_values_block = Integrals.bf_val_helpers.eval_bfs(basis, coords_block, parallel=True, non_zero_indices=list_nonzero_indices[iblock][0:count_nonzero_indices[iblock]]) 1328 if self.use_gpu and self.keep_ao_in_gpu: 1329 list_ao_values.append(cp.asarray(ao_values_block)) 1330 else: 1331 list_ao_values.append(ao_values_block) 1332 #Free memory 1333 ao_values_block = 0 1334 if xc_family_dict[x_family_code]!='LDA' or xc_family_dict[c_family_code]!='LDA': 1335 print('\nYou have asked to save the values of significant basis functions and their gradients on grid points so as to avoid recalculation for each SCF cycle.', flush=True) 1336 memory_required = 4*sum(count_nonzero_indices*blocksize)*8/1024/1024/1024 1337 print('Please note: This will require addtional memory that is approximately :'+ str(np.round(memory_required,1))+ ' GB', flush=True) 1338 print('Calculating the value of significantly contributing basis functions (atomic orbitals)...', flush=True) 1339 list_ao_values = [] 1340 list_ao_grad_values = [] 1341 # Loop over batches 1342 for iblock in range(nblocks+1): 1343 offset = iblock*blocksize 1344 coords_block = grids.coords[offset : min(offset+blocksize,ngrids)] 1345 # ao_values_block = Integrals.bf_val_helpers.eval_bfs(basis, coords_block, parallel=True, non_zero_indices=list_nonzero_indices[iblock][0:count_nonzero_indices[iblock]]) 1346 ao_values_block, ao_grad_values_block = Integrals.bf_val_helpers.eval_bfs_and_grad(basis, coords_block, parallel=True, non_zero_indices=list_nonzero_indices[iblock][0:count_nonzero_indices[iblock]]) 1347 if self.use_gpu and self.keep_ao_in_gpu: 1348 list_ao_values.append(cp.asarray(ao_values_block)) 1349 list_ao_grad_values.append(cp.asarray(ao_grad_values_block)) 1350 else: 1351 list_ao_values.append(ao_values_block) 1352 list_ao_grad_values.append(ao_grad_values_block) 1353 #Free memory 1354 ao_values_block = 0 1355 ao_grad_values_block =0 1356 print('done!', flush=True) 1357 durationAO_values = timer() - startAO_values 1358 print('Time taken '+str(durationAO_values)+' seconds.\n', flush=True) 1359 1360 if self.use_gpu: 1361 grids.coords = cp.asarray(grids.coords, dtype=cp.float64) 1362 grids.weights = cp.asarray(grids.weights, dtype=cp.float64) 1363 1364 #-------XC Stuff start---------------------- 1365 1366 funcid = self.xc 1367 1368 xc_family_dict = {1:'LDA',2:'GGA',4:'MGGA'} 1369 1370 # Create a LibXC object 1371 funcx = pylibxc.LibXCFunctional(funcid[0], "unpolarized") 1372 funcc = pylibxc.LibXCFunctional(funcid[1], "unpolarized") 1373 x_family_code = funcx.get_family() 1374 c_family_code = funcc.get_family() 1375 1376 1377 print('\n\n------------------------------------------------------', flush=True) 1378 print('Exchange-Correlation Functional') 1379 print('------------------------------------------------------\n', flush=True) 1380 print('XC Functional IDs supplied: ', funcid, flush=True) 1381 print('\n\nDescription of exchange functional: \n') 1382 print('The Exchange function belongs to the family:', xc_family_dict[x_family_code], flush=True) 1383 print(funcx.describe()) 1384 print('\n\nDescription of correlation functional: \n', flush=True) 1385 print(' The Correlation function belongs to the family:', xc_family_dict[c_family_code], flush=True) 1386 print(funcc.describe()) 1387 print('------------------------------------------------------\n', flush=True) 1388 print('\n\n', flush=True) 1389 #-------XC Stuff end---------------------- 1390 1391 Etot = 0 1392 1393 itr = 1 1394 Enn = self.nuclear_rep_energy(mol) 1395 #diis = scf.CDIIS() 1396 dmat_old = 0 1397 J_diff = 0 1398 Ecoul = 0.0 1399 Ecoul_temp = 0.0 1400 1401 durationxc = 0 1402 1403 startKS = timer() 1404 if self.sao: 1405 ######### 1406 # I use a trick to calculate the DFT energy in SAO basis. 1407 # The trick is to get rid of the extra information in the CAO basis matrices. 1408 # The following does just that. 1409 # Going from CAO --> SAO we lose the extra information then we go back to from SAO --> CAO 1410 # This way all the integrals (potential matrices and density matrices) can still be calculated 1411 # in the CAO basis. In fact even the KS matrix is diagonalized in the CAO basis with the extra information 1412 # removed by forward and backward transformation: CAO --> SAO followed by SAO --> CAO. 1413 # This also helps in reducing the number of transformations needed for calculation of various energy contributions. 1414 ######### 1415 # Convert the overlap matrix from CAO to SAO basis 1416 S = np.dot(c2sph_mat, np.dot(S, c2sph_mat.T)) # CAO --> SAO 1417 # Convert back to SAO so that now we lose the extra information that the CAO basis had 1418 S = np.dot(sph2c_mat_pseudo, np.dot(S, sph2c_mat_pseudo.T)) 1419 if orthogonalize: 1420 if not self.use_gpu: 1421 eig_val_s, eig_vec_s = scipy.linalg.eigh(S) 1422 # Removing the eigenvectors assoicated to the smallest eigenvalue. 1423 x = eig_vec_s[:,eig_val_s>1e-7] / np.sqrt(eig_val_s[eig_val_s>1e-7]) 1424 else: 1425 x = None 1426 durationKS = durationKS + timer() - startKS 1427 1428 1429 while not (scf_converged or itr==max_itr+1): 1430 startIter = timer() 1431 if itr==1: 1432 dmat_diff = dmat # This is in CAO basis 1433 # Coulomb (Hartree) matrix 1434 if not isDF: 1435 J = contract('ijkl,ij',ints4c2e,dmat) # This is in CAO basis 1436 else: 1437 J, durationDF, durationDF_coeff, durationDF_gamma, durationDF_Jtri, Ecoul_temp = Jmat_from_density_fitting(dmat, DF_algo, cholesky, cho_decomp_ints2c2e, df_coeff0, Qpq, ints3c2e, ints2c2e, indices_dmat_tri, indices_dmat_tri_2, indicesA, indicesB, indicesC, offsets_3c2e, sqrt_ints4c2e_diag, sqrt_diag_ints2c2e, threshold_schwarz, strict_schwarz, basis, auxbasis, self.use_gpu, self.keep_ints3c2e_in_gpu, durationDF_gamma, ncores, durationDF_coeff, durationDF_Jtri, durationDF) 1438 1439 1440 1441 1442 else: 1443 dmat_diff = dmat-dmat_old 1444 if not isDF: 1445 # J_diff = contract('ijkl,ij',ints4c2e,dmat_diff) 1446 # J += J_diff 1447 J = contract('ijkl,ij', ints4c2e, dmat) 1448 else: 1449 J, durationDF, durationDF_coeff, durationDF_gamma, durationDF_Jtri, Ecoul_temp = Jmat_from_density_fitting(dmat, DF_algo, cholesky, cho_decomp_ints2c2e, df_coeff0, Qpq, ints3c2e, ints2c2e, indices_dmat_tri, indices_dmat_tri_2, indicesA, indicesB, indicesC, offsets_3c2e, sqrt_ints4c2e_diag, sqrt_diag_ints2c2e, threshold_schwarz, strict_schwarz, basis, auxbasis, self.use_gpu, self.keep_ints3c2e_in_gpu, durationDF_gamma, ncores, durationDF_coeff, durationDF_Jtri, durationDF) 1450 # J += J_diff 1451 if self.use_gpu: 1452 J = cp.asarray(J, dtype=cp.float64) 1453 streams[0].synchronize() 1454 cp.cuda.Stream.null.synchronize() 1455 1456 1457 # XC energy and potential 1458 startxc = timer() 1459 1460 if not self.use_gpu: 1461 if XC_algo==1: 1462 # Much slower than JOBLIB version 1463 # Still keeping it because, it can be useful when using GPUs 1464 Exc, Vxc = Integrals.eval_xc_1(basis, dmat, grids.weights, grids.coords, funcid, blocksize=blocksize, debug=debug, \ 1465 list_nonzero_indices=list_nonzero_indices, count_nonzero_indices=count_nonzero_indices, \ 1466 list_ao_values=list_ao_values, list_ao_grad_values=list_ao_grad_values) 1467 if XC_algo==2: 1468 # Much faster than above and stable too, therefore this should be default now. 1469 # Used to unstable and had memory leaks, 1470 # But now all that is fixed by using threadpoolctl, garbage collection or freeing up memory after XC evaluation at each iteration 1471 1472 Exc, Vxc = Integrals.eval_xc_2(basis, dmat, grids.weights, grids.coords, funcid, ncores=ncores, blocksize=blocksize, \ 1473 list_nonzero_indices=list_nonzero_indices, count_nonzero_indices=count_nonzero_indices, \ 1474 list_ao_values=list_ao_values, list_ao_grad_values=list_ao_grad_values, debug=debug) 1475 1476 if XC_algo==3: 1477 with threadpool_limits(limits=1, user_api='blas'): 1478 Exc, Vxc = Integrals.eval_xc_3(basis, dmat, grids.weights, grids.coords, funcid, ncores=ncores, blocksize=blocksize, \ 1479 list_nonzero_indices=list_nonzero_indices, count_nonzero_indices=count_nonzero_indices, \ 1480 list_ao_values=list_ao_values, list_ao_grad_values=list_ao_grad_values, debug=debug) 1481 else: # GPU 1482 if XC_algo==1: 1483 Exc, Vxc = Integrals.eval_xc_1_cupy(basis, dmat_cp, grids.weights, grids.coords, funcid, blocksize=blocksize, debug=debug, \ 1484 list_nonzero_indices=list_nonzero_indices, count_nonzero_indices=count_nonzero_indices, \ 1485 list_ao_values=list_ao_values, list_ao_grad_values=list_ao_grad_values, use_libxc=self.use_libxc,\ 1486 nstreams=self.n_streams, ngpus=self.n_gpus, freemem=self.free_gpu_mem, threads_per_block=threads_per_block, 1487 type=precision_XC) 1488 if XC_algo==2: 1489 Exc, Vxc = Integrals.eval_xc_2_cupy(basis, dmat_cp, grids.weights, cp.asnumpy(grids.coords), funcid, ncores=ncores, blocksize=blocksize, \ 1490 list_nonzero_indices=list_nonzero_indices, count_nonzero_indices=count_nonzero_indices, \ 1491 list_ao_values=list_ao_values, list_ao_grad_values=list_ao_grad_values, debug=debug) 1492 if XC_algo==3: 1493 # Default for GPUs 1494 Exc, Vxc = Integrals.eval_xc_3_cupy(basis, dmat_cp, grids.weights, grids.coords, funcid, blocksize=blocksize, debug=debug, \ 1495 list_nonzero_indices=list_nonzero_indices, count_nonzero_indices=count_nonzero_indices, \ 1496 list_ao_values=list_ao_values, list_ao_grad_values=list_ao_grad_values, use_libxc=self.use_libxc,\ 1497 nstreams=self.n_streams, ngpus=self.n_gpus, freemem=self.free_gpu_mem, threads_per_block=threads_per_block, 1498 type=precision_XC, streams=streams, nb_streams=nb_streams) 1499 1500 Vxc = cp.asarray(Vxc, dtype=cp.float64) 1501 1502 durationxc = durationxc + timer() - startxc 1503 1504 1505 dmat_old = dmat # This is in CAO basis 1506 1507 1508 if self.use_gpu: 1509 Enuc = contract('ij,ji->', dmat_cp, V) 1510 Ekin = contract('ij,ji->', dmat_cp, T) 1511 Ecoul = contract('ij,ji->', dmat_cp, J)*0.5 1512 else: 1513 with threadpool_limits(limits=ncores, user_api='blas'): 1514 # print('Energy contractions', controller.info()) 1515 Enuc = contract('ij,ji->', dmat, V) 1516 Ekin = contract('ij,ji->', dmat, T) 1517 Ecoul = contract('ij,ji->', dmat, J)*0.5 1518 if isDF and (DF_algo==6 or DF_algo==10): 1519 Ecoul = Ecoul*2 - 0.5*Ecoul_temp # This is the correct formula for Coulomb energy with DF 1520 1521 Etot_new = Exc + Enuc + Ekin + Enn + Ecoul 1522 self.Total_energy = Etot_new 1523 self.XC_energy = Exc 1524 self.Kinetic_energy = Ekin 1525 self.Nuclear_repulsion_energy = Enn 1526 self.J_energy = Ecoul 1527 self.Nuc_energy = Enuc 1528 1529 # Set label width and numeric format 1530 label_w = 30 1531 num_fmt = "{:>20.13f}" # 20-wide, 10 decimal places 1532 1533 print(f"\n\n\n------Iteration {itr}--------\n\n", flush=True) 1534 print("Energies\n") 1535 print(f"{'Electron-Nuclear Energy':<{label_w}}{num_fmt.format(Enuc)}") 1536 print(f"{'Nuclear repulsion Energy':<{label_w}}{num_fmt.format(Enn)}") 1537 print(f"{'Kinetic Energy':<{label_w}}{num_fmt.format(Ekin)}") 1538 print(f"{'Coulomb Energy':<{label_w}}{num_fmt.format(Ecoul)}") 1539 print(f"{'Exchange-Correlation Energy':<{label_w}}{num_fmt.format(Exc)}") 1540 print('-' * (label_w + 20)) 1541 print(f"{'Total Energy':<{label_w}}{num_fmt.format(Etot_new)}") 1542 print('-' * (label_w + 20) + "\n\n\n", flush=True) 1543 1544 1545 1546 print('Energy difference : ',abs(Etot_new-Etot), flush=True) 1547 1548 # Check convergence criteria 1549 if abs(Etot_new-Etot)<conv_crit: 1550 scf_converged = True 1551 print('\nSCF Converged after '+str(itr) +' iterations!', flush=True) 1552 Etot = Etot_new 1553 print('\n-------------------------------------', flush=True) 1554 print('Total Energy = ', Etot, flush=True) 1555 print('-------------------------------------\n\n', flush=True) 1556 break 1557 1558 Etot = Etot_new 1559 itr = itr + 1 1560 1561 1562 if itr>=max_itr and not scf_converged: 1563 print('\nSCF NOT Converged after '+str(itr-1) +' iterations!', flush=True) 1564 1565 1566 1567 if not scf_converged: 1568 KS = H + J + Vxc 1569 if self.sao: 1570 # The following gets rid of the extra information in the CAO basis KS matrix by going to SAO and then back to CAO. 1571 # This way even though the matrix dimensions would be that of CAO but the information would be the same as SAO 1572 # leading to the same energy as SAO basis PySCF or TURBOMOLE calculations 1573 KS = np.dot(c2sph_mat, np.dot(KS, c2sph_mat.T)) # CAO --> SAO 1574 KS = np.dot(sph2c_mat_pseudo, np.dot(KS, sph2c_mat_pseudo.T)) #SAO --> CAO 1575 1576 1577 #### DIIS 1578 startDIIS = timer() 1579 diis_start_itr = 1 1580 if itr >= diis_start_itr: 1581 if not self.use_gpu: 1582 with threadpool_limits(limits=ncores, user_api='blas'): 1583 # print('DIIS ', controller.info()) 1584 KS = self.DIIS(S, dmat, KS) 1585 else: 1586 KS = self.DIIS_cupy(S, dmat_cp, KS) 1587 streams[0].synchronize() 1588 cp.cuda.Stream.null.synchronize() 1589 durationDIIS = durationDIIS + timer() - startDIIS 1590 1591 #### Solve KS equation (Diagonalize KS matrix) 1592 startKS = timer() 1593 if self.use_gpu: 1594 eigvalues, eigvectors = self.solve_cupy(KS, S, orthogonalize=True) # Orthogonalization is necessary with CUDA 1595 mo_occ = self.getOcc_cupy(mol, eigvalues, eigvectors) 1596 dmat = self.gen_dm_cupy(eigvectors, mo_occ) 1597 dmat_cp = dmat 1598 dmat = cp.asnumpy(dmat) 1599 streams[0].synchronize() 1600 cp.cuda.Stream.null.synchronize() 1601 # HOMO-LUMO gap 1602 occupied = cp.where(mo_occ > 1e-8)[0] 1603 if len(occupied) < len(eigvalues): 1604 homo_idx = occupied[-1] 1605 lumo_idx = homo_idx + 1 1606 gap = (eigvalues[lumo_idx] - eigvalues[homo_idx]) 1607 print(f"\n\nHOMO-LUMO gap: {gap} au", flush=True) 1608 print(f"HOMO-LUMO gap: {gap*27.211324570273:.3f} eV", flush=True) 1609 else: 1610 with threadpool_limits(limits=ncores, user_api='blas'): 1611 # print('KS eigh', controller.info()) 1612 eigvalues, eigvectors = self.solve(KS, S, orthogonalize=orthogonalize, x=x) 1613 mo_occ = self.getOcc(mol, eigvalues, eigvectors) 1614 dmat = self.gen_dm(eigvectors, mo_occ) 1615 # HOMO-LUMO gap 1616 occupied = np.where(mo_occ > 1e-8)[0] 1617 if len(occupied) < len(eigvalues): 1618 homo_idx = occupied[-1] 1619 lumo_idx = homo_idx + 1 1620 gap = (eigvalues[lumo_idx] - eigvalues[homo_idx]) 1621 print(f"\n\nHOMO-LUMO gap: {gap} au", flush=True) 1622 print(f"HOMO-LUMO gap: {gap*27.211324570273:.3f} eV", flush=True) 1623 durationKS = durationKS + timer() - startKS 1624 1625 self.dmat = dmat 1626 self.mo_coefficients = eigvectors 1627 self.mo_energies = eigvalues 1628 self.mo_occupations = mo_occ 1629 1630 # Check when to switch to double precision for XC 1631 if self.use_gpu: 1632 if precision_XC is cp.float32: 1633 if abs(Etot_new-Etot)/abs(Etot_new)<5e-7: 1634 precision_XC = cp.float64 1635 print('\nSwitching to double precision for XC evaluation after '+str(itr) +' iterations!', flush=True) 1636 1637 1638 1639 durationItr = timer() - startIter 1640 print('\n\nTime taken for the previous iteration: '+str(durationItr)+'\n\n', flush=True) 1641 1642 # # Check convergence criteria 1643 # if abs(Etot_new-Etot)<conv_crit: 1644 # scf_converged = True 1645 # print('\nSCF Converged after '+str(itr) +' iterations!', flush=True) 1646 # Etot = Etot_new 1647 # print('\n-------------------------------------', flush=True) 1648 # print('Total Energy = ', Etot, flush=True) 1649 # print('-------------------------------------\n\n', flush=True) 1650 # break 1651 1652 # Etot = Etot_new 1653 # itr = itr + 1 1654 1655 1656 # if itr>=max_itr and not scf_converged: 1657 # print('\nSCF NOT Converged after '+str(itr-1) +' iterations!', flush=True) 1658 1659 1660 1661 durationSCF = timer() - startSCF 1662 # print(dmat) 1663 print('\nTime taken : '+str(durationSCF) +' seconds.', flush=True) 1664 print('\n\n', flush=True) 1665 print('-------------------------------------', flush=True) 1666 print('Profiling', flush=True) 1667 print('-------------------------------------', flush=True) 1668 print('Preprocessing ', durationXCpreprocessing + durationAO_values + durationgrids_prune_rho + durationSchwarz) 1669 if isDF: 1670 print('Density Fitting ', durationDF, flush=True) 1671 if DF_algo==6 or DF_algo==10: 1672 print(' DF (gamma) ', durationDF_gamma, flush=True) 1673 print(' DF (coeff) ', durationDF_coeff, flush=True) 1674 print(' DF (Jtri) ', durationDF_Jtri, flush=True) 1675 if cholesky: 1676 print(' DF (Cholesky) ', durationDF_cholesky, flush=True) 1677 print('DIIS ', durationDIIS, flush=True) 1678 print('KS matrix diagonalization ', durationKS, flush=True) 1679 print('One electron Integrals (S, T, Vnuc) ', duration1e, flush=True) 1680 if isDF: 1681 print('Coulomb Integrals (2c2e + 3c2e) ', durationCoulomb-durationSchwarz-durationDF_cholesky, flush=True) 1682 if not isDF: 1683 print('Coulomb Integrals (4c2e) ', durationCoulomb-durationSchwarz, flush=True) 1684 print('Grids construction ', durationgrids, flush=True) 1685 print('Exchange-Correlation Term ', durationxc, flush=True) 1686 totalTime = durationXCpreprocessing + durationAO_values + duration1e + durationCoulomb - durationDF_cholesky + \ 1687 durationgrids + durationxc + durationDF + durationKS + durationDIIS + durationgrids_prune_rho 1688 print('Misc. ', durationSCF - totalTime, flush=True) 1689 print('Complete SCF ', durationSCF, flush=True) 1690 1691 if self.use_gpu: 1692 # Free memory of all GPUs 1693 for igpu in range(self.n_gpus): 1694 cp.cuda.Device(igpu).use() 1695 cp._default_memory_pool.free_all_blocks() 1696 1697 # Switch back to main GPU 1698 cp.cuda.Device(0).use() 1699 1700 return Etot, dmat
A class for performing Density Functional Theory (DFT) calculations with optional support for density fitting (DF), GPU acceleration, and LibXC.
Parameters
mol : Molecule Molecular object on which the DFT calculation is to be performed.
basis : Basis Orbital basis set used for the SCF calculation.
auxbasis : Basis, optional Auxiliary basis set for density fitting (DF). If None, a default will be assigned.
conv_crit : float, optional Convergence criterion for the SCF cycle in Hartrees (default is 1e-7).
dmat_guess_method : str, optional Method for the initial density matrix guess (e.g., 'core', 'huckel').
xc : list or str, optional
Exchange-correlation functional specification. If None, defaults to LDA ([1, 7]).
grids : object, optional Precomputed numerical integration grids. If None, they will be generated automatically.
gridsLevel : int, optional Level of numerical integration grid refinement (default is 3).
blocksize : int, optional Block size for XC grid evaluations. Defaults depend on whether GPU is used.
save_ao_values : bool, optional If True, saves AO values to reuse during XC evaluation. Increases speed but uses more memory.
use_gpu : bool, optional Whether to use GPU acceleration.
ncores : int, optional Number of CPU cores to use (default is 2).
Attributes
dmat : ndarray Initial guess for the density matrix, will be computed during setup.
KSmats : list List of Kohn–Sham matrices used in DIIS extrapolation.
errVecs : list List of error vectors for DIIS.
max_itr : int Maximum number of SCF iterations (default is 50).
isDF : bool Whether to use density fitting for Coulomb integrals.
rys : bool Whether to use Rys quadrature for evaluating electron repulsion integrals.
DF_algo : int Algorithm selector for DF (reserved for developer use).
XC_algo : int Algorithm selector for XC evaluation (2 for CPU, 3 for GPU).
sortGrids : bool Whether to sort DFT integration grids (not recommended).
xc_bf_screen : bool Enable basis function screening for XC term evaluation.
threshold_schwarz : float Threshold for Schwarz screening (default is 1e-9).
strict_schwarz : bool If True, applies stricter Schwarz screening.
cholesky : bool If True, uses Cholesky decomposition for DF.
orthogonalize : bool If True, orthogonalizes AO basis functions.
sao : bool If True, uses SAO basis instead of CAO basis.
keep_ao_in_gpu : bool
Whether to retain AO values in GPU memory during SCF (if save_ao_values is True).
use_libxc : bool Whether to use LibXC for XC functional evaluation (recommended off for GPU).
n_streams : int Number of CUDA streams to use (if applicable).
n_gpus : int Number of GPUs to use.
free_gpu_mem : bool Whether to forcibly free GPU memory after use.
max_threads_per_block : int Maximum threads per CUDA block supported by the device.
threads_x : int CUDA thread configuration (X dimension).
threads_y : int CUDA thread configuration (Y dimension).
dynamic_precision : bool Whether to use precision switching during XC evaluation for performance gains.
keep_ints3c2e_in_gpu : bool Whether to keep 3-center 2-electron integrals in GPU memory to avoid transfers.
debug : bool If True, prints debugging output during DFT calculations.
Notes
- This class supports SCF DFT calculations with density fitting (DF).
- GPU support is optional and provides significant speed-up for large systems.
- The class is tightly integrated with PyFock and LibXC libraries.
Examples
>>> mol = Molecule(...)
>>> basis = Basis(mol, ...)
>>> dft = DFT(mol, basis, xc='PBE', use_gpu=True)
>>> dft.run_scf()
206 def __init__(self, mol, basis, auxbasis=None, conv_crit=1e-7, dmat_guess_method=None, 207 xc=None, grids=None, gridsLevel=3, blocksize=None, 208 save_ao_values=False, use_gpu=False, ncores=1): 209 210 self.mol = mol 211 """ Molecular object for which the DFT calculation will be performed """ 212 if self.mol is None: 213 print('ERROR: A Mol object is required to initialize a DFT object.') 214 215 self.basis = basis 216 """ Basis object for corresponding to the molecule for the DFT calculation """ 217 if self.basis is None: 218 print('ERROR: A Basis object is required to initialize a DFT object.') 219 220 221 self.dmat_guess_method = dmat_guess_method 222 """ Initial guess for the density matrix """ 223 if self.dmat_guess_method is None: 224 self.dmat_guess_method = 'core' 225 226 self.dmat = None 227 """ Initial density matrix guess for SCF. Will get updated at each SCF iteration. """ 228 229 self.xc = xc 230 """ Exchange-Correlation Functional """ 231 if self.xc is None: 232 self.xc = [1, 7] #LDA 233 print("XC not specified, defaulting to LDA functional (LibXC codes: 1, 7)") 234 235 # DIIS 236 self.KSmats = [] 237 self.errVecs = [] 238 self.diisSpace = 6 239 240 self.conv_crit = conv_crit 241 """ Convergence criterion for SCF (in Hartrees) """ 242 # if self.conv_crit is None: 243 # self.conv_crit = 1.0E-7 244 245 self.max_itr = 50 246 """ Maximum number of iterations for SCF """ 247 # if self.max_itr is None: 248 # self.max_itr = 50 249 250 self.ncores = ncores 251 """ Number of cores to be used for DFT calculation """ 252 if self.ncores is None: 253 self.ncores = 2 254 255 self.grids = grids 256 """ Atomic grids for DFT calculation (If None or not supplied, will be generated using NumGrid) """ 257 self.gridsLevel = gridsLevel 258 """ Atomic grids for DFT calculation """ 259 260 self.isDF = True 261 """ Use density fitting (DF) for two-electron Coulomb integrals. This is only for developers. Users should not change it. """ 262 263 self.auxbasis = auxbasis 264 """ Basis object to be used as the auxiliary basis for DF """ 265 if self.auxbasis is None: 266 auxbasis = Basis(mol, {'all':Basis.load(mol=mol, basis_name='def2-universal-jfit')}) 267 268 self.rys = True 269 """ Use rys quadrature for the evaluation of two electron integrals (with and without DF)""" 270 271 self.DF_algo = 10 272 """ This is only for developers. Users should not change it. """ 273 274 self.blocksize = blocksize 275 """ Block size for the evaulation of XC term on grids. For CPUs a value of ~5000 is recommended. For GPUs, a value >20480 is recommended. """ 276 if blocksize is None: 277 if use_gpu: 278 self.blocksize = 20480 279 else: 280 self.blocksize = 5000 281 282 self.XC_algo = None 283 """ This is only for developers. Users should not change it. The algorithm for XC evaluation should be 2 for CPU and 3 for GPU.""" 284 if use_gpu: 285 self.XC_algo = 3 286 else: 287 self.XC_algo = 2 288 289 self.debug = False 290 """ Turn on printing debug statements """ 291 self.sortGrids = False 292 """ Enable/Disable sorting of DFT grids. Doesn't seem to offer any signficant advantage.""" 293 self.save_ao_values = save_ao_values 294 """ Whether to save atomic orbital (AO) values for reuse during XC evaluation. Improves performance but requires more memory. """ 295 self.xc_bf_screen = True 296 """ Enable screening of basis functions for XC term evaluation to reduce computation time drastically. """ 297 self.threshold_schwarz = 1e-09 298 """ Threshold for Schwarz screening of two-electron integrals. Smaller values increase accuracy but reduce sparsity. """ 299 self.strict_schwarz = True 300 """ If True, enforce stricter Schwarz screening to aggressively eliminate small two-electron integrals. """ 301 self.cholesky = True 302 """ Whether to use Cholesky decomposition for DF. Slightly speeds up calculations. """ 303 self.orthogonalize = True 304 """ Apply orthogonalization to the AO basis. Should be True for most standard calculations. """ 305 306 self.mo_occupations = None 307 """ Molecular orbital occupations. Will be computed during SCF. """ 308 self.mo_coefficients = None 309 """ Molecular orbital coefficients. Will be computed during SCF. """ 310 self.mo_energies = None 311 """ Molecular orbital energies. Will be computed during SCF. """ 312 self.Total_energy = None 313 """ Total energy of the system. Will be computed during SCF. """ 314 self.J_energy = None 315 """ Coulomb energy contribution. Will be computed during SCF. """ 316 self.XC_energy = None 317 """ Exchange-correlation energy contribution. Will be computed during SCF. """ 318 self.Nuclear_rep_energy = None 319 """ Nuclear repulsion energy. Will be computed during SCF. """ 320 self.Kinetic_energy = None 321 """ Kinetic energy contribution. Will be computed during SCF. """ 322 self.Nuc_energy = None 323 """ Nuclear potential energy contribution. Will be computed during SCF. """ 324 325 326 # CAO or SAO 327 self.sao = False 328 """ Whether to use SAO basis or CAO basis. Default is CAO basis. """ 329 330 331 # GPU acceleration 332 self.use_gpu = use_gpu 333 """ Whether to use GPU acceleration or not """ 334 self.keep_ao_in_gpu = True 335 """ Whether to keep the atomic orbitals for XC evaluation in GPU memory or CPU memory. Only relevant if save_ao_values = True. """ 336 self.use_libxc = True 337 """ Whether to use LibXC's version of XC functionals or PyFock implementations. 338 Only relevant when GPU is used. For GPU calculations it is recommended to use PyFock 339 implementation as it avoids CPU-GPU transfers.""" 340 self.n_streams = 1 341 self.n_gpus = 1 342 """ Number of GPUs to be used """ 343 self.free_gpu_mem = False 344 """ Whether the GPU memory should be freed by force or not""" 345 try: 346 self.max_threads_per_block = cuda.get_current_device().MAX_THREADS_PER_BLOCK 347 except: 348 self.max_threads_per_block = 1024 349 350 self.threads_x = int(self.max_threads_per_block/16) 351 self.threads_y = int(self.max_threads_per_block/64) 352 self.dynamic_precision = False # Only for the XC term 353 """ Whether to use dynamic precision switching for XC term or not """ 354 self.keep_ints3c2e_in_gpu = True 355 """ Whether to keep the 3c2e integrals in GPU memory or not. 356 Recommended to keep in GPU memory to avoid CPU-GPU transfers at each iteration.""" 357 358 # self.max_threads_per_block = 1024 359 # self.threads_x = 64 360 # self.threads_y = 16 361 362 # if self.use_gpu: 363 # try: 364 # global cp 365 # global cupy_scipy 366 # import cupy as cp 367 # import cupyx.scipy as cupy_scipy 368 # # from cupy.linalg import eig, multi_dot as dot 369 # except ModuleNotFoundError: 370 # print('Cupy was not found!')
Use density fitting (DF) for two-electron Coulomb integrals. This is only for developers. Users should not change it.
Block size for the evaulation of XC term on grids. For CPUs a value of ~5000 is recommended. For GPUs, a value >20480 is recommended.
This is only for developers. Users should not change it. The algorithm for XC evaluation should be 2 for CPU and 3 for GPU.
Whether to save atomic orbital (AO) values for reuse during XC evaluation. Improves performance but requires more memory.
Enable screening of basis functions for XC term evaluation to reduce computation time drastically.
Threshold for Schwarz screening of two-electron integrals. Smaller values increase accuracy but reduce sparsity.
If True, enforce stricter Schwarz screening to aggressively eliminate small two-electron integrals.
Apply orthogonalization to the AO basis. Should be True for most standard calculations.
Whether to keep the atomic orbitals for XC evaluation in GPU memory or CPU memory. Only relevant if save_ao_values = True.
Whether to use LibXC's version of XC functionals or PyFock implementations. Only relevant when GPU is used. For GPU calculations it is recommended to use PyFock implementation as it avoids CPU-GPU transfers.
Whether to keep the 3c2e integrals in GPU memory or not. Recommended to keep in GPU memory to avoid CPU-GPU transfers at each iteration.
375 def nuclear_rep_energy(self, mol=None): 376 """ 377 Compute the nuclear-nuclear repulsion energy. 378 379 Parameters 380 ---------- 381 mol : Molecule, optional 382 Molecule object containing nuclear coordinates and charges. 383 If None, uses `self.mol`. 384 385 Returns 386 ------- 387 float 388 The nuclear repulsion energy in Hartrees. 389 """ 390 # Nuclear-nuclear energy 391 if mol is None: 392 mol = self.mol 393 394 e = 0 395 for i in range(mol.natoms): 396 for j in range(i, mol.natoms): 397 if (i!=j): 398 dist = mol.coordsBohrs[i]-mol.coordsBohrs[j] 399 e = e + mol.Zcharges[i]*mol.Zcharges[j]/np.sqrt(np.sum(dist**2)) 400 401 return e
Compute the nuclear-nuclear repulsion energy.
Parameters
mol : Molecule, optional
Molecule object containing nuclear coordinates and charges.
If None, uses self.mol.
Returns
float The nuclear repulsion energy in Hartrees.
404 def gen_dm(self, mo_coeff, mo_occ): 405 """ 406 Generate the density matrix from molecular orbital coefficients and occupations. 407 408 Parameters 409 ---------- 410 mo_coeff : ndarray 411 Molecular orbital coefficient matrix. 412 413 mo_occ : ndarray 414 Array of MO occupation numbers. 415 416 Returns 417 ------- 418 ndarray 419 Density matrix (RHF/RKS type). 420 """ 421 mocc = mo_coeff[:,mo_occ>0] 422 423 return np.dot(mocc*mo_occ[mo_occ>0], mocc.conj().T)
Generate the density matrix from molecular orbital coefficients and occupations.
Parameters
mo_coeff : ndarray Molecular orbital coefficient matrix.
mo_occ : ndarray Array of MO occupation numbers.
Returns
ndarray Density matrix (RHF/RKS type).
425 def getOcc(self, mol=None, energy_mo=None, coeff_mo=None): 426 """ 427 Assign occupation numbers to molecular orbitals based on their energies. 428 429 Parameters 430 ---------- 431 mol : Molecule 432 Molecule object to extract the number of electrons. 433 434 energy_mo : ndarray 435 Array of MO energies. 436 437 coeff_mo : ndarray 438 Array of MO coefficients (not used but kept for compatibility). 439 440 Returns 441 ------- 442 ndarray 443 Array of MO occupations (0 or 2 for RHF). 444 """ 445 e_idx = np.argsort(energy_mo) 446 e_sort = energy_mo[e_idx] 447 nmo = energy_mo.size 448 occ_mo = np.zeros(nmo) 449 nocc = mol.nelectrons // 2 450 occ_mo[e_idx[:nocc]] = 2 451 return occ_mo
Assign occupation numbers to molecular orbitals based on their energies.
Parameters
mol : Molecule Molecule object to extract the number of electrons.
energy_mo : ndarray Array of MO energies.
coeff_mo : ndarray Array of MO coefficients (not used but kept for compatibility).
Returns
ndarray Array of MO occupations (0 or 2 for RHF).
453 def gen_dm_cupy(self, mo_coeff, mo_occ): 454 """ 455 Generate the density matrix using CuPy for GPU acceleration. 456 457 Parameters 458 ---------- 459 mo_coeff : cp.ndarray 460 Molecular orbital coefficient matrix (on GPU). 461 462 mo_occ : cp.ndarray 463 Array of MO occupation numbers (on GPU). 464 465 Returns 466 ------- 467 cp.ndarray 468 Density matrix (RHF/RKS type) computed on the GPU. 469 """ 470 mocc = mo_coeff[:,mo_occ>0] 471 return cp.dot(mocc*mo_occ[mo_occ>0], mocc.conj().T)
Generate the density matrix using CuPy for GPU acceleration.
Parameters
mo_coeff : cp.ndarray Molecular orbital coefficient matrix (on GPU).
mo_occ : cp.ndarray Array of MO occupation numbers (on GPU).
Returns
cp.ndarray Density matrix (RHF/RKS type) computed on the GPU.
473 def getOcc_cupy(self, mol=None, energy_mo=None, coeff_mo=None): 474 """ 475 Assign occupation numbers to MOs using CuPy (for GPU-based calculations). 476 477 Parameters 478 ---------- 479 mol : Molecule 480 Molecule object to determine number of electrons. 481 482 energy_mo : cp.ndarray 483 MO energy array on the GPU. 484 485 coeff_mo : cp.ndarray 486 MO coefficients array on the GPU (not used). 487 488 Returns 489 ------- 490 cp.ndarray 491 Array of MO occupations (on GPU). 492 """ 493 e_idx = cp.argsort(energy_mo) 494 e_sort = energy_mo[e_idx] 495 nmo = energy_mo.size 496 occ_mo = cp.zeros(nmo) 497 nocc = mol.nelectrons // 2 498 occ_mo[e_idx[:nocc]] = 2 499 return occ_mo
Assign occupation numbers to MOs using CuPy (for GPU-based calculations).
Parameters
mol : Molecule Molecule object to determine number of electrons.
energy_mo : cp.ndarray MO energy array on the GPU.
coeff_mo : cp.ndarray MO coefficients array on the GPU (not used).
Returns
cp.ndarray Array of MO occupations (on GPU).
501 def solve(self, H, S, orthogonalize=False, x=None): 502 """ 503 Solve the generalized or canonical eigenvalue equation. 504 505 Parameters 506 ---------- 507 H : ndarray 508 Hamiltonian matrix. 509 510 S : ndarray 511 Overlap matrix. 512 513 orthogonalize : bool, optional 514 If True, solve using orthogonalized basis. 515 516 x : ndarray, optional 517 Transformation matrix (if already computed). 518 519 Returns 520 ------- 521 tuple of (ndarray, ndarray) 522 Eigenvalues and eigenvectors of the system. 523 """ 524 if not orthogonalize: 525 #Solve the generalized eigenvalue equation HC = SCE 526 eigvalues, eigvectors = scipy.linalg.eigh(H, S) 527 528 else: 529 if x is None: 530 eig_val_s, eig_vec_s = scipy.linalg.eigh(S) 531 # Removing the eigenvectors assoicated to the smallest eigenvalue. 532 x = eig_vec_s[:,eig_val_s>1e-7] / np.sqrt(eig_val_s[eig_val_s>1e-7]) 533 xHx = x.T @ H @ x 534 #Solve the canonical eigenvalue equation HC = CE 535 eigvalues, eigvectors = scipy.linalg.eigh(xHx) 536 eigvectors = np.dot(x, eigvectors) 537 538 idx = np.argmax(np.abs(eigvectors.real), axis=0) 539 eigvectors[:,eigvectors[idx,np.arange(len(eigvalues))].real<0] *= -1 540 return eigvalues, eigvectors # E, C
Solve the generalized or canonical eigenvalue equation.
Parameters
H : ndarray Hamiltonian matrix.
S : ndarray Overlap matrix.
orthogonalize : bool, optional If True, solve using orthogonalized basis.
x : ndarray, optional Transformation matrix (if already computed).
Returns
tuple of (ndarray, ndarray) Eigenvalues and eigenvectors of the system.
542 def solve_cupy(self, H, S, orthogonalize=True): 543 """ 544 Solve the generalized eigenvalue problem using CuPy (GPU). 545 546 Parameters 547 ---------- 548 H : cp.ndarray 549 Hamiltonian matrix (on GPU). 550 551 S : cp.ndarray 552 Overlap matrix (on GPU). 553 554 orthogonalize : bool, optional 555 If True, solve using orthogonalized basis. 556 557 Returns 558 ------- 559 tuple of (cp.ndarray, cp.ndarray) 560 Eigenvalues and eigenvectors (on GPU). 561 """ 562 eig_val_s, eig_vec_s = cp.linalg.eigh(S) 563 # Removing the eigenvectors assoicated to the smallest eigenvalue. 564 x = eig_vec_s[:,eig_val_s>1e-7] / cp.sqrt(eig_val_s[eig_val_s>1e-7]) 565 xhx = x.T @ H @ x 566 #Solve the canonical eigenvalue equation HC = SCE 567 eigvalues, eigvectors = cp.linalg.eigh(xhx) 568 eigvectors = cp.dot(x, eigvectors) 569 570 idx = cp.argmax(cp.abs(eigvectors.real), axis=0) 571 eigvectors[:,eigvectors[idx,cp.arange(len(eigvalues))].real<0] *= -1 572 return eigvalues, eigvectors # E, C
Solve the generalized eigenvalue problem using CuPy (GPU).
Parameters
H : cp.ndarray Hamiltonian matrix (on GPU).
S : cp.ndarray Overlap matrix (on GPU).
orthogonalize : bool, optional If True, solve using orthogonalized basis.
Returns
tuple of (cp.ndarray, cp.ndarray) Eigenvalues and eigenvectors (on GPU).
575 def getCoreH(self, mol=None, basis=None): 576 """ 577 Compute the core Hamiltonian matrix (T + V). 578 579 Parameters 580 ---------- 581 mol : Molecule, optional 582 Molecule object. 583 584 basis : Basis, optional 585 Basis set object. 586 587 Returns 588 ------- 589 ndarray 590 Core Hamiltonian matrix. 591 """ 592 #Get the core Hamiltonian 593 if mol is None: 594 mol = self.mol 595 if basis is None: 596 basis = self.basis 597 nao = basis.bfs_nao 598 H = np.empty((nao,nao)) 599 Vmat = Integrals.nuc_mat_symm(basis, mol) 600 Tmat = Integrals.kin_mat_symm(basis) 601 H = Vmat + Tmat 602 603 return H
Compute the core Hamiltonian matrix (T + V).
Parameters
mol : Molecule, optional Molecule object.
basis : Basis, optional Basis set object.
Returns
ndarray Core Hamiltonian matrix.
606 def guessCoreH(self, mol=None, basis=None, Hcore=None, S=None): 607 """ 608 Generate a guess density matrix using the core Hamiltonian. 609 610 Parameters 611 ---------- 612 mol : Molecule, optional 613 Molecule object. 614 615 basis : Basis, optional 616 Basis set. 617 618 Hcore : ndarray, optional 619 Core Hamiltonian. If None, it will be computed. 620 621 S : ndarray, optional 622 Overlap matrix. If None, it will be computed. 623 624 Returns 625 ------- 626 ndarray 627 Initial guess density matrix. 628 """ 629 #Get a guess for the density matrix using the core Hamiltonian 630 if mol is None: 631 mol = self.mol 632 if basis is None: 633 basis = self.basis 634 if Hcore is None: 635 Hcore = self.getCoreH(mol, basis) 636 if S is None: 637 S = Integrals.overlap_mat_symm(basis) 638 639 eigvalues, eigvectors = scipy.linalg.eigh(Hcore, S) 640 # print(eigvalues) 641 idx = np.argmax(abs(eigvectors.real), axis=0) 642 eigvectors[:,eigvectors[idx,np.arange(len(eigvalues))].real<0] *= -1 643 mo_occ = self.getOcc(mol, eigvalues, eigvectors) 644 # print(mo_occ) 645 dmat = self.gen_dm(eigvectors, mo_occ) 646 647 return dmat
Generate a guess density matrix using the core Hamiltonian.
Parameters
mol : Molecule, optional Molecule object.
basis : Basis, optional Basis set.
Hcore : ndarray, optional Core Hamiltonian. If None, it will be computed.
S : ndarray, optional Overlap matrix. If None, it will be computed.
Returns
ndarray Initial guess density matrix.
650 def DIIS(self,S,D,F): 651 """ 652 Perform Direct Inversion in the Iterative Subspace (DIIS) to improve SCF convergence. 653 654 Adapted from 655 ---------- 656 McMurchie-Davidson project: 657 https://github.com/jjgoings/McMurchie-Davidson 658 Licensed under the BSD-3-Clause license 659 660 Parameters 661 ---------- 662 S : ndarray 663 Overlap matrix. 664 665 D : ndarray 666 Density matrix. 667 668 F : ndarray 669 Fock matrix. 670 671 Returns 672 ------- 673 ndarray 674 DIIS-extrapolated Fock matrix. 675 """ 676 FDS = dot([F,D,S]) 677 # SDF = np.conjugate(FDS).T 678 errVec = FDS - np.conjugate(FDS).T 679 self.KSmats.append(F) 680 self.errVecs.append(errVec) 681 nKS = len(self.KSmats) 682 if nKS > self.diisSpace: 683 self.KSmats.pop(0) 684 self.errVecs.pop(0) 685 nKS = nKS - 1 686 B = np.zeros((nKS + 1,nKS + 1)) 687 B[-1,:] = B[:,-1] = -1.0 688 B[-1,-1] = 0.0 689 # B is symmetric 690 for i in range(nKS): 691 for j in range(i+1): 692 B[i,j] = B[j,i] = \ 693 np.real(np.trace(np.dot(np.conjugate(self.errVecs[i]).T, self.errVecs[j]))) 694 # for i in range(nKS): 695 # for j in range(i+1): 696 # print(self.errVecs[i].shape) 697 # B[i,j] = np.real(np.dot(np.conjugate(self.errVecs[i]).T, self.errVecs[j])) 698 # B[j,i] = B[i,j] 699 700 residual = np.zeros((nKS + 1, 1)) 701 residual[-1] = -1.0 702 weights = scipy.linalg.solve(B,residual) 703 704 # weights is 1 x numFock + 1, but first numFock values 705 # should sum to one if we are doing DIIS correctly 706 assert np.isclose(sum(weights[:-1]),1.0) 707 708 F = np.zeros(F.shape) 709 for i, KS in enumerate(self.KSmats): 710 weight = weights[i] 711 F += numexpr.evaluate('(weight * KS)') 712 713 return F
Perform Direct Inversion in the Iterative Subspace (DIIS) to improve SCF convergence.
Adapted from
McMurchie-Davidson project: https://github.com/jjgoings/McMurchie-Davidson Licensed under the BSD-3-Clause license
Parameters
S : ndarray Overlap matrix.
D : ndarray Density matrix.
F : ndarray Fock matrix.
Returns
ndarray DIIS-extrapolated Fock matrix.
715 def DIIS_cupy(self,S,D,F): 716 """ 717 Perform DIIS on GPU using CuPy to accelerate SCF convergence. 718 719 Adapted from 720 ---------- 721 McMurchie-Davidson project: 722 https://github.com/jjgoings/McMurchie-Davidson 723 Licensed under the BSD-3-Clause license 724 725 Parameters 726 ---------- 727 S : cp.ndarray 728 Overlap matrix (on GPU). 729 730 D : cp.ndarray 731 Density matrix (on GPU). 732 733 F : cp.ndarray 734 Fock matrix (on GPU). 735 736 Returns 737 ------- 738 cp.ndarray 739 DIIS-extrapolated Fock matrix (on GPU). 740 """ 741 FDS = F @ D @ S 742 errVec = FDS - cp.conjugate(FDS).T 743 self.KSmats.append(F) 744 self.errVecs.append(errVec) 745 nKS = len(self.KSmats) 746 if nKS > self.diisSpace: 747 self.KSmats.pop(0) 748 self.errVecs.pop(0) 749 nKS = nKS - 1 750 B = cp.zeros((nKS + 1,nKS + 1)) 751 B[-1,:] = B[:,-1] = -1.0 752 B[-1,-1] = 0.0 753 # B is symmetric 754 for i in range(nKS): 755 for j in range(i+1): 756 B[i,j] = B[j,i] = \ 757 cp.real(cp.trace(cp.dot(cp.conjugate(self.errVecs[i]).T, self.errVecs[j]))) 758 759 residual = cp.zeros((nKS + 1, 1)) 760 residual[-1] = -1.0 761 weights = cp.linalg.solve(B,residual) 762 763 # weights is 1 x numFock + 1, but first numFock values 764 # should sum to one if we are doing DIIS correctly 765 assert cp.isclose(sum(weights[:-1]),1.0) 766 767 F = cp.zeros(F.shape) 768 for i, KS in enumerate(self.KSmats): 769 weight = weights[i] 770 F += weight * KS 771 772 return F
Perform DIIS on GPU using CuPy to accelerate SCF convergence.
Adapted from
McMurchie-Davidson project: https://github.com/jjgoings/McMurchie-Davidson Licensed under the BSD-3-Clause license
Parameters
S : cp.ndarray Overlap matrix (on GPU).
D : cp.ndarray Density matrix (on GPU).
F : cp.ndarray Fock matrix (on GPU).
Returns
cp.ndarray DIIS-extrapolated Fock matrix (on GPU).
775 def scf(self): 776 """ 777 Perform Self-Consistent Field (SCF) calculation for Density Functional Theory (DFT). 778 779 This method implements a complete DFT SCF procedure including: 780 - One-electron integral calculation (overlap, kinetic, nuclear attraction) 781 - Two-electron integral calculation (4-center ERIs or density fitting) 782 - Grid generation and pruning for exchange-correlation evaluation 783 - Iterative SCF cycles with DIIS convergence acceleration 784 - Exchange-correlation energy and potential evaluation using LibXC 785 - GPU acceleration support for performance-critical operations 786 787 The implementation supports multiple algorithmic variants: 788 - Density fitting (DF) 789 - Schwarz screening for integral sparsity 790 - Multiple XC evaluation algorithms (CPU/GPU optimized) 791 - Dynamic precision switching for GPU calculations 792 793 SCF Procedure: 794 1. Initialize one-electron integrals (S, T, V_nuc) 795 2. Calculate/prepare two-electron integrals with optional screening 796 3. Generate and prune integration grids for XC evaluation 797 4. Iterative SCF loop: 798 - Build Coulomb matrix J from density matrix 799 - Evaluate exchange-correlation energy/potential on grids 800 - Form Kohn-Sham matrix: H_KS = H_core + J + V_xc 801 - Apply DIIS convergence acceleration 802 - Diagonalize KS matrix to get new orbitals 803 - Generate new density matrix from occupied orbitals 804 - Check energy convergence 805 5. Return converged total energy and density matrix 806 807 Computational Features: 808 - Multi-threading support via Numba and configurable core count 809 - GPU acceleration using CuPy for grid-based operations 810 - Memory-efficient batched evaluation of XC terms 811 - Multiple density fitting algorithms (DF_algo 1-10) 812 - Basis function screening for XC evaluation efficiency 813 - Optional Cholesky decomposition for 2-center integrals 814 815 Returns: 816 -------- 817 tuple[float, numpy.ndarray] 818 Etot : float 819 Converged total electronic energy in atomic units 820 dmat : numpy.ndarray, shape (nbf, nbf) 821 Converged density matrix in atomic orbital basis 822 823 Raises: 824 ------- 825 ConvergenceError 826 If SCF fails to converge within max_itr iterations 827 828 Notes: 829 ------ 830 - Uses class attributes for all computational parameters (basis, xc, conv_crit, etc.) 831 - Extensive timing and profiling information printed during execution 832 - Memory usage information displayed for large arrays 833 - GPU memory automatically freed after completion 834 - Supports both Cartesian (CAO) and Spherical (SAO) atomic orbital bases 835 836 The function performs comprehensive error checking and provides detailed 837 timing breakdowns for performance analysis. GPU acceleration is automatically 838 enabled when CuPy is available and use_gpu=True. 839 840 Example Energy Components: 841 - Electron-nuclear attraction energy 842 - Nuclear repulsion energy 843 - Kinetic energy 844 - Coulomb (electron-electron repulsion) energy 845 - Exchange-correlation energy 846 """ 847 #### Timings 848 duration1e = 0 849 durationDIIS = 0 850 durationAO_values = 0 851 durationCoulomb = 0 852 durationgrids = 0 853 durationItr = 0 854 durationxc = 0 855 durationXCpreprocessing = 0 856 durationDF = 0 857 durationKS = 0 858 durationSCF = 0 859 durationgrids_prune_rho = 0 860 durationSchwarz = 0 861 duration2c2e = 0 862 durationDF_coeff = 0 863 durationDF_gamma = 0 864 durationDF_Jtri = 0 865 durationDF_cholesky = 0 866 startSCF = timer() 867 868 mol = self.mol 869 basis = self.basis 870 dmat = self.dmat 871 xc = self.xc 872 conv_crit = self.conv_crit 873 max_itr = self.max_itr 874 ncores = self.ncores 875 grids = self.grids 876 gridsLevel = self.gridsLevel 877 isDF = self.isDF 878 auxbasis = self.auxbasis 879 rys = self.rys 880 DF_algo = self.DF_algo 881 blocksize = self.blocksize 882 XC_algo = self.XC_algo 883 debug = self.debug 884 sortGrids = self.sortGrids 885 save_ao_values = self.save_ao_values 886 xc_bf_screen = self.xc_bf_screen 887 threshold_schwarz = self.threshold_schwarz 888 strict_schwarz = self.strict_schwarz 889 cholesky = self.cholesky 890 orthogonalize = self.orthogonalize 891 892 print_pyfock_logo() 893 print('\n\nNumber of atoms:', mol.natoms) 894 print('\n\nNumber of basis functions (atomic orbitals):', basis.bfs_nao) 895 print('\n\nNumber of auxiliary basis functions:', auxbasis.bfs_nao) 896 print('\n\n================================================================================\n\n') 897 898 print_sys_info() 899 900 #### Set number of cores 901 numba.set_num_threads(ncores) 902 os.environ['RAYON_NUM_THREADS'] = str(ncores) 903 904 print('Running DFT using '+str(numba.get_num_threads())+' threads for Numba.\n\n', flush=True) 905 # if basis is None: 906 # basis = self.basis 907 # if mol is None: 908 # mol = self.mol 909 # if xc is None: 910 # xc = self.xc 911 # if gridsLevel is None: 912 # gridsLevel = self.gridsLevel 913 # if auxbasis is None: 914 # auxbasis = Basis(mol, {'all':Basis.load(mol=mol, basis_name='def2-universal-jfit')}) 915 if grids is None: 916 sortGrids = True 917 if xc_bf_screen==False: 918 if save_ao_values==True: 919 print('Warning! AO screening is set to False, but AO values are requested to be saved. \ 920 AO values can only be saved when XC_BF_SCREEN=TRUE. AO values will not be saved.', flush=True) 921 save_ao_values = False 922 if CUPY_AVAILABLE: 923 if self.use_gpu: 924 print('GPU acceleration is enabled. Currently this only accelerates AO values and XC term evaluation.', flush=True) 925 print('GPU(s) information:') 926 # print(cp.cuda.Device.mem_info()) 927 print(cuda.detect()) 928 print('Max threads per block supported by the GPU: ', cuda.get_current_device().MAX_THREADS_PER_BLOCK, flush=True) 929 print('The user has specified to use '+str(self.n_gpus)+' GPU(s).') 930 931 # For CUDA computations 932 threads_per_block = (self.threads_x, self.threads_y) 933 print('Threads per block configuration for the XC term: ', threads_per_block, flush=True) 934 print('Threads per block configuration for the all other calculations: ', (32, 32), flush=True) 935 if self.dynamic_precision: 936 print('\n\nWill use dynamic precision. ') 937 print('This means that the XC term will be evaluated in single precision until the ') 938 print('relative energy difference b/w successive iterations is less than 5.0E-7.') 939 precision_XC = cp.float32 940 else: 941 precision_XC = cp.float64 942 943 if XC_algo is None: 944 XC_algo = 3 945 if blocksize is None: 946 blocksize = 20480 947 streams = [] 948 nb_streams = [] 949 for i in range(self.n_gpus): 950 cp.cuda.Device(i).use() 951 cp_stream = cp.cuda.Stream(non_blocking = True) 952 nb_stream = cuda.external_stream(cp_stream.ptr) 953 streams.append(cp_stream) 954 nb_streams.append(nb_stream) 955 # Switch back to main GPU 956 cp.cuda.Device(0).use() 957 streams[0].use() 958 # # Set some basis function data as cupy arrays to avoid redoing it during XC term evaluation at every SCF iteration 959 # bfs_coords = cp.asarray([basis.bfs_coords], dtype=precision_XC) 960 # bfs_contr_prim_norms = cp.asarray([basis.bfs_contr_prim_norms], dtype=precision_XC) 961 # bfs_lmn = cp.asarray([basis.bfs_lmn]) 962 # bfs_nprim = cp.asarray([basis.bfs_nprim]) 963 # #The remaining properties like bfs_coeffs are a list of lists of unequal sizes. 964 # #Numba won't be able to work with these efficiently. 965 # #So, we convert them to a numpy 2d array by applying a trick, 966 # #that the second dimension is that of the largest list. So that 967 # #it can accomadate all the lists. 968 # maxnprim = max(basis.bfs_nprim) 969 # bfs_coeffs = cp.zeros([basis.bfs_nao, maxnprim], dtype=precision_XC) 970 # bfs_expnts = cp.zeros([basis.bfs_nao, maxnprim], dtype=precision_XC) 971 # bfs_prim_norms = cp.zeros([basis.bfs_nao, maxnprim], dtype=precision_XC) 972 # bfs_radius_cutoff = cp.zeros([basis.bfs_nao], dtype=precision_XC) 973 # for i in range(basis.bfs_nao): 974 # for j in range(basis.bfs_nprim[i]): 975 # bfs_coeffs[i,j] = basis.bfs_coeffs[i][j] 976 # bfs_expnts[i,j] = basis.bfs_expnts[i][j] 977 # bfs_prim_norms[i,j] = basis.bfs_prim_norms[i][j] 978 # bfs_radius_cutoff[i] = basis.bfs_radius_cutoff[i] 979 # # Now bf/ao values can be evaluated by calling the following 980 # # bf_values = Integrals.bf_val_helpers.eval_bfs(bfs_coords[0], bfs_contr_prim_norms[0], bfs_nprim[0], bfs_lmn[0], bfs_coeffs, bfs_prim_norms, bfs_expnts, bfs_radius_cutoff, coord) 981 # bfs_data_as_np_arrays = [bfs_coords[0], bfs_contr_prim_norms[0], bfs_nprim[0], bfs_lmn[0], bfs_coeffs, bfs_prim_norms, bfs_expnts, bfs_radius_cutoff] 982 983 984 else: 985 if self.use_gpu: 986 print('GPU acceleration requested but cannot be enabled as Cupy is not installed.', flush=True) 987 self.use_gpu = False 988 989 if XC_algo is None: 990 XC_algo = 2 991 if blocksize is None: 992 blocksize = 5000 993 if isDF: 994 isSchwarz = True 995 else: 996 isSchwarz = False # Schwarz screening is not yet implemented for 4c2e integrals 997 if strict_schwarz: 998 if not (DF_algo==6 or DF_algo==10): 999 print('Warning: The stricter variation of Schwarz screening is only compatible with DF algo #6 or #10 so turning it off.') 1000 strict_schwarz = False 1001 if cholesky: 1002 if not (DF_algo==6 or DF_algo==10): 1003 print('Warning: The Cholesky decomposition of 2c2e integrls is only compatible with DF algo #6 or #10 so turning it off.') 1004 cholesky = False 1005 1006 if self.sao: 1007 print('\n\nSpherical Atomic Orbitals are being used!\n\n') 1008 # Get the CAO to SAO transformation matrix 1009 c2sph_mat = basis.cart2sph_basis() # CAO --> SAO 1010 # Calculate the pseudoinverse transformation matrix (for back transformation of SAO dmat to CAO dmat) 1011 sph2c_mat_pseudo = basis.sph2cart_basis() # SAO --> CAO 1012 1013 1014 eigvectors = None 1015 # DF_algo = 1 # Worst algorithm for more than 500 bfs/auxbfs (requires 2x mem of 3c2e integrals and a large prefactor) 1016 # DF_algo = 2 # Sligthly better (2x memory efficient) algorithm than above (requires 1x mem of 3c2e integrals and a large prefactor) 1017 # DF_algo = 3 # Memory effcient without any prefactor. (Can easily be converted into a sparse version, unlike the others) (https://aip.scitation.org/doi/pdf/10.1063/1.1567253) 1018 # DF_algo = 4 # Same as 3, except now we use triangular version of ints3c2e to save on memory 1019 # DF_algo = 5 # Same as 4 in terms of memory requirements, however faster in performance due to the use of Schwarz screening. 1020 # DF_algo = 6 # Much cheaper than 4 and 5 in terms of memory requirements because the indices of significant (ij|P) are efficiently calculated without duplicates/temporary arrays. 1021 # The speed maybe same or just slightly slower. 1022 # DF_algo = 7 # The significant indices (ij|P) are stored even more efficiently by using shell indices instead of bf indices. 1023 # DF_algo = 8 # Similar to 6, except that here the significant indices are not stored resulting in 50% memory savings. The drawback is that it only works in serial which is useful for Google colab or Kaggle perhaps. 1024 # DF_algo = 9 # 1025 1026 if not strict_schwarz: # If a stricter variant of Schwarz screening is not requested 1027 start1e = timer() 1028 print('\nCalculating one electron integrals...\n\n', flush=True) 1029 # One electron integrals 1030 if not self.use_gpu: 1031 S = Integrals.overlap_mat_symm(basis) 1032 V = Integrals.nuc_mat_symm(basis, mol) 1033 T = Integrals.kin_mat_symm(basis) 1034 # Core hamiltonian 1035 H = T + V 1036 else: 1037 S = Integrals.overlap_mat_symm_cupy(basis, cp_stream=streams[0]) 1038 V = Integrals.nuc_mat_symm_cupy(basis, mol, cp_stream = streams[0]) 1039 T = Integrals.kin_mat_symm_cupy(basis, cp_stream = streams[0]) 1040 # Core hamiltonian 1041 H = T + V 1042 1043 print('Core H size in GB ',H.nbytes/1e9, flush=True) 1044 print('done!', flush=True) 1045 duration1e = timer() - start1e 1046 print('Time taken '+str(duration1e)+' seconds.\n', flush=True) 1047 else: 1048 start1e = timer() 1049 print('\nCalculating overlap and kinetic integrals...\n\n', flush=True) 1050 # One electron integrals 1051 if not self.use_gpu: 1052 S = Integrals.overlap_mat_symm(basis) 1053 T = Integrals.kin_mat_symm(basis) 1054 # Core hamiltonian 1055 H = T 1056 else: 1057 S = Integrals.overlap_mat_symm_cupy(basis, cp_stream = streams[0]) 1058 T = Integrals.kin_mat_symm_cupy(basis, cp_stream = streams[0]) 1059 # Core hamiltonian 1060 H = T 1061 1062 print('Core H size in GB ',(H.nbytes/1e9)*2, flush=True) # Factor of 2 because nuclear matrix will also be included here later 1063 print('done!', flush=True) 1064 duration1e = timer() - start1e 1065 print('Time taken '+str(duration1e)+' seconds.\n', flush=True) 1066 1067 1068 if dmat is None: 1069 if self.dmat_guess_method=='core': 1070 dmat = self.guessCoreH(mol, basis, Hcore=H, S=S) 1071 1072 if self.use_gpu: 1073 dmat_cp = cp.asarray(dmat, dtype=cp.float64) 1074 streams[0].synchronize() 1075 cp.cuda.Stream.null.synchronize() 1076 1077 1078 if self.sao: 1079 # It is possible that the supplied density matrix to the SCF was in SAO format already. 1080 # In such a case we need to transform this density matrix to CAO basis so that the J and XC term evaluations can be done properly 1081 if not dmat.shape==S.shape: 1082 dmat = np.dot(sph2c_mat_pseudo, np.dot(dmat, sph2c_mat_pseudo.T)) # Convert to CAO from SAO (SAO --> CAO) 1083 # Later the dmat will be converted back to SAO after J and XC term evaluations 1084 if not isDF: # 4c2e ERI case 1085 1086 print('\nCalculating four centered two electron integrals (ERIs)...\n\n', flush=True) 1087 if not isSchwarz: 1088 # Four centered two electron integrals (ERIs) 1089 if rys: 1090 ints4c2e = Integrals.rys_4c2e_symm(basis) 1091 else: 1092 ints4c2e = Integrals.conv_4c2e_symm(basis) 1093 print('Four Center Two electron ERI size in GB ',ints4c2e.nbytes/1e9, flush=True) 1094 print('done!') 1095 else: 1096 print('\n\nPerforming Schwarz screening...') 1097 # threshold_schwarz = 1e-09 1098 print('Threshold ', threshold_schwarz) 1099 startSchwarz = timer() 1100 nints4c2e_sym8 = int(basis.bfs_nao**4/8) 1101 nints4c2e = int(basis.bfs_nao**4) 1102 duration_4c2e_diag = 0.0 1103 start_4c2e_diag = timer() 1104 # Diagonal elements of ERI 4c2e array 1105 ints4c2e_diag = Integrals.schwarz_helpers.eri_4c2e_diag(basis) 1106 duration_4c2e_diag = timer() - start_4c2e_diag 1107 print('Time taken to evaluate the "diagonal" of 4c2e ERI tensor: ', duration_4c2e_diag) 1108 # Calculate the square roots required for 1109 duration_square_roots = 0.0 1110 start_square_roots = timer() 1111 sqrt_ints4c2e_diag = np.sqrt(np.abs(ints4c2e_diag)) 1112 duration_square_roots = timer() - start_square_roots 1113 print('Time taken to evaluate the square roots needed: ', duration_square_roots) 1114 chunksize = int(1e9) # Results in 2 GB chunks 1115 duration_indices_calc = 0.0 1116 duration_concatenation = 0.0 1117 start_indices_calc = timer() 1118 # indices_temp = [] 1119 ijkl = [0, 0, 0, 0] 1120 if chunksize<nints4c2e_sym8: 1121 nchunks = nints4c2e_sym8//chunksize + 1 1122 else: 1123 nchunks=1 1124 indicesA = None 1125 for ichunk in range(nchunks): 1126 indices_temp = Integrals.schwarz_helpers.calc_indices_4c2e_schwarz(sqrt_ints4c2e_diag, min(chunksize, nints4c2e_sym8), basis.bfs_nao, ijkl[0], ijkl[1], ijkl[2], ijkl[3], threshold_schwarz) 1127 1128 ijkl = indices_temp[4] 1129 count = indices_temp[5] 1130 # start_concatenation = timer() 1131 if indicesA is not None and count>0: 1132 indicesA = np.concatenate([indicesA, indices_temp[0][0:count]]) 1133 indicesB = np.concatenate([indicesB, indices_temp[1][0:count]]) 1134 indicesC = np.concatenate([indicesC, indices_temp[2][0:count]]) 1135 indicesD = np.concatenate([indicesD, indices_temp[3][0:count]]) 1136 else: 1137 1138 indicesA = indices_temp[0][0:count] 1139 indicesB = indices_temp[1][0:count] 1140 indicesC = indices_temp[2][0:count] 1141 indicesD = indices_temp[3][0:count] 1142 # duration_concatenation += timer() - start_concatenation 1143 # Break out of the for loop if the nol. of significant triplets found is less than the chunksize 1144 # This is because, it means that there are no more significant triplets to be found from all possible configurations. 1145 if count<chunksize: 1146 break 1147 1148 duration_indices_calc += timer() - start_indices_calc 1149 print('Time for significant indices evaluation: ', duration_indices_calc) 1150 # print('Time for array concatenation: ', duration_concatenation) 1151 1152 # Get rid of temp variables 1153 indices_temp=0 1154 ijk = 0 1155 1156 print('Size of permanent array storing the significant indices of 4c2e ERI in GB ', indicesA.nbytes/1e9+indicesB.nbytes/1e9+indicesC.nbytes/1e9+indicesD.nbytes/1e9, flush=True) 1157 1158 nsignificant = len(indicesA) 1159 print('No. of elements in the standard four-centered two electron ERI tensor: ', nints4c2e, flush=True) 1160 print('No. of elements after factoring in the 8-fold symmetry in the four-centered two electron ERI tensor: ', nints4c2e_sym8, flush=True) 1161 print('No. of significant quadruplets based on Schwarz inequality and 8-fold symmetry: ' + str(nsignificant) + ' or '+str(np.round(nsignificant/nints4c2e*100,1)) + '% of original', flush=True) 1162 print('Schwarz screening done!') 1163 durationSchwarz = timer() - startSchwarz 1164 print('Total time taken for Schwarz screening '+str(durationSchwarz)+' seconds.\n', flush=True) 1165 1166 ints4c2e = Integrals.schwarz_helpers.rys_4c2e_tri_schwarz_sparse(basis, auxbasis, indicesA, indicesB, indicesC, indicesD) 1167 1168 else: # Density fitting case (3c2e, and 2c2e will be calculated) 1169 H_temp, V_temp, ints3c2e, ints2c2e, nsignificant, indicesA, indicesB, indicesC, offsets_3c2e, indices, ints4c2e_diag, sqrt_ints4c2e_diag, sqrt_diag_ints2c2e, indices_dmat_tri, indices_dmat_tri_2, df_coeff0, Qpq, cho_decomp_ints2c2e, durationDF_cholesky, durationCoulomb = density_fitting_prelims_for_DFT_development(mol, basis, auxbasis, T, dmat, self.use_gpu, self.keep_ints3c2e_in_gpu, threshold_schwarz, strict_schwarz, rys, DF_algo, cholesky) 1170 if not H_temp is None: 1171 H = H_temp 1172 if not V_temp is None: 1173 V = V_temp 1174 if cholesky: 1175 durationDF += durationDF_cholesky 1176 1177 1178 1179 1180 1181 scf_converged = False 1182 durationgrids = 0 1183 1184 if grids is None: 1185 startGrids = timer() 1186 print('\nGenerating grids...\n\n', flush=True) 1187 # To get the same energies as PySCF (level=5) upto 1e-7 au, use the following settings 1188 # radial_precision = 1.0e-13 1189 # level=3 1190 # pruning by density with threshold = 1e-011 1191 # alpha_min and alpha_max corresponding to QZVP 1192 print('Grids level: ', gridsLevel) 1193 # Generate grids for XC term 1194 basisGrids = Basis(mol, {'all':Basis.load(mol=mol, basis_name='def2-QZVP')}) 1195 grids = Grids(mol, basis=basisGrids, level = gridsLevel, ncores=ncores) 1196 1197 print('done!', flush=True) 1198 durationgrids = timer() - startGrids 1199 print('Time taken '+str(durationgrids)+' seconds.\n', flush=True) 1200 1201 # Begin pruning the grids based on density (rho) 1202 # Evaluate ao_values to calculate rho 1203 print('\nPruning generated grids by rho...\n\n', flush=True) 1204 startGrids_prune_rho = timer() 1205 threshold_rho = 1e-011 1206 ngrids_temp = grids.coords.shape[0] 1207 ndeleted = 0 1208 blocksize_temp = 50000 1209 nblocks_temp = ngrids_temp//blocksize_temp 1210 weightsNew = None 1211 coordsNew = None 1212 for iblock in range(nblocks_temp+1): 1213 offset = iblock*blocksize_temp 1214 weights_block = grids.weights[offset : min(offset+blocksize_temp,ngrids_temp)] 1215 coords_block = grids.coords[offset : min(offset+blocksize_temp,ngrids_temp)] 1216 ao_value_block = Integrals.bf_val_helpers.eval_bfs(basis, coords_block) 1217 rho_block = contract('ij,mi,mj->m', dmat, ao_value_block, ao_value_block) 1218 zero_indices = np.where(np.abs(rho_block*weights_block) < threshold_rho)[0] 1219 ndeleted += len(zero_indices) 1220 weightsNew_block = np.delete(weights_block, zero_indices) 1221 coordsNew_block = np.delete(coords_block, zero_indices, 0) 1222 if weightsNew_block.shape[0]>0: 1223 if weightsNew is None: 1224 weightsNew = weightsNew_block 1225 coordsNew = coordsNew_block 1226 else: 1227 weightsNew = np.concatenate((weightsNew, weightsNew_block)) 1228 coordsNew = np.concatenate([coordsNew, coordsNew_block], axis=0) 1229 1230 grids.coords = coordsNew 1231 grids.weights = weightsNew 1232 print('done!', flush=True) 1233 durationgrids_prune_rho = timer() - startGrids_prune_rho 1234 print('Time taken '+str(durationgrids_prune_rho)+' seconds.\n', flush=True) 1235 print('\nDeleted '+ str(ndeleted) + ' grid points.', flush=True) 1236 1237 else: 1238 print('\nUsing the user supplied grids!\n\n', flush=True) 1239 1240 1241 # Grid information initial 1242 print('\nNo. of supplied/generated grid points: ', grids.coords.shape[0], flush=True) 1243 1244 # Prune grids based on weights 1245 # start_pruning_weights = timer() 1246 # print('\nPruning grids based on weights....', flush=True) 1247 # zero_indices = np.where(np.logical_and(grids.weights>=-1.0e-12, grids.weights<=1.e-12)) 1248 # grids.weights = np.delete(grids.weights, zero_indices) 1249 # grids.coords = np.delete(grids.coords, zero_indices, 0) 1250 # print('done!', flush=True) 1251 # duration_pruning_weights = timer() - start_pruning_weights 1252 # print('\nTime taken '+str(duration_pruning_weights)+' seconds.\n', flush=True) 1253 # print('\nNo. of grid points after screening by weights: ', grids.coords.shape[0], flush=True) 1254 1255 print('Size (in GB) for storing the coordinates of grid: ', grids.coords.nbytes/1e9, flush=True) 1256 print('Size (in GB) for storing the weights of grid: ', grids.weights.nbytes/1e9, flush=True) 1257 print('Size (in GB) for storing the density at gridpoints: ', grids.weights.nbytes/1e9, flush=True) 1258 1259 # Sort the grids for slightly better performance with batching (doesn't seem to make much difference) 1260 if sortGrids: 1261 print('\nSorting grids ....', flush=True) 1262 # Function to sort grids 1263 def get_ordered_list(points, x, y, z): 1264 points.sort(key = lambda p: (p[0] - x)**2 + (p[1] - y)**2 + (p[2] - z)**2) 1265 # print(points[0:10]) 1266 return points 1267 # Make a single array of coords and weights 1268 coords_weights = np.c_[grids.coords, grids.weights] 1269 coords_weights = np.array(get_ordered_list(coords_weights.tolist(), min(grids.coords[:,0]), min(grids.coords[:,1]), min(grids.coords[:,2]))) 1270 # Now go back to two arrays for coords and weights 1271 grids.weights = coords_weights[:,3] 1272 grids.coords = coords_weights[:,0:3] 1273 coords_weights = 0#None 1274 print('done!', flush=True) 1275 1276 # blocksize = 10000 1277 ngrids = grids.coords.shape[0] 1278 nblocks = ngrids//blocksize 1279 print('\nWill use batching to evaluate the XC term for memory efficiency.', flush=True) 1280 print('Batch size: ', blocksize, flush=True) 1281 print('No. of batches: ', nblocks+1, flush=True) 1282 1283 1284 #### Some preliminary stuff for XC evaluation 1285 durationXCpreprocessing = 0 1286 list_nonzero_indices = None 1287 count_nonzero_indices = None 1288 list_ao_values = None 1289 list_ao_grad_values = None 1290 if xc_bf_screen: 1291 xc_family_dict = {1:'LDA',2:'GGA',4:'MGGA'} 1292 # Create a LibXC object 1293 funcx = pylibxc.LibXCFunctional(xc[0], "unpolarized") 1294 funcc = pylibxc.LibXCFunctional(xc[1], "unpolarized") 1295 x_family_code = funcx.get_family() 1296 c_family_code = funcc.get_family() 1297 ### Find the list of significanlty contributing bfs for xc evaluations 1298 startXCpreprocessing = timer() 1299 print('\nPreliminary processing for XC term evaluations...', flush=True) 1300 print('Calculating the value of basis functions (atomic orbitals) and get the indices of siginificantly contributing functions...', flush=True) 1301 # Calculate the value of basis functions for all grid points in batches 1302 # and find the indices of basis functions that have a significant contribution to those batches for each batch 1303 if not self.use_gpu: 1304 list_nonzero_indices, count_nonzero_indices = Integrals.bf_val_helpers.nonzero_ao_indices(basis, grids.coords, blocksize, nblocks, ngrids) 1305 else: 1306 list_nonzero_indices, count_nonzero_indices = Integrals.bf_val_helpers.nonzero_ao_indices_cupy(basis, grids.coords, blocksize, nblocks, ngrids, streams[0]) 1307 print('done!', flush=True) 1308 durationXCpreprocessing = timer() - startXCpreprocessing 1309 print('Time taken '+str(durationXCpreprocessing)+' seconds.\n', flush=True) 1310 print('Maximum no. of basis functions contributing to a batch of grid points: ', max(count_nonzero_indices)) 1311 print('Average no. of basis functions contributing to a batch of grid points: ', int(np.mean(count_nonzero_indices))) 1312 1313 1314 durationAO_values = 0 1315 if save_ao_values: 1316 startAO_values = timer() 1317 if xc_family_dict[x_family_code]=='LDA' and xc_family_dict[c_family_code]=='LDA': 1318 print('\nYou have asked to save the values of significant basis functions on grid points so as to avoid recalculation for each SCF cycle.', flush=True) 1319 memory_required = sum(count_nonzero_indices*blocksize)*8/1024/1024/1024 1320 print('Please note: This will require addtional memory that is approximately: '+ str(np.round(memory_required,1))+ ' GB', flush=True) 1321 print('Calculating the value of significantly contributing basis functions (atomic orbitals)...', flush=True) 1322 list_ao_values = [] 1323 # Loop over batches 1324 for iblock in range(nblocks+1): 1325 offset = iblock*blocksize 1326 coords_block = grids.coords[offset : min(offset+blocksize,ngrids)] 1327 ao_values_block = Integrals.bf_val_helpers.eval_bfs(basis, coords_block, parallel=True, non_zero_indices=list_nonzero_indices[iblock][0:count_nonzero_indices[iblock]]) 1328 if self.use_gpu and self.keep_ao_in_gpu: 1329 list_ao_values.append(cp.asarray(ao_values_block)) 1330 else: 1331 list_ao_values.append(ao_values_block) 1332 #Free memory 1333 ao_values_block = 0 1334 if xc_family_dict[x_family_code]!='LDA' or xc_family_dict[c_family_code]!='LDA': 1335 print('\nYou have asked to save the values of significant basis functions and their gradients on grid points so as to avoid recalculation for each SCF cycle.', flush=True) 1336 memory_required = 4*sum(count_nonzero_indices*blocksize)*8/1024/1024/1024 1337 print('Please note: This will require addtional memory that is approximately :'+ str(np.round(memory_required,1))+ ' GB', flush=True) 1338 print('Calculating the value of significantly contributing basis functions (atomic orbitals)...', flush=True) 1339 list_ao_values = [] 1340 list_ao_grad_values = [] 1341 # Loop over batches 1342 for iblock in range(nblocks+1): 1343 offset = iblock*blocksize 1344 coords_block = grids.coords[offset : min(offset+blocksize,ngrids)] 1345 # ao_values_block = Integrals.bf_val_helpers.eval_bfs(basis, coords_block, parallel=True, non_zero_indices=list_nonzero_indices[iblock][0:count_nonzero_indices[iblock]]) 1346 ao_values_block, ao_grad_values_block = Integrals.bf_val_helpers.eval_bfs_and_grad(basis, coords_block, parallel=True, non_zero_indices=list_nonzero_indices[iblock][0:count_nonzero_indices[iblock]]) 1347 if self.use_gpu and self.keep_ao_in_gpu: 1348 list_ao_values.append(cp.asarray(ao_values_block)) 1349 list_ao_grad_values.append(cp.asarray(ao_grad_values_block)) 1350 else: 1351 list_ao_values.append(ao_values_block) 1352 list_ao_grad_values.append(ao_grad_values_block) 1353 #Free memory 1354 ao_values_block = 0 1355 ao_grad_values_block =0 1356 print('done!', flush=True) 1357 durationAO_values = timer() - startAO_values 1358 print('Time taken '+str(durationAO_values)+' seconds.\n', flush=True) 1359 1360 if self.use_gpu: 1361 grids.coords = cp.asarray(grids.coords, dtype=cp.float64) 1362 grids.weights = cp.asarray(grids.weights, dtype=cp.float64) 1363 1364 #-------XC Stuff start---------------------- 1365 1366 funcid = self.xc 1367 1368 xc_family_dict = {1:'LDA',2:'GGA',4:'MGGA'} 1369 1370 # Create a LibXC object 1371 funcx = pylibxc.LibXCFunctional(funcid[0], "unpolarized") 1372 funcc = pylibxc.LibXCFunctional(funcid[1], "unpolarized") 1373 x_family_code = funcx.get_family() 1374 c_family_code = funcc.get_family() 1375 1376 1377 print('\n\n------------------------------------------------------', flush=True) 1378 print('Exchange-Correlation Functional') 1379 print('------------------------------------------------------\n', flush=True) 1380 print('XC Functional IDs supplied: ', funcid, flush=True) 1381 print('\n\nDescription of exchange functional: \n') 1382 print('The Exchange function belongs to the family:', xc_family_dict[x_family_code], flush=True) 1383 print(funcx.describe()) 1384 print('\n\nDescription of correlation functional: \n', flush=True) 1385 print(' The Correlation function belongs to the family:', xc_family_dict[c_family_code], flush=True) 1386 print(funcc.describe()) 1387 print('------------------------------------------------------\n', flush=True) 1388 print('\n\n', flush=True) 1389 #-------XC Stuff end---------------------- 1390 1391 Etot = 0 1392 1393 itr = 1 1394 Enn = self.nuclear_rep_energy(mol) 1395 #diis = scf.CDIIS() 1396 dmat_old = 0 1397 J_diff = 0 1398 Ecoul = 0.0 1399 Ecoul_temp = 0.0 1400 1401 durationxc = 0 1402 1403 startKS = timer() 1404 if self.sao: 1405 ######### 1406 # I use a trick to calculate the DFT energy in SAO basis. 1407 # The trick is to get rid of the extra information in the CAO basis matrices. 1408 # The following does just that. 1409 # Going from CAO --> SAO we lose the extra information then we go back to from SAO --> CAO 1410 # This way all the integrals (potential matrices and density matrices) can still be calculated 1411 # in the CAO basis. In fact even the KS matrix is diagonalized in the CAO basis with the extra information 1412 # removed by forward and backward transformation: CAO --> SAO followed by SAO --> CAO. 1413 # This also helps in reducing the number of transformations needed for calculation of various energy contributions. 1414 ######### 1415 # Convert the overlap matrix from CAO to SAO basis 1416 S = np.dot(c2sph_mat, np.dot(S, c2sph_mat.T)) # CAO --> SAO 1417 # Convert back to SAO so that now we lose the extra information that the CAO basis had 1418 S = np.dot(sph2c_mat_pseudo, np.dot(S, sph2c_mat_pseudo.T)) 1419 if orthogonalize: 1420 if not self.use_gpu: 1421 eig_val_s, eig_vec_s = scipy.linalg.eigh(S) 1422 # Removing the eigenvectors assoicated to the smallest eigenvalue. 1423 x = eig_vec_s[:,eig_val_s>1e-7] / np.sqrt(eig_val_s[eig_val_s>1e-7]) 1424 else: 1425 x = None 1426 durationKS = durationKS + timer() - startKS 1427 1428 1429 while not (scf_converged or itr==max_itr+1): 1430 startIter = timer() 1431 if itr==1: 1432 dmat_diff = dmat # This is in CAO basis 1433 # Coulomb (Hartree) matrix 1434 if not isDF: 1435 J = contract('ijkl,ij',ints4c2e,dmat) # This is in CAO basis 1436 else: 1437 J, durationDF, durationDF_coeff, durationDF_gamma, durationDF_Jtri, Ecoul_temp = Jmat_from_density_fitting(dmat, DF_algo, cholesky, cho_decomp_ints2c2e, df_coeff0, Qpq, ints3c2e, ints2c2e, indices_dmat_tri, indices_dmat_tri_2, indicesA, indicesB, indicesC, offsets_3c2e, sqrt_ints4c2e_diag, sqrt_diag_ints2c2e, threshold_schwarz, strict_schwarz, basis, auxbasis, self.use_gpu, self.keep_ints3c2e_in_gpu, durationDF_gamma, ncores, durationDF_coeff, durationDF_Jtri, durationDF) 1438 1439 1440 1441 1442 else: 1443 dmat_diff = dmat-dmat_old 1444 if not isDF: 1445 # J_diff = contract('ijkl,ij',ints4c2e,dmat_diff) 1446 # J += J_diff 1447 J = contract('ijkl,ij', ints4c2e, dmat) 1448 else: 1449 J, durationDF, durationDF_coeff, durationDF_gamma, durationDF_Jtri, Ecoul_temp = Jmat_from_density_fitting(dmat, DF_algo, cholesky, cho_decomp_ints2c2e, df_coeff0, Qpq, ints3c2e, ints2c2e, indices_dmat_tri, indices_dmat_tri_2, indicesA, indicesB, indicesC, offsets_3c2e, sqrt_ints4c2e_diag, sqrt_diag_ints2c2e, threshold_schwarz, strict_schwarz, basis, auxbasis, self.use_gpu, self.keep_ints3c2e_in_gpu, durationDF_gamma, ncores, durationDF_coeff, durationDF_Jtri, durationDF) 1450 # J += J_diff 1451 if self.use_gpu: 1452 J = cp.asarray(J, dtype=cp.float64) 1453 streams[0].synchronize() 1454 cp.cuda.Stream.null.synchronize() 1455 1456 1457 # XC energy and potential 1458 startxc = timer() 1459 1460 if not self.use_gpu: 1461 if XC_algo==1: 1462 # Much slower than JOBLIB version 1463 # Still keeping it because, it can be useful when using GPUs 1464 Exc, Vxc = Integrals.eval_xc_1(basis, dmat, grids.weights, grids.coords, funcid, blocksize=blocksize, debug=debug, \ 1465 list_nonzero_indices=list_nonzero_indices, count_nonzero_indices=count_nonzero_indices, \ 1466 list_ao_values=list_ao_values, list_ao_grad_values=list_ao_grad_values) 1467 if XC_algo==2: 1468 # Much faster than above and stable too, therefore this should be default now. 1469 # Used to unstable and had memory leaks, 1470 # But now all that is fixed by using threadpoolctl, garbage collection or freeing up memory after XC evaluation at each iteration 1471 1472 Exc, Vxc = Integrals.eval_xc_2(basis, dmat, grids.weights, grids.coords, funcid, ncores=ncores, blocksize=blocksize, \ 1473 list_nonzero_indices=list_nonzero_indices, count_nonzero_indices=count_nonzero_indices, \ 1474 list_ao_values=list_ao_values, list_ao_grad_values=list_ao_grad_values, debug=debug) 1475 1476 if XC_algo==3: 1477 with threadpool_limits(limits=1, user_api='blas'): 1478 Exc, Vxc = Integrals.eval_xc_3(basis, dmat, grids.weights, grids.coords, funcid, ncores=ncores, blocksize=blocksize, \ 1479 list_nonzero_indices=list_nonzero_indices, count_nonzero_indices=count_nonzero_indices, \ 1480 list_ao_values=list_ao_values, list_ao_grad_values=list_ao_grad_values, debug=debug) 1481 else: # GPU 1482 if XC_algo==1: 1483 Exc, Vxc = Integrals.eval_xc_1_cupy(basis, dmat_cp, grids.weights, grids.coords, funcid, blocksize=blocksize, debug=debug, \ 1484 list_nonzero_indices=list_nonzero_indices, count_nonzero_indices=count_nonzero_indices, \ 1485 list_ao_values=list_ao_values, list_ao_grad_values=list_ao_grad_values, use_libxc=self.use_libxc,\ 1486 nstreams=self.n_streams, ngpus=self.n_gpus, freemem=self.free_gpu_mem, threads_per_block=threads_per_block, 1487 type=precision_XC) 1488 if XC_algo==2: 1489 Exc, Vxc = Integrals.eval_xc_2_cupy(basis, dmat_cp, grids.weights, cp.asnumpy(grids.coords), funcid, ncores=ncores, blocksize=blocksize, \ 1490 list_nonzero_indices=list_nonzero_indices, count_nonzero_indices=count_nonzero_indices, \ 1491 list_ao_values=list_ao_values, list_ao_grad_values=list_ao_grad_values, debug=debug) 1492 if XC_algo==3: 1493 # Default for GPUs 1494 Exc, Vxc = Integrals.eval_xc_3_cupy(basis, dmat_cp, grids.weights, grids.coords, funcid, blocksize=blocksize, debug=debug, \ 1495 list_nonzero_indices=list_nonzero_indices, count_nonzero_indices=count_nonzero_indices, \ 1496 list_ao_values=list_ao_values, list_ao_grad_values=list_ao_grad_values, use_libxc=self.use_libxc,\ 1497 nstreams=self.n_streams, ngpus=self.n_gpus, freemem=self.free_gpu_mem, threads_per_block=threads_per_block, 1498 type=precision_XC, streams=streams, nb_streams=nb_streams) 1499 1500 Vxc = cp.asarray(Vxc, dtype=cp.float64) 1501 1502 durationxc = durationxc + timer() - startxc 1503 1504 1505 dmat_old = dmat # This is in CAO basis 1506 1507 1508 if self.use_gpu: 1509 Enuc = contract('ij,ji->', dmat_cp, V) 1510 Ekin = contract('ij,ji->', dmat_cp, T) 1511 Ecoul = contract('ij,ji->', dmat_cp, J)*0.5 1512 else: 1513 with threadpool_limits(limits=ncores, user_api='blas'): 1514 # print('Energy contractions', controller.info()) 1515 Enuc = contract('ij,ji->', dmat, V) 1516 Ekin = contract('ij,ji->', dmat, T) 1517 Ecoul = contract('ij,ji->', dmat, J)*0.5 1518 if isDF and (DF_algo==6 or DF_algo==10): 1519 Ecoul = Ecoul*2 - 0.5*Ecoul_temp # This is the correct formula for Coulomb energy with DF 1520 1521 Etot_new = Exc + Enuc + Ekin + Enn + Ecoul 1522 self.Total_energy = Etot_new 1523 self.XC_energy = Exc 1524 self.Kinetic_energy = Ekin 1525 self.Nuclear_repulsion_energy = Enn 1526 self.J_energy = Ecoul 1527 self.Nuc_energy = Enuc 1528 1529 # Set label width and numeric format 1530 label_w = 30 1531 num_fmt = "{:>20.13f}" # 20-wide, 10 decimal places 1532 1533 print(f"\n\n\n------Iteration {itr}--------\n\n", flush=True) 1534 print("Energies\n") 1535 print(f"{'Electron-Nuclear Energy':<{label_w}}{num_fmt.format(Enuc)}") 1536 print(f"{'Nuclear repulsion Energy':<{label_w}}{num_fmt.format(Enn)}") 1537 print(f"{'Kinetic Energy':<{label_w}}{num_fmt.format(Ekin)}") 1538 print(f"{'Coulomb Energy':<{label_w}}{num_fmt.format(Ecoul)}") 1539 print(f"{'Exchange-Correlation Energy':<{label_w}}{num_fmt.format(Exc)}") 1540 print('-' * (label_w + 20)) 1541 print(f"{'Total Energy':<{label_w}}{num_fmt.format(Etot_new)}") 1542 print('-' * (label_w + 20) + "\n\n\n", flush=True) 1543 1544 1545 1546 print('Energy difference : ',abs(Etot_new-Etot), flush=True) 1547 1548 # Check convergence criteria 1549 if abs(Etot_new-Etot)<conv_crit: 1550 scf_converged = True 1551 print('\nSCF Converged after '+str(itr) +' iterations!', flush=True) 1552 Etot = Etot_new 1553 print('\n-------------------------------------', flush=True) 1554 print('Total Energy = ', Etot, flush=True) 1555 print('-------------------------------------\n\n', flush=True) 1556 break 1557 1558 Etot = Etot_new 1559 itr = itr + 1 1560 1561 1562 if itr>=max_itr and not scf_converged: 1563 print('\nSCF NOT Converged after '+str(itr-1) +' iterations!', flush=True) 1564 1565 1566 1567 if not scf_converged: 1568 KS = H + J + Vxc 1569 if self.sao: 1570 # The following gets rid of the extra information in the CAO basis KS matrix by going to SAO and then back to CAO. 1571 # This way even though the matrix dimensions would be that of CAO but the information would be the same as SAO 1572 # leading to the same energy as SAO basis PySCF or TURBOMOLE calculations 1573 KS = np.dot(c2sph_mat, np.dot(KS, c2sph_mat.T)) # CAO --> SAO 1574 KS = np.dot(sph2c_mat_pseudo, np.dot(KS, sph2c_mat_pseudo.T)) #SAO --> CAO 1575 1576 1577 #### DIIS 1578 startDIIS = timer() 1579 diis_start_itr = 1 1580 if itr >= diis_start_itr: 1581 if not self.use_gpu: 1582 with threadpool_limits(limits=ncores, user_api='blas'): 1583 # print('DIIS ', controller.info()) 1584 KS = self.DIIS(S, dmat, KS) 1585 else: 1586 KS = self.DIIS_cupy(S, dmat_cp, KS) 1587 streams[0].synchronize() 1588 cp.cuda.Stream.null.synchronize() 1589 durationDIIS = durationDIIS + timer() - startDIIS 1590 1591 #### Solve KS equation (Diagonalize KS matrix) 1592 startKS = timer() 1593 if self.use_gpu: 1594 eigvalues, eigvectors = self.solve_cupy(KS, S, orthogonalize=True) # Orthogonalization is necessary with CUDA 1595 mo_occ = self.getOcc_cupy(mol, eigvalues, eigvectors) 1596 dmat = self.gen_dm_cupy(eigvectors, mo_occ) 1597 dmat_cp = dmat 1598 dmat = cp.asnumpy(dmat) 1599 streams[0].synchronize() 1600 cp.cuda.Stream.null.synchronize() 1601 # HOMO-LUMO gap 1602 occupied = cp.where(mo_occ > 1e-8)[0] 1603 if len(occupied) < len(eigvalues): 1604 homo_idx = occupied[-1] 1605 lumo_idx = homo_idx + 1 1606 gap = (eigvalues[lumo_idx] - eigvalues[homo_idx]) 1607 print(f"\n\nHOMO-LUMO gap: {gap} au", flush=True) 1608 print(f"HOMO-LUMO gap: {gap*27.211324570273:.3f} eV", flush=True) 1609 else: 1610 with threadpool_limits(limits=ncores, user_api='blas'): 1611 # print('KS eigh', controller.info()) 1612 eigvalues, eigvectors = self.solve(KS, S, orthogonalize=orthogonalize, x=x) 1613 mo_occ = self.getOcc(mol, eigvalues, eigvectors) 1614 dmat = self.gen_dm(eigvectors, mo_occ) 1615 # HOMO-LUMO gap 1616 occupied = np.where(mo_occ > 1e-8)[0] 1617 if len(occupied) < len(eigvalues): 1618 homo_idx = occupied[-1] 1619 lumo_idx = homo_idx + 1 1620 gap = (eigvalues[lumo_idx] - eigvalues[homo_idx]) 1621 print(f"\n\nHOMO-LUMO gap: {gap} au", flush=True) 1622 print(f"HOMO-LUMO gap: {gap*27.211324570273:.3f} eV", flush=True) 1623 durationKS = durationKS + timer() - startKS 1624 1625 self.dmat = dmat 1626 self.mo_coefficients = eigvectors 1627 self.mo_energies = eigvalues 1628 self.mo_occupations = mo_occ 1629 1630 # Check when to switch to double precision for XC 1631 if self.use_gpu: 1632 if precision_XC is cp.float32: 1633 if abs(Etot_new-Etot)/abs(Etot_new)<5e-7: 1634 precision_XC = cp.float64 1635 print('\nSwitching to double precision for XC evaluation after '+str(itr) +' iterations!', flush=True) 1636 1637 1638 1639 durationItr = timer() - startIter 1640 print('\n\nTime taken for the previous iteration: '+str(durationItr)+'\n\n', flush=True) 1641 1642 # # Check convergence criteria 1643 # if abs(Etot_new-Etot)<conv_crit: 1644 # scf_converged = True 1645 # print('\nSCF Converged after '+str(itr) +' iterations!', flush=True) 1646 # Etot = Etot_new 1647 # print('\n-------------------------------------', flush=True) 1648 # print('Total Energy = ', Etot, flush=True) 1649 # print('-------------------------------------\n\n', flush=True) 1650 # break 1651 1652 # Etot = Etot_new 1653 # itr = itr + 1 1654 1655 1656 # if itr>=max_itr and not scf_converged: 1657 # print('\nSCF NOT Converged after '+str(itr-1) +' iterations!', flush=True) 1658 1659 1660 1661 durationSCF = timer() - startSCF 1662 # print(dmat) 1663 print('\nTime taken : '+str(durationSCF) +' seconds.', flush=True) 1664 print('\n\n', flush=True) 1665 print('-------------------------------------', flush=True) 1666 print('Profiling', flush=True) 1667 print('-------------------------------------', flush=True) 1668 print('Preprocessing ', durationXCpreprocessing + durationAO_values + durationgrids_prune_rho + durationSchwarz) 1669 if isDF: 1670 print('Density Fitting ', durationDF, flush=True) 1671 if DF_algo==6 or DF_algo==10: 1672 print(' DF (gamma) ', durationDF_gamma, flush=True) 1673 print(' DF (coeff) ', durationDF_coeff, flush=True) 1674 print(' DF (Jtri) ', durationDF_Jtri, flush=True) 1675 if cholesky: 1676 print(' DF (Cholesky) ', durationDF_cholesky, flush=True) 1677 print('DIIS ', durationDIIS, flush=True) 1678 print('KS matrix diagonalization ', durationKS, flush=True) 1679 print('One electron Integrals (S, T, Vnuc) ', duration1e, flush=True) 1680 if isDF: 1681 print('Coulomb Integrals (2c2e + 3c2e) ', durationCoulomb-durationSchwarz-durationDF_cholesky, flush=True) 1682 if not isDF: 1683 print('Coulomb Integrals (4c2e) ', durationCoulomb-durationSchwarz, flush=True) 1684 print('Grids construction ', durationgrids, flush=True) 1685 print('Exchange-Correlation Term ', durationxc, flush=True) 1686 totalTime = durationXCpreprocessing + durationAO_values + duration1e + durationCoulomb - durationDF_cholesky + \ 1687 durationgrids + durationxc + durationDF + durationKS + durationDIIS + durationgrids_prune_rho 1688 print('Misc. ', durationSCF - totalTime, flush=True) 1689 print('Complete SCF ', durationSCF, flush=True) 1690 1691 if self.use_gpu: 1692 # Free memory of all GPUs 1693 for igpu in range(self.n_gpus): 1694 cp.cuda.Device(igpu).use() 1695 cp._default_memory_pool.free_all_blocks() 1696 1697 # Switch back to main GPU 1698 cp.cuda.Device(0).use() 1699 1700 return Etot, dmat
Perform Self-Consistent Field (SCF) calculation for Density Functional Theory (DFT).
This method implements a complete DFT SCF procedure including:
- One-electron integral calculation (overlap, kinetic, nuclear attraction)
- Two-electron integral calculation (4-center ERIs or density fitting)
- Grid generation and pruning for exchange-correlation evaluation
- Iterative SCF cycles with DIIS convergence acceleration
- Exchange-correlation energy and potential evaluation using LibXC
- GPU acceleration support for performance-critical operations
The implementation supports multiple algorithmic variants:
- Density fitting (DF)
- Schwarz screening for integral sparsity
- Multiple XC evaluation algorithms (CPU/GPU optimized)
- Dynamic precision switching for GPU calculations
SCF Procedure:
- Initialize one-electron integrals (S, T, V_nuc)
- Calculate/prepare two-electron integrals with optional screening
- Generate and prune integration grids for XC evaluation
- Iterative SCF loop:
- Build Coulomb matrix J from density matrix
- Evaluate exchange-correlation energy/potential on grids
- Form Kohn-Sham matrix: H_KS = H_core + J + V_xc
- Apply DIIS convergence acceleration
- Diagonalize KS matrix to get new orbitals
- Generate new density matrix from occupied orbitals
- Check energy convergence
- Return converged total energy and density matrix
Computational Features:
- Multi-threading support via Numba and configurable core count
- GPU acceleration using CuPy for grid-based operations
- Memory-efficient batched evaluation of XC terms
- Multiple density fitting algorithms (DF_algo 1-10)
- Basis function screening for XC evaluation efficiency
- Optional Cholesky decomposition for 2-center integrals
Returns:
tuple[float, numpy.ndarray] Etot : float Converged total electronic energy in atomic units dmat : numpy.ndarray, shape (nbf, nbf) Converged density matrix in atomic orbital basis
Raises:
ConvergenceError If SCF fails to converge within max_itr iterations
Notes:
- Uses class attributes for all computational parameters (basis, xc, conv_crit, etc.)
- Extensive timing and profiling information printed during execution
- Memory usage information displayed for large arrays
- GPU memory automatically freed after completion
- Supports both Cartesian (CAO) and Spherical (SAO) atomic orbital bases
The function performs comprehensive error checking and provides detailed timing breakdowns for performance analysis. GPU acceleration is automatically enabled when CuPy is available and use_gpu=True.
Example Energy Components:
- Electron-nuclear attraction energy
- Nuclear repulsion energy
- Kinetic energy
- Coulomb (electron-electron repulsion) energy
- Exchange-correlation energy