Source code for dgamore.memory_estimator

# SPDX-FileCopyrightText: 2025-2026 Julian Peil <julian.peil@tuwien.ac.at>
# SPDX-License-Identifier: MIT
#
# DGAmore — Multi-Orbital Ladder Dynamical Vertex Approximation (LDGA) &
#           Eliashberg Equation Solver for Strongly Correlated Electron Systems
"""
Pure, side-effect-free estimator of the peak host-memory of the memory-sensitive DGAmore operations. Each
``save_memory_*`` switch in :class:`dgamore.config.MemoryConfig` selects between a fast (flag off) and a lean
(flag on) code path; this module estimates the peak bytes of the dominant arrays of both paths so the driver can set
the flags automatically. Apart from the global storage precision :data:`dgamore.n_point_base.DTYPE` (the single
source of truth for the per-element size), it pulls in no run-state from the package -- no MPI, no ``psutil``, no
config singleton: every input is passed as an argument, which keeps the formulas unit-testable in isolation.

All heavy quantities are backed by a single :data:`~dgamore.n_point_base.DTYPE` array, and q-points are distributed
across MPI ranks, so per-rank arrays scale with the per-rank q-count rather than the total. Only the dominant large
arrays of each branch are modeled; a single global ``OVERHEAD_FACTOR`` accounts for the un-modeled transients.
"""

from dataclasses import dataclass

import numpy as np

from dgamore.n_point_base import DTYPE

# Bytes per stored array element, taken from the global storage precision so this stays correct if DTYPE is switched
# (e.g. to complex128).
DTYPE_BYTES: int = np.dtype(DTYPE).itemsize
OVERHEAD_FACTOR: float = 1.1

# chiq_aux builds its two-fermion block via ``(gchi0_inv + gamma) - (v + u)``; each add/sub returns a new full block
# while its input block is still live, and the following invert_and_sum_over_last_vn keeps its input block resident
# while looping over q (the per-q inversion transient is single-q, hence negligible). The peak is therefore
# ~2x the rank-local block, not 1x.
CHIQ_AUX_INVERT_FACTOR: int = 2

# fq builds the block and combines it with whole-block compound-index matmuls (gchi0_q_inv @ f @ gchi0_q_inv) plus a
# second accumulated term, holding ~3 full blocks live at once.
FQ_MATMUL_FACTOR: int = 3


[docs] @dataclass(frozen=True) class BranchPeak: """ Per-rank transient peak bytes of one memory-sensitive branch, split by how the peak is distributed across the MPI ranks of a node (the persistent baseline is reported separately and is *not* included here). For the node-total budget the memory on a node with ``r`` ranks at this branch's peak is ``r * (baseline + distributed) + single``: a *distributed* transient is held by every rank simultaneously (so it scales with ``r``), while a *single-rank* transient is built on one rank while the others idle (so it is counted once). Both the fast (``off``) and lean (``on``) code paths are described. :ivar off_distributed: Per-rank transient bytes held by every rank in the fast (flag-off) path. :ivar off_single: Transient bytes held by a single rank in the fast (flag-off) path. :ivar on_distributed: Per-rank transient bytes held by every rank in the lean (flag-on) path. :ivar on_single: Transient bytes held by a single rank in the lean (flag-on) path. """ off_distributed: float off_single: float on_distributed: float on_single: float
def _ceil_div(a: int, b: int) -> int: """ Returns the ceiling of ``a / b`` for non-negative integers (per-rank task counts). :param a: Numerator (e.g. the total number of q-points). :param b: Denominator (e.g. the number of MPI ranks). :return: ``ceil(a / b)`` as an int (at least 1 if ``a > 0``). """ return -(-a // b) def _two_fermion_block(q: int, nb: int, nw: int, nv: int) -> int: """ Returns the element count of a full two-fermion four-point block ``[q, nb^4, nw, nv, nv]``. :param q: Number of (rank-local) momentum points. :param nb: Number of bands. :param nw: Number of bosonic frequencies. :param nv: Number of fermionic frequencies (single axis length). :return: The number of complex elements. """ return q * nb**4 * nw * nv * nv def _bubble_block(q: int, nb: int, nw: int, nv: int) -> int: """ Returns the element count of a bubble / kernel block with a single fermionic axis ``[q, nb^4, nw, nv]``. :param q: Number of (rank-local) momentum points. :param nb: Number of bands. :param nw: Number of bosonic frequencies. :param nv: Number of fermionic frequencies (single axis length). :return: The number of complex elements. """ return q * nb**4 * nw * nv def _giwk_rspace(nk_tot: int, nb: int, nv: int) -> int: """ Returns the element count of a momentum-space Green's function replicated over the full grid ``[nk_tot, nb^2, nv]`` (the FFT paths and the persistent baseline hold such replicated buffers). :param nk_tot: Total number of momentum points (full BZ). :param nb: Number of bands. :param nv: Number of fermionic frequencies (single axis length). :return: The number of complex elements. """ return nk_tot * nb**2 * nv
[docs] def estimate_peaks( *, n_bands: int, nk_tot: int, nk_irr: int, niw_core: int, niv_core: int, niv_full: int, niv_cut: int, niv_pp: int, n_ranks: int, with_eliashberg: bool, save_fq: bool = False, construct_fq_cheap: bool = False, overhead: float = OVERHEAD_FACTOR, ) -> tuple[float, dict[str, BranchPeak]]: r""" Estimates the per-rank transient peak host-memory (in bytes) of the fast and lean code path of each memory-sensitive branch, split by whether each transient is distributed across the ranks of a node or built on a single rank, together with the per-rank persistent baseline. The returned dict maps a branch key to a :class:`BranchPeak`; the branch keys mirror the ``save_memory_for_*`` switches: ``"chi0q"``, ``"chiq_aux"``, ``"sde"`` are always present; ``"fq"`` and ``"lanczos"`` are added only when ``with_eliashberg`` is True. The first tuple element is the per-rank persistent baseline (the replicated full-grid Green's function and self-energies that stay live throughout the non-local routine); the caller adds it to the node total (every rank holds it). For a node with ``r`` ranks the memory at a branch's peak is ``r * (baseline + distributed) + single``. :param n_bands: Number of bands :math:`B`. :param nk_tot: Total number of momentum points (full BZ). :param nk_irr: Number of momentum points in the irreducible BZ. :param niw_core: Number of positive bosonic core frequencies. :param niv_core: Number of positive fermionic core frequencies. :param niv_full: Number of positive fermionic full-region frequencies. :param niv_cut: Number of positive fermionic frequencies the full-grid ``giwk_full`` is kept at through the kernel/SDE section (``min(niw_core + niv_full + 10, niv_dmft)`` in :func:`dgamore.nonlocal_sde.calculate_self_energy_q`); the SDE self-energy contraction needs the shell window, so giwk is not shrunk to the core box here. :param niv_pp: Number of positive fermionic frequencies of the pp (Eliashberg) box. :param n_ranks: Number of MPI ranks the q-points are distributed over. :param with_eliashberg: Whether the Eliashberg step runs (adds the ``"fq"`` and ``"lanczos"`` branches). :param save_fq: Whether the full ladder vertex is kept in the full ph box (``config.eliashberg.save_fq``); when True the per-rank ``fq`` accumulator spans the full ``[wn, vc, vc]`` block instead of the small pp box. :param construct_fq_cheap: Whether the ``fq`` per-q blocks are built on the smaller pp frequency box (``config.eliashberg.construct_fq_cheap``), shrinking every per-q two-fermion block from ``vc`` to ``vpp``. :param overhead: Global multiplicative factor accounting for un-modeled transient arrays. :return: A tuple ``(baseline_bytes, peaks)`` of the per-rank baseline and a dict mapping each branch key to its :class:`BranchPeak`. """ nb = n_bands wp = niw_core + 1 # half bosonic range, as the heavy objects are constructed vc = 2 * niv_core vf = 2 * niv_full vpp = 2 * niv_pp qi = _ceil_div(nk_irr, n_ranks) # per-rank irreducible-BZ q-count qt = _ceil_div(nk_tot, n_ranks) # per-rank full-BZ q-count scale = DTYPE_BYTES * overhead # Per-rank persistent baseline at the heavy-section (kernel/SDE) peak: the two full-grid two-point objects that # stay live on every rank -- giwk_full and sigma_old, both kept at niv_cut through the SDE (giwk's shell window is # needed by the self-energy contraction; sigma_old keeps its DMFT shell for the mixing/residual). See # nonlocal_sde.calculate_self_energy_q. The remaining self-energies (sigma_dmft, sigma_dmft_full, delta_sigma) are # local (a single k-point) and negligible. baseline = scale * 2 * _giwk_rspace(nk_tot, nb, 2 * niv_cut) peaks: dict[str, BranchPeak] = {} # chi0q: fast path (FFT, create_generalized_chi0_q_fft) builds the WHOLE irreducible-BZ bubble on rank 0 # (nk_irr, not the per-rank q-count) plus TWO full-grid B^4 buffers, each with one fermionic axis over nk_tot -- # the preallocated ``chi_r_v_buffer`` multiply target and the equally large array returned by ``xp.fft.ifftn`` # each iw -- and ~2 replicated real-space Green's functions over the (niv_full + niw_core) window. All on rank 0, # so the whole fast path is a SINGLE-rank transient. The lean per-q einsum builds only this rank's q-slice of # the bubble plus its own ~2 Green's-function buffers, so it is DISTRIBUTED. gf_copies = 2 * _giwk_rspace(nk_tot, nb, 2 * (niv_full + niw_core)) peaks["chi0q"] = BranchPeak( off_distributed=0.0, off_single=scale * (_bubble_block(nk_irr, nb, wp, vf) + 2 * _bubble_block(nk_tot, nb, 1, vf) + gf_copies), on_distributed=scale * (_bubble_block(qi, nb, wp, vf) + gf_copies), on_single=0.0, ) # chiq_aux: fast path (v1) materializes the whole rank-local two-fermion block on every rank (DISTRIBUTED) and # inverts it one q at a time, plus the full-BZ kernel gathered on a SINGLE rank (rank 0). The lean path (v3) # builds one q at a time and accumulates the (1-fermion) summed result, all DISTRIBUTED. peaks["chiq_aux"] = BranchPeak( off_distributed=scale * CHIQ_AUX_INVERT_FACTOR * _two_fermion_block(qi, nb, wp, vc), off_single=scale * _bubble_block(nk_tot, nb, wp, vc), on_distributed=scale * (CHIQ_AUX_INVERT_FACTOR * _two_fermion_block(1, nb, wp, vc) + _bubble_block(qi, nb, wp, vc)), on_single=0.0, ) # sde: both paths are DISTRIBUTED. The fast path (FFT) materializes the full-BZ kernel on every rank plus a # replicated R-space Green's function (giwk is kept at niv_cut by this point) and the kernel's own R-space buffer. # The lean q-loop (calculate_sigma_from_kernel_cpu) maps the kernel once but then makes a full # ``np.asfortranarray(giwk.mat)`` copy of the Green's function (and a Fortran copy of the kernel of comparable # size to the mapped one), so beyond the mapped kernel it holds a replicated full-grid Green's function at # niv_cut. Both paths carry a comparable kernel transient (the FFT path redistributes it, the q-loop copies it), # so only the giwk copy is counted here. sde_kernel_full = _bubble_block(qt, nb, wp, vc) peaks["sde"] = BranchPeak( off_distributed=scale * (sde_kernel_full + 2 * _giwk_rspace(nk_tot, nb, 2 * niv_cut)), off_single=0.0, on_distributed=scale * (sde_kernel_full + _giwk_rspace(nk_tot, nb, 2 * niv_cut)), on_single=0.0, ) if with_eliashberg: # fq: both paths are DISTRIBUTED. The fast path builds the whole rank-local two-fermion ph block and combines # it with whole-block compound matmuls, holding ~FQ_MATMUL_FACTOR full blocks live; the lean path does the # same one q at a time but additionally writes into a rank-local accumulator (f_q_r_mat) spanning ALL # rank-local q-points. ``construct_fq_cheap`` shrinks every per-q construction block from vc to vpp; the # accumulator keeps the full ph box [wn, vc, vc] when ``save_fq`` is set, otherwise the small pp box [vpp, vpp]. vc_fq = vpp if construct_fq_cheap else vc fq_accumulator = _two_fermion_block(qi, nb, wp, vc) if save_fq else _two_fermion_block(qi, nb, 1, vpp) peaks["fq"] = BranchPeak( off_distributed=scale * FQ_MATMUL_FACTOR * _two_fermion_block(qi, nb, wp, vc_fq), off_single=0.0, on_distributed=scale * (FQ_MATMUL_FACTOR * _two_fermion_block(1, nb, wp, vc_fq) + fq_accumulator), on_single=0.0, ) # lanczos: the fast (in-memory) path assembles the entire BZ pairing vertex on ONE rank AND a momentum-flipped # copy of it (flip_momentum_axis allocates a fresh full array), so two full-BZ vertices are live on a SINGLE # rank. The lean path (gather_full_ibz_for_vslice + map_to_full_bz) hands every rank the FULL BZ with only a # slice of the second fermionic frequency, so its per-rank share scales with the full-BZ per-rank q-count # (nk_tot/n_ranks), NOT the irreducible one -- also two copies (vertex + flipped), DISTRIBUTED. peaks["lanczos"] = BranchPeak( off_distributed=0.0, off_single=scale * 2 * _two_fermion_block(nk_tot, nb, 1, vpp), on_distributed=scale * 2 * _two_fermion_block(qt, nb, 1, vpp), on_single=0.0, ) return baseline, peaks