Source code for dgamore.memory_estimator

# SPDX-FileCopyrightText: 2025-2026 Julian Peil <julian.peil@tuwien.ac.at>
# SPDX-License-Identifier: MIT
#
# DGAmore — Multi-Orbital Ladder Dynamical Vertex Approximation (LDGA) &
#           Eliashberg Equation Solver for Strongly Correlated Electron Systems
"""
Pure, side-effect-free estimator of the peak host-memory of the memory-sensitive DGAmore operations. Each
``save_memory_*`` switch in :class:`dgamore.config.MemoryConfig` selects between a fast (flag off) and a lean
(flag on) code path; this module estimates the peak bytes of the dominant arrays of both paths so the driver can set
the flags automatically. Apart from the global storage precision :data:`dgamore.n_point_base.DTYPE` (the single
source of truth for the per-element size), it pulls in no run-state from the package -- no MPI, no ``psutil``, no
config singleton: every input is passed as an argument, which keeps the formulas unit-testable in isolation.

All heavy quantities are backed by a single :data:`~dgamore.n_point_base.DTYPE` array, and q-points are distributed
across MPI ranks, so per-rank arrays scale with the per-rank q-count rather than the total. Only the dominant large
arrays of each branch are modeled; a single global ``OVERHEAD_FACTOR`` accounts for the un-modeled transients.
"""

from dataclasses import dataclass

import numpy as np

from dgamore.n_point_base import DTYPE

# Bytes per stored array element, taken from the global storage precision so this stays correct if DTYPE is switched
# (e.g. to complex128).
DTYPE_BYTES: int = np.dtype(DTYPE).itemsize
OVERHEAD_FACTOR: float = 1.1

# chiq_aux builds its two-fermion block via ``(gchi0_inv + gamma) - (v + u)``; each add/sub returns a new full block
# while its input block is still live, and the following invert_and_sum_over_last_vn keeps its input block resident
# while looping over q (the per-q inversion transient is single-q, hence negligible). The peak is therefore
# ~2x the rank-local block, not 1x.
CHIQ_AUX_INVERT_FACTOR: int = 2

# fq builds the block and combines it with whole-block compound-index matmuls (gchi0_q_inv @ f @ gchi0_q_inv) plus a
# second accumulated term, holding ~3 full blocks live at once.
FQ_MATMUL_FACTOR: int = 3



[docs]
@dataclass(frozen=True)
class BranchPeak:
    """
    Per-rank transient peak bytes of one memory-sensitive branch, split by how the peak is distributed across the MPI
    ranks of a node (the persistent baseline is reported separately and is *not* included here).

    For the node-total budget the memory on a node with ``r`` ranks at this branch's peak is
    ``r * (baseline + distributed) + single``: a *distributed* transient is held by every rank simultaneously (so it
    scales with ``r``), while a *single-rank* transient is built on one rank while the others idle (so it is counted
    once). Both the fast (``off``) and lean (``on``) code paths are described.

    :ivar off_distributed: Per-rank transient bytes held by every rank in the fast (flag-off) path.
    :ivar off_single: Transient bytes held by a single rank in the fast (flag-off) path.
    :ivar on_distributed: Per-rank transient bytes held by every rank in the lean (flag-on) path.
    :ivar on_single: Transient bytes held by a single rank in the lean (flag-on) path.
    """

    off_distributed: float
    off_single: float
    on_distributed: float
    on_single: float



def _ceil_div(a: int, b: int) -> int:
    """
    Returns the ceiling of ``a / b`` for non-negative integers (per-rank task counts).

    :param a: Numerator (e.g. the total number of q-points).
    :param b: Denominator (e.g. the number of MPI ranks).
    :return: ``ceil(a / b)`` as an int (at least 1 if ``a > 0``).
    """
    return -(-a // b)


def _two_fermion_block(q: int, nb: int, nw: int, nv: int) -> int:
    """
    Returns the element count of a full two-fermion four-point block ``[q, nb^4, nw, nv, nv]``.

    :param q: Number of (rank-local) momentum points.
    :param nb: Number of bands.
    :param nw: Number of bosonic frequencies.
    :param nv: Number of fermionic frequencies (single axis length).
    :return: The number of complex elements.
    """
    return q * nb**4 * nw * nv * nv


def _bubble_block(q: int, nb: int, nw: int, nv: int) -> int:
    """
    Returns the element count of a bubble / kernel block with a single fermionic axis ``[q, nb^4, nw, nv]``.

    :param q: Number of (rank-local) momentum points.
    :param nb: Number of bands.
    :param nw: Number of bosonic frequencies.
    :param nv: Number of fermionic frequencies (single axis length).
    :return: The number of complex elements.
    """
    return q * nb**4 * nw * nv


def _giwk_rspace(nk_tot: int, nb: int, nv: int) -> int:
    """
    Returns the element count of a momentum-space Green's function replicated over the full grid ``[nk_tot, nb^2, nv]``
    (the FFT paths and the persistent baseline hold such replicated buffers).

    :param nk_tot: Total number of momentum points (full BZ).
    :param nb: Number of bands.
    :param nv: Number of fermionic frequencies (single axis length).
    :return: The number of complex elements.
    """
    return nk_tot * nb**2 * nv



[docs]
def estimate_peaks(
    *,
    n_bands: int,
    nk_tot: int,
    nk_irr: int,
    niw_core: int,
    niv_core: int,
    niv_full: int,
    niv_cut: int,
    niv_pp: int,
    n_ranks: int,
    with_eliashberg: bool,
    save_fq: bool = False,
    construct_fq_cheap: bool = False,
    overhead: float = OVERHEAD_FACTOR,
) -> tuple[float, dict[str, BranchPeak]]:
    r"""
    Estimates the per-rank transient peak host-memory (in bytes) of the fast and lean code path of each
    memory-sensitive branch, split by whether each transient is distributed across the ranks of a node or built on a
    single rank, together with the per-rank persistent baseline.

    The returned dict maps a branch key to a :class:`BranchPeak`; the branch keys mirror the ``save_memory_for_*``
    switches: ``"chi0q"``, ``"chiq_aux"``, ``"sde"`` are always present; ``"fq"`` and ``"lanczos"`` are added only
    when ``with_eliashberg`` is True. The first tuple element is the per-rank persistent baseline (the replicated
    full-grid Green's function and self-energies that stay live throughout the non-local routine); the caller adds it
    to the node total (every rank holds it). For a node with ``r`` ranks the memory at a branch's peak is
    ``r * (baseline + distributed) + single``.

    :param n_bands: Number of bands :math:`B`.
    :param nk_tot: Total number of momentum points (full BZ).
    :param nk_irr: Number of momentum points in the irreducible BZ.
    :param niw_core: Number of positive bosonic core frequencies.
    :param niv_core: Number of positive fermionic core frequencies.
    :param niv_full: Number of positive fermionic full-region frequencies.
    :param niv_cut: Number of positive fermionic frequencies the full-grid ``giwk_full`` is kept at through the
        kernel/SDE section (``min(niw_core + niv_full + 10, niv_dmft)`` in
        :func:`dgamore.nonlocal_sde.calculate_self_energy_q`); the SDE self-energy contraction needs the shell window,
        so giwk is not shrunk to the core box here.
    :param niv_pp: Number of positive fermionic frequencies of the pp (Eliashberg) box.
    :param n_ranks: Number of MPI ranks the q-points are distributed over.
    :param with_eliashberg: Whether the Eliashberg step runs (adds the ``"fq"`` and ``"lanczos"`` branches).
    :param save_fq: Whether the full ladder vertex is kept in the full ph box (``config.eliashberg.save_fq``); when
        True the per-rank ``fq`` accumulator spans the full ``[wn, vc, vc]`` block instead of the small pp box.
    :param construct_fq_cheap: Whether the ``fq`` per-q blocks are built on the smaller pp frequency box
        (``config.eliashberg.construct_fq_cheap``), shrinking every per-q two-fermion block from ``vc`` to ``vpp``.
    :param overhead: Global multiplicative factor accounting for un-modeled transient arrays.
    :return: A tuple ``(baseline_bytes, peaks)`` of the per-rank baseline and a dict mapping each branch key to its
        :class:`BranchPeak`.
    """
    nb = n_bands
    wp = niw_core + 1  # half bosonic range, as the heavy objects are constructed
    vc = 2 * niv_core
    vf = 2 * niv_full
    vpp = 2 * niv_pp

    qi = _ceil_div(nk_irr, n_ranks)  # per-rank irreducible-BZ q-count
    qt = _ceil_div(nk_tot, n_ranks)  # per-rank full-BZ q-count

    scale = DTYPE_BYTES * overhead

    # Per-rank persistent baseline at the heavy-section (kernel/SDE) peak: the two full-grid two-point objects that
    # stay live on every rank -- giwk_full and sigma_old, both kept at niv_cut through the SDE (giwk's shell window is
    # needed by the self-energy contraction; sigma_old keeps its DMFT shell for the mixing/residual). See
    # nonlocal_sde.calculate_self_energy_q. The remaining self-energies (sigma_dmft, sigma_dmft_full, delta_sigma) are
    # local (a single k-point) and negligible.
    baseline = scale * 2 * _giwk_rspace(nk_tot, nb, 2 * niv_cut)

    peaks: dict[str, BranchPeak] = {}

    # chi0q: fast path (FFT, create_generalized_chi0_q_fft) builds the WHOLE irreducible-BZ bubble on rank 0
    # (nk_irr, not the per-rank q-count) plus TWO full-grid B^4 buffers, each with one fermionic axis over nk_tot --
    # the preallocated ``chi_r_v_buffer`` multiply target and the equally large array returned by ``xp.fft.ifftn``
    # each iw -- and ~2 replicated real-space Green's functions over the (niv_full + niw_core) window. All on rank 0,
    # so the whole fast path is a SINGLE-rank transient. The lean per-q einsum builds only this rank's q-slice of
    # the bubble plus its own ~2 Green's-function buffers, so it is DISTRIBUTED.
    gf_copies = 2 * _giwk_rspace(nk_tot, nb, 2 * (niv_full + niw_core))
    peaks["chi0q"] = BranchPeak(
        off_distributed=0.0,
        off_single=scale * (_bubble_block(nk_irr, nb, wp, vf) + 2 * _bubble_block(nk_tot, nb, 1, vf) + gf_copies),
        on_distributed=scale * (_bubble_block(qi, nb, wp, vf) + gf_copies),
        on_single=0.0,
    )

    # chiq_aux: fast path (v1) materializes the whole rank-local two-fermion block on every rank (DISTRIBUTED) and
    # inverts it one q at a time, plus the full-BZ kernel gathered on a SINGLE rank (rank 0). The lean path (v3)
    # builds one q at a time and accumulates the (1-fermion) summed result, all DISTRIBUTED.
    peaks["chiq_aux"] = BranchPeak(
        off_distributed=scale * CHIQ_AUX_INVERT_FACTOR * _two_fermion_block(qi, nb, wp, vc),
        off_single=scale * _bubble_block(nk_tot, nb, wp, vc),
        on_distributed=scale
        * (CHIQ_AUX_INVERT_FACTOR * _two_fermion_block(1, nb, wp, vc) + _bubble_block(qi, nb, wp, vc)),
        on_single=0.0,
    )

    # sde: both paths are DISTRIBUTED. The fast path (FFT) materializes the full-BZ kernel on every rank plus a
    # replicated R-space Green's function (giwk is kept at niv_cut by this point) and the kernel's own R-space buffer.
    # The lean q-loop (calculate_sigma_from_kernel_cpu) maps the kernel once but then makes a full
    # ``np.asfortranarray(giwk.mat)`` copy of the Green's function (and a Fortran copy of the kernel of comparable
    # size to the mapped one), so beyond the mapped kernel it holds a replicated full-grid Green's function at
    # niv_cut. Both paths carry a comparable kernel transient (the FFT path redistributes it, the q-loop copies it),
    # so only the giwk copy is counted here.
    sde_kernel_full = _bubble_block(qt, nb, wp, vc)
    peaks["sde"] = BranchPeak(
        off_distributed=scale * (sde_kernel_full + 2 * _giwk_rspace(nk_tot, nb, 2 * niv_cut)),
        off_single=0.0,
        on_distributed=scale * (sde_kernel_full + _giwk_rspace(nk_tot, nb, 2 * niv_cut)),
        on_single=0.0,
    )

    if with_eliashberg:
        # fq: both paths are DISTRIBUTED. The fast path builds the whole rank-local two-fermion ph block and combines
        # it with whole-block compound matmuls, holding ~FQ_MATMUL_FACTOR full blocks live; the lean path does the
        # same one q at a time but additionally writes into a rank-local accumulator (f_q_r_mat) spanning ALL
        # rank-local q-points. ``construct_fq_cheap`` shrinks every per-q construction block from vc to vpp; the
        # accumulator keeps the full ph box [wn, vc, vc] when ``save_fq`` is set, otherwise the small pp box [vpp, vpp].
        vc_fq = vpp if construct_fq_cheap else vc
        fq_accumulator = _two_fermion_block(qi, nb, wp, vc) if save_fq else _two_fermion_block(qi, nb, 1, vpp)
        peaks["fq"] = BranchPeak(
            off_distributed=scale * FQ_MATMUL_FACTOR * _two_fermion_block(qi, nb, wp, vc_fq),
            off_single=0.0,
            on_distributed=scale * (FQ_MATMUL_FACTOR * _two_fermion_block(1, nb, wp, vc_fq) + fq_accumulator),
            on_single=0.0,
        )

        # lanczos: the fast (in-memory) path assembles the entire BZ pairing vertex on ONE rank AND a momentum-flipped
        # copy of it (flip_momentum_axis allocates a fresh full array), so two full-BZ vertices are live on a SINGLE
        # rank. The lean path (gather_full_ibz_for_vslice + map_to_full_bz) hands every rank the FULL BZ with only a
        # slice of the second fermionic frequency, so its per-rank share scales with the full-BZ per-rank q-count
        # (nk_tot/n_ranks), NOT the irreducible one -- also two copies (vertex + flipped), DISTRIBUTED.
        peaks["lanczos"] = BranchPeak(
            off_distributed=0.0,
            off_single=scale * 2 * _two_fermion_block(nk_tot, nb, 1, vpp),
            on_distributed=scale * 2 * _two_fermion_block(qt, nb, 1, vpp),
            on_single=0.0,
        )

    return baseline, peaks