Source code for torchani.neurochem

r"""
This module is part of the *Legacy API* of TorchANI 2 and should not be used in new
code. It should only be used if you need to interface with previously trained ANI models
in the NeuroChem format. It contains tools for loading files in the NeuroChem format,
the original file format used in the first `ANI`_ article.

.. _ANI:
    http://pubs.rsc.org/en/Content/ArticleLanding/2017/SC/C6SC05720A#!divAbstract
"""

import itertools
from pathlib import Path
from dataclasses import dataclass
import typing as tp
import struct
import bz2
import shutil
from collections import OrderedDict

import torch
import typing_extensions as tpx

from torchani.paths import neurochem_dir
from torchani.arch import ANI
from torchani.aev import AEVComputer
from torchani.nn import (
    ANINetworks,
    Ensemble,
    AtomicNetwork,
    AtomicContainer,
    TightCELU,
)
from torchani.cutoffs import CutoffArg
from torchani.neighbors import NeighborlistArg
from torchani.sae import SelfEnergy
from torchani.utils import download_and_extract
from torchani.annotations import StrPath


def model_dir_from_prefix(prefix: Path, idx: int) -> Path:
    network_path = (prefix.parent / f"{prefix.name}{idx}") / "networks"
    return network_path


class NeurochemParseError(RuntimeError):
    pass


@dataclass
class NeurochemLayerSpec:
    nodes: int
    activation: int
    kind: int
    blocksize: int
    dropout: int
    dropset: float
    maskupdate: float
    maxnorm: int
    norm: float
    normupdate: float
    l2norm: int
    l2value: float
    batchnorm: int
    weights: str
    weight_numel: int
    biases: str
    bias_numel: int


def load_aev_computer_and_symbols(
    consts_file: StrPath,
    strategy: str = "pyaev",
    neighborlist: NeighborlistArg = "all_pairs",
    cutoff_fn: CutoffArg = "cosine",
) -> tp.Tuple[AEVComputer, tp.Tuple[str, ...]]:
    consts, symbols = load_aev_constants_and_symbols(consts_file)
    aev_computer = AEVComputer.from_constants(
        radial_cutoff=consts.radial_cutoff,
        angular_cutoff=consts.angular_cutoff,
        radial_eta=consts.radial_eta,
        radial_shifts=consts.radial_shifts,
        angular_eta=consts.angular_eta,
        angular_zeta=consts.angular_zeta,
        angular_shifts=consts.angular_shifts,
        sections=consts.sections,
        num_species=len(symbols),
        strategy=strategy,
        cutoff_fn=cutoff_fn,
        neighborlist=neighborlist,
    )
    return aev_computer, symbols


@dataclass
class AEVConstants:
    radial_cutoff: float
    radial_eta: float
    radial_shifts: tp.Tuple[float, ...]
    angular_cutoff: float
    angular_eta: float
    angular_zeta: float
    angular_shifts: tp.Tuple[float, ...]
    sections: tp.Tuple[float, ...]


def load_aev_constants_and_symbols(
    consts_file: StrPath,
) -> tp.Tuple[AEVConstants, tp.Tuple[str, ...]]:
    aev_floats: tp.Dict[str, float] = {}
    aev_seqs: tp.Dict[str, tp.Tuple[float, ...]] = {}
    file_name_mapping = {
        "Rcr": "radial_cutoff",
        "Rca": "angular_cutoff",
        "EtaR": "radial_eta",
        "ShfR": "radial_shifts",
        "ShfA": "angular_shifts",
        "ShfZ": "sections",
        "EtaA": "angular_eta",
        "Zeta": "angular_zeta",
    }
    with open(consts_file) as f:
        for i in f:
            try:
                line = [x.strip() for x in i.split("=")]
                name = line[0]
                value = line[1]
                if name in ["Rcr", "Rca"]:
                    aev_floats[file_name_mapping[name]] = float(value)
                elif name in ["EtaR", "ShfR", "Zeta", "ShfZ", "EtaA", "ShfA"]:
                    float_values = tuple(
                        float(x.strip())
                        for x in value.replace("[", "").replace("]", "").split(",")
                    )
                    if name in ["EtaR", "Zeta", "EtaA"]:
                        assert len(float_values) == 1
                        aev_floats[file_name_mapping[name]] = float_values[0]
                    else:
                        aev_seqs[file_name_mapping[name]] = float_values
                elif name == "Atyp":
                    symbols = tuple(
                        x.strip()
                        for x in value.replace("[", "").replace("]", "").split(",")
                    )
            except Exception:
                raise NeurochemParseError(
                    f"Unable to parse const file {consts_file}"
                ) from None
    constants = AEVConstants(
        radial_cutoff=aev_floats["radial_cutoff"],
        angular_cutoff=aev_floats["angular_cutoff"],
        radial_eta=aev_floats["radial_eta"],
        angular_eta=aev_floats["angular_eta"],
        angular_zeta=aev_floats["angular_zeta"],
        radial_shifts=aev_seqs["radial_shifts"],
        angular_shifts=aev_seqs["angular_shifts"],
        sections=aev_seqs["sections"],
    )
    return constants, symbols


[docs] def load_sae(filename: StrPath) -> SelfEnergy: r"""Returns a self-energy calculator, with self energies from NeuroChem sae file The constructed object is of class `torchani.sae.SelfEnergy`, and can be used to calculate the self atomic energies of a group of molecules. """ _self_energies = [] _symbols = [] with open(Path(filename).resolve(), mode="rt", encoding="utf-8") as f: for i in f: line = [x.strip() for x in i.split("=")] symbol = line[0].split(",")[0].strip() idx = int(line[0].split(",")[1].strip()) energy = float(line[1]) _symbols.append((idx, symbol)) _self_energies.append((idx, energy)) self_energies = [e for _, e in sorted(_self_energies)] symbols = [s for _, s in sorted(_symbols)] return SelfEnergy(symbols, self_energies)
def _get_activation(activation_index: int) -> torch.nn.Module: # Activation defined in: # https://github.com/Jussmith01/NeuroChem/blob # /stable1/src-atomicnnplib/cunetwork/cuannlayer_t.cu#L920 if activation_index == 9: # CELU return TightCELU() elif activation_index == 5: # Gaussian raise NeurochemParseError( "Activation index 5 corresponds to a Gaussian which is not supported" ) raise NeurochemParseError(f"Unsupported activation index {activation_index}") def _decompress_nnf(buffer_: bytes) -> str: while buffer_[0] != ord("="): buffer_ = buffer_[1:] buffer_ = buffer_[2:] return bz2.decompress(buffer_)[:-1].decode("ascii").strip() def _parse_nnf(nnf_str: str) -> tp.List[NeurochemLayerSpec]: # Hack: replace tokens so the file can be evale'd as a list of python dicts # This is unsafe and hacky but since neurochem is legacy it is not a problem layers = [ layer.strip() for layer in nnf_str.replace("\n", "") .replace("$", "") .replace("FILE:", "'") .replace("];]", ")") .replace("]", "") .replace(";", ",") .replace("wparam[", "wparam',weight_numel=") .replace("bparam[", "bparam',bias_numel=") .replace("type", "kind") .replace("l2valu", "l2value") .replace("btchnorm", "batchnorm") .replace("[", "dict(") .replace("=-nan", "=float('nan')") .replace("=nan", "=float('nan')") .split("layer")[1:] ] return [NeurochemLayerSpec(**eval(layer)) for layer in layers] def load_atomic_network(filename: StrPath) -> AtomicNetwork: """Returns a `torchani.nn.AtomicNetwork` Hyperparams and params are loaded from NeuroChem's ``.nnf``, ``.wparam`` and ``.bparam`` files. """ filename = Path(filename).resolve() with open(filename, "rb") as f: nnf_compressed_buffer = f.read() try: nnf_str = _decompress_nnf(nnf_compressed_buffer) except Exception: raise NeurochemParseError(f"Could not decompress nnf file {filename}") from None try: layer_specs = _parse_nnf(nnf_str) except Exception: raise NeurochemParseError(f"Could not parse nnf file {filename}") from None activations: tp.List[int] = [] layer_dims: tp.List[int] = [] weight_files: tp.List[Path] = [] bias_files: tp.List[Path] = [] for j, spec in enumerate(layer_specs): # construct linear layer and load parameters _in = spec.blocksize _out = spec.nodes weight_filename = spec.weights weights_numel = spec.weight_numel bias_filename = spec.biases biases_numel = spec.bias_numel if _in * _out != weights_numel: raise NeurochemParseError( f"Bad parameter shape in {filename}:" f" blocksize * nodes=({_in} * {_out})" f" should be equal to weights_numel={weights_numel}" ) if _out != biases_numel: raise NeurochemParseError( f"Bad parameter shape in {filename}:" f" nodes={_out}" f" should be equal to biases_numel={biases_numel}" ) if j == 0: layer_dims.extend([_in, _out]) else: if layer_dims[-1] != _in: raise NeurochemParseError(f"Bad layer dimension in {filename}") layer_dims.append(_out) weight_files.append(filename.parent / weight_filename) bias_files.append(filename.parent / bias_filename) activations.append(spec.activation) assert activations[-1] == 6, "Last activation must have index 6" assert len(set(activations[:-1])) == 1, "All activations must be equal" network = AtomicNetwork( layer_dims, activation=_get_activation(activations[0]), bias=True, ) for linear, wfile, bfile in zip( itertools.chain(network.layers, [network.final_layer]), weight_files, bias_files ): _in = tp.cast(int, linear.in_features) _out = tp.cast(int, linear.out_features) with open(wfile, mode="rb") as wf: _w = struct.unpack("{}f".format(_in * _out), wf.read()) linear.weight.data = torch.tensor(_w).view(_out, _in) with open(bfile, mode="rb") as bf: _b = struct.unpack("{}f".format(_out), bf.read()) linear.bias.data = torch.tensor(_b).view(_out) return network
[docs] def load_member(symbols: tp.Sequence[str], model_dir: StrPath) -> ANINetworks: """Returns a `torchani.nn.ANINetworks` loaded from NeuroChem's network directory. Args: symbols: |symbols| model_dir: Dir storing network configurations. """ model_dir = Path(model_dir).resolve() return ANINetworks( OrderedDict( [(s, load_atomic_network(model_dir / f"ANN-{s}.nnf")) for s in symbols] ) )
[docs] def load_ensemble( symbols: tp.Sequence[str], prefix: StrPath, count: int ) -> Ensemble: r"""Loads `torchani.nn.Ensemble` from NeuroChem's dirs with a given prefix Args: symbols: |symbols| prefix: Prefix of paths of directory where networks configurations are stored. count: Number of models in the ensemble. """ prefix = Path(prefix) return Ensemble( [load_member(symbols, model_dir_from_prefix(prefix, i)) for i in range(count)] )
SUPPORTED_MODELS = {"ani1x", "ani2x", "ani1ccx"} @dataclass class NeurochemInfo: sae: Path const: Path ensemble_prefix: Path ensemble_size: int @classmethod def from_info_file(cls, info_file_path: Path) -> tpx.Self: with open(info_file_path, mode="rt", encoding="utf-8") as f: lines: tp.List[str] = [x.strip() for x in f.readlines()][:4] _const_file, _sae_file, _ensemble_prefix, _ensemble_size = lines ensemble_size: int = int(_ensemble_size) const_file_path: Path = Path(neurochem_dir(), *_const_file.split("/")) sae_file_path: Path = Path(neurochem_dir(), *_sae_file.split("/")) ensemble_prefix: Path = Path(neurochem_dir(), *_ensemble_prefix.split("/")) return cls(sae_file_path, const_file_path, ensemble_prefix, ensemble_size) @classmethod def from_model_name(cls, model_name: str) -> tpx.Self: if model_name not in SUPPORTED_MODELS: raise ValueError( f"Neurochem model {model_name} not supported," f" supported models are: {SUPPORTED_MODELS}", ) suffix = model_name.replace("ani", "") info_file_path = Path(neurochem_dir(), f"ani-{suffix}_8x.info") if not info_file_path.is_file(): download_model_parameters() info = cls.from_info_file(info_file_path) return info def download_model_parameters( root: tp.Optional[Path] = None, verbose: bool = True ) -> None: root = root or neurochem_dir() if any(root.iterdir()): if verbose: print("Found existing files in directory, assuming params already present") return repo = "ani-model-zoo" tag = "ani-2x" extracted_dirname = f"{repo}-{tag}" url = f"https://github.com/aiqm/{repo}/archive/{tag}.zip" download_and_extract(url, "neurochem-builtins.zip", root, verbose=verbose) extracted_dir = Path(root) / extracted_dirname for f in (extracted_dir / "resources").iterdir(): shutil.move(str(f), root / f.name) shutil.rmtree(extracted_dir) def modules_from_info( info: NeurochemInfo, model_index: tp.Optional[int] = None, strategy: str = "pyaev", ) -> tp.Tuple[AEVComputer, AtomicContainer, SelfEnergy, tp.Sequence[str]]: aev_computer, symbols = load_aev_computer_and_symbols( info.const, strategy=strategy, ) adder = load_sae(info.sae) neural_networks: AtomicContainer if model_index is None: neural_networks = load_ensemble( symbols, info.ensemble_prefix, info.ensemble_size ) else: if model_index >= info.ensemble_size: raise ValueError( f"Model index {model_index} should be <= {info.ensemble_size}" ) neural_networks = load_member( symbols, model_dir_from_prefix( info.ensemble_prefix, model_index, ), ) return aev_computer, neural_networks, adder, symbols
[docs] def modules_from_model_name( model_name: str, model_index: tp.Optional[int] = None, strategy: str = "pyaev", ) -> tp.Tuple[AEVComputer, AtomicContainer, SelfEnergy, tp.Sequence[str]]: r"""Creates the necessary modules to generate a pre-trained ANI model Parses data from legacy neurochem files, which are fetched according to the model name. """ return modules_from_info( NeurochemInfo.from_model_name(model_name), model_index, strategy=strategy, )
[docs] def modules_from_info_file( info_file: Path, model_index: tp.Optional[int] = None, strategy: str = "pyaev", ) -> tp.Tuple[AEVComputer, AtomicContainer, SelfEnergy, tp.Sequence[str]]: r""" Creates the necessary modules to generate a pre-trained ANI model, parsing the data from legacy neurochem files. """ return modules_from_info( NeurochemInfo.from_info_file(info_file), model_index, strategy=strategy, )
def load_model_from_info_file( info_file: StrPath, model_index: tp.Optional[int] = None, strategy: str = "pyaev", periodic_table_index: bool = True, ) -> ANI: info_file = Path(info_file).resolve() components = modules_from_info_file( info_file, model_index, strategy=strategy, ) aev_computer, neural_networks, self_energy_potential, symbols = components return ANI( symbols, aev_computer, neural_networks, self_energy_potential, periodic_table_index=periodic_table_index, ) def load_model_from_name( model_name: str, model_index: tp.Optional[int] = None, strategy: str = "pyaev", periodic_table_index: bool = True, ) -> ANI: components = modules_from_model_name( model_name, model_index, strategy=strategy, ) aev_computer, neural_networks, self_energy_potential, symbols = components return ANI( symbols, aev_computer, neural_networks, self_energy_potential, periodic_table_index=periodic_table_index, ) __all__ = [ "load_aev_constants_and_symbols", "load_aev_computer_and_symbols", "load_sae", "load_member", "load_ensemble", "load_model_from_name", "load_model_from_info_file", "modules_from_model_name", "modules_from_info_file", "download_model_parameters", ]