r"""
This module is part of the *Legacy API* of TorchANI 2 and should not be used in new
code. It should only be used if you need to interface with previously trained ANI models
in the NeuroChem format. It contains tools for loading files in the NeuroChem format,
the original file format used in the first `ANI`_ article.
.. _ANI:
http://pubs.rsc.org/en/Content/ArticleLanding/2017/SC/C6SC05720A#!divAbstract
"""
import itertools
from pathlib import Path
from dataclasses import dataclass
import typing as tp
import struct
import bz2
import shutil
from collections import OrderedDict
import torch
import typing_extensions as tpx
from torchani.paths import neurochem_dir
from torchani.arch import ANI
from torchani.aev import AEVComputer
from torchani.nn import (
ANINetworks,
Ensemble,
AtomicNetwork,
AtomicContainer,
TightCELU,
)
from torchani.cutoffs import CutoffArg
from torchani.neighbors import NeighborlistArg
from torchani.sae import SelfEnergy
from torchani.utils import download_and_extract
from torchani.annotations import StrPath
def model_dir_from_prefix(prefix: Path, idx: int) -> Path:
network_path = (prefix.parent / f"{prefix.name}{idx}") / "networks"
return network_path
class NeurochemParseError(RuntimeError):
pass
@dataclass
class NeurochemLayerSpec:
nodes: int
activation: int
kind: int
blocksize: int
dropout: int
dropset: float
maskupdate: float
maxnorm: int
norm: float
normupdate: float
l2norm: int
l2value: float
batchnorm: int
weights: str
weight_numel: int
biases: str
bias_numel: int
def load_aev_computer_and_symbols(
consts_file: StrPath,
strategy: str = "pyaev",
neighborlist: NeighborlistArg = "all_pairs",
cutoff_fn: CutoffArg = "cosine",
) -> tp.Tuple[AEVComputer, tp.Tuple[str, ...]]:
consts, symbols = load_aev_constants_and_symbols(consts_file)
aev_computer = AEVComputer.from_constants(
radial_cutoff=consts.radial_cutoff,
angular_cutoff=consts.angular_cutoff,
radial_eta=consts.radial_eta,
radial_shifts=consts.radial_shifts,
angular_eta=consts.angular_eta,
angular_zeta=consts.angular_zeta,
angular_shifts=consts.angular_shifts,
sections=consts.sections,
num_species=len(symbols),
strategy=strategy,
cutoff_fn=cutoff_fn,
neighborlist=neighborlist,
)
return aev_computer, symbols
@dataclass
class AEVConstants:
radial_cutoff: float
radial_eta: float
radial_shifts: tp.Tuple[float, ...]
angular_cutoff: float
angular_eta: float
angular_zeta: float
angular_shifts: tp.Tuple[float, ...]
sections: tp.Tuple[float, ...]
def load_aev_constants_and_symbols(
consts_file: StrPath,
) -> tp.Tuple[AEVConstants, tp.Tuple[str, ...]]:
aev_floats: tp.Dict[str, float] = {}
aev_seqs: tp.Dict[str, tp.Tuple[float, ...]] = {}
file_name_mapping = {
"Rcr": "radial_cutoff",
"Rca": "angular_cutoff",
"EtaR": "radial_eta",
"ShfR": "radial_shifts",
"ShfA": "angular_shifts",
"ShfZ": "sections",
"EtaA": "angular_eta",
"Zeta": "angular_zeta",
}
with open(consts_file) as f:
for i in f:
try:
line = [x.strip() for x in i.split("=")]
name = line[0]
value = line[1]
if name in ["Rcr", "Rca"]:
aev_floats[file_name_mapping[name]] = float(value)
elif name in ["EtaR", "ShfR", "Zeta", "ShfZ", "EtaA", "ShfA"]:
float_values = tuple(
float(x.strip())
for x in value.replace("[", "").replace("]", "").split(",")
)
if name in ["EtaR", "Zeta", "EtaA"]:
assert len(float_values) == 1
aev_floats[file_name_mapping[name]] = float_values[0]
else:
aev_seqs[file_name_mapping[name]] = float_values
elif name == "Atyp":
symbols = tuple(
x.strip()
for x in value.replace("[", "").replace("]", "").split(",")
)
except Exception:
raise NeurochemParseError(
f"Unable to parse const file {consts_file}"
) from None
constants = AEVConstants(
radial_cutoff=aev_floats["radial_cutoff"],
angular_cutoff=aev_floats["angular_cutoff"],
radial_eta=aev_floats["radial_eta"],
angular_eta=aev_floats["angular_eta"],
angular_zeta=aev_floats["angular_zeta"],
radial_shifts=aev_seqs["radial_shifts"],
angular_shifts=aev_seqs["angular_shifts"],
sections=aev_seqs["sections"],
)
return constants, symbols
[docs]
def load_sae(filename: StrPath) -> SelfEnergy:
r"""Returns a self-energy calculator, with self energies from NeuroChem sae file
The constructed object is of class `torchani.sae.SelfEnergy`, and can
be used to calculate the self atomic energies of a group of molecules.
"""
_self_energies = []
_symbols = []
with open(Path(filename).resolve(), mode="rt", encoding="utf-8") as f:
for i in f:
line = [x.strip() for x in i.split("=")]
symbol = line[0].split(",")[0].strip()
idx = int(line[0].split(",")[1].strip())
energy = float(line[1])
_symbols.append((idx, symbol))
_self_energies.append((idx, energy))
self_energies = [e for _, e in sorted(_self_energies)]
symbols = [s for _, s in sorted(_symbols)]
return SelfEnergy(symbols, self_energies)
def _get_activation(activation_index: int) -> torch.nn.Module:
# Activation defined in:
# https://github.com/Jussmith01/NeuroChem/blob
# /stable1/src-atomicnnplib/cunetwork/cuannlayer_t.cu#L920
if activation_index == 9: # CELU
return TightCELU()
elif activation_index == 5: # Gaussian
raise NeurochemParseError(
"Activation index 5 corresponds to a Gaussian which is not supported"
)
raise NeurochemParseError(f"Unsupported activation index {activation_index}")
def _decompress_nnf(buffer_: bytes) -> str:
while buffer_[0] != ord("="):
buffer_ = buffer_[1:]
buffer_ = buffer_[2:]
return bz2.decompress(buffer_)[:-1].decode("ascii").strip()
def _parse_nnf(nnf_str: str) -> tp.List[NeurochemLayerSpec]:
# Hack: replace tokens so the file can be evale'd as a list of python dicts
# This is unsafe and hacky but since neurochem is legacy it is not a problem
layers = [
layer.strip()
for layer in nnf_str.replace("\n", "")
.replace("$", "")
.replace("FILE:", "'")
.replace("];]", ")")
.replace("]", "")
.replace(";", ",")
.replace("wparam[", "wparam',weight_numel=")
.replace("bparam[", "bparam',bias_numel=")
.replace("type", "kind")
.replace("l2valu", "l2value")
.replace("btchnorm", "batchnorm")
.replace("[", "dict(")
.replace("=-nan", "=float('nan')")
.replace("=nan", "=float('nan')")
.split("layer")[1:]
]
return [NeurochemLayerSpec(**eval(layer)) for layer in layers]
def load_atomic_network(filename: StrPath) -> AtomicNetwork:
"""Returns a `torchani.nn.AtomicNetwork`
Hyperparams and params are loaded from NeuroChem's ``.nnf``, ``.wparam`` and
``.bparam`` files.
"""
filename = Path(filename).resolve()
with open(filename, "rb") as f:
nnf_compressed_buffer = f.read()
try:
nnf_str = _decompress_nnf(nnf_compressed_buffer)
except Exception:
raise NeurochemParseError(f"Could not decompress nnf file {filename}") from None
try:
layer_specs = _parse_nnf(nnf_str)
except Exception:
raise NeurochemParseError(f"Could not parse nnf file {filename}") from None
activations: tp.List[int] = []
layer_dims: tp.List[int] = []
weight_files: tp.List[Path] = []
bias_files: tp.List[Path] = []
for j, spec in enumerate(layer_specs):
# construct linear layer and load parameters
_in = spec.blocksize
_out = spec.nodes
weight_filename = spec.weights
weights_numel = spec.weight_numel
bias_filename = spec.biases
biases_numel = spec.bias_numel
if _in * _out != weights_numel:
raise NeurochemParseError(
f"Bad parameter shape in {filename}:"
f" blocksize * nodes=({_in} * {_out})"
f" should be equal to weights_numel={weights_numel}"
)
if _out != biases_numel:
raise NeurochemParseError(
f"Bad parameter shape in {filename}:"
f" nodes={_out}"
f" should be equal to biases_numel={biases_numel}"
)
if j == 0:
layer_dims.extend([_in, _out])
else:
if layer_dims[-1] != _in:
raise NeurochemParseError(f"Bad layer dimension in {filename}")
layer_dims.append(_out)
weight_files.append(filename.parent / weight_filename)
bias_files.append(filename.parent / bias_filename)
activations.append(spec.activation)
assert activations[-1] == 6, "Last activation must have index 6"
assert len(set(activations[:-1])) == 1, "All activations must be equal"
network = AtomicNetwork(
layer_dims,
activation=_get_activation(activations[0]),
bias=True,
)
for linear, wfile, bfile in zip(
itertools.chain(network.layers, [network.final_layer]), weight_files, bias_files
):
_in = tp.cast(int, linear.in_features)
_out = tp.cast(int, linear.out_features)
with open(wfile, mode="rb") as wf:
_w = struct.unpack("{}f".format(_in * _out), wf.read())
linear.weight.data = torch.tensor(_w).view(_out, _in)
with open(bfile, mode="rb") as bf:
_b = struct.unpack("{}f".format(_out), bf.read())
linear.bias.data = torch.tensor(_b).view(_out)
return network
[docs]
def load_member(symbols: tp.Sequence[str], model_dir: StrPath) -> ANINetworks:
"""Returns a `torchani.nn.ANINetworks` loaded from NeuroChem's network directory.
Args:
symbols: |symbols|
model_dir: Dir storing network configurations.
"""
model_dir = Path(model_dir).resolve()
return ANINetworks(
OrderedDict(
[(s, load_atomic_network(model_dir / f"ANN-{s}.nnf")) for s in symbols]
)
)
[docs]
def load_ensemble(
symbols: tp.Sequence[str], prefix: StrPath, count: int
) -> Ensemble:
r"""Loads `torchani.nn.Ensemble` from NeuroChem's dirs with a given prefix
Args:
symbols: |symbols|
prefix: Prefix of paths of directory where networks configurations are stored.
count: Number of models in the ensemble.
"""
prefix = Path(prefix)
return Ensemble(
[load_member(symbols, model_dir_from_prefix(prefix, i)) for i in range(count)]
)
SUPPORTED_MODELS = {"ani1x", "ani2x", "ani1ccx"}
@dataclass
class NeurochemInfo:
sae: Path
const: Path
ensemble_prefix: Path
ensemble_size: int
@classmethod
def from_info_file(cls, info_file_path: Path) -> tpx.Self:
with open(info_file_path, mode="rt", encoding="utf-8") as f:
lines: tp.List[str] = [x.strip() for x in f.readlines()][:4]
_const_file, _sae_file, _ensemble_prefix, _ensemble_size = lines
ensemble_size: int = int(_ensemble_size)
const_file_path: Path = Path(neurochem_dir(), *_const_file.split("/"))
sae_file_path: Path = Path(neurochem_dir(), *_sae_file.split("/"))
ensemble_prefix: Path = Path(neurochem_dir(), *_ensemble_prefix.split("/"))
return cls(sae_file_path, const_file_path, ensemble_prefix, ensemble_size)
@classmethod
def from_model_name(cls, model_name: str) -> tpx.Self:
if model_name not in SUPPORTED_MODELS:
raise ValueError(
f"Neurochem model {model_name} not supported,"
f" supported models are: {SUPPORTED_MODELS}",
)
suffix = model_name.replace("ani", "")
info_file_path = Path(neurochem_dir(), f"ani-{suffix}_8x.info")
if not info_file_path.is_file():
download_model_parameters()
info = cls.from_info_file(info_file_path)
return info
def download_model_parameters(
root: tp.Optional[Path] = None, verbose: bool = True
) -> None:
root = root or neurochem_dir()
if any(root.iterdir()):
if verbose:
print("Found existing files in directory, assuming params already present")
return
repo = "ani-model-zoo"
tag = "ani-2x"
extracted_dirname = f"{repo}-{tag}"
url = f"https://github.com/aiqm/{repo}/archive/{tag}.zip"
download_and_extract(url, "neurochem-builtins.zip", root, verbose=verbose)
extracted_dir = Path(root) / extracted_dirname
for f in (extracted_dir / "resources").iterdir():
shutil.move(str(f), root / f.name)
shutil.rmtree(extracted_dir)
def modules_from_info(
info: NeurochemInfo,
model_index: tp.Optional[int] = None,
strategy: str = "pyaev",
) -> tp.Tuple[AEVComputer, AtomicContainer, SelfEnergy, tp.Sequence[str]]:
aev_computer, symbols = load_aev_computer_and_symbols(
info.const,
strategy=strategy,
)
adder = load_sae(info.sae)
neural_networks: AtomicContainer
if model_index is None:
neural_networks = load_ensemble(
symbols, info.ensemble_prefix, info.ensemble_size
)
else:
if model_index >= info.ensemble_size:
raise ValueError(
f"Model index {model_index} should be <= {info.ensemble_size}"
)
neural_networks = load_member(
symbols,
model_dir_from_prefix(
info.ensemble_prefix,
model_index,
),
)
return aev_computer, neural_networks, adder, symbols
[docs]
def modules_from_model_name(
model_name: str,
model_index: tp.Optional[int] = None,
strategy: str = "pyaev",
) -> tp.Tuple[AEVComputer, AtomicContainer, SelfEnergy, tp.Sequence[str]]:
r"""Creates the necessary modules to generate a pre-trained ANI model
Parses data from legacy neurochem files, which are fetched according to the model
name.
"""
return modules_from_info(
NeurochemInfo.from_model_name(model_name),
model_index,
strategy=strategy,
)
[docs]
def modules_from_info_file(
info_file: Path,
model_index: tp.Optional[int] = None,
strategy: str = "pyaev",
) -> tp.Tuple[AEVComputer, AtomicContainer, SelfEnergy, tp.Sequence[str]]:
r"""
Creates the necessary modules to generate a pre-trained ANI model, parsing the data
from legacy neurochem files.
"""
return modules_from_info(
NeurochemInfo.from_info_file(info_file),
model_index,
strategy=strategy,
)
def load_model_from_info_file(
info_file: StrPath,
model_index: tp.Optional[int] = None,
strategy: str = "pyaev",
periodic_table_index: bool = True,
) -> ANI:
info_file = Path(info_file).resolve()
components = modules_from_info_file(
info_file,
model_index,
strategy=strategy,
)
aev_computer, neural_networks, self_energy_potential, symbols = components
return ANI(
symbols,
aev_computer,
neural_networks,
self_energy_potential,
periodic_table_index=periodic_table_index,
)
def load_model_from_name(
model_name: str,
model_index: tp.Optional[int] = None,
strategy: str = "pyaev",
periodic_table_index: bool = True,
) -> ANI:
components = modules_from_model_name(
model_name,
model_index,
strategy=strategy,
)
aev_computer, neural_networks, self_energy_potential, symbols = components
return ANI(
symbols,
aev_computer,
neural_networks,
self_energy_potential,
periodic_table_index=periodic_table_index,
)
__all__ = [
"load_aev_constants_and_symbols",
"load_aev_computer_and_symbols",
"load_sae",
"load_member",
"load_ensemble",
"load_model_from_name",
"load_model_from_info_file",
"modules_from_model_name",
"modules_from_info_file",
"download_model_parameters",
]