Source code for reacnetgenerator.reacnetgen

#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
# SPDX-License-Identifier: LGPL-3.0-or-later
# cython: language_level=3
# cython: linetrace=True
"""The main module of ReacNetGenerator."""
doc_run = """ReacNetGenerator: an automatic reaction network generator for reactive
molecular dynamics simulation.

Please cite: ReacNetGenerator: an automatic reaction network generator
for reactive molecular dynamic simulations, Phys. Chem. Chem. Phys.,
2020, 22 (2): 683-691, doi: 10.1039/C9CP05091D

Jinzhe Zeng (jinzhe.zeng@rutgers.edu), Tong Zhu (tzhu@lps.ecnu.edu.cn)

==================
Features
==================
* Processing of MD trajectory containing atomic coordinates or bond orders
* Hidden Markov Model (HMM) based noise filtering
* Isomers identifying accoarding to SMILES
* Generation of reaction network for visualization using force-directed
  algorithm
* Parallel computing

==================
Simple example
==================
ReacNetGenerator can process any kind of trajectory files containing
atomic coordinates, e.g. a LAMMPS dump file prepared by running “dump 1
all custom 100 dump.reaxc id type x y z” in LAMMPS:
$ reacnetgenerator --type dump -i dump.reaxc -a C H O --nohmm
where C, H, and O are atomic names in the input file. Analysis report
will be generated automatically.

Also, ReacNetGenerator can process files containing bond information,
e.g. LAMMPS bond file:
$ reacnetgenerator --type bond -i bonds.reaxc -a C H O --nohmm

You can running the following script for help:
$ reacnetgenerator -h
"""


import gc
import itertools
import os
import time
from enum import Enum
from typing import Any, List, Tuple, Union

import numpy as np

from . import __date__, __version__
from ._detect import _Detect
from ._download import DownloadData
from ._draw import _DrawNetwork
from ._hmmfilter import _HMMFilter
from ._logging import logger
from ._matrix import _GenerateMatrix
from ._mergeiso import _mergeISO
from ._path import _CollectPaths
from ._reachtml import _HTMLResult
from .utils import must_be_list



[docs]
class ReacNetGenerator:
    """Use ReacNetGenerator for trajectory analysis.

    Parameters
    ----------
    inputfiletype: str
        The type of the input file. The following type is allowed:
            - dump: LAMMPS dump file, which can be outputed by
              using `dump 1 all custom 100 dump.reaxc id type x y z`. See
              https://lammps.sandia.gov/doc/dump.html for details.
            - bond: LAMMPS ReaxFF bond file. See https://lammps.sandia.gov/doc/fix_reaxc_bonds.html
              for details.
            - xyz: XYZ file, which can also be outputed by LAMMPS using dump.
    inputfilename: str or list of strs
        The filename(s) of the input file, which can be either relative path or absolute path. If
        it is a list, the files will be read in order.
    atomname: tuple of strs
        The list of the atomic names in the input file, such as `('C', 'H', 'O')`. It should match
        the order of that in the input file.
    runHMM: bool, optional, default: True
        Process trajectory with Hidden Markov Model (HMM) or not. If the user find too many species
        are filtered, they can turn off this option.
    miso: int, optional, default: 0
        Merge the isomers and the highest frequency is used as the representative. 0, off
        two available levels:
        1, merge the isomers with same atoms and same bond-network but different bond levels;
        2, merge the isomers with same atoms with different bond-network.
    pbc: bool, optional, default: True
        Use periodic boundary conditions (PBC) or not.
    cell: (3,3) array_like or (3,) array_like or (9,) array_like, optional, default: None
        The cell (box size) of the system. If None (default), the cell will be read from the input
        file. If the input file doesn't have cell information, this parameter will be necessary.
    nproc: int, optional, default: None
        The number of processors used for analysis. If None (default), the program will try to use
        all processors.
    selectatoms: str, optional, default: None
        Select an element from the atomic names, such as `C`, and only show species with this
        element in the reaction network. If None (default), the network will show all elements.
    split: int, optional, default: None
        Split number for the time axis. For example, if set to 10, the whole trajectroy will
        be divided into 10 parts and reactions of each part will be shown.
    a: (2,2) array_like, optional, default: [[0.999, 0.001], [0.001, 0.009]]
        Transition matrix A of HMM parameters. It is recommended for users to choose their own
        parameters. See the paper for details.
    b: (2,2) array_like, optional, default: [[0.6, 0.4], [0.4, 0.6]]
        Emission matrix B of HMM parameters. It is recommended for users to choose their own
        parameters. See the paper for details.

    Examples
    --------
    >>> from reacnetgenerator import ReacNetGenerator
    >>> rng=ReacNetGenerator(inputfiletype="dump", inputfilename="dump.ch4", atomname=['C', 'H', 'O'])
    >>> rng.runanddraw()
    """

    urls: dict
    moleculetempfilename: str
    moleculetemp2filename: str
    originfilename: str
    hmmfilename: str
    resultfilename: str

    def __init__(self, **kwargs: Any) -> None:
        """Init ReacNetGenerator."""
        logger.info(doc_run)
        logger.info(f"Version: {__version__}  Creation date: {__date__}")

        try:
            nproc = len(os.sched_getaffinity(0))
        except AttributeError:
            # macos and windows
            nproc = os.cpu_count()

        # process kwargs
        necessary_key = ["inputfiletype", "inputfilename", "atomname"]
        default_value = {
            "node_color": [78 / 256, 196 / 256, 238 / 256],
            "p": [0.5, 0.5],
            "a": [[0.999, 0.001], [0.001, 0.999]],
            "b": [[0.6, 0.4], [0.4, 0.6]],
            "speciesfilter": [],
            "start_color": [0, 0, 1],
            "end_color": [1, 0, 0],
            "nproc": nproc,
            "pos": {},
            "pbc": True,
            "split": 1,
            "n_searchspecies": 2,
            "node_size": 200,
            "font_size": 6,
            "widthcoefficient": 1,
            "maxspecies": 20,
            "stepinterval": 1,
            "nolabel": False,
            "printfiltersignal": False,
            "showid": True,
            "runHMM": True,
            "SMILES": True,
            "miso": 0,
            "getoriginfile": False,
            "needprintspecies": True,
            "urls": [],
            "matrix_size": 100,
        }
        none_key = ["selectatoms", "species", "pos", "k", "speciescenter", "cell"]
        accept_keys = [
            "atomtype",
            "step",
            "hmmit",
            "timestep",
            "steplinenum",
            "N",
            "temp1it",
            "originfilename",
            "hmmfilename",
            "moleculetempfilename",
            "moleculetemp2filename",
            "allmoleculeroute",
            "splitmoleculeroute",
        ]
        nparray_key = [
            "atomname",
            "p",
            "a",
            "b",
            "node_color",
            "start_color",
            "end_color",
        ]
        file_key = {
            "moleculefilename": "moname",
            "atomroutefilename": "route",
            "reactionfilename": "reaction",
            "tablefilename": "table",
            "imagefilename": "svg",
            "speciesfilename": "species",
            "resultfilename": "html",
            "jsonfilename": "json",
            "reactionabcdfilename": "reactionabcd",
        }
        assert set(necessary_key).issubset(set(kwargs)), (
            "Must give neccessary key: %s" % ", ".join(necessary_key)
        )
        assert set(kwargs).issubset(
            set(necessary_key) | set(default_value) | set(none_key) | set(file_key)
        ), "Unsupported key"
        kwargs["inputfilename"] = must_be_list(kwargs["inputfilename"])
        for kk in default_value:
            kwargs.setdefault(kk, default_value[kk])
            if kwargs[kk] is None:
                kwargs[kk] = default_value[kk]
        for kk in itertools.chain(none_key, accept_keys):
            kwargs.setdefault(kk, None)
        for kk in file_key:
            kwargs.setdefault(kk, f"{kwargs['inputfilename'][0]}.{file_key[kk]}")
        for kk in nparray_key:
            kwargs[kk] = np.array(kwargs[kk])
        if not kwargs["runHMM"]:
            kwargs["getoriginfile"] = True
        if kwargs["selectatoms"] is None:
            kwargs["selectatoms"] = kwargs["atomname"]
        self.__dict__.update(kwargs)
        if self.cell is not None:
            if len(self.cell) == 9:
                self.cell = np.array(self.cell).reshape((3, 3))
            elif len(self.cell) == 3:
                self.cell = np.diag(self.cell)
            else:
                raise RuntimeError(
                    "cell must be (3,3) array_like or (3,) array_like or (9,) array_like"
                )


[docs]
    def runanddraw(
        self, run: bool = True, draw: bool = True, report: bool = True
    ) -> None:
        """Analyze the trajectory from MD simulation.

        Parameters
        ----------
        run : bool, optional, default: True
            Process the trajectory or not, including DOWNLOAD, DETECT, HMM, PATH, and MATRIX steps.
        draw : bool, optional, default: True
            Draw the reaction network or not, i.e. NETWORK step.
        report : bool, optional, default: True
            Generate the analysis report, i.e. NETWORK step.
        """
        processthing = []
        if run:
            if self.urls:
                processthing.append(self.Status.DOWNLOAD)
            processthing.extend(
                (
                    self.Status.DETECT,
                    self.Status.MISO,
                    self.Status.HMM,
                    self.Status.PATH,
                    self.Status.MATRIX,
                )
            )
        if draw:
            processthing.append(self.Status.NETWORK)
        if report:
            processthing.append(self.Status.REPORT)
        self._process(processthing)



[docs]
    def run(self) -> None:
        """Process MD trajectory, including DOWNLOAD, DETECT, HMM, PATH, and MATRIX steps."""
        processthing = []
        if self.urls:
            processthing.append(self.Status.DOWNLOAD)
        processthing.extend(
            (
                self.Status.DETECT,
                self.Status.MISO,
                self.Status.HMM,
                self.Status.PATH,
                self.Status.MATRIX,
            )
        )
        self._process(processthing)



[docs]
    def draw(self) -> None:
        """Draw the reaction network, i.e. NETWORK step."""
        self._process((self.Status.NETWORK,))



[docs]
    def report(self) -> None:
        """Generate the analysis report, i.e. REPORT step."""
        self._process((self.Status.REPORT,))



[docs]
    class Status(Enum):
        """ReacNetGenerator status.

        The ReacNetGenerator consists of several modules and algorithms to
        process the information from the given trajectory, including:

        - DOWNLOAD: Download trajectory from urls
        - DETECT: Read bond information and detect molecules
        - HMM: HMM filter
        - MISO: Merge isomers
        - PATH: Indentify isomers and collect reaction paths
        - MATRIX: Reaction matrix generation
        - NETWORK: Draw reaction network
        - REPORT: Generate analysis report
        """

        INIT = "Init"
        DETECT = "Read bond information and detect molecules"
        MISO = "Merge isomers"
        HMM = "HMM filter"
        PATH = "Indentify isomers and collect reaction paths"
        MATRIX = "Reaction matrix generation"
        NETWORK = "Draw reaction network"
        REPORT = "Generate analysis report"
        DOWNLOAD = "Download trajectory"

        def __str__(self):
            """Return describtion of the status."""
            return self.value


    def _process(self, steps: Union[List[Status], Tuple[Status, ...]]) -> None:
        """Process steps in order.

        Parameters
        ----------
        steps : tuple of ReacNetGenerator.Status
            The process that needs to be processed.
        """
        timearray = [time.perf_counter()]
        for i, runstep in enumerate(steps, 1):
            if runstep == self.Status.DETECT:
                _Detect.gettype(self).detect()
            elif runstep == self.Status.MISO:
                _mergeISO(self).merge()
            elif runstep == self.Status.HMM:
                _HMMFilter(self).filter()
            elif runstep == self.Status.PATH:
                _CollectPaths.getstype(self).collect()
            elif runstep == self.Status.MATRIX:
                _GenerateMatrix(self).generate()
            elif runstep == self.Status.NETWORK:
                _DrawNetwork(self).draw()
            elif runstep == self.Status.REPORT:
                _HTMLResult(self).report()
            elif runstep == self.Status.DOWNLOAD:
                DownloadData(self).download_files()
            # garbage collect
            gc.collect()
            timearray.append(time.perf_counter())
            logger.info(
                f"Step {i}: Done! Time consumed (s): {timearray[-1]-timearray[-2]:.3f} ({runstep})"
            )

        # delete tempfile
        for tempfilename in (
            self.moleculetempfilename,
            self.moleculetemp2filename,
            self.originfilename,
            self.hmmfilename,
        ):
            if tempfilename is not None:
                try:
                    os.remove(tempfilename)
                except OSError:
                    pass
        # Summary
        logger.info("====== Summary ======")
        logger.info(f"Total time(s): {timearray[-1]-timearray[0]:.3f} s")



# Please import ReacNetGenerator class from reacnetgenerator instead of reacnetgenerator.reacnetgen
__all__ = []