Upload 8 files

Browse files

Files changed (8) hide show

__init__.py +8 -13
cp2k.py +65 -0
i_pi.py +47 -0
mdtraj.py +77 -0
n2p2.py +65 -0
runner.py +134 -0
utilities.py +336 -0
xyz.py +94 -0

__init__.py CHANGED Viewed

@@ -1,14 +1,9 @@
-# __init__.py
-from importlib import resources
-try:
-  import tomllib
-except ModuleNotFoundError:
-  import tomli as tomllib
-# Version of the package
-__version__ = "1.0.0"
-# Read URL of the feed from config file
-_cfg = tomllib.loads(resources.read_text("reader", "config.toml"))
-URL = _cfg["feed"]["url"]

+"""Input and output functionality to serve the rest of the package."""
+from .utilities import *
+from .cp2k import *
+from .mdtraj import *
+from .i_pi import *
+from . import xyz
+from . import runner
+from . import n2p2

cp2k.py ADDED Viewed

	@@ -0,0 +1,65 @@

+"""Functions for CP2K-specific input/output."""
+__all__ = ['add_energy_cp2k_comment', 'read_frames_cp2k']
+from itertools import repeat
+from .utilities import Frame, merge_frames, read_frames
+def add_energy_cp2k_comment(frames):
+    """Parse CP2K energy and inject it into frames.
+    For each frame in `frames`, try to extract a CP2K-formatted potential energy
+    from the comment string and inject it back into the frame. Energy from CP2K is
+    in Hartree, so no conversion is needed.
+    """
+    for frame in frames:
+        if frame.energy is not None:
+            raise ValueError('Energy already present.')
+        try:
+            for pair in frame.comment.split(','):
+                items = pair.split('=')
+                if items[0].strip() == 'E':
+                    frame.energy = float(items[1])
+                    break
+        except (IndexError, ValueError):
+            raise ValueError('No CP2K energy found in comment line.')
+        yield frame
+def read_frames_cp2k(fn_positions, cell=None, fn_forces=None, read_energy: bool = True, force_unit=1.0):
+    """Read data specifically produced by CP2K.
+    Arguments:
+        fn_positions: position trajectory file name, XYZ format
+        cell: a constant cell to use in all frames, optional
+        fn_forces: forces file name, XYZ format, optional
+        read_energy: whether to read energies from comments in `fn_positions`
+    Returns:
+        a `Frame` object
+    """
+    # positions from XYZ, energies from comment if requested
+    # we expect units of angstrom for positions from CP2K
+    frames_pos = read_frames(fn_positions, fformat='xyz')
+    if read_energy:
+        frames_pos = add_energy_cp2k_comment(frames_pos)
+    frames = [frames_pos]
+    # add a constant cell if provided
+    if cell is not None:
+        frames.append(repeat(Frame(cell=cell)))
+    # add forces from XYZ if filename was provided
+    # we expect atomic units for forces from CP2K per default
+    if fn_forces is not None:
+        frames.append(read_frames(fn_forces, fformat='xyz', name_data='forces', unit=force_unit))
+    # iterate over merged frames
+    yield from merge_frames(*frames)

i_pi.py ADDED Viewed

	@@ -0,0 +1,47 @@

+"""Read data specifically produced by i-PI"""
+__all__ = ['read_frames_i_pi']
+import itertools
+from .utilities import Frame, merge_frames, read_frames
+def read_frames_i_pi(fn_positions, cell=None, fn_forces=None, fn_energies=None, column_energy=4):
+    """Read data specifically produced by i-PI.
+    We assume typically used units - angstrom for positions, atomic units for forces and energies.
+    i-PI can save data in any units, but we do not attempt to be fully general here. The strides
+    if all files are assumed to be the same. For other units or strides, compose the frames by hand
+    or write a custom reader function.
+    Arguments:
+        fn_positions: position trajectory file name, XYZ format
+        cell: a constant cell to use in all frames, optional
+        fn_forces: forces file name, XYZ format, optional
+        fn_energies: energies file name, n2p2 energy format, optional
+    Returns:
+        a `Frame` object
+    """
+    # positions from XYZ, we expect units of angstrom for positions from ipi
+    frames_pos = read_frames(fn_positions, fformat='xyz')
+    frames = [frames_pos]
+    # add a constant cell if provided
+    if cell is not None:
+        frames.append(itertools.repeat(Frame(cell=cell)))
+    # add forces from XYZ if filename was provided
+    # we expect atomic units for forces from i-PI
+    if fn_forces is not None:
+        frames.append(read_frames(fn_forces, fformat='xyz', name_data='forces', unit=1.0))
+    # add energies from file if filename was provided
+    # we expect atomic units for energies from i-PI
+    if fn_energies is not None:
+        frames.append(read_frames(fn_energies, fformat='N2P2_E', column=column_energy))
+    # iterate over merged frames
+    yield from merge_frames(*frames)

mdtraj.py ADDED Viewed

	@@ -0,0 +1,77 @@

+"""Read data using MDTraj."""
+__all__ = [
+    'read_frames_mdtraj',
+]
+try:
+    import mdtraj
+except ImportError:
+    mdtraj = None
+from .utilities import Frame
+from ..constants import nm
+def read_frames_mdtraj(fn_in, top=None, names_atoms='type', name_data='positions', unit=nm, unit_cell=nm, chunk=100):
+    """Read data from a file using the MDTraj package.
+    Arguments:
+        fn_in: name of trajectory file to read, passed to `mdtraj.iterload`
+        top: MDTraj topology, passed to `mdtraj.iterload`
+        names_atoms: which atom names to use, 'type' or 'element'
+        name_data: what quantity to take the data as
+        unit: unit to scale data by, multiplicative factor in atomic units
+        unit_cell: unit to scale cell by, multiplicative factor in atomic units
+        chunk: size of one trajectory chunk, passed to `mdtraj.iterload`
+    Yields:
+        One AML `Frame` object at a time
+    """
+    # open the trajectory for interation
+    trj = mdtraj.iterload(fn_in, top=top, chunk=chunk)
+    # no atom names yet
+    names = None
+    # prepare data names
+    if name_data not in ('positions', 'forces'):
+        raise ValueError(f'Unsupported `name_data`: {name_data}. Expected "positions" or "forces".')
+    # iterate over all frames
+    for chunk in trj:
+        # prepare atom names
+        # (`trj` is a generator, no topology information there)
+        if names is None:
+            if names_atoms == 'type':
+                names = [atom.name for atom in chunk.topology.atoms]
+            elif names_atoms == 'element':
+                names = [atom.element.symbol for atom in chunk.topology.atoms]
+            else:
+                raise ValueError(f'Expected "type" or "element" for `name_atoms`, got {names_atoms}.')
+        for i in range(len(chunk)):
+            # atomic data
+            data = chunk.xyz[i, :, :] * unit
+            # cell data, if present
+            if chunk.unitcell_vectors is not None:
+                cell = chunk.unitcell_vectors[i, ...] * unit_cell
+            else:
+                cell = None
+            # prepare all kwargs and construct a frame
+            kwargs = {
+                'names': names,
+                name_data: data,
+                'cell': cell
+            }
+            yield Frame(**kwargs)
+if mdtraj is None:
+    del read_frames_mdtraj
+    __all__.remove('read_frames_mdtraj')

n2p2.py ADDED Viewed

	@@ -0,0 +1,65 @@

+"""Functions to read and write n2p2 data files."""
+__all__ = [
+    'read_epre_n2p2',
+    'read_fpre_n2p2'
+]
+import numpy as np
+from .utilities import Frame, register_io
+@register_io('N2P2_E','read')
+def read_epre_n2p2(f_in, column=3):
+    """Read the outcome of the energy prediction from file"""
+    line = f_in.readline()
+    # no more data in the file
+    if not line:
+        return None
+    # Skip comment lines:
+    while True:
+        if '#' not in line:
+            break
+        line = f_in.readline()
+    energy = float(line.split()[column])
+    return Frame(energy=energy)
+@register_io('N2P2_F','read')
+def read_fpre_n2p2(f_in):
+    """Read the outcome of the force prediction from file"""
+    line = f_in.readline()
+    # no more data in the file
+    if not line:
+        return None
+    # Skip comment lines:
+    while True:
+        if '#' not in line:
+            break
+        line = f_in.readline()
+    items = line.split()
+    config = items[0]
+    forces = []
+    forces.append(float(items[3]))
+    while True:
+        last_pos = f_in.tell()
+        line = f_in.readline()
+        # no more data in the file
+        if not line:
+            break
+        items = line.split()
+        # Stop if config changes
+        if items[0] != config:
+            f_in.seek(last_pos)
+            break
+        forces.append(float(items[3]))
+    forces = np.array(forces)
+    forces = forces.reshape((len(forces)//3, 3))
+    return Frame(forces=forces)

runner.py ADDED Viewed

	@@ -0,0 +1,134 @@

+"""Functions to read and write RuNNer data files."""
+__all__ = [
+    'write_frame_runner',
+    'read_frame_runner',
+]
+import numpy as np
+from .utilities import Frame, register_io
+@register_io('RuNNer', 'read', 'data')   # noqa: C901
+def read_frame_runner(f_in):
+    """Read one frame of the RuNNer format from an open file.
+    Arguments:
+        f_in: open file in the RuNNer format
+    Returns:
+        `Frame` instance or `None`
+    """
+    # For reference, in n2p2, this is implemented in `Structure::readFromFile`, found somewhere here:
+    # https://github.com/CompPhysVienna/n2p2/blob/master/src/libnnp/Structure.cpp#L84
+    # read first line to examine it
+    line_begin = f_in.readline()
+    # no more data in the file
+    if not line_begin:
+        return None
+    # there is some data, frame should start with 'begin'
+    if line_begin.strip() != 'begin':
+        raise ValueError
+    comment = None
+    cell = []
+    names = []
+    positions = []
+    forces = []
+    energy = None
+    for line in f_in:
+        items = line.split()
+        tag = items[0]
+        if tag == 'comment':
+            comment = " ".join(items[1:])
+        elif tag == 'lattice':
+            cell.append([float(item) for item in items[1:]])
+        elif tag == 'atom':
+            positions.append([float(item) for item in items[1:4]])
+            names.append(items[4])
+            forces.append([float(item) for item in items[7:10]])
+            # items[5] is atomic energy, only RuNNer itself (potentially) deals with that
+            # items[6] is atomic energy - not really used by anyone
+        elif tag == 'energy':
+            energy = float(items[1])
+        elif tag == 'charge':
+            pass
+        elif tag == 'end':
+            break
+        else:
+            raise ValueError('Unexpected data in file.')
+    if len(names) == 0:
+        raise ValueError('No atomic data.')
+    cell = np.array(cell)
+    if cell.shape != (3, 3) and len(cell) != 0:
+        raise ValueError('Wrong cell data.')
+    if len(cell) == 0:
+        cell = None
+    positions = np.array(positions)
+    forces = np.array(forces)
+    # Prepare frame
+    frame = Frame(names=names, positions=positions, comment=comment, cell=cell, energy=energy, forces=forces)
+    return frame
+@register_io('RuNNer', 'write', 'data')
+def write_frame_runner(f_out, frame):
+    # "cell" and "lattice" is the same data, we just use the terminology of the file format here.
+    #
+    # Note that atomic charges, atomic energies, and total charge currently not supported
+    # and zeros will be written in the file for these.
+    # Check that required data is in the frame:
+    if (frame.positions is None) or (frame.names is None):
+        raise ValueError('Frame does not contain required properties - atom names and positions.')
+    fmt_lattice = 'lattice ' + 3*'{:16.6f}' + '\n'
+    fmt_one = '{:13.6f}'
+    fmt_atom = 'atom ' + 3*fmt_one + '{:^6s}' + 5*fmt_one + '\n'
+    fmt_energy = 'energy ' + fmt_one + '\n'
+    fmt_charge = 'charge ' + fmt_one + '\n'
+    f_out.write('begin\n')
+    if frame.comment is not None:
+        f_out.write('comment ' + frame.comment + '\n')
+    if frame.cell is not None:
+        for lattice_vector in frame.cell:
+            f_out.write(fmt_lattice.format(*lattice_vector))
+    if frame.forces is not None:
+        for i, name in enumerate(frame.names):
+            f_out.write(fmt_atom.format(*frame.positions[i], name,
+                                        0.0, 0.0, *frame.forces[i]))
+    else:
+        for i, name in enumerate(frame.names):
+            f_out.write(fmt_atom.format(*frame.positions[i], name,
+                                        0.0, 0.0, 0.0, 0.0, 0.0))
+    if frame.energy is None:
+        energy = 0.0
+    else:
+        energy = frame.energy
+    f_out.write(fmt_energy.format(energy))
+    f_out.write(fmt_charge.format(0.0))
+    f_out.write('end\n')

utilities.py ADDED Viewed

	@@ -0,0 +1,336 @@

+"""Input and output utilities.
+The central concept in the I/O infrastructure is a "frame" - a dataclass that represents one atomic
+configuration that contains data of different kinds. Atomic units are used in the frame object itself,
+unless explicitly stated otherwise. Units specified by the file format are used in the files themselves.
+"""
+__all__ = [
+    'AnyPath',
+    'get_fn_test',
+    'Frame',
+    'open_safe',
+    'working_directory',
+    'temporary_directory',
+    'to_file',
+    'from_file',
+    'read_frames',
+    'write_frames',
+    'merge_frames',
+]
+import os
+import shutil
+from collections import defaultdict
+from contextlib import contextmanager
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Optional, Sequence, Union
+import numpy as np
+from ..utilities import AMLIOError
+# functions that are registered to read and write frames
+formats = defaultdict(dict)
+# mapping of file extensions to file formats
+ext2fmt = dict()
+AnyPath = Union[str, Path]
+def get_fn_test(filename):
+    """Get absolute file names of test data.
+    Arguments:
+        filename: name of file in the test data directory, no path
+    """
+    fn_out = Path(__file__).parent.parent / '../tests/data' / filename
+    return fn_out.resolve()
+def register_io(fformat: str, operation: str, extension: Union[str, None] = None):
+    """Decorator to register an I/O operation for a specific file format.
+    Optionally, the function can also register a file name extension to automatic
+    detection of file format from file name.
+    Arguments:
+        fformat: name of file format
+        operation: I/O operation - "read" or "write"
+        extension: file name extension or `None`
+    """
+    def decorator(function):
+        if operation not in ('read', 'write'):
+            raise ValueError('Unrecognized operation. Allowed values: "read", "write".')
+        formats[fformat][operation] = function
+        if extension is not None:
+            formats[fformat]['extension'] = extension
+            if (extension in ext2fmt.keys()) and ext2fmt[extension] != fformat:
+                raise ValueError(f'Attempted to register the same file extension ({extension}) twice.')
+            ext2fmt[extension] = fformat
+    return decorator
+@dataclass(eq=False)
+class Frame:
+    """All possible data of a single frame.
+    Used to exchange data between data structure and I/O routines. Defaults are set to `None`, which
+    corresponds to that given kind of data not being set/available. We do not provide a comparison operator,
+    at least for now, as comparing NumPy arrays is more involved.
+    """
+    # slots do not work correctly with dataclass
+    # Here is an alternative: https://pypi.org/project/dataslots/
+    # Here is some context: https://github.com/ericvsmith/dataclasses/issues/28
+    # __slots__ = ['names', 'positions', 'cell', 'comment', 'energy', 'forces']
+    names: Optional[Sequence] = None
+    positions: Optional[np.ndarray] = None
+    cell: Optional[np.ndarray] = None
+    comment: Optional[str] = None
+    energy: Optional[float] = None
+    forces: Optional[np.ndarray] = None
+    def update(self, other: 'Frame', force: bool = False):
+        """Update this frame with data from another.
+        Arguments:
+            other: another frame
+            force: whether to overwrite data
+        """
+        # check that we have the same atom names
+        if (other.names is not None) and (self.names != other.names):
+            raise ValueError('Inconsistent atom names.')
+        # take over all that we can
+        attrs = ['positions', 'cell', 'comment', 'energy', 'forces']
+        for attr in attrs:
+            attr_o = getattr(other, attr)
+            if attr_o is not None:
+                if force or (getattr(self, attr) is None):
+                    setattr(self, attr, attr_o)
+def open_safe(filename, mode='r', buffering=-1, verbose=False):
+    """A wrapper around `open` which saves backup files.
+    If opening for writing and `filename` exists, it will be renamed
+    so that we do not overwrite any data.
+    Arguments:
+        filename: name of file to open
+        mode: file open mode
+        buffering: passed through to `open`
+        verbose: whether to print to standard output what backup was performed
+    Returns:
+        an open file
+    """
+    if mode[0] == 'w':
+        # if writing, make sure file is not overwritten
+        filename = Path(filename)
+        i = 0
+        fn_backup = filename
+        while fn_backup.exists():
+            name_new = f'#{filename.name:s}#{i:d}#'
+            fn_backup = fn_backup.with_name(name_new)
+            i += 1
+        if fn_backup != filename:
+            filename.rename(fn_backup)
+            if verbose:
+                print(f'Backup performed: {filename} -> {fn_backup}\n')
+    elif mode[0] in ('r', 'a'):
+        # read or append, no danger of overwritten files
+        pass
+    else:
+        # did not expect that, more work needed
+        raise NotImplementedError(f'Unsupported file open mode: {mode:s}.')
+    return open(filename, mode, buffering)
+@contextmanager
+def working_directory(directory):
+    """Change working directory within the context.
+    This is not available in the standard library [1] but can be useful, especially for testing.
+    The old fixture in pytest (`tmpdir`) used py.path [2] which has `as_cwd`, but this is legacy
+    code now and not recommended [3].
+    [1] https://bugs.python.org/issue25625
+    [2] https://py.readthedocs.io/en/latest/path.html
+    [3] https://docs.pytest.org/en/latest/how-to/tmpdir.html
+    Arguments:
+        directory: directory to change to
+    """
+    # store the current working directory
+    dir_original = Path().absolute()
+    # try to change to the new one and then back
+    try:
+        os.chdir(directory)
+        yield
+    finally:
+        os.chdir(dir_original)
+@contextmanager
+def temporary_directory(directory: AnyPath, parents: bool = False, keep: bool = False):
+    """Create a temporary directory.
+    The directory is removed upon exiting the context, unless the users asks to keep it.
+    Arguments:
+        directory: directory to create
+        parents: whether to create parents as well
+        keep: whether to keep directory after exiting context
+    """
+    directory = Path(directory)
+    # catch conflict early, a nicer error message
+    if directory.exists():
+        raise AMLIOError(f'Unable to create directory, already exists: {directory.absolute()}')
+    # actually make the directory
+    directory.mkdir(parents=parents)
+    # create context, clean up if needed
+    try:
+        yield directory
+    finally:
+        if not keep:
+            shutil.rmtree(directory)
+def from_file(fn_in, binary=False):
+    """Read the contents of a file into a variable.
+    By default, the file will be read as a text file, resulting in a string.
+    It `binary` is true, it will be read as a binary file, resulting in bytes.
+    """
+    mode = 'r'
+    if binary:
+        mode += 'b'
+    with open(fn_in, mode) as f_in:
+        data = f_in.read()
+    return data
+def to_file(data, fn_out, binary=False, verbose=False):
+    """Write a variable to a file.
+    The provided `data` would typically be a string or bytes, if `binary` is true.
+    The output file name is protected against overwriting and if `verbose is true,
+    backup file creation will be reported.
+    """
+    mode = 'w'
+    if binary:
+        mode += 'b'
+    with open_safe(fn_out, mode, verbose=verbose) as f_out:
+        f_out.write(data)
+def get_io_operation(fn, fformat, operation):
+    """Select I/O function for given file format.
+    Arguments:
+        fn: name of file to operate on
+        fformat: name of file format
+        operation: I/O operation - "read" or "write"
+    Returns:
+        function to read or write one frame
+    """
+    if operation not in ('read', 'write'):
+        raise ValueError('Unrecognized operation. Allowed values: "read", "write".')
+    # automatically pick a file format
+    if fformat is None:
+        fn = Path(fn)
+        extension = fn.suffix[1:]
+        try:
+            fformat = ext2fmt[extension]
+        except KeyError:
+            raise KeyError(f'Extension "{extension:s}" not registered for file format detection.')
+    try:
+        return formats[fformat][operation]
+    except KeyError:
+        msg = f'File format "{fformat:s}" not supported for operation "{operation:s}".'
+        raise ValueError(msg)
+def read_frames(fn_in, fformat=None, **kwargs):
+    """Iterate over a trajectory file, returning all data for each frame."""
+    read_frame = get_io_operation(fn_in, fformat, 'read')
+    # read all frames, quit when there is no more data
+    # File formats read using MDTraj must be opened differently. Maybe there is a more elegany way to do that though
+    with open(fn_in) as f_in:
+        while True:
+            frame = read_frame(f_in, **kwargs)
+            if frame is None:
+                break
+            yield frame
+def write_frames(fn_out, frames, fformat=None):
+    """Write frames to file.
+    The format of the file is given by `fformat` or inferred from the file
+    extension if `fformat` is `None`.
+    Arguments:
+        fn_out: name of output file
+        frames: iterator over `Frame` objects
+        fformat: format of the file, or `None`
+        label_prop: label of property to include, or `None`
+    """
+    write_frame = get_io_operation(fn_out, fformat, 'write')
+    # write all frames to file
+    with open_safe(fn_out, 'w') as f_out:
+        for frame in frames:
+            write_frame(f_out, frame)
+def merge_frames(frames, *frames_others, force: bool = False):
+    """Merge frames from multiple sources.
+    The length of the result will be determined by the length of `frames`,
+    the other iterators should be at least as long as that.
+    Arguments:
+        frames: iterator over `Frame` objects
+        frames_others: more iterators over `Frame` instances
+        force: whether to overwrite data
+    Yields:
+        `Frame` objects
+    """
+    for frame in frames:
+        for frames_extra in frames_others:
+            frame.update(next(frames_extra), force=force)
+        yield frame

xyz.py ADDED Viewed

	@@ -0,0 +1,94 @@

+"""Functions to read and write XYZ files."""
+__all__ = [
+    'write_frame_xyz',
+    'read_frame_xyz',
+]
+import numpy as np
+from ..constants import angstrom
+from .utilities import Frame, register_io
+@register_io('xyz', 'read', 'xyz')
+def read_frame_xyz(f_in, name_data='positions', unit=angstrom):
+    """Read one frame of XYZ format from an open file.
+    Arguments:
+        f_in: open file in XYZ format
+        name_data: what quantity to take the XYZ data as
+        unit: unit to scale data by, multiplicative factor in atomic units
+    Returns:
+        `Frame` object or `None` if there is no more data
+    """
+    # read first line to examine it
+    line_begin = f_in.readline()
+    # no more data in the file
+    if not line_begin:
+        return None
+    # there is some data, frame should begin with natoms
+    natoms = int(line_begin)
+    # read comment line
+    comment = f_in.readline().rstrip()
+    names = []
+    data = []
+    for _ in range(natoms):
+        line = f_in.readline()
+        if line.strip() == '':
+            raise ValueError('Unexpected data in file.')
+        items = line.split()
+        names.append(items[0])
+        data.append([float(item) for item in items[1:4]])
+    data = np.array(data) * unit
+    # so unless the code fails, this will not trigger.
+    if len(names) != natoms:
+        raise ValueError('Inconsistent number of atoms in XYZ file.')
+    # prepare data
+    if name_data == 'positions':
+        positions = data
+        forces = None
+    elif name_data == 'forces':
+        positions = None
+        forces = data
+    else:
+        raise ValueError(f'Unsupported `name_data`: {name_data}. Expected "positions" or "forces".')
+    return Frame(names=names, positions=positions, comment=comment, energy=None, forces=forces)
+@register_io('xyz', 'write', 'xyz')
+def write_frame_xyz(f_out, frame, unit=angstrom):
+    """Print a single frame into an open XYZ file.
+    This is currently hard-coded to write positions, if we ever need to write forces
+    or something else, it needs generalizing.
+    """
+    # Check that required things are in frame:
+    if (frame.positions is None) or (frame.names is None):
+        raise ValueError('Frame does not contain required properties.')
+    fmt_one = '{:13.6f}'
+    fmt_prop = '{:6s} ' + 3*fmt_one + '\n'
+    # write number of atoms and comment line
+    f_out.write(f'{len(frame.names):d}\n')
+    if frame.comment is not None:
+        f_out.write(f'{frame.comment:s}\n')
+    else:
+        f_out.write('\n')
+    data = frame.positions / unit
+    # write atomic lines
+    for i, name in enumerate(frame.names):
+        f_out.write(fmt_prop.format(name, *data[i]))