ase/io/rmc6f.py

import re
import time
import numpy as np

from ase.atoms import Atoms
from ase.utils import reader, writer
from ase.cell import Cell

__all__ = ['read_rmc6f', 'write_rmc6f']

ncols2style = {9: 'no_labels',
               10: 'labels',
               11: 'magnetic'}


def _read_construct_regex(lines):
    """
    Utility for constructing  regular expressions used by reader.
    """
    lines = [l.strip() for l in lines]
    lines_re = '|'.join(lines)
    lines_re = lines_re.replace(' ', r'\s+')
    lines_re = lines_re.replace('(', r'\(')
    lines_re = lines_re.replace(')', r'\)')
    return '({})'.format(lines_re)


def _read_line_of_atoms_section(fields):
    """
    Process `fields` line of Atoms section in rmc6f file and output extracted
    info as atom id (key) and list of properties for Atoms object (values).

    Parameters
    ----------
    fields: list[str]
        List of columns from line in rmc6f file.


    Returns
    ------
    atom_id: int
        Atom ID
    properties: list[str|float]
        List of Atoms properties based on rmc6f style.
        Basically, have 1) element and fractional coordinates for 'labels'
        or 'no_labels' style and 2) same for 'magnetic' style except adds
        the spin.
        Examples for 1) 'labels' or 'no_labels' styles or 2) 'magnetic' style:
            1) [element, xf, yf, zf]
            2) [element, xf, yf, zf, spin]
    """
    # Atom id
    atom_id = int(fields[0])

    # Get style used for rmc6f from file based on number of columns
    ncols = len(fields)
    style = ncols2style[ncols]

    # Create the position dictionary
    properties = list()
    element = str(fields[1])
    if style == 'no_labels':
        # id element xf yf zf ref_num ucell_x ucell_y ucell_z
        xf = float(fields[2])
        yf = float(fields[3])
        zf = float(fields[4])
        properties = [element, xf, yf, zf]

    elif style == 'labels':
        # id element label xf yf zf ref_num ucell_x ucell_y ucell_z
        xf = float(fields[3])
        yf = float(fields[4])
        zf = float(fields[5])
        properties = [element, xf, yf, zf]

    elif style == 'magnetic':
        # id element xf yf zf ref_num ucell_x ucell_y ucell_z M: spin
        xf = float(fields[2])
        yf = float(fields[3])
        zf = float(fields[4])
        spin = float(fields[10].strip("M:"))
        properties = [element, xf, yf, zf, spin]
    else:
        raise Exception("Unsupported style for parsing rmc6f file format.")

    return atom_id, properties


def _read_process_rmc6f_lines_to_pos_and_cell(lines):
    """
    Processes the lines of rmc6f file to atom position dictionary and cell

    Parameters
    ----------
    lines: list[str]
        List of lines from rmc6f file.

    Returns
    ------
    pos : dict{int:list[str|float]}
        Dict for each atom id and Atoms properties based on rmc6f style.
        Basically, have 1) element and fractional coordinates for 'labels'
        or 'no_labels' style and 2) same for 'magnetic' style except adds
        the spin.
        Examples for 1) 'labels' or 'no_labels' styles or 2) 'magnetic' style:
            1) pos[aid] = [element, xf, yf, zf]
            2) pos[aid] = [element, xf, yf, zf, spin]
    cell: Cell object
        The ASE Cell object created from cell parameters read from the 'Cell'
        section of rmc6f file.
    """

    # Inititalize output pos dictionary
    pos = {}

    # Defined the header an section lines we process
    header_lines = [
        "Number of atoms:",
        "Supercell dimensions:",
        "Cell (Ang/deg):",
        "Lattice vectors (Ang):"]
    sections = ["Atoms"]

    # Construct header and sections regex
    header_lines_re = _read_construct_regex(header_lines)
    sections_re = _read_construct_regex(sections)

    section = None
    header = True

    # Remove any lines that are blank
    lines = [line for line in lines if line != '']

    # Process each line of rmc6f file
    pos = {}
    for line in lines:

        # check if in a section
        m = re.match(sections_re, line)
        if m is not None:
            section = m.group(0).strip()
            header = False
            continue

        # header section
        if header:
            field = None
            val = None

            # Regex that matches whitespace-separated floats
            float_list_re = r'\s+(\d[\d|\s\.]+[\d|\.])'
            m = re.search(header_lines_re + float_list_re, line)
            if m is not None:
                field = m.group(1)
                val = m.group(2)

            if field is not None and val is not None:

                if field == "Number of atoms:":
                    pass
                    """
                    NOTE: Can just capture via number of atoms ingested.
                          Maybe use in future for a check.
                    code: natoms = int(val)
                    """

                if field.startswith('Supercell'):
                    pass
                    """
                    NOTE: wrapping back down to unit cell is not
                          necessarily needed for ASE object.

                    code: supercell = [int(x) for x in val.split()]
                    """

                if field.startswith('Cell'):
                    cellpar = [float(x) for x in val.split()]
                    cell = Cell.fromcellpar(cellpar)

                if field.startswith('Lattice'):
                    pass
                    """
                    NOTE: Have questions about RMC fractionalization matrix for
                          conversion in data2config vs. the ASE matrix.
                          Currently, just support the Cell section.
                    """

        # main body section
        if section is not None:
            if section == 'Atoms':
                atom_id, atom_props = _read_line_of_atoms_section(line.split())
                pos[atom_id] = atom_props

    return pos, cell


def _write_output_column_format(columns, arrays):
    """
    Helper function to build output for data columns in rmc6f format

    Parameters
    ----------
    columns: list[str]
        List of keys in arrays. Will be columns in the output file.
    arrays: dict{str:np.array}
        Dict with arrays for each column of rmc6f file that are
        property of Atoms object.

    Returns
    ------
    property_ncols : list[int]
        Number of columns for each property.
    dtype_obj: np.dtype
        Data type object for the columns.
    formats_as_str: str
        The format for printing the columns.

    """
    fmt_map = {'d': ('R', '%14.6f '),
               'f': ('R', '%14.6f '),
               'i': ('I', '%8d '),
               'O': ('S', '%s'),
               'S': ('S', '%s'),
               'U': ('S', '%s'),
               'b': ('L', ' %.1s ')}

    property_types = []
    property_ncols = []
    dtypes = []
    formats = []

    # Take each column and setup formatting vectors
    for column in columns:
        array = arrays[column]
        dtype = array.dtype

        property_type, fmt = fmt_map[dtype.kind]
        property_types.append(property_type)

        # Flags for 1d vectors
        is_1d = len(array.shape) == 1
        is_1d_as_2d = len(array.shape) == 2 and array.shape[1] == 1

        # Construct the list of key pairs of column with dtype
        if (is_1d or is_1d_as_2d):
            ncol = 1
            dtypes.append((column, dtype))
        else:
            ncol = array.shape[1]
            for c in range(ncol):
                dtypes.append((column + str(c), dtype))

        # Add format and number of columns for this property to output array
        formats.extend([fmt] * ncol)
        property_ncols.append(ncol)

    # Prepare outputs to correct data structure
    dtype_obj = np.dtype(dtypes)
    formats_as_str = ''.join(formats) + '\n'

    return property_ncols, dtype_obj, formats_as_str


def _write_output(filename, header_lines, data, fmt, order=None):
    """
    Helper function to write information to the filename

    Parameters
    ----------
    filename : file|str
        A file like object or filename
    header_lines : list[str]
        Header section of output rmc6f file
    data: np.array[len(atoms)]
        Array for the Atoms section to write to file. Has
        the columns that need to be written on each row
    fmt: str
        The format string to use for writing each column in
        the rows of data.
    order : list[str]
        If not None, gives a list of atom types for the order
        to write out each.
    """
    fd = filename

    # Write header section
    for line in header_lines:
        fd.write("%s \n" % line)

    # If specifying the order, fix the atom id and write to file
    natoms = data.shape[0]
    if order is not None:
        new_id = 0
        for atype in order:
            for i in range(natoms):
                if atype == data[i][1]:
                    new_id += 1
                    data[i][0] = new_id
                    fd.write(fmt % tuple(data[i]))
    # ...just write rows to file
    else:
        for i in range(natoms):
            fd.write(fmt % tuple(data[i]))


@reader
def read_rmc6f(filename, atom_type_map=None):
    """
    Parse a RMCProfile rmc6f file into ASE Atoms object

    Parameters
    ----------
    filename : file|str
        A file like object or filename.
    atom_type_map: dict{str:str}
        Map of atom types for conversions. Mainly used if there is
        an atom type in the file that is not supported by ASE but
        want to map to a supported atom type instead.

        Example to map deuterium to hydrogen:
        atom_type_map = { 'D': 'H' }

    Returns
    ------
    structure : Atoms
        The Atoms object read in from the rmc6f file.
    """

    fd = filename
    lines = fd.readlines()

    # Process the rmc6f file to extract positions and cell
    pos, cell = _read_process_rmc6f_lines_to_pos_and_cell(lines)

    # create an atom type map if one does not exist from unique atomic symbols
    if atom_type_map is None:
        symbols = [atom[0] for atom in pos.values()]
        atom_type_map = {atype: atype for atype in symbols}

    # Use map of tmp symbol to actual symbol
    for atom in pos.values():
        atom[0] = atom_type_map[atom[0]]

    # create Atoms from read-in data
    symbols = []
    scaled_positions = []
    spin = None
    magmoms = []
    for atom in pos.values():
        if len(atom) == 4:
            element, x, y, z = atom
        else:
            element, x, y, z, spin = atom

        element = atom_type_map[element]
        symbols.append(element)
        scaled_positions.append([x, y, z])
        if spin is not None:
            magmoms.append(spin)

    atoms = Atoms(scaled_positions=scaled_positions,
                  symbols=symbols,
                  cell=cell,
                  magmoms=magmoms,
                  pbc=[True, True, True])

    return atoms


@writer
def write_rmc6f(filename, atoms, order=None, atom_type_map=None):
    """
    Write output in rmc6f format - RMCProfile v6 fractional coordinates

    Parameters
    ----------
    filename : file|str
        A file like object or filename.
    atoms: Atoms object
        The Atoms object to be written.

    order : list[str]
        If not None, gives a list of atom types for the order
        to write out each.
    atom_type_map: dict{str:str}
        Map of atom types for conversions. Mainly used if there is
        an atom type in the Atoms object that is a placeholder
        for a different atom type. This is used when the atom type
        is not supported by ASE but is in RMCProfile.

        Example to map hydrogen to deuterium:
        atom_type_map = { 'H': 'D' }
    """

    # get atom types and how many of each (and set to order if passed)
    atom_types = set(atoms.symbols)
    if order is not None:
        if set(atom_types) != set(order):
            raise Exception("The order is not a set of the atom types.")
        atom_types = order

    atom_count_dict = atoms.symbols.formula.count()
    natom_types = [str(atom_count_dict[atom_type]) for atom_type in atom_types]

    # create an atom type map if one does not exist from unique atomic symbols
    if atom_type_map is None:
        symbols = set(np.array(atoms.symbols))
        atom_type_map = {atype: atype for atype in symbols}

    # HEADER SECTION

    # get type and number of each atom type
    atom_types_list = [atom_type_map[atype] for atype in atom_types]
    atom_types_present = ' '.join(atom_types_list)
    natom_types_present = ' '.join(natom_types)

    header_lines = [
        "(Version 6f format configuration file)",
        "(Generated by ASE - Atomic Simulation Environment https://wiki.fysik.dtu.dk/ase/ )",  # noqa: E501
        "Metadata date: " + time.strftime('%d-%m-%Y'),
        "Number of types of atoms:   {} ".format(len(atom_types)),
        "Atom types present:          {}".format(atom_types_present),
        "Number of each atom type:   {}".format(natom_types_present),
        "Number of moves generated:           0",
        "Number of moves tried:               0",
        "Number of moves accepted:            0",
        "Number of prior configuration saves: 0",
        "Number of atoms:                     {}".format(len(atoms)),
        "Supercell dimensions:                1 1 1"]

    # magnetic moments
    if atoms.has('magmoms'):
        spin_str = "Number of spins:                     {}"
        spin_line = spin_str.format(len(atoms.get_initial_magnetic_moments()))
        header_lines.extend([spin_line])

    density_str = "Number density (Ang^-3):              {}"
    density_line = density_str.format(len(atoms) / atoms.get_volume())
    cell_angles = [str(x) for x in atoms.cell.cellpar()]
    cell_line = "Cell (Ang/deg): " + ' '.join(cell_angles)
    header_lines.extend([density_line, cell_line])

    # get lattice vectors from cell lengths and angles
    # NOTE: RMCProfile uses a different convention for the fractionalization
    # matrix

    cell_parameters = atoms.cell.cellpar()
    cell = Cell.fromcellpar(cell_parameters).T
    x_line = ' '.join(['{:12.6f}'.format(i) for i in cell[0]])
    y_line = ' '.join(['{:12.6f}'.format(i) for i in cell[1]])
    z_line = ' '.join(['{:12.6f}'.format(i) for i in cell[2]])
    lat_lines = ["Lattice vectors (Ang):", x_line, y_line, z_line]
    header_lines.extend(lat_lines)
    header_lines.extend(['Atoms:'])

    # ATOMS SECTION

    # create columns of data for atoms (fr_cols)
    fr_cols = ['id', 'symbols', 'scaled_positions', 'ref_num', 'ref_cell']
    if atoms.has('magmoms'):
        fr_cols.extend('magmom')

    # Collect data to be written out
    natoms = len(atoms)

    arrays = {}
    arrays['id'] = np.array(range(1, natoms + 1, 1), int)
    arrays['symbols'] = np.array(atoms.symbols)
    arrays['ref_num'] = np.zeros(natoms, int)
    arrays['ref_cell'] = np.zeros((natoms, 3), int)
    arrays['scaled_positions'] = np.array(atoms.get_scaled_positions())

    # get formatting for writing output based on atom arrays
    ncols, dtype_obj, fmt = _write_output_column_format(fr_cols, arrays)

    # Pack fr_cols into record array
    data = np.zeros(natoms, dtype_obj)
    for column, ncol in zip(fr_cols, ncols):
        value = arrays[column]
        if ncol == 1:
            data[column] = np.squeeze(value)
        else:
            for c in range(ncol):
                data[column + str(c)] = value[:, c]

    # Use map of tmp symbol to actual symbol
    for i in range(natoms):
        data[i][1] = atom_type_map[data[i][1]]

    # Write the output
    _write_output(filename, header_lines, data, fmt, order=order)