Source code for mdsuite.file_io.extxyz_files

"""
MDSuite: A Zincwarecode package.

License
-------
This program and the accompanying materials are made available under the terms
of the Eclipse Public License v2.0 which accompanies this distribution, and is
available at https://www.eclipse.org/legal/epl-v20.html

SPDX-License-Identifier: EPL-2.0

Copyright Contributors to the Zincwarecode Project.

Contact Information
-------------------
email: zincwarecode@gmail.com
github: https://github.com/zincware
web: https://zincwarecode.com/

Citation
--------
If you use this module please cite us with:

Summary
-------
"""
import copy
import logging
import pathlib
import typing

import numpy as np

import mdsuite.database.simulation_database
import mdsuite.file_io.file_read
import mdsuite.file_io.tabular_text_files
from mdsuite.database.mdsuite_properties import mdsuite_properties
from mdsuite.file_io.tabular_text_files import (
    get_species_list_from_tabular_text_reader_data,
)

log = logging.getLogger(__name__)

var_names = {
    mdsuite_properties.positions: "pos",
    mdsuite_properties.velocities: "vel",
    mdsuite_properties.forces: "force",
    mdsuite_properties.stress: "stress",
    mdsuite_properties.energy: "energies",
    mdsuite_properties.time: "time",
    mdsuite_properties.momenta: "momenta",
}


[docs]class EXTXYZFile(mdsuite.file_io.tabular_text_files.TabularTextFileProcessor): """ Reader for extxyz files """ def __init__( self, file_path: typing.Union[str, pathlib.Path], custom_data_map: dict = None ): """ Parameters ---------- file_path Path to the extxyz file custom_data_map: If your file contains columns with data that is not part of the standard set of properties (see var_names in this file), you can map the column names to the corresponding property. example: custom_data_map = {"Reduced_Momentum": "redmom"}, if the file header contains "redmom:R:3" to point to the correct 3 columns containing the reduced momentum values """ super(EXTXYZFile, self).__init__( file_path, file_format_column_names=var_names, custom_column_names=custom_data_map, ) self.n_header_lines = 2 def _get_tabular_text_reader_mdata( self, ) -> mdsuite.file_io.tabular_text_files.TabularTextFileReaderMData: """ Implement abstract parent method """ with open(self.file_path, "r") as file: # first header line: number of particles n_particles = int(file.readline()) # second line: other info header = file.readline() species_idx, property_dict = _get_property_to_column_idx_dict( header, self._column_name_dict ) file.seek(0) species_dict = self._get_species_information(file, species_idx, n_particles) # get number of configs from file length file.seek(0) num_lines = sum(1 for _ in file) n_configs_float = num_lines / (n_particles + self.n_header_lines) n_configs = int(round(n_configs_float)) assert abs(n_configs_float - n_configs) < 1e-10 return mdsuite.file_io.tabular_text_files.TabularTextFileReaderMData( n_configs=n_configs, species_name_to_line_idx_dict=species_dict, property_to_column_idx_dict=property_dict, n_header_lines=self.n_header_lines, n_particles=n_particles, header_lines_for_each_config=True, ) def _get_metadata(self): """ Gets the metadata for database creation as an implementation of the parent class virtual function by analysing the header lines and one full configuration. """ with open(self.file_path, "r") as file: file.readline() # box_l in second header line header = file.readline() box_l = _get_box_l(header) file.seek(0) mdsuite.file_io.tabular_text_files.skip_n_lines( file, self.tabular_text_reader_data.n_particles + self.n_header_lines + 1, ) header_1 = file.readline() time_0 = _get_time(header) time_1 = _get_time(header_1) if time_0 is not None and time_1 is not None: sample_rate = int(round(time_1 - time_0)) else: log.warning( "Could not read sample rate from file. Please adjust the sample rate" " manually if required." ) sample_rate = None species_list = get_species_list_from_tabular_text_reader_data( self.tabular_text_reader_data ) mdata = mdsuite.database.simulation_database.TrajectoryMetadata( n_configurations=self.tabular_text_reader_data.n_configs, box_l=box_l, sample_rate=sample_rate, species_list=species_list, ) return mdata def _get_species_information(self, file, species_idx: int, n_particles: int): """ Get the initial species information Parameters ---------- file: An opened extxyz file species_idx: The index of the column in which the species name is stored n_particles: The total number of particles """ mdsuite.file_io.tabular_text_files.skip_n_lines(file, self.n_header_lines) # read one configuration traj_data = np.stack([list(file.readline().split()) for _ in range(n_particles)]) # Loop over atoms in first configuration. species_dict = {} for i, line in enumerate(traj_data): sp_name = line[species_idx] if sp_name not in list(species_dict.keys()): species_dict[sp_name] = [] species_dict[sp_name].append(i) return species_dict
def _get_box_l(header: str) -> list: """ Get the box lengths from the Lattice property in the header Parameters ---------- header: The extxyz header line as one string Returns ------- [box_l_x, box_l_y, box_l_z] The tree sides of the box cuboid """ data = copy.deepcopy(header).split() lattice = None start = None stop = None for idx, item in enumerate(data): if "Lattice" in item: start = idx break if start is not None: for idx, item in enumerate(data[start:]): if item[-1] == '"': stop = idx break else: raise RuntimeError("Could not find lattice size in file header") if stop is not None: lattice = data[start : stop + 1] lattice[0] = lattice[0].split("=")[1].replace('"', "") lattice[-1] = lattice[-1].replace('"', "") lattice = np.array(lattice).astype(float) return [lattice[0], lattice[4], lattice[8]] def _get_time(header: str) -> float: """ Retrieve the time value from a header line. Can be used to infer the sampling step by calling on consecutive config headers. Parameters ---------- header The extxyz header line as one string """ data = copy.deepcopy(header).split() time = None for item in data: if var_names[mdsuite_properties.time] in item: try: time = float(item.split("=")[-1]) except ValueError: time = float(item.split("=")[-1].split(",")[0]) return time def _get_property_to_column_idx_dict( header: str, var_names: dict ) -> typing.Tuple[int, typing.Dict[str, typing.List[int]]]: """ Get the property summary from the header data. Parameters ---------- header: header to analyse var_names: dict of translations from MDsuite property names to extxyz property names Returns ------- species_index: int The index of the column in which the species names are stored property_summary : dict A dictionary of properties and their location in the data file. """ data = copy.deepcopy(header).split() properties_string = None for item in data: if "Properties" in item: properties_string = item if properties_string is None: raise RuntimeError("Could not find properties in header") properties_list = ( properties_string.split("=")[1].replace(":S", "").replace(":R", "").split(":") ) property_summary = {} species_index = None index = 0 var_names_values = list(var_names.values()) var_names_keys = list(var_names.keys()) for idx, item in enumerate(properties_list): if item == "species": species_index = index index += 1 if item in var_names_values: key = var_names_keys[var_names_values.index(item)] length = int(properties_list[idx + 1]) property_summary[key] = [index + i for i in range(length)] index += length if species_index is None: raise RuntimeError("could not find species in header") return species_index, property_summary