Source code for kgcnn.data.datasets.MatBenchDataset2020

import os
import pandas as pd

from kgcnn.data.crystal import CrystalDataset
from kgcnn.data.download import DownloadDataset
from kgcnn.data.utils import load_json_file, save_json_file


[docs]class MatBenchDataset2020(CrystalDataset, DownloadDataset): r"""Base class for loading graph datasets from `MatBench <https://matbench.materialsproject.org/>`__ , collection of materials datasets. For graph learning only those with structure are relevant. Process and loads from serialized :obj:`pymatgen` structures. .. note:: This class does not follow the interface of `MatBench <https://github.com/materialsproject/matbench>`__ and therefore also not the original splits required for submission of benchmark values. Matbench is an automated leaderboard for benchmarking state of the art ML algorithms predicting a diverse range of solid materials' properties. It is hosted and maintained by the `Materials Project <https://materialsproject.org/>`_ . `Matbench <https://www.nature.com/articles/s41524-020-00406-3>`__ is an `ImageNet <https://image-net.org/>`__ for materials science; a curated set of 13 supervised, pre-cleaned, ready-to-use ML tasks for benchmarking and fair comparison. The tasks span a wide domain of inorganic materials science applications including electronic, thermodynamic, mechanical, and thermal properties among crystals, 2D materials, disordered metals, and more. References: (1) Dunn, A., Wang, Q., Ganose, A. et al. Benchmarking materials property prediction methods: the Matbench test set and Automatminer reference algorithm. npj Comput Mater 6, 138 (2020). `<https://doi.org/10.1038/s41524-020-00406-3>`_ . """ datasets_download_info = { "matbench_steels": {"dataset_name": "matbench_steels", "download_file_name": 'matbench_steels.json.gz', "data_directory_name": "matbench_steels", "extract_gz": True, "extract_file_name": 'matbench_steels.json'}, "matbench_jdft2d": {"dataset_name": "matbench_jdft2d", "download_file_name": 'matbench_jdft2d.json.gz', "data_directory_name": "matbench_jdft2d", "extract_gz": True, "extract_file_name": 'matbench_jdft2d.json'}, "matbench_phonons": {"dataset_name": "matbench_phonons", "download_file_name": 'matbench_phonons.json.gz', "data_directory_name": "matbench_phonons", "extract_gz": True, "extract_file_name": 'matbench_phonons.json'}, "matbench_expt_gap": {"dataset_name": "matbench_expt_gap", "download_file_name": 'matbench_expt_gap.json.gz', "data_directory_name": "matbench_expt_gap", "extract_gz": True, "extract_file_name": 'matbench_expt_gap.json'}, "matbench_dielectric": {"dataset_name": "matbench_dielectric", "download_file_name": 'matbench_dielectric.json.gz', "data_directory_name": "matbench_dielectric", "extract_gz": True, "extract_file_name": 'matbench_dielectric.json'}, "matbench_expt_is_metal": {"dataset_name": "matbench_expt_is_metal", "download_file_name": 'matbench_expt_is_metal.json.gz', "data_directory_name": "matbench_expt_is_metal", "extract_gz": True, "extract_file_name": 'matbench_expt_is_metal.json'}, "matbench_glass": {"dataset_name": "matbench_glass", "download_file_name": 'matbench_glass.json.gz', "data_directory_name": "matbench_glass", "extract_gz": True, "extract_file_name": 'matbench_glass.json'}, "matbench_log_gvrh": {"dataset_name": "matbench_log_gvrh", "download_file_name": 'matbench_log_gvrh.json.gz', "data_directory_name": "matbench_log_gvrh", "extract_gz": True, "extract_file_name": 'matbench_log_gvrh.json'}, "matbench_log_kvrh": {"dataset_name": "matbench_log_kvrh", "download_file_name": 'matbench_log_kvrh.json.gz', "data_directory_name": "matbench_log_kvrh", "extract_gz": True, "extract_file_name": 'matbench_log_kvrh.json'}, "matbench_perovskites": {"dataset_name": "matbench_perovskites", "download_file_name": 'matbench_perovskites.json.gz', "data_directory_name": "matbench_perovskites", "extract_gz": True, "extract_file_name": 'matbench_perovskites.json'}, "matbench_mp_gap": {"dataset_name": "matbench_mp_gap", "download_file_name": 'matbench_mp_gap.json.gz', "data_directory_name": "matbench_mp_gap", "extract_gz": True, "extract_file_name": 'matbench_mp_gap.json'}, "matbench_mp_is_metal": {"dataset_name": "matbench_mp_is_metal", "download_file_name": 'matbench_mp_is_metal.json.gz', "data_directory_name": "matbench_mp_is_metal", "extract_gz": True, "extract_file_name": 'matbench_mp_is_metal.json'}, "matbench_mp_e_form": {"dataset_name": "matbench_mp_e_form", "download_file_name": 'matbench_mp_e_form.json.gz', "data_directory_name": "matbench_mp_e_form", "extract_gz": True, "extract_file_name": 'matbench_mp_e_form.json'}, } datasets_prepare_data_info = { "matbench_steels": {"file_column_name": "composition"}, "matbench_jdft2d": {"file_column_name": "structure"}, "matbench_phonons": {"file_column_name": "structure"}, "matbench_expt_gap": {"file_column_name": "composition"}, "matbench_dielectric": {"file_column_name": "structure"}, "matbench_expt_is_metal": {"file_column_name": "composition"}, "matbench_glass": {"file_column_name": "composition"}, "matbench_log_gvrh": {"file_column_name": "structure"}, "matbench_log_kvrh": {"file_column_name": "structure"}, "matbench_perovskites": {"file_column_name": "structure"}, "matbench_mp_gap": {"file_column_name": "structure"}, "matbench_mp_is_metal": {"file_column_name": "structure"}, "matbench_mp_e_form": {"file_column_name": "structure"}, } datasets_read_in_memory_info = { "matbench_steels": {"label_column_name": "yield strength"}, "matbench_jdft2d": {"label_column_name": "exfoliation_en"}, "matbench_phonons": {"label_column_name": "last phdos peak"}, "matbench_expt_gap": {"label_column_name": "gap expt"}, "matbench_dielectric": {"label_column_name": "n"}, "matbench_expt_is_metal": {"label_column_name": "is_metal"}, "matbench_glass": {"label_column_name": "gfa"}, "matbench_log_gvrh": {"label_column_name": "log10(G_VRH)"}, "matbench_log_kvrh": {"label_column_name": "log10(K_VRH)"}, "matbench_perovskites": {"label_column_name": "e_form"}, "matbench_mp_gap": {"label_column_name": "gap pbe"}, "matbench_mp_is_metal": {"label_column_name": "is_metal"}, "matbench_mp_e_form": {"label_column_name": "e_form"}, }
[docs] def __init__(self, dataset_name: str, reload: bool = False, verbose: int = 10): """Initialize a `GraphTUDataset` instance from string identifier. Args: dataset_name (str): Name of a dataset. reload (bool): Download the dataset again and prepare data on disk. verbose (int): Print progress or info for processing where 60=silent. Default is 10. """ if not isinstance(dataset_name, str): raise ValueError("Please provide string identifier for TUDataset.") CrystalDataset.__init__(self, verbose=verbose, dataset_name=dataset_name) # Prepare download if dataset_name in self.datasets_download_info: self.download_info = self.datasets_download_info[dataset_name] self.download_info.update({"download_url": "https://ml.materialsproject.org/projects/" + self.download_info["download_file_name"]}) else: raise ValueError( "Can not resolve %s as a MatBench dataset. Pick " % dataset_name, self.datasets_download_info.keys(), "For new dataset, add to `datasets_download_info` list manually.") DownloadDataset.__init__(self, **self.download_info, reload=reload, verbose=verbose) self.data_directory = os.path.join(self.data_main_dir, self.data_directory_name) file_name_download = self.download_file_name if self.extract_file_name is None else self.extract_file_name self.file_name = "%s.csv" % os.path.splitext(file_name_download)[0] self.dataset_name = dataset_name self.require_prepare_data = True self.fits_in_memory = True if self.require_prepare_data: self.prepare_data(overwrite=reload, **self.datasets_prepare_data_info[self.dataset_name]) if self.fits_in_memory: self.read_in_memory(**self.datasets_read_in_memory_info[self.dataset_name])
[docs] def prepare_data(self, file_column_name: str = None, overwrite: bool = False): file_name_download = self.download_file_name if self.extract_file_name is None else self.extract_file_name # file_name_base = os.path.splitext(self.file_name)[0] if all([os.path.exists(self.pymatgen_json_file_path), os.path.exists(self.file_path), not overwrite]): self.info("Found '%s' of structures." % self.pymatgen_json_file_path) return self self.info("Load dataset '%s' to memory..." % self.dataset_name) data = load_json_file(os.path.join(self.data_directory, file_name_download)) self.info("Process database with %s and columns %s" % (data.keys(), data["columns"])) data_columns = data["columns"] index_structure = 0 for i, col in enumerate(data_columns): if col == file_column_name: index_structure = i break py_mat_list = [x[index_structure] for x in data["data"]] self.info("Write structures or compositions '%s' to file." % self.pymatgen_json_file_path) save_json_file(py_mat_list, self.pymatgen_json_file_path) df_dict = {"index": data["index"]} for i, col in enumerate(data_columns): if i != index_structure: df_dict[col] = [x[i] for x in data["data"]] df = pd.DataFrame(df_dict) self.info("Write dataset table '%s' to file." % self.file_name) df.to_csv(self.file_path) return self