import os
import logging
from typing import Callable
from kgcnn.molecule.io import read_mol_list_from_sdf_file, read_xyz_file, read_smiles_file, write_mol_block_list_to_sdf, \
parse_list_to_xyz_str
from concurrent.futures import ThreadPoolExecutor # , ProcessPoolExecutor
from kgcnn.molecule.external.ballloon import BalloonInterface
from typing import Union
logging.basicConfig() # Module logger
module_logger = logging.getLogger(__name__)
module_logger.setLevel(logging.INFO)
# RDkit
try:
import rdkit
import rdkit.Chem
import rdkit.Chem.AllChem
import rdkit.Chem.rdDetermineBonds
from rdkit import RDLogger
[docs] def rdkit_smile_to_mol(smile: str, sanitize: bool = True, add_hydrogen: bool = True, make_conformers: bool = True,
optimize_conformer: bool = True, random_seed: int = 42, stop_logging: bool = False):
# Order of parameters is important here.
if stop_logging:
RDLogger.DisableLog('rdApp.*')
try:
m = rdkit.Chem.MolFromSmiles(smile)
if sanitize:
rdkit.Chem.SanitizeMol(m)
m = rdkit.Chem.AddHs(m)
m.SetProp("_Name", smile.strip())
if make_conformers:
params = rdkit.Chem.AllChem.ETKDGv3()
params.useSmallRingTorsions = True
params.randomSeed = random_seed
# params.useRandomCoords = True
success = rdkit.Chem.AllChem.EmbedMolecule(m, params=params)
if optimize_conformer:
rdkit.Chem.AllChem.MMFFOptimizeMolecule(m)
rdkit.Chem.AssignAtomChiralTagsFromStructure(m)
rdkit.Chem.AssignStereochemistryFrom3D(m)
if not add_hydrogen:
m = rdkit.Chem.RemoveHs(m)
rdkit.Chem.AssignStereochemistry(m)
except:
m = None
if stop_logging:
RDLogger.EnableLog('rdApp.*')
if m is not None:
return rdkit.Chem.MolToMolBlock(m)
return None
[docs] def rdkit_xyz_to_mol(xyz_string: str, charge: Union[int, list, None] = None):
"""Convert xyz-string to mol-string.
The order of atoms in the list should be the same as output.
Args:
xyz_string (str): Convert the xyz string to mol-string
charge (int, list): Possible charges of the molecule.
Returns:
str: Mol-string. Generates bond information in addition to coordinates from xyz-string.
"""
if charge is None:
charge = [0, 1, -1, 2, -2]
if isinstance(charge, int):
charge = [charge]
out_mol = None
for c in charge:
try:
raw_mol = rdkit.Chem.MolFromXYZBlock(xyz_string)
out_mol = rdkit.Chem.Mol(raw_mol)
# rdkit.Chem.rdDetermineBonds.DetermineConnectivity(out_mol, charge=charge)
rdkit.Chem.rdDetermineBonds.DetermineBonds(out_mol, charge=c)
break
except:
out_mol = None
continue
if out_mol is not None:
return rdkit.Chem.MolToMolBlock(out_mol)
return None
except ImportError:
module_logger.error("Can not import `RDKit` package for conversion.")
rdkit_smile_to_mol = None
rdkit_xyz_to_mol = None
try:
# There problems with openbabel if system variable is not set.
# Openbabel may not be fully threadsafe, but is improved in version 3.0.
from openbabel import openbabel
if "BABEL_DATADIR" not in os.environ:
module_logger.warning(
"In case openbabel fails, you can set `kgcnn.mol.convert.openbabel_smile_to_mol` to `None` for disable.")
[docs] def openbabel_smile_to_mol(smile: str, sanitize: bool = True, add_hydrogen: bool = True,
make_conformers: bool = True, optimize_conformer: bool = True,
random_seed: int = 42,
stop_logging: bool = False):
if stop_logging:
openbabel.obErrorLog.StopLogging()
try:
m = openbabel.OBMol()
ob_conversion = openbabel.OBConversion()
format_okay = ob_conversion.SetInAndOutFormats("smi", "mol")
read_okay = ob_conversion.ReadString(m, smile)
is_okay = {"format_okay": format_okay, "read_okay": read_okay}
if make_conformers:
# We need to make conformer with builder
builder = openbabel.OBBuilder()
build_okay = builder.Build(m)
is_okay.update({"build_okay": build_okay})
if add_hydrogen:
# it seems h's are made after build, an get embedded too
m.AddHydrogens()
if optimize_conformer and make_conformers:
ff = openbabel.OBForceField.FindType("mmff94")
ff_setup_okay = ff.Setup(m)
ff.SteepestDescent(100) # defaults are 50-500 in pybel
ff.GetCoordinates(m)
is_okay.update({"ff_setup_okay": ff_setup_okay})
all_okay = all(list(is_okay.values()))
if not all_okay:
print(
"WARNING: Openbabel returned false flag %s" % [key for key, value in is_okay.items() if not value])
except:
m = None
ob_conversion = None
# Set back to default
if stop_logging:
openbabel.obErrorLog.StartLogging()
if m is not None:
return ob_conversion.WriteString(m)
return None
[docs] def openbabel_xyz_to_mol(xyz_string: str, charge: int = 0, stop_logging: bool = False):
"""Convert xyz-string to mol-string.
The order of atoms in the list should be the same as output. Uses openbabel for conversion.
Args:
xyz_string (str): Convert the xyz string to mol-string
stop_logging (bool): Whether to stop logging. Default is False.
Returns:
str: Mol-string. Generates bond information in addition to coordinates from xyz-string.
"""
if stop_logging:
openbabel.obErrorLog.StopLogging()
ob_conversion = openbabel.OBConversion()
ob_conversion.SetInAndOutFormats("xyz", "mol")
# ob_conversion.SetInFormat("xyz")
mol = openbabel.OBMol()
ob_conversion.ReadString(mol, xyz_string)
# print(xyz_str)
out_mol = ob_conversion.WriteString(mol)
# Set back to default
if stop_logging:
openbabel.obErrorLog.StartLogging()
return out_mol
except ImportError:
module_logger.error("Can not import `OpenBabel` package for conversion.")
openbabel_smile_to_mol, openbabel_xyz_to_mol = None, None
[docs]class MolConverter:
[docs] def __init__(self, base_path: str = None):
"""Initialize a converter to transform smile or coordinates into mol block information.
Args:
base_path (str): Base path for temporary files.
"""
self.base_path = base_path
if base_path is None:
self.base_path = os.path.realpath(__file__)
@staticmethod
def _check_is_same_length(a, b):
if len(a) != len(b):
module_logger.error("Mismatch in number of converted. Found '%s' vs. '%s'." % (len(a), len(b)))
raise ValueError("Conversion was not successful")
@staticmethod
def _convert_parallel(conversion_method: Callable, smile_list: list, num_workers: int, *args):
if num_workers is None:
num_workers = os.cpu_count()
if rdkit_smile_to_mol is None and openbabel_smile_to_mol is None:
raise ModuleNotFoundError("Can not convert smiles. Missing `RDkit` or `OpenBabel` packages.")
if num_workers == 1:
mol_list = [conversion_method(x, *args) for x in smile_list]
return mol_list
else:
arg_list = [(x,) + args for x in smile_list]
with ThreadPoolExecutor(max_workers=num_workers) as executor:
result = executor.map(conversion_method, *zip(*arg_list))
mol_list = list(result)
return mol_list
@staticmethod
def _single_smile_to_mol(smile: str,
sanitize: bool = True,
add_hydrogen: bool = True,
make_conformers: bool = True,
optimize_conformer: bool = True):
if rdkit_smile_to_mol is not None:
mol = rdkit_smile_to_mol(smile=smile, sanitize=sanitize, add_hydrogen=add_hydrogen,
make_conformers=make_conformers, optimize_conformer=optimize_conformer)
if mol is not None:
return mol
if openbabel_smile_to_mol is not None:
mol = openbabel_smile_to_mol(smile=smile, sanitize=sanitize, add_hydrogen=add_hydrogen,
make_conformers=make_conformers, optimize_conformer=optimize_conformer)
if mol is not None:
return mol
module_logger.warning("Failed conversion for smile '%s'." % smile)
return None
[docs] def smile_to_mol(self, smiles_path: str, sdf_path: str, external_program: dict = None, num_workers: int = None,
sanitize: bool = True, add_hydrogen: bool = True, make_conformers: bool = True,
optimize_conformer: bool = True, logger=None, batch_size: int = 5000):
"""Convert a smiles file to SDF structure file.
Args:
smiles_path:
sdf_path:
external_program:
num_workers:
sanitize:
add_hydrogen:
make_conformers:
optimize_conformer:
logger:
batch_size:
Returns:
list: List of mol-strings.
"""
# Default via python packages RDkit and OpenBabel.
if external_program is None:
smiles_list = read_smiles_file(smiles_path)
mol_list = []
for i in range(0, len(smiles_list), batch_size):
mg = self._convert_parallel(
self._single_smile_to_mol, smiles_list[i:i + batch_size], num_workers,
# All args for _single_smile_to_mol.
sanitize, add_hydrogen, make_conformers, optimize_conformer
)
mol_list = mol_list + mg
if logger is not None:
logger.info(" ... converted molecules {0} from {1}".format(i + len(mg), len(smiles_list)))
# Check success
self._check_is_same_length(smiles_list, mol_list)
if sdf_path is not None:
write_mol_block_list_to_sdf(mol_list, sdf_path)
return mol_list
# External programs
smiles_list = read_smiles_file(smiles_path)
if external_program["class_name"] == "balloon":
ext_program = BalloonInterface(**external_program["config"])
ext_program.run(input_file=smiles_path, output_file=sdf_path, output_format="sdf")
else:
raise ValueError("Unknown program for conversion of smiles '%s'" % external_program)
mol_list = read_mol_list_from_sdf_file(sdf_path)
self._check_is_same_length(smiles_list, mol_list)
return mol_list
@staticmethod
def _single_xyz_to_mol(xyz_string, charge=0):
if rdkit_smile_to_mol is not None:
mol = rdkit_xyz_to_mol(xyz_string, charge)
if mol is not None:
return mol
if openbabel_smile_to_mol is not None:
mol = openbabel_xyz_to_mol(xyz_string, charge)
if mol is not None:
return mol
module_logger.warning("Failed conversion for xyz '%s'... ." % xyz_string[:20])
return None
[docs] def xyz_to_mol(self, xyz_path: str, sdf_path: str, charge: Union[list, int, None] = None):
"""Convert xyz info to structure file.
Args:
xyz_path:
sdf_path:
charge:
Returns:
list: List of mol blocks as string.
"""
if openbabel_xyz_to_mol is None and rdkit_xyz_to_mol is None:
raise ModuleNotFoundError("Can not convert XYZ to SDF format, missing package `OpenBabel` or `RDkit`.")
xyz_list = read_xyz_file(xyz_path)
mol_list = []
for x in xyz_list:
xyz_str = parse_list_to_xyz_str(x, number_coordinates=3)
# No parallel conversion here, not necessary.
mol_str = self._single_xyz_to_mol(xyz_str, charge=charge)
mol_list.append(mol_str)
self._check_is_same_length(xyz_list, mol_list)
if sdf_path is not None:
write_mol_block_list_to_sdf(mol_list, sdf_path)
return mol_list