6. Constraint Docking#
This feature is available since version 2.0.0
NOTE: Since version 2.1.7 it is not needed to specify protected_ids to avoid failures of moldrug. However, on this tutorial we will let the workaround for user with older versions. May be that the specified CReM parameters completely remove any similarity between a solution and the reference structure for constraint docking; only in this specific scenarios you should protect some atoms in order to preserve some MCS.
import os, requests, gzip, shutil, yaml
from multiprocessing import cpu_count
from moldrug.data import get_data
from moldrug import utils
import numpy as np
from rdkit import Chem
import seaborn as sns
import matplotlib.pyplot as plt
from typing import List
import pandas as pd
# Remember to change to your vina executable
# If you define as a path, it must be absolute
vina_executable = '/Users/klimt/GIT/docking/bin/vina'
# use moldrug's internal data
data_x0161 = get_data('x0161')
# Defining the wd and change directory
wd = 'wd'
os.mkdir(wd)
os.chdir(wd)
# This function is possible from version 2.2.0
def plot_dist(individuals:list[utils.Individual], properties:list[str], every_gen:int = 1):
# Set up the matplotlib figure
sns.set_theme(style="whitegrid")
fig, axes = plt.subplots(nrows = len(properties), figsize=(25, 25))
SawIndividuals = utils.to_dataframe(individuals).drop(['pdbqt'], axis = 1).replace([np.inf, -np.inf], np.nan).dropna()
SawIndividuals = SawIndividuals[SawIndividuals['kept_gens'].map(len) != 0].reset_index(drop=True)
gen_idxs = sorted(set(item for sublist in SawIndividuals['kept_gens'] for item in sublist))
NumGens = max(gen_idxs)
# Set pop to the initial population and pops out the first gen
pop = SawIndividuals[SawIndividuals.genID == gen_idxs.pop(0)].sort_values(by=["cost"])
pops = pop.copy()
for gen_idx in gen_idxs:
idx = [i for i in range(SawIndividuals.shape[0]) if gen_idx in SawIndividuals.loc[i,'kept_gens']]
pop = SawIndividuals.copy().iloc[idx,:].assign(genID=gen_idx)
pops = pd.concat([pops, pop.copy()])
# Draw a violinplot with a narrow bandwidth than the default
pops = pops.loc[pops['genID'].isin([gen for gen in range(0, NumGens+every_gen, every_gen)])]
if len(properties) <= 1:
sns.violinplot(hue = 'genID', y = properties[0], data=pops, palette="Set3", bw_adjust=.2, cut=0, linewidth=1, ax=axes, legend=False)
else:
for i, prop in enumerate(properties):
sns.violinplot(x = 'genID', hue = 'genID', y = prop, data=pops, palette="Set3", bw_adjust=.2, cut=0, linewidth=1, ax=axes[i], legend=False)
return fig, axes
def atom_ids_list(smiles:str) -> list[int]:
"""Return a list of atom IDs for the molecule
Parameters
----------
smiles : str
The SMILES string of the molecule
Returns
-------
list[int]
List of atoms IDs
"""
return list(range(Chem.MolFromSmiles(smiles).GetNumAtoms()))
# Getting the data
lig = data_x0161['smiles']
box = data_x0161['box']
# Getting the CReM data base
url = "http://www.qsar4u.com/files/cremdb/replacements02_sc2.db.gz"
r = requests.get(url, allow_redirects=True)
crem_dbgz_path = 'crem.db.gz'
crem_db_path = 'crem.db'
open(crem_dbgz_path, 'wb').write(r.content)
with gzip.open(crem_dbgz_path, 'rb') as f_in:
with open(crem_db_path, 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
print(lig)
print(atom_ids_list(lig))
print(box)
print(os.listdir())
COC(=O)C=1C=CC(=CC1)S(=O)(=O)N
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
{'boxcenter': [12.11, 1.84, 23.56], 'boxsize': [22.5, 22.5, 22.5]}
['crem.db.gz', 'crem.db']
6.1. Setting the configuration#
moldrug’s version 2.0.4 tried to fix some issues with constraint docking (see release description). If follow jobs are needed and mutate_crem_kwargs allows changes in heavy atoms; it MUST be specified the keyword protected_ids for mutate_crem_kwargs. This is needed because during the generation of constraint conformers constraint_ref is used as fix core. Therefore if a generated molecule does not have this core, an error happens. On version 2.0.0 the core was guessed based on MCS but some error happened (see this RDKit bug). You have then two possibles work around:
Use
mutate_crem_kwargsthat only allow grow operations (min_size = max_size = 0). In this way the core will be preserved.Specify
protected_idsinsidemutate_crem_kwargs. Those IDs are the atom indexes ofseed_molthat correspond to the atoms ofconstraint_ref.
On this tutorial we will use the second strategy. In this case constraint_ref is the same as seed_mol but with some specific conformation state. Therefore we could use the the result of atom_ids_list function (specified above) to get the protected_ids list of atoms.
config ={
"01_grow": {
"type": "GA",
"njobs": 4,
"seed_mol": lig,
"AddHs": True,
"randomseed": 1234, # This is only for reproducibility, ot should not be used for production
"costfunc": "Cost",
"costfunc_kwargs": {
"vina_executable": vina_executable,
"receptor_pdbqt_path": data_x0161['protein']['pdbqt'],
"boxcenter": box['boxcenter'],
"boxsize": box['boxsize'],
"exhaustiveness": 4,
"ncores": int(cpu_count() / 4),
"num_modes": 1,
"vina_seed": 1234, # This is only for reproducibility, ot should not be used for production
"constraint": True,
"constraint_type": "score_only", # local_only
"constraint_ref": data_x0161['ligand_3D'],
"constraint_receptor_pdb_path": data_x0161['protein']['pdb'],
"constraint_num_conf": 100,
"constraint_minimum_conf_rms": 0.01
},
"crem_db_path": crem_db_path,
"maxiter": 2,
"popsize": 10,
"beta": 0.001,
"pc": 0.5,
"get_similar": False,
"mutate_crem_kwargs": {
"radius": 3,
"min_size": 0,
"max_size": 0,
"min_inc": -5,
"max_inc": 6,
"protected_ids": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
"ncores": 12
},
"save_pop_every_gen": 10,
"deffnm": "01_grow"
},
"02_allow_grow": {
"mutate_crem_kwargs": {
"radius": 3,
"min_size": 0,
"max_size": 2,
"min_inc": -5,
"max_inc": 3,
"protected_ids": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
"ncores": 12
},
"maxiter": 2,
"deffnm": "02_allow_grow"
},
"03_pure_mutate": {
"mutate_crem_kwargs": {
"radius": 3,
"min_size": 1,
"max_size": 8,
"min_inc": -5,
"max_inc": 3,
"protected_ids": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
"ncores": 12
},
"maxiter": 2,
"deffnm": "03_pure_mutate"
},
"04_local": {
"mutate_crem_kwargs": {
"radius": 3,
"min_size": 0,
"max_size": 1,
"min_inc": -1,
"max_inc": 1,
"protected_ids": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
"ncores": 12
},
"maxiter": 2,
"deffnm": "04_local"
}
}
# Save the config as a yaml file
with open('config.yml', 'w') as f:
yaml.dump(config, f)
os.listdir()
['crem.db.gz', 'config.yml', 'crem.db']
6.2. Run moldrug#
! moldrug config.yml
Started at Thu Jun 13 14:07:32 2024
You are using moldrug: 3.7.2.
CommandLineHelper(yaml_file='config.yml', fitness=None, continuation=False, verbose=False)
Creating the first population with 10 members:
100%|███████████████████████████████████████████| 10/10 [00:47<00:00, 4.72s/it]
Initial Population: Best Individual: Individual(idx = 0, smiles = COC(=O)c1ccc(S(N)(=O)=O)cc1, cost = 1.0)
Accepted rate: 10 / 10
File 01_grow_pop.sdf was createad!
Evaluating generation 1 / 2:
100%|█████████████████████████████████████████████| 5/5 [00:42<00:00, 8.55s/it]
Generation 1: Best Individual: Individual(idx = 0, smiles = COC(=O)c1ccc(S(N)(=O)=O)cc1, cost = 1.0).
Accepted rate: 0 / 5
Evaluating generation 2 / 2:
100%|█████████████████████████████████████████████| 5/5 [00:33<00:00, 6.67s/it]
File 01_grow_pop.sdf was createad!
Generation 2: Best Individual: Individual(idx = 0, smiles = COC(=O)c1ccc(S(N)(=O)=O)cc1, cost = 1.0).
Accepted rate: 0 / 5
=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+
The simulation finished successfully after 2 generations witha population of 10 individuals. A total number of 20 Individuals were seen during the simulation.
Initial Individual: Individual(idx = 0, smiles = COC(=O)c1ccc(S(N)(=O)=O)cc1, cost = 1.0)
Final Individual: Individual(idx = 0, smiles = COC(=O)c1ccc(S(N)(=O)=O)cc1, cost = 1.0)
The cost function dropped in 0.0 units.
=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+
Total time (2 generations): 146.42 (s).
Finished at Thu Jun 13 14:09:58 2024.
File 01_grow_pop.sdf was createad!
The follow job 02_allow_grow started.
File 02_allow_grow_pop.sdf was createad!
Evaluating generation 3 / 4:
100%|█████████████████████████████████████████████| 5/5 [00:14<00:00, 2.88s/it]
Generation 3: Best Individual: Individual(idx = 0, smiles = COC(=O)c1ccc(S(N)(=O)=O)cc1, cost = 1.0).
Accepted rate: 0 / 5
Evaluating generation 4 / 4:
100%|█████████████████████████████████████████████| 5/5 [00:27<00:00, 5.56s/it]
File 02_allow_grow_pop.sdf was createad!
Generation 4: Best Individual: Individual(idx = 0, smiles = COC(=O)c1ccc(S(N)(=O)=O)cc1, cost = 1.0).
Accepted rate: 0 / 5
=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+
The simulation finished successfully after 4 generations witha population of 10 individuals. A total number of 30 Individuals were seen during the simulation.
Initial Individual: Individual(idx = 0, smiles = COC(=O)c1ccc(S(N)(=O)=O)cc1, cost = 1.0)
Final Individual: Individual(idx = 0, smiles = COC(=O)c1ccc(S(N)(=O)=O)cc1, cost = 1.0)
The cost function dropped in 0.0 units.
=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+
=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+
Note: Check the running warnings and erorrs in error.tar.gz file!
=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+
Total time (2 generations): 63.12 (s).
Finished at Thu Jun 13 14:11:01 2024.
File 02_allow_grow_pop.sdf was createad!
The job 02_allow_grow finished!
The follow job 03_pure_mutate started.
File 03_pure_mutate_pop.sdf was createad!
Note: The mutation on Individual(idx = 0, smiles = COC(=O)c1ccc(S(N)(=O)=O)cc1, cost = 1.0) did not work, it will be returned the same individual
Note: The mutation on Individual(idx = 0, smiles = COC(=O)c1ccc(S(N)(=O)=O)cc1, cost = 1.0) did not work, it will be returned the same individual
Evaluating generation 5 / 6:
100%|█████████████████████████████████████████████| 3/3 [00:52<00:00, 17.52s/it]
Generation 5: Best Individual: Individual(idx = 0, smiles = COC(=O)c1ccc(S(N)(=O)=O)cc1, cost = 1.0).
Accepted rate: 0 / 3
Note: The mutation on Individual(idx = 0, smiles = COC(=O)c1ccc(S(N)(=O)=O)cc1, cost = 1.0) did not work, it will be returned the same individual
Evaluating generation 6 / 6:
100%|█████████████████████████████████████████████| 4/4 [00:55<00:00, 13.93s/it]
File 03_pure_mutate_pop.sdf was createad!
Generation 6: Best Individual: Individual(idx = 0, smiles = COC(=O)c1ccc(S(N)(=O)=O)cc1, cost = 1.0).
Accepted rate: 0 / 4
=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+
The simulation finished successfully after 6 generations witha population of 10 individuals. A total number of 37 Individuals were seen during the simulation.
Initial Individual: Individual(idx = 0, smiles = COC(=O)c1ccc(S(N)(=O)=O)cc1, cost = 1.0)
Final Individual: Individual(idx = 0, smiles = COC(=O)c1ccc(S(N)(=O)=O)cc1, cost = 1.0)
The cost function dropped in 0.0 units.
=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+
Total time (2 generations): 124.61 (s).
Finished at Thu Jun 13 14:13:06 2024.
File 03_pure_mutate_pop.sdf was createad!
The job 03_pure_mutate finished!
The follow job 04_local started.
File 04_local_pop.sdf was createad!
Evaluating generation 7 / 8:
100%|█████████████████████████████████████████████| 5/5 [00:24<00:00, 4.98s/it]
Generation 7: Best Individual: Individual(idx = 0, smiles = COC(=O)c1ccc(S(N)(=O)=O)cc1, cost = 1.0).
Accepted rate: 0 / 5
Evaluating generation 8 / 8:
100%|█████████████████████████████████████████████| 3/3 [00:32<00:00, 10.72s/it]
File 04_local_pop.sdf was createad!
Generation 8: Best Individual: Individual(idx = 0, smiles = COC(=O)c1ccc(S(N)(=O)=O)cc1, cost = 1.0).
Accepted rate: 0 / 3
=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+
The simulation finished successfully after 8 generations witha population of 10 individuals. A total number of 45 Individuals were seen during the simulation.
Initial Individual: Individual(idx = 0, smiles = COC(=O)c1ccc(S(N)(=O)=O)cc1, cost = 1.0)
Final Individual: Individual(idx = 0, smiles = COC(=O)c1ccc(S(N)(=O)=O)cc1, cost = 1.0)
The cost function dropped in 0.0 units.
=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+
Total time (2 generations): 76.52 (s).
Finished at Thu Jun 13 14:14:23 2024.
File 04_local_pop.sdf was createad!
The job 04_local finished!
os.listdir()
['02_allow_grow_pop.sdf',
'04_local_pop.sdf',
'01_grow_result.pbz2',
'03_pure_mutate_pop.sdf',
'04_local_result.pbz2',
'01_grow_pop.sdf',
'03_pure_mutate_result.pbz2',
'crem.db.gz',
'03_pure_mutate_pop.pbz2',
'config.yml',
'error.tar.gz',
'02_allow_grow_result.pbz2',
'crem.db',
'01_grow_pop.pbz2',
'02_allow_grow_pop.pbz2',
'04_local_pop.pbz2']
6.3. Visualization#
If needed install the packages through pip:
! pip install nglview, parmed
# import nglview as nv
# import parmed as pmd
# from ipywidgets import IntSlider, VBox
# parms = pmd.rdkit.from_sdf(os.path.join(wd, '04_local_pop.sdf')) # require parmed, rdkit
# parms
# view = nv.NGLWidget()
# slider = IntSlider(max=len(parms)-1)
# def show_one_ligand(change):
# val = change['new']
# view.show_only([val])
# slider.observe(show_one_ligand, 'value')
# VBox([view, slider])
# for p in parms:
# view.add_structure(nv.ParmEdTrajectory(p))
# view.show_only([0])
result = utils.decompress_pickle('04_local_result.pbz2')
plot_dist(result.SawIndividuals, properties=['qed', 'sa_score', 'vina_score', 'cost'])
(<Figure size 2500x2500 with 4 Axes>,
array([<Axes: xlabel='genID', ylabel='qed'>,
<Axes: xlabel='genID', ylabel='sa_score'>,
<Axes: xlabel='genID', ylabel='vina_score'>,
<Axes: xlabel='genID', ylabel='cost'>], dtype=object))
6.4. See how it looks with local_only#
# Save the new config as a yaml file
new_config = config.copy()
new_config['01_grow']['costfunc_kwargs']['constraint_type'] = 'local_only'
with open('new_config.yml', 'w') as f:
yaml.dump(config, f)
os.listdir()
['02_allow_grow_pop.sdf',
'04_local_pop.sdf',
'new_config.yml',
'01_grow_result.pbz2',
'03_pure_mutate_pop.sdf',
'04_local_result.pbz2',
'01_grow_pop.sdf',
'03_pure_mutate_result.pbz2',
'crem.db.gz',
'03_pure_mutate_pop.pbz2',
'config.yml',
'error.tar.gz',
'02_allow_grow_result.pbz2',
'crem.db',
'01_grow_pop.pbz2',
'02_allow_grow_pop.pbz2',
'04_local_pop.pbz2']
6.5. Rerun moldrug#
! moldrug new_config.yml
Started at Thu Jun 13 14:14:25 2024
You are using moldrug: 3.7.2.
CommandLineHelper(yaml_file='new_config.yml', fitness=None, continuation=False, verbose=False)
Creating the first population with 10 members:
100%|███████████████████████████████████████████| 10/10 [01:08<00:00, 6.89s/it]
Initial Population: Best Individual: Individual(idx = 0, smiles = COC(=O)c1ccc(S(N)(=O)=O)cc1, cost = 1.0)
Accepted rate: 10 / 10
File 01_grow_pop.sdf was createad!
Evaluating generation 1 / 2:
100%|█████████████████████████████████████████████| 5/5 [01:33<00:00, 18.71s/it]
Generation 1: Best Individual: Individual(idx = 0, smiles = COC(=O)c1ccc(S(N)(=O)=O)cc1, cost = 1.0).
Accepted rate: 0 / 5
Evaluating generation 2 / 2:
100%|█████████████████████████████████████████████| 5/5 [01:15<00:00, 15.14s/it]
File 01_grow_pop.sdf was createad!
Generation 2: Best Individual: Individual(idx = 0, smiles = COC(=O)c1ccc(S(N)(=O)=O)cc1, cost = 1.0).
Accepted rate: 0 / 5
=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+
The simulation finished successfully after 2 generations witha population of 10 individuals. A total number of 20 Individuals were seen during the simulation.
Initial Individual: Individual(idx = 0, smiles = COC(=O)c1ccc(S(N)(=O)=O)cc1, cost = 1.0)
Final Individual: Individual(idx = 0, smiles = COC(=O)c1ccc(S(N)(=O)=O)cc1, cost = 1.0)
The cost function dropped in 0.0 units.
=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+
Total time (2 generations): 261.79 (s).
Finished at Thu Jun 13 14:18:47 2024.
File 01_grow_pop.sdf was createad!
The follow job 02_allow_grow started.
File 02_allow_grow_pop.sdf was createad!
Evaluating generation 3 / 4:
100%|█████████████████████████████████████████████| 5/5 [01:03<00:00, 12.65s/it]
Generation 3: Best Individual: Individual(idx = 0, smiles = COC(=O)c1ccc(S(N)(=O)=O)cc1, cost = 1.0).
Accepted rate: 0 / 5
Evaluating generation 4 / 4:
100%|█████████████████████████████████████████████| 5/5 [01:19<00:00, 15.88s/it]
File 02_allow_grow_pop.sdf was createad!
Generation 4: Best Individual: Individual(idx = 0, smiles = COC(=O)c1ccc(S(N)(=O)=O)cc1, cost = 1.0).
Accepted rate: 0 / 5
=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+
The simulation finished successfully after 4 generations witha population of 10 individuals. A total number of 30 Individuals were seen during the simulation.
Initial Individual: Individual(idx = 0, smiles = COC(=O)c1ccc(S(N)(=O)=O)cc1, cost = 1.0)
Final Individual: Individual(idx = 0, smiles = COC(=O)c1ccc(S(N)(=O)=O)cc1, cost = 1.0)
The cost function dropped in 0.0 units.
=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+
=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+
Note: Check the running warnings and erorrs in error.tar.gz file!
=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+
Total time (2 generations): 162.27 (s).
Finished at Thu Jun 13 14:21:29 2024.
File 02_allow_grow_pop.sdf was createad!
The job 02_allow_grow finished!
The follow job 03_pure_mutate started.
File 03_pure_mutate_pop.sdf was createad!
Note: The mutation on Individual(idx = 0, smiles = COC(=O)c1ccc(S(N)(=O)=O)cc1, cost = 1.0) did not work, it will be returned the same individual
Note: The mutation on Individual(idx = 0, smiles = COC(=O)c1ccc(S(N)(=O)=O)cc1, cost = 1.0) did not work, it will be returned the same individual
Evaluating generation 5 / 6:
100%|█████████████████████████████████████████████| 2/2 [00:30<00:00, 15.26s/it]
Generation 5: Best Individual: Individual(idx = 0, smiles = COC(=O)c1ccc(S(N)(=O)=O)cc1, cost = 1.0).
Accepted rate: 0 / 2
Note: The mutation on Individual(idx = 0, smiles = COC(=O)c1ccc(S(N)(=O)=O)cc1, cost = 1.0) did not work, it will be returned the same individual
Evaluating generation 6 / 6:
100%|█████████████████████████████████████████████| 4/4 [00:50<00:00, 12.72s/it]
File 03_pure_mutate_pop.sdf was createad!
Generation 6: Best Individual: Individual(idx = 0, smiles = COC(=O)c1ccc(S(N)(=O)=O)cc1, cost = 1.0).
Accepted rate: 0 / 4
=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+
The simulation finished successfully after 6 generations witha population of 10 individuals. A total number of 36 Individuals were seen during the simulation.
Initial Individual: Individual(idx = 0, smiles = COC(=O)c1ccc(S(N)(=O)=O)cc1, cost = 1.0)
Final Individual: Individual(idx = 0, smiles = COC(=O)c1ccc(S(N)(=O)=O)cc1, cost = 1.0)
The cost function dropped in 0.0 units.
=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+
Total time (2 generations): 97.61 (s).
Finished at Thu Jun 13 14:23:07 2024.
File 03_pure_mutate_pop.sdf was createad!
The job 03_pure_mutate finished!
The follow job 04_local started.
File 04_local_pop.sdf was createad!
Evaluating generation 7 / 8:
100%|█████████████████████████████████████████████| 5/5 [00:35<00:00, 7.07s/it]
Generation 7: Best Individual: Individual(idx = 0, smiles = COC(=O)c1ccc(S(N)(=O)=O)cc1, cost = 1.0).
Accepted rate: 0 / 5
Evaluating generation 8 / 8:
100%|█████████████████████████████████████████████| 4/4 [00:57<00:00, 14.28s/it]
File 04_local_pop.sdf was createad!
Generation 8: Best Individual: Individual(idx = 0, smiles = COC(=O)c1ccc(S(N)(=O)=O)cc1, cost = 1.0).
Accepted rate: 0 / 4
=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+
The simulation finished successfully after 8 generations witha population of 10 individuals. A total number of 45 Individuals were seen during the simulation.
Initial Individual: Individual(idx = 0, smiles = COC(=O)c1ccc(S(N)(=O)=O)cc1, cost = 1.0)
Final Individual: Individual(idx = 0, smiles = COC(=O)c1ccc(S(N)(=O)=O)cc1, cost = 1.0)
The cost function dropped in 0.0 units.
=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+
Total time (2 generations): 112.40 (s).
Finished at Thu Jun 13 14:24:59 2024.
File 04_local_pop.sdf was createad!
The job 04_local finished!
os.listdir()
['02_allow_grow_pop.sdf',
'04_local_pop.sdf',
'new_config.yml',
'01_grow_result.pbz2',
'03_pure_mutate_pop.sdf',
'04_local_result.pbz2',
'01_grow_pop.sdf',
'03_pure_mutate_result.pbz2',
'crem.db.gz',
'03_pure_mutate_pop.pbz2',
'config.yml',
'error.tar.gz',
'02_allow_grow_result.pbz2',
'crem.db',
'01_grow_pop.pbz2',
'02_allow_grow_pop.pbz2',
'04_local_pop.pbz2']
# parms = pmd.rdkit.from_sdf(os.path.join(wd, '04_local_pop.sdf')) # require parmed, rdkit
# parms
# view = nv.NGLWidget()
# slider = IntSlider(max=len(parms)-1)
# def show_one_ligand(change):
# val = change['new']
# view.show_only([val])
# slider.observe(show_one_ligand, 'value')
# VBox([view, slider])
# for p in parms:
# view.add_structure(nv.ParmEdTrajectory(p))
# view.show_only([0])
result = utils.decompress_pickle( '04_local_result.pbz2')
plot_dist(result.SawIndividuals, properties=['qed', 'sa_score', 'vina_score', 'cost'])
(<Figure size 2500x2500 with 4 Axes>,
array([<Axes: xlabel='genID', ylabel='qed'>,
<Axes: xlabel='genID', ylabel='sa_score'>,
<Axes: xlabel='genID', ylabel='vina_score'>,
<Axes: xlabel='genID', ylabel='cost'>], dtype=object))
On both examples moldrug can not optimize the cost function. The population does not change. One of the main cause of this behavior is the desirability definition used for vina_score:
'vina_score': {
'w': 1,
'SmallerTheBest': {
'Target': -12,
'UpperLimit': -6,
'r': 1
}
}
This means that if the vina score is not lower than -6 the cost function will be always 1. In this scenario all were are not optimizing over the generations the “best” individuals. In other words, it is important to see what is the behavior of our system (one small simulation) and then tune your parameters accordantly. On possibility for this system is change UpperLimit to, for example -4. There are a lot of other different options that you could play around (e.g. increase the generations, etc…). It is important to remark that this example is just for the tutorial, in a real project you may need to increase generations, tune the crem keywords, etc. I let it to you as a challenge. Happy simulations!
Just as a bonus I let you here how to change the desirability definition.
6.6. Change the desirability#
# Save the new config as a yaml file
tune_config = new_config.copy()
tune_config['01_grow']['costfunc_kwargs']['desirability'] = {
'qed': {
'w': 1,
'LargerTheBest': {
'LowerLimit': 0.1,
'Target': 0.75,
'r': 1
}
},
'sa_score': {
'w': 1,
'SmallerTheBest': {
'Target': 3,
'UpperLimit': 7,
'r': 1
}
},
'vina_score': {
'w': 1,
'SmallerTheBest': {
'Target': -9,
'UpperLimit': -4,
'r': 1
}
}
}
with open('tune_config.yml', 'w') as f:
yaml.dump(config, f)
os.listdir()
['02_allow_grow_pop.sdf',
'04_local_pop.sdf',
'new_config.yml',
'01_grow_result.pbz2',
'03_pure_mutate_pop.sdf',
'04_local_result.pbz2',
'01_grow_pop.sdf',
'03_pure_mutate_result.pbz2',
'crem.db.gz',
'03_pure_mutate_pop.pbz2',
'tune_config.yml',
'config.yml',
'error.tar.gz',
'02_allow_grow_result.pbz2',
'crem.db',
'01_grow_pop.pbz2',
'02_allow_grow_pop.pbz2',
'04_local_pop.pbz2']
! moldrug tune_config.yml
Started at Thu Jun 13 14:25:01 2024
You are using moldrug: 3.7.2.
CommandLineHelper(yaml_file='tune_config.yml', fitness=None, continuation=False, verbose=False)
Creating the first population with 10 members:
100%|███████████████████████████████████████████| 10/10 [01:00<00:00, 6.05s/it]
Initial Population: Best Individual: Individual(idx = 7, smiles = COC(=O)c1ccc(S(=O)(=O)NC(=O)OC(C)C)cc1, cost = 0.3855366348628305)
Accepted rate: 10 / 10
File 01_grow_pop.sdf was createad!
Evaluating generation 1 / 2:
100%|█████████████████████████████████████████████| 5/5 [01:35<00:00, 19.05s/it]
Generation 1: Best Individual: Individual(idx = 14, smiles = COC(=O)c1ccc(S(=O)(=O)NC2CCCCC2)cc1, cost = 0.3206002991032758).
Accepted rate: 2 / 5
Evaluating generation 2 / 2:
100%|█████████████████████████████████████████████| 5/5 [01:44<00:00, 20.95s/it]
File 01_grow_pop.sdf was createad!
Generation 2: Best Individual: Individual(idx = 19, smiles = COC(=O)c1ccc(S(=O)(=O)NC2CCC(CNC(C)=O)CC2)cc1, cost = 0.309426525654653).
Accepted rate: 2 / 5
=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+
The simulation finished successfully after 2 generations witha population of 10 individuals. A total number of 20 Individuals were seen during the simulation.
Initial Individual: Individual(idx = 0, smiles = COC(=O)c1ccc(S(N)(=O)=O)cc1, cost = 0.4859556664149246)
Final Individual: Individual(idx = 19, smiles = COC(=O)c1ccc(S(=O)(=O)NC2CCC(CNC(C)=O)CC2)cc1, cost = 0.309426525654653)
The cost function dropped in 0.1765291407602716 units.
=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+
=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+
Note: Check the running warnings and erorrs in error.tar.gz file!
=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+
Total time (2 generations): 284.06 (s).
Finished at Thu Jun 13 14:29:45 2024.
File 01_grow_pop.sdf was createad!
The follow job 02_allow_grow started.
File 02_allow_grow_pop.sdf was createad!
Evaluating generation 3 / 4:
100%|█████████████████████████████████████████████| 5/5 [01:22<00:00, 16.45s/it]
Generation 3: Best Individual: Individual(idx = 19, smiles = COC(=O)c1ccc(S(=O)(=O)NC2CCC(CNC(C)=O)CC2)cc1, cost = 0.309426525654653).
Accepted rate: 2 / 5
Evaluating generation 4 / 4:
100%|█████████████████████████████████████████████| 5/5 [01:31<00:00, 18.37s/it]
File 02_allow_grow_pop.sdf was createad!
Generation 4: Best Individual: Individual(idx = 19, smiles = COC(=O)c1ccc(S(=O)(=O)NC2CCC(CNC(C)=O)CC2)cc1, cost = 0.309426525654653).
Accepted rate: 2 / 5
=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+
The simulation finished successfully after 4 generations witha population of 10 individuals. A total number of 30 Individuals were seen during the simulation.
Initial Individual: Individual(idx = 0, smiles = COC(=O)c1ccc(S(N)(=O)=O)cc1, cost = 0.4859556664149246)
Final Individual: Individual(idx = 19, smiles = COC(=O)c1ccc(S(=O)(=O)NC2CCC(CNC(C)=O)CC2)cc1, cost = 0.309426525654653)
The cost function dropped in 0.1765291407602716 units.
=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+
=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+
Note: Check the running warnings and erorrs in error.tar.gz file!
=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+
Total time (2 generations): 194.93 (s).
Finished at Thu Jun 13 14:33:00 2024.
File 02_allow_grow_pop.sdf was createad!
The job 02_allow_grow finished!
The follow job 03_pure_mutate started.
File 03_pure_mutate_pop.sdf was createad!
Evaluating generation 5 / 6:
100%|█████████████████████████████████████████████| 5/5 [01:20<00:00, 16.10s/it]
Generation 5: Best Individual: Individual(idx = 19, smiles = COC(=O)c1ccc(S(=O)(=O)NC2CCC(CNC(C)=O)CC2)cc1, cost = 0.309426525654653).
Accepted rate: 2 / 5
Evaluating generation 6 / 6:
100%|█████████████████████████████████████████████| 4/4 [01:32<00:00, 23.16s/it]
File 03_pure_mutate_pop.sdf was createad!
Generation 6: Best Individual: Individual(idx = 19, smiles = COC(=O)c1ccc(S(=O)(=O)NC2CCC(CNC(C)=O)CC2)cc1, cost = 0.309426525654653).
Accepted rate: 2 / 4
=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+
The simulation finished successfully after 6 generations witha population of 10 individuals. A total number of 39 Individuals were seen during the simulation.
Initial Individual: Individual(idx = 0, smiles = COC(=O)c1ccc(S(N)(=O)=O)cc1, cost = 0.4859556664149246)
Final Individual: Individual(idx = 19, smiles = COC(=O)c1ccc(S(=O)(=O)NC2CCC(CNC(C)=O)CC2)cc1, cost = 0.309426525654653)
The cost function dropped in 0.1765291407602716 units.
=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+
=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+
Note: Check the running warnings and erorrs in error.tar.gz file!
=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+
Total time (2 generations): 194.28 (s).
Finished at Thu Jun 13 14:36:15 2024.
File 03_pure_mutate_pop.sdf was createad!
The job 03_pure_mutate finished!
The follow job 04_local started.
File 04_local_pop.sdf was createad!
Evaluating generation 7 / 8:
100%|█████████████████████████████████████████████| 5/5 [01:54<00:00, 22.88s/it]
Generation 7: Best Individual: Individual(idx = 19, smiles = COC(=O)c1ccc(S(=O)(=O)NC2CCC(CNC(C)=O)CC2)cc1, cost = 0.309426525654653).
Accepted rate: 1 / 5
Evaluating generation 8 / 8:
100%|█████████████████████████████████████████████| 5/5 [01:37<00:00, 19.60s/it]
File 04_local_pop.sdf was createad!
Generation 8: Best Individual: Individual(idx = 19, smiles = COC(=O)c1ccc(S(=O)(=O)NC2CCC(CNC(C)=O)CC2)cc1, cost = 0.309426525654653).
Accepted rate: 0 / 5
=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+
The simulation finished successfully after 8 generations witha population of 10 individuals. A total number of 49 Individuals were seen during the simulation.
Initial Individual: Individual(idx = 0, smiles = COC(=O)c1ccc(S(N)(=O)=O)cc1, cost = 0.4859556664149246)
Final Individual: Individual(idx = 19, smiles = COC(=O)c1ccc(S(=O)(=O)NC2CCC(CNC(C)=O)CC2)cc1, cost = 0.309426525654653)
The cost function dropped in 0.1765291407602716 units.
=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+
=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+
Note: Check the running warnings and erorrs in error.tar.gz file!
=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+
Total time (2 generations): 231.61 (s).
Finished at Thu Jun 13 14:40:06 2024.
File 04_local_pop.sdf was createad!
The job 04_local finished!
os.listdir()
['02_allow_grow_pop.sdf',
'04_local_pop.sdf',
'new_config.yml',
'01_grow_result.pbz2',
'03_pure_mutate_pop.sdf',
'04_local_result.pbz2',
'01_grow_pop.sdf',
'03_pure_mutate_result.pbz2',
'crem.db.gz',
'03_pure_mutate_pop.pbz2',
'tune_config.yml',
'config.yml',
'error.tar.gz',
'02_allow_grow_result.pbz2',
'crem.db',
'01_grow_pop.pbz2',
'02_allow_grow_pop.pbz2',
'04_local_pop.pbz2']
result = utils.decompress_pickle('04_local_result.pbz2')
plot_dist(result.SawIndividuals, properties=['qed', 'sa_score', 'vina_score', 'cost'])
(<Figure size 2500x2500 with 4 Axes>,
array([<Axes: xlabel='genID', ylabel='qed'>,
<Axes: xlabel='genID', ylabel='sa_score'>,
<Axes: xlabel='genID', ylabel='vina_score'>,
<Axes: xlabel='genID', ylabel='cost'>], dtype=object))
Now we see completely different picture. The optimization is working again! You can see how the Accepted rate is not always cero over the different generations, this mean that the population is changing. So, we get some improvements changing the desirability ;-).
Take home message: Different consecutive runs with different desirability may help to get out from the local minimum of the initial population.