在生信小分子中,通常有scaffold split和random split两种划分数据集的方式,特别是在MoleculeNet中,而基于scaffold的任务要比random split的任务更难,而且更有意义。因为:
论文 Analyzing Learned Molecular Representations for Property Prediction 中提到:
scaffold split代码:
import os
import csv
import math
import numpy as np
import torch
import torch.nn.functional as F
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem.Scaffolds.MurckoScaffold import MurckoScaffoldSmiles
def _generate_scaffold(smiles, include_chirality=False):
mol = Chem.MolFromSmiles(smiles)
scaffold = MurckoScaffoldSmiles(mol=mol, includeChirality=include_chirality)
return scaffold
def generate_scaffolds(dataset, log_every_n=1000):
scaffolds = {}
data_len = len(dataset)
print(data_len)
print("About to generate scaffolds")
for ind, smiles in enumerate(dataset.smiles_data):
if ind % log_every_n == 0:
print("Generating scaffold %d/%d" % (ind, data_len))
scaffold = _generate_scaffold(smiles)
if scaffold not in scaffolds:
scaffolds[scaffold] = [ind]
else:
scaffolds[scaffold].append(ind)
# Sort from largest to smallest scaffold sets
scaffolds = {key: sorted(value) for key, value in scaffolds.items()}
scaffold_sets = [
scaffold_set for (scaffold, scaffold_set) in sorted(
scaffolds.items(), key=lambda x: (len(x[1]), x[1][0]), reverse=True)
]
return scaffold_sets
def scaffold_split(dataset, valid_size, test_size, seed=None, log_every_n=1000):
train_size = 1.0 - valid_size - test_size
scaffold_sets = generate_scaffolds(dataset)
train_cutoff = train_size * len(dataset)
valid_cutoff = (train_size + valid_size) * len(dataset)
train_inds: List[int] = []
valid_inds: List[int] = []
test_inds: List[int] = []
print("About to sort in scaffold sets")
for scaffold_set in scaffold_sets:
if len(train_inds) + len(scaffold_set) > train_cutoff:
if len(train_inds) + len(valid_inds) + len(scaffold_set) > valid_cutoff:
test_inds += scaffold_set
else:
valid_inds += scaffold_set
else:
train_inds += scaffold_set
return train_inds, valid_inds, test_inds
参考文献:
Yang, Kevin, et al. "Analyzing learned molecular representations for property prediction." Journal of chemical information and modeling 59.8 (2019): 3370-3388.