Shape Distance Matrix

Calculates the distance between all molecules in a database with themselves. There will only be one entry per molecule, though all conformers will be considered in the comparison. This means the conformer used in a particular row or column of the matrix will not be consistent. The complete distance matrix is written out to the clusters.csv in comma separated format, useful for feeding into downstream clustering software.

Note

The values output will be a “distance”, not the tanimotos. That means a perfect match is ‘0.0’, not 1.0 or 2.0 respectively. The default is to use Tanimoto Combo. The -shapeOnly flag can be used to get only the shape distance.

Warning

This will generate O(N^2) amount of data and runtime. This is not a cheap script to run. This script can generally handle 1,000s to 10,000s in a reasonable timeframe on a modern GPU and a machine with decent memory and disk space.

Code

prompt> ShapeDistanceMatrix.py [-shapeOnly] [-dbase] <database> [-matrix] <clusters.csv>

Download code

ShapeDistanceMatrix.py

#!/usr/bin/env python
# (C) 2022 Cadence Design Systems, Inc. (Cadence) 
# All rights reserved.
# TERMS FOR USE OF SAMPLE CODE The software below ("Sample Code") is
# provided to current licensees or subscribers of Cadence products or
# SaaS offerings (each a "Customer").
# Customer is hereby permitted to use, copy, and modify the Sample Code,
# subject to these terms. Cadence claims no rights to Customer's
# modifications. Modification of Sample Code is at Customer's sole and
# exclusive risk. Sample Code may require Customer to have a then
# current license or subscription to the applicable Cadence offering.
# THE SAMPLE CODE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED.  OPENEYE DISCLAIMS ALL WARRANTIES, INCLUDING, BUT
# NOT LIMITED TO, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
# PARTICULAR PURPOSE AND NONINFRINGEMENT. In no event shall Cadence be
# liable for any damages or liability in connection with the Sample Code
# or its use.

# Write out a csv file of the similarity matrix of a multi-conformer
# database. Note, all conformers will be compared to each other,
# however, only the best match will be reported between two molecules.

import sys
import os
import csv

from openeye import oechem
from openeye import oefastrocs

oepy = os.path.join(os.path.dirname(__file__), "openeye", "python")
sys.path.insert(0, os.path.realpath(oepy))

InterfaceData = """\
!BRIEF [-shapeOnly] [-dbase] <database> [-matrix] <clusters.csv>
!PARAMETER -dbase
  !TYPE string
  !REQUIRED true
  !BRIEF Input database to select from
  !KEYLESS 1
!END
!PARAMETER -matrix
  !TYPE string
  !REQUIRED true
  !BRIEF csv file to write similarity matrix to
  !KEYLESS 2
!END
!PARAMETER -shapeOnly
  !ALIAS -s
  !TYPE bool
  !DEFAULT false
  !BRIEF Run FastROCS in shape only mode.
!END
"""


def GetScoreGetter(shapeOnly=False):
    if shapeOnly:
        return oefastrocs.OEShapeDatabaseScore.GetShapeTanimoto
    return


def main(argv=[__name__]):
    itf = oechem.OEInterface(InterfaceData, argv)

    ifs = oechem.oemolistream()
    dbname = itf.GetString("-dbase")
    if oechem.OEIsGZip(dbname):
        oechem.OEThrow.Fatal("%s is an unsupported database file format as it is gzipped.\n"
                             "Preferred formats are .oeb, .sdf or .oez", dbname)

    if not ifs.open(dbname):
        oechem.OEThrow.Fatal("Unable to open %s for reading" % dbname)

    colname = "TanimotoCombo"
    getter = oefastrocs.OEShapeDatabaseScore.GetTanimotoCombo
    dbtype = oefastrocs.OEShapeDatabaseType_Default
    if itf.GetBool("-shapeOnly"):
        colname = "ShapeTanimoto"
        getter = oefastrocs.OEShapeDatabaseScore.GetShapeTanimoto
        dbtype = oefastrocs.OEShapeDatabaseType_Shape

    csvwriter = csv.writer(open(itf.GetString("-matrix"), 'w'))
    csvwriter.writerow(["Title1", "Title2", colname])

    shapedb = oefastrocs.OEShapeDatabase(dbtype)
    options = oefastrocs.OEShapeDatabaseOptions()
    options.SetScoreType(dbtype)

    lmat = [[]]
    titles = []
    for mol in ifs.GetOEMols():
        if titles:
            bestscores = [0.0] * len(titles)
            for conf in mol.GetConfs():
                for score in shapedb.GetScores(conf, options):
                    midx = score.GetMolIdx()
                    bestscores[midx] = max(bestscores[midx], getter(score))

            lmat.append(bestscores)

        shapedb.AddMol(mol)

        title = mol.GetTitle()
        if not title:
            title = str(len(titles) + 1)
        titles.append(title)

    # write csv file
    csvwriter = csv.writer(open(itf.GetString("-matrix"), 'w'))
    csvwriter.writerow(titles)
    nrows = len(titles)
    for i in range(nrows):
        row = [i+1]
        for j in range(nrows):
            val = 2.0
            if itf.GetBool("-shapeOnly"):
                val = 1.0

            if j > i:
                val -= lmat[j][i]
            elif j < i:
                val -= lmat[i][j]
            elif j == i:
                val = 0.0

            row.append("%.3f" % val)

        csvwriter.writerow(row)

    return 0


if __name__ == '__main__':
    sys.exit(main(sys.argv))