OEMedChem Examples

Bemis Murcko perception

#!/usr/bin/env python
#############################################################################
# Copyright (C) 2014 OpenEye Scientific Software, Inc.
#############################################################################
# Utility to fragment the input structures by Bemis-Murcko rules
# ---------------------------------------------------------------------------
# BemisMurckoPerception.py input_mols output_mols [uncolor]
#
# input_mols: filename of molecules to fragment and uncolor
# output_mols: filename of input structure with SDData of perceived regions
# [uncolor]: optional 4th arg to request uncoloring of output fragment info
#############################################################################
from openeye.oechem import *
from openeye.oemedchem import *
import sys

############################################################
InterfaceData = """
!BRIEF [ -uncolor ] [-i] <infile1> [-o] <infile2>
!PARAMETER -i
  !ALIAS -in
  !ALIAS -input
  !TYPE string
  !REQUIRED true
  !BRIEF Input structure file name
  !KEYLESS 1
!END
!PARAMETER -o
  !ALIAS -out
  !ALIAS -output
  !TYPE string
  !REQUIRED true
  !BRIEF Output SD file name
  !KEYLESS 2
!END
!PARAMETER -uncolor
  !ALIAS -u
  !TYPE bool
  !DEFAULT false
  !BRIEF Uncolor output molecules
!END
"""


def main(argv=[__name__]):
    itf = OEInterface(InterfaceData, argv)

    # flag on command line indicates uncoloring option or not
    bUncolor = itf.GetBool("-uncolor")

    # input structure(s) to transform
    ifsmols = oemolistream()
    if not ifsmols.open(itf.GetString("-i")):
        OEThrow.Fatal("Unable to open %s for reading" % itf.GetString("-i"))

    # save output structure(s) to this file
    ofs = oemolostream()
    if not ofs.open(itf.GetString("-o")):
        OEThrow.Fatal("Unable to open %s for writing" % itf.GetString("-o"))
    if not OEIsSDDataFormat(ofs.GetFormat()):
        OEThrow.Fatal("Unable to open %s for writing" % itf.GetString("-o"))

    irec = 0
    ototal = 0
    frag = OEGraphMol()
    for mol in ifsmols.GetOEGraphMols():
        irec += 1
        OETheFunctionFormerlyKnownAsStripSalts(mol)
        iter = OEGetBemisMurcko(mol)
        if not iter.IsValid():
            name = mol.GetTitle()
            if not mol.GetTitle():
                name = 'Record ' + str(irec)
            OEThrow.Warning("%s: no perceived regions" % name)
            continue
        for bmregion in iter:
            # create a fragment from the perceived region
            OESubsetMol(frag, mol, bmregion, True)
            if bUncolor:
                # uncolor the fragment
                OEUncolorMol(frag)
            smi = OEMolToSmiles(frag)
            # annotate the input molecule with the role information
            for role in bmregion.GetRoles():
                OEAddSDData(mol, role.GetName(), smi)
        ototal += 1
        OEWriteMolecule(ofs, mol)

    if not irec:
        OEThrow.Fatal('No records in input structure file to perceive')

    if not ototal:
        OEThrow.Warning('No annotated structures generated')

    print("Input molecules={0:d}, output annotated {1:s}molecules={2:d}"
          .format(irec, ("(uncolored) " if bUncolor else ""), ototal))

    return 0

if __name__ == "__main__":
    sys.exit(main(sys.argv))

See also

Matched Pair analysis and transformations

#!/usr/bin/env python
#############################################################################
# Copyright (C) 2014-2015 OpenEye Scientific Software, Inc.
#############################################################################
# Utility to perform a matched pair analysis on a set of structures
#  and use the transformations discovered to alter a second set of structures
# ---------------------------------------------------------------------------
# MatchedPairTransform.py index_mols input_mols output_mols
#
# index_mols: filename of input molecules to analyze
# input_mols: filename of molecules to transform based on analysis
# output_mols: filename to collect transformed molecules
#############################################################################
from openeye.oechem import *
from openeye.oemedchem import *
import sys


def Analyze(itf):
    # input structures to index
    ifsindex = oemolistream()
    if not ifsindex.open(itf.GetString("-index")):
        OEThrow.Fatal("Unable to open %s for reading" %
                      itf.GetString("-index"))

    # input structure(s) to transform
    ifsmols = oemolistream()
    if not ifsmols.open(itf.GetString("-input")):
        OEThrow.Fatal("Unable to open %s for reading" %
                      itf.GetString("-input"))

    # save output structure(s) to this file
    ofs = oemolostream()
    if not ofs.open(itf.GetString("-output")):
        OEThrow.Fatal("Unable to open %s for writing" %
                      itf.GetString("-output"))

    # create options class with defaults
    opts = OEMatchedPairAnalyzerOptions()
    # setup options from command line
    if not OESetupMatchedPairIndexOptions(opts, itf):
        OEThrow.Fatal("Error setting matched pair indexing options!")

    if not opts.HasIndexableFragmentHeavyAtomRange():
        OEThrow.Info("Indexing all fragments")
    else:
        print("Using index range={0:.2f}-{1:.2f}%"
              .format(opts.GetIndexableFragmentRangeMin(), opts.GetIndexableFragmentRangeMax()))

    # request a specific context for the transform activity, here 0-bonds
    chemctxt = OEMatchedPairContext_Bond0
    askcontext = itf.GetString("-context")[:1]
    if askcontext == '0':
        chemctxt = OEMatchedPairContext_Bond0
    elif askcontext == '1':
        chemctxt = OEMatchedPairContext_Bond1
    elif askcontext == '2':
        chemctxt = OEMatchedPairContext_Bond2
    elif askcontext == '3':
        chemctxt = OEMatchedPairContext_Bond3
    elif askcontext == 'a' or askcontext == 'A':
        chemctxt = OEMatchedPairContext_AllBonds
    else:
        OEThrow.Fatal("Invalid context specified: " +
                      askcontext + ", only 0|1|2|3|A allowed")

    # create indexing engine
    mmp = OEMatchedPairAnalyzer(opts)

    verbose = itf.GetBool("-verbose")

    # add molecules to be indexed
    record = 0
    for mol in ifsindex.GetOEGraphMols():
        record += 1
        status = mmp.AddMol(mol, record)
        if verbose and status != record:
            OEThrow.Info('Input structure not added to index, record=%d status=%s' %
                         (record, OEMatchedPairIndexStatusName(status)))

    if not mmp.NumMols():
        OEThrow.Fatal('No records in index structure file')

    if not mmp.NumMatchedPairs():
        OEThrow.Fatal('No matched pairs found from indexing, ' +
                      'use -fragGe,-fragLe options to extend index range')

    # return some status information
    print("indexed molecules={0:d} matched pairs={1:d}"
          .format(mmp.NumMols(),
                  mmp.NumMatchedPairs()))

    minpairs = itf.GetInt("-minpairs")
    if minpairs > 1:
        OEThrow.Info('Requiring at least %d matched pairs to apply transformations' % minpairs)

    orec = 0
    ocnt = 0
    for mol in ifsmols.GetOEGraphMols():
        orec += 1
        iter = OEMatchedPairApplyTransforms(mol, mmp, chemctxt, minpairs)
        if not iter.IsValid():
            if verbose:
                # as minpairs increases, fewer transformed mols are generated - output if requested
                name = mol.GetTitle()
                if not mol.GetTitle():
                    name = 'Record ' + str(orec)
                OEThrow.Info("%s did not produce any output" % name)
            continue
        for outmol in iter:
            ocnt += 1
            OEWriteMolecule(ofs, outmol)

    if not orec:
        OEThrow.Fatal('No records in input structure file to transform')

    if not ocnt:
        OEThrow.Warning('No transformed structures generated')

    print("Input molecules={0:d} Output molecules={1:d}".format(orec, ocnt))

    return 0

############################################################
InterfaceData = """
# matchedpairtransform interface file
!CATEGORY MatchedPairTransform

    !CATEGORY I/O
        !PARAMETER -index 1
          !TYPE string
          !REQUIRED true
          !BRIEF Input filename of structures to index
          !KEYLESS 1
        !END

        !PARAMETER -input 2
          !ALIAS -i
          !ALIAS -in
          !TYPE string
          !REQUIRED true
          !BRIEF Input filename of structures to process based on matched pairs discovered from indexing
          !KEYLESS 2
        !END

        !PARAMETER -output 3
          !ALIAS -o
          !ALIAS -out
          !TYPE string
          !REQUIRED true
          !BRIEF Output filename
          !KEYLESS 3
        !END
    !END

    !CATEGORY options
        !PARAMETER -context 1
           !ALIAS -c
           !TYPE string
           !DEFAULT 0
           !BRIEF chemistry context to use for the transformation [0|1|2|3|A]
        !END

        !PARAMETER -minpairs 2
           !TYPE int
           !DEFAULT 0
           !BRIEF require at least -minpairs to apply the transformations (default: all)
        !END

        !PARAMETER -verbose 3
           !TYPE bool
           !DEFAULT 0
           !BRIEF generate verbose output
        !END
    !END
!END
"""


def main(argv=[__name__]):
    itf = OEInterface(InterfaceData)
    OEConfigureMatchedPairIndexOptions(itf)

    if not OEParseCommandLine(itf, argv):
        OEThrow.Fatal("Unable to interpret command line!")

    Analyze(itf)

if __name__ == "__main__":
    sys.exit(main(sys.argv))

Matched Pair analysis and listing of transformations

#!/usr/bin/env python
#############################################################################
# Copyright (C) 2014-2015 OpenEye Scientific Software, Inc.
#############################################################################
# Utility to perform a matched pair analysis on a set of structures
#  and dump a listing of the transformations derived from matched pairs found
# ---------------------------------------------------------------------------
# MatchedPairTransformList.py index_mols
#
# index_mols: filename of input molecules to analyze
#############################################################################
from __future__ import print_function
from openeye.oechem import *
from openeye.oemedchem import *
import sys

# simple statistics
import math
average = lambda x: sum(x) * 1.0 / len(x)
variance = lambda x: list(map(lambda y: (y - average(x)) ** 2, x))
stddev = lambda x: math.sqrt(average(variance(x)))

class MMPXform:
    def __init__(self, xform, avg, std, num):
        self.xform = xform
        self.avg = avg
        self.std = std
        self.num = num

    def __cmp__(self, other):
        return cmp(self.avg, other.avg)


def MMPAnalyze(itf):
    # input structures to index
    ifs = oemolistream()
    if not ifs.open(itf.GetString("-input")):
        OEThrow.Fatal("Unable to open %s for reading" %
                      itf.GetString("-input"))

    # request a specific context for the transform activity, here 0-bonds
    chemctxt = OEMatchedPairContext_Bond0
    askcontext = itf.GetString("-context")[:1]
    if askcontext == '0':
        chemctxt = OEMatchedPairContext_Bond0
    elif askcontext == '1':
        chemctxt = OEMatchedPairContext_Bond1
    elif askcontext == '2':
        chemctxt = OEMatchedPairContext_Bond2
    elif askcontext == '3':
        chemctxt = OEMatchedPairContext_Bond3
    elif askcontext == 'a' or askcontext == 'A':
        chemctxt = OEMatchedPairContext_AllBonds
    else:
        OEThrow.Fatal("Invalid context specified: " +
                      askcontext + ", only 0|1|2|3|A allowed")

    bPrintTransforms = itf.GetBool("-printlist")
    # if a data field was specified, retreive the SD data field name
    field = None
    if itf.HasString("-datafield"):
        field = itf.GetString("-datafield")

    # create options class with defaults
    opts = OEMatchedPairAnalyzerOptions()
    # setup options from command line
    if not OESetupMatchedPairIndexOptions(opts, itf):
        OEThrow.Fatal("Error setting matched pair indexing options!")

    if not opts.HasIndexableFragmentHeavyAtomRange():
        OEThrow.Info("Indexing all fragments")
    else:
        print("Using index range={0:.2f}-{1:.2f}%"
              .format(opts.GetIndexableFragmentRangeMin(), opts.GetIndexableFragmentRangeMax()))

    if field is None and not bPrintTransforms:
        OEThrow.Info("Specify -datafield or -printlist, otherwise nothing to do!")
        return

    # create indexing engine
    mmp = OEMatchedPairAnalyzer(opts)

    # add molecules to be indexed
    bFoundData = False
    record = 0
    for mol in ifs.GetOEGraphMols():
        record += 1
        status = mmp.AddMol(mol, record)
        if status != record:
            OEThrow.Warning(
                "{0}: molecule indexing error, status={1}"
                .format(record, OEMatchedPairIndexStatusName(status)))

        elif field is not None and OEHasSDData(mol,field):
            # validate that data field value is numeric
            dataValue = None
            try:
                dataValue = float(OEGetSDData(mol,field))
            except ValueError:
                dataValue = None
            if dataValue is not None:
                bFoundData = True
            else:
                OEThrow.Fatal(
                    "{0}: Non-numeric data for field {1} found, {2}"
                    .format(record,field,OEGetSDData(mol,field)))

    if not mmp.NumMols():
        OEThrow.Fatal('No records in input structure file were indexed')

    if field is not None and not bFoundData:
        OEThrow.Fatal("No data found for requested field, " + field)

    if not mmp.NumMatchedPairs():
        OEThrow.Fatal('No matched pairs found from indexing, ' +
                      'use -fragGe,-fragLe options to extend index range')

    # controls how transforms are extracted (direction and allowed properties)
    extractMode = (OEMatchedPairTransformExtractMode_Sorted +
                   OEMatchedPairTransformExtractMode_NoSMARTS)

    # walk the transforms from the indexed matched pairs
    xforms = []

    xfmidx = 0
    for mmpxform in OEMatchedPairGetTransforms(mmp,
                                               chemctxt,extractMode):
        xfmidx += 1
        if bPrintTransforms:
            print("{0:2} {1}".format(xfmidx, mmpxform.GetTransform()))

        # compute delta property
        mmpidx = 0
        prop = []
        for mmppair in mmpxform.GetMatchedPairs():
            mmpidx += 1
            mmpinfo = "\t{0:2}: ({1:2},{2:2})".format(mmpidx,
                                                      mmppair.FromIndex(),
                                                      mmppair.ToIndex())
            for tag in mmppair.GetDataTags():
                mmpinfo = mmpinfo + " {0}=({1},{2})".format(tag,
                                                            mmppair.GetFromSDData(tag),
                                                            mmppair.GetToSDData(tag))
                if field is not None and tag == field:
                    # already validated above
                    fromData = float(mmppair.GetFromSDData(tag))
                    toData = float(mmppair.GetToSDData(tag))
                    prop.append(toData - fromData)

            if bPrintTransforms:
                print(mmpinfo)

        # skip if property not found
        if len(prop):
            xforms.append(MMPXform(mmpxform, average(prop), stddev(prop), len(prop)))

    if not field:
        return 0

    if field and not len(xforms):
        OEThrow.Error("No matched pairs found with %s data" % field)

    print("")
    print("*** Transforms sorted by delta", field)

    xforms.sort(key=lambda c: -abs(float(c.avg)))
    idx = 0
    for xfm in xforms:
        idx += 1
        if (extractMode & OEMatchedPairTransformExtractMode_NoSMARTS) != 0:
            # not 'invertable' if SMARTS qualifiers were applied
            if xfm.avg < 0.:
                xfm.avg = -1. * xfm.avg
                xfm.xform.Invert()
        print("{0:2} {2}=(avg={3:.2f},stdev={4:.2f},num={5}) {1}".format(idx,
                                                                         xfm.xform.GetTransform(),
                                                                         field,
                                                                         xfm.avg,
                                                                         xfm.std,
                                                                         xfm.num))

############################################################
InterfaceData = """
# matchedpairtransformlist interface file
!BRIEF [ -datafield sdDataField ] index.sdf
!CATEGORY MatchedPairTransformList

    !CATEGORY I/O
        !PARAMETER -input 1
          !ALIAS -i
          !TYPE string
          !REQUIRED true
          !BRIEF Input filename of structures to index
          !KEYLESS 1
        !END
    !END

    !CATEGORY options
        !PARAMETER -context 1
           !ALIAS -c
           !TYPE string
           !DEFAULT 0
           !BRIEF chemistry context to use for the transformation [0|1|2|3|A]
        !END

        !PARAMETER -printlist 2
           !ALIAS -p
           !ALIAS -pri
           !TYPE bool
           !DEFAULT 1
           !BRIEF print all transforms and matched pairs
        !END

        !PARAMETER -datafield 3
           !ALIAS -d
           !TYPE string
           !BRIEF sort transforms based on delta change in this property
        !END
    !END
!END
"""


def main(argv=[__name__]):
    itf = OEInterface(InterfaceData)
    OEConfigureMatchedPairIndexOptions(itf)

    if not OEParseCommandLine(itf, argv):
        OEThrow.Fatal("Unable to interpret command line!")

    MMPAnalyze(itf)

if __name__ == "__main__":
    sys.exit(main(sys.argv))

Apply ChEMBL solubility transformations

#!/usr/bin/env python
#############################################################################
# Copyright (C) 2014 OpenEye Scientific Software, Inc.
#############################################################################
# Utility to apply ChEMBL18 solubility transforms to an input set of structures
# ---------------------------------------------------------------------------
# ChEMBLsolubility.py input_mols output_mols [ -verbose ] [ -context [0|2] ] [ -minpairs # ]
#
# input_mols: filename of molecules to transform based on analysis
# output_mols: filename to collect transformed molecules
#############################################################################
from openeye.oechem import *
from openeye.oemedchem import *
import sys

############################################################
InterfaceData = """
!BRIEF [-i] <infile1> [-o] <infile2> [ -verbose ] [ -context [0|2]] [ -minpairs # ]
!PARAMETER -i
  !ALIAS -in
  !ALIAS -input
  !TYPE string
  !REQUIRED true
  !BRIEF Input file name
  !KEYLESS 1
!END
!PARAMETER -o
  !ALIAS -out
  !ALIAS -output
  !TYPE string
  !REQUIRED true
  !BRIEF Output file name
  !KEYLESS 2
!END
!PARAMETER -verbose
  !ALIAS -v
  !TYPE bool
  !DEFAULT false
  !BRIEF Verbose output
!END
!PARAMETER -context
  !ALIAS -c
  !TYPE string
  !DEFAULT 0
  !BRIEF Chemistry context for output
!END
!PARAMETER -minpairs 2
   !TYPE int
   !DEFAULT 0
   !BRIEF require at least -minpairs to apply the transformations (default: all)
!END
"""


def main(argv=[__name__]):
    itf = OEInterface(InterfaceData, argv)

    verbose = itf.GetBool("-verbose")

    # input structure(s) to transform
    ifsmols = oemolistream()
    if not ifsmols.open(itf.GetString("-i")):
        OEThrow.Fatal("Unable to open %s for reading" % itf.GetString("-i"))

    # save output structure(s) to this file
    ofs = oemolostream()
    if not ofs.open(itf.GetString("-o")):
        OEThrow.Fatal("Unable to open %s for writing" % itf.GetString("-o"))

    # request a specific context for the transform activity, here 0-bonds
    chemctxt = OEMatchedPairContext_Bond0
    askcontext = itf.GetString("-context")[:1]
    if askcontext == '0':
        chemctxt = OEMatchedPairContext_Bond0
    elif askcontext == '2':
        chemctxt = OEMatchedPairContext_Bond2
    else:
        OEThrow.Fatal("Invalid context specified: " +
                      askcontext + ", only 0|2 allowed")

    minpairs = itf.GetInt("-minpairs")
    if minpairs > 1:
        print('Requiring at least {0:d} matched pairs to apply transformations'.format(minpairs))

    irec = 0
    ocnt = 0
    ototal = 0
    for mol in ifsmols.GetOEGraphMols():
        irec += 1
        OETheFunctionFormerlyKnownAsStripSalts(mol)
        iter = OEApplyChEMBL18SolubilityTransforms(mol, chemctxt, minpairs)
        if not iter.IsValid():
            name = mol.GetTitle()
            if not mol.GetTitle():
                name = 'record ' + str(irec)
            OEThrow.Warning("%s: did not produce any output" % name)
            continue
        ocnt = 0
        for outmol in iter:
            ocnt += 1
            OEWriteMolecule(ofs, outmol)
        if not ocnt:
            print('Record', irec, 'No output generated')
            print(OEMolToSmiles(mol))
        else:
            ototal += ocnt
            if verbose:
                print('Record:', "{0:4d}".format(irec),
                      'transformation count=', "{0:6d}".format(ocnt),
                      'total mols=', "{0:7d}".format(ototal))

    if not irec:
        OEThrow.Fatal('No records in input structure file to transform')

    if not ocnt:
        OEThrow.Warning('No transformed structures generated')

    print("Input molecules={0:d} output molecules={1:d}".format(irec, ototal))

    return 0

if __name__ == "__main__":
    sys.exit(main(sys.argv))

MCS Similarity score reporting

#!/usr/bin/env python
###############################################################################
# Copyright (C) 2016 OpenEye Scientific Software, Inc.
###############################################################################
# Utility to index a set of structures and return the highest MCS similarity
#  structures from an input query structure
# ---------------------------------------------------------------------------
# MCSFragDatabase [ -inmols index_mols | -indb index.mcsfrag ]
#                 [ -query query_file ]
#                 [ -type bond -limit 10 ]    // 10 Tanimoto sim bond scores
#                 [ -verbose 1 ]              // optional verbosity
#                 [ -outdb outindex.mcsfrag ] // optional save index file
#
# index_mols: filename of input molecules to analyze
# input.mcsfrag: filename of index file to load
# query_file: filename of query to report MCS similarity results
###############################################################################
from __future__ import print_function
from openeye.oechem import *
from openeye.oemedchem import *
import sys


def MCSFragAnalyze(itf):

    mcsopt = OEMCSFragDatabaseOptions()
    if not OESetupMCSFragDatabaseOptions(mcsopt, itf):
        OEThrow.Fatal("Error setting MCS fragment database options")

    # either:
    #   1) input structures to index
    #   2) MCS fragment database to load
    indb = None
    outdb = None
    inmols = None

    verbose = itf.GetBool("-verbose")
    timer = itf.GetBool("-timer")

    if not itf.HasString("-inmols") and not itf.HasString("-indb"):
        OEThrow.Fatal("Must specify either -inmols or -indb")
    elif itf.HasString("-inmols") and itf.HasString("-indb"):
        OEThrow.Fatal("Must specify only one of -inmols or -indb")
    elif itf.HasString("-inmols") and not itf.HasString("-outdb") and not itf.HasString("-query"):
        OEThrow.Fatal("Must specify one of -outdb or -query with -inmols")
    elif itf.HasString("-indb") and not itf.HasString("-query"):
        OEThrow.Fatal("Must specify -query with -indb")

    if itf.HasString("-outdb"):
        outdb = itf.GetString("-outdb")
        if not OEIsMCSFragDatabaseFiletype(outdb):
            OEThrow.Fatal("Option -outdb must specify a .mcsfrag filename: " + outdb)

    watch = OEStopwatch()

    # initialize the MCS fragment index
    mcsdb = OEMCSFragDatabase(mcsopt)

    ifsindex = oemolistream()
    if itf.HasString("-indb"):
        indb = itf.GetString("-indb")
        if not OEIsMCSFragDatabaseFiletype(indb):
            OEThrow.Fatal("Option -indb must specify a .mcsfrag filename: " + indb)
        if verbose:
            OEThrow.Info("Loading index from " + indb)
        watch.Start()
        if not OEReadMCSFragDatabase(indb, mcsdb):
            OEThrow.Fatal("Error deserializing MCS fragment database: " + indb)
        loadtime = watch.Elapsed()

        if itf.HasString("-outdb"):
            OEThrow.Warning("Option -outdb only meaningful with -inmols, ignoring")
            outdb = None

        if not mcsdb.NumMols():
            OEThrow.Fatal("Loaded empty index")
        elif timer:
            if not verbose or not loadtime:
                print("Loaded index of {0} molecules, {1} fragments"
                      .format(mcsdb.NumMols(), mcsdb.NumFragments()))
            else:
                print("Loaded index of {0} molecules, {1} fragments in {2:.2F} sec: {3:,.1F} mols/sec {4:,.1F} frags/sec"
                      .format(mcsdb.NumMols(), mcsdb.NumFragments(), loadtime, float(mcsdb.NumMols())/loadtime, float(mcsdb.NumFragments())/loadtime))

        # index instantiated - drop through to results

    elif itf.HasString("-inmols"):
        inmols = itf.GetString("-inmols")
        if not ifsindex.open(inmols):
            OEThrow.Fatal("Unable to open input file for indexing: " + inmols)

    queryfile = None
    ifsquery = oemolistream()
    if itf.HasString("-query"):
        queryfile = itf.GetString("-query")
        if not ifsquery.open(queryfile):
            OEThrow.Fatal("Unable to open query file: " + queryfile)

    if inmols:
        if not mcsopt.HasIndexableFragmentHeavyAtomRange():
            OEThrow.Info("Indexing all fragments")
        else:
            print("Using index range={0:.1f}-{1:.1f}%".format(mcsopt.GetIndexableFragmentRangeMin(),
                                                              mcsopt.GetIndexableFragmentRangeMax()))

        if (mcsopt.GetOptions() & OEMatchedPairOptions_AllCuts) != OEMatchedPairOptions_AllCuts:
            print("Using nondefault cut option={0}".format(mcsopt.GetOptions() & OEMatchedPairOptions_AllCuts))
        if (mcsopt.GetOptions() & OEMatchedPairOptions_IndexHSites) == 0:
            print("Using nondefault HMember off option")

        # add molecules to be indexed
        maxrec = 0
        if itf.HasInt("-maxrec"):
            maxrec = itf.GetInt("-maxrec")
        if maxrec:
            print("Indexing a maximum of", maxrec, "records")

        vstatus = 0
        if itf.HasInt("-status"):
            vstatus = itf.GetInt("-status")

        record = 0
        mol = OEGraphMol()
        indexed = 0
        watch.Start()
        while OEReadMolecule(ifsindex, mol):
            record += 1
            status = mcsdb.AddMol(mol, record)
            if status == record:
                indexed += 1
            elif verbose:
                OEThrow.Info("Input structure not added to index, record={0} status={1}"
                             .format(record,
                                     OEMatchedPairIndexStatusName(status)))
            if maxrec and record >= maxrec:
                break  # maximum record limit reached
            if vstatus and (record % vstatus) == 0:
                print("{0} records processed, {1} structures indexed".format(record, indexed))

        indextime = watch.Elapsed()
        if record == 0:
            OEThrow.Fatal("No records in input structure file for indexing")
        elif timer:
            if not verbose or not indextime:
                print("Processed {0} molecules, generating {1} fragments"
                      .format(record, mcsdb.NumFragments()))
            else:
                print("Processed {0} molecules, generating {1} fragments in {2:.2F} sec: {3:,.1F} mols/sec {4:,.1F} frags/sec"
                      .format(record, mcsdb.NumFragments(), indextime, float(record)/float(indextime), float(mcsdb.NumFragments())/float(indextime)))

        # indexing complete - drop through to results

    if not mcsdb.NumFragments():
        OEThrow.Fatal("No fragments found from indexing, use -fragGe,-fragLe options to extend indexable range")

    if outdb:
        if verbose:
            OEThrow.Info("Writing index to " + outdb)
        if not OEWriteMCSFragDatabase(outdb, mcsdb):
            OEThrow.Fatal("Error serializing MCS fragment database: " + outdb)

    # return some status information about the index
    print("Index: molecules={0} fragments={1}\n".format(mcsdb.NumMols(), mcsdb.NumFragments()))

    # check for (optional) query
    if not queryfile:
        return

    # process the query options
    qmaxrec = 0
    if itf.HasInt("-qlim"):
        qmaxrec = itf.GetInt("-qlim")

    numscores = 10
    if itf.HasInt("-limit"):
        numscores = itf.GetInt("-limit")

    cnttype = OEMCSScoreType_BondCount
    if itf.HasString("-type"):
        if "atom" in itf.GetString("-type"):
            cnttype = OEMCSScoreType_AtomCount
        elif "bond" in itf.GetString("-type"):
            cnttype = OEMCSScoreType_BondCount
        else:
            OEThrow.Warning("Ignoring unrecognized -type option, expecting atom or bond: " + itf.GetString("-type"))

    cntname = "bond"
    if cnttype != OEMCSScoreType_BondCount:
        cntname = "atom"

    # process the query (or queries)
    qmol = OEGraphMol()
    qnum = 0
    while OEReadMolecule(ifsquery, qmol):
        qnum += 1

        numhits = 0
        for score in mcsdb.GetSortedScores(qmol, numscores, 0, 0, True, cnttype):
            if numhits == 0:
                print("Query: {0}: {1}".format(qnum, qmol.GetTitle()))

            print("\trecord: {0:6}\ttanimoto_{1}_score: {2:.2f}\tmcs_core: {3}"
                  .format(score.GetIdx(), cntname, score.GetScore(), score.GetMCSCore()))
            numhits += 1

        if numhits == 0:
            OEThrow.Warning("Query: {0}: {1} - no hits".format(qnum, qmol.GetTitle()))

        if qmaxrec and qnum >= qmaxrec:
            break

    if qnum == 0:
        OEThrow.Fatal("Error reading query structure(s): " + queryfile)


############################################################
InterfaceData = """
#MCSFragmentDatabase interface file
!CATEGORY MCSFragmentDatabase

    !CATEGORY I/O
        !PARAMETER -inmols 1
          !TYPE string
          !BRIEF Input filename of structure(s) to index
        !END

        !PARAMETER -indb 2
          !TYPE string
          !BRIEF Input filename of index file to load
        !END

        !PARAMETER -query 3
          !ALIAS -q
          !TYPE string
          !BRIEF query structure file to report similarity results against
        !END

        !PARAMETER -outdb 4
          !TYPE string
          !BRIEF Output filename of serialized index
        !END

    !END

    !CATEGORY options
        !PARAMETER -type 1
           !TYPE string
           !DEFAULT bond
           !BRIEF MCS core counts to use for reported scores: atom or bond
        !END
        !PARAMETER -limit 2
           !TYPE int
           !DEFAULT 10
           !BRIEF report -limit scores for the query structure
        !END
        !PARAMETER -verbose 3
           !TYPE bool
           !DEFAULT 0
           !BRIEF generate verbose output
        !END
        !PARAMETER -maxrec 4
           !TYPE int
           !DEFAULT 0
           !BRIEF limit indexing to -maxrec records from the -inputmols structures
        !END
        !PARAMETER -qlim 5
           !TYPE int
           !DEFAULT 1
           !BRIEF limit query processing to -qlim records from the -query structures
        !END
        !PARAMETER -timer 6
           !TYPE bool
           !DEFAULT 1
           !BRIEF report loading or indexing time
        !END
        !PARAMETER -status 7
           !TYPE int
           !DEFAULT 0
           !BRIEF print indexing status every -status records
        !END
    !END
!END
"""


def main(argv=[__name__]):
    itf = OEInterface(InterfaceData)
    OEConfigureMCSFragDatabaseOptions(itf)

    if not OEParseCommandLine(itf, argv):
        OEThrow.Fatal("Unable to interpret command line!")

    MCSFragAnalyze(itf)


if __name__ == "__main__":
    sys.exit(main(sys.argv))

MCS Fragmentation API

#!/usr/bin/env python
###############################################################################
# Copyright (C) 2017 OpenEye Scientific Software, Inc.
###############################################################################
# report common fragments from an input set of molcules
###############################################################################
from __future__ import print_function
import openeye.oechem as oechem
import openeye.oemedchem as oemedchem

import operator
import sys


def MCSFragOccurrence(itf):

    mcsopt = oemedchem.OEMCSFragDatabaseOptions()
    if not oemedchem.OESetupMCSFragDatabaseOptions(mcsopt, itf):
        oechem.OEThrow.Fatal("Error setting MCS fragment database options")

    indb = None
    outdb = None
    inmols = None

    verbose = itf.GetBool("-verbose")
    timer = itf.GetBool("-timer")

    if itf.HasString("-outdb"):
        outdb = itf.GetString("-outdb")
        if not oemedchem.OEIsMCSFragDatabaseFiletype(outdb):
            oechem.OEThrow.Fatal("Option -outdb must specify a .mcsfrag filename: " + outdb)

    watch = oechem.OEStopwatch()

    # initialize the MCS fragment index
    mcsdb = oemedchem.OEMCSFragDatabase(mcsopt)
    mol = oechem.OEMol() # multiconformer molecules will only process the active conformer

    maxrec = itf.GetInt("-maxrec")
    vstatus = itf.GetInt("-status")

    if itf.HasString("-indb"):
        indb = itf.GetString("-indb")
        if not oemedchem.OEIsMCSFragDatabaseFiletype(indb):
            oechem.OEThrow.Fatal("Option -indb must specify a .mcsfrag filename: " + indb)
        if verbose:
            print("Loading index from {0}".format(indb))
        watch.Start()
        if not oemedchem.OEReadMCSFragDatabase(indb, mcsdb):
            oemedchem.OEThrow.Fatal("Error deserializing MCS fragment database: " + indb)
        loadtime = watch.Elapsed()

        if itf.HasString("-outdb"):
            oechem.OEThrow.Warning("Option -outdb only meaningful with -inmols, ignoring")
            outdb = None

        if not mcsdb.NumMols():
            oechem.OEThrow.Fatal("Loaded empty index")
        elif timer:
            if not verbose or not loadtime:
                print("Loaded index of {0} molecules, {1} fragments"
                      .format(mcsdb.NumMols(), mcsdb.NumFragments()))
            else:
                print("Loaded index of {0} molecules, {1} fragments in {2:.2F} sec: {3:,.1F} mols/sec {4:,.1F} frags/sec"
                      .format(mcsdb.NumMols(), mcsdb.NumFragments(), loadtime, float(mcsdb.NumMols())/loadtime, float(mcsdb.NumFragments())/loadtime))

        # index instantiated - drop through to results

    inmols = itf.GetString("-inmols")
    ifsindex = oechem.oemolistream()
    if not ifsindex.open(inmols):
        oechem.OEThrow.Fatal("Unable to open input file for indexing: " + inmols)

    if inmols and not itf.HasString("-indb"):
        if not mcsopt.HasIndexableFragmentHeavyAtomRange():
            print("Indexing all fragments")
        else:
            print("Using index range={0:.1f}-{1:.1f}%".format(mcsopt.GetIndexableFragmentRangeMin(),
                                                              mcsopt.GetIndexableFragmentRangeMax()))

        if (mcsopt.GetOptions() & oemedchem.OEMatchedPairOptions_AllCuts) != oemedchem.OEMatchedPairOptions_AllCuts:
            print("Using nondefault cut option={0}".format(mcsopt.GetOptions() & oemedchem.OEMatchedPairOptions_AllCuts))
        if (mcsopt.GetOptions() & oemedchem.OEMatchedPairOptions_IndexHSites) == 0:
            print("Using nondefault HMember off option")

        # add molecules to be indexed
        if maxrec:
            print("Indexing a maximum of {0} records".format(maxrec))

        record = 0
        indexed = 0
        watch.Start()
        while oechem.OEReadMolecule(ifsindex, mol):
            record += 1
            status = mcsdb.AddMol(mol, record)
            if status == record:
                indexed += 1
            elif verbose:
                print("Input structure not added to index, record={0} status={1}"
                      .format(record,oemedchem.OEMatchedPairIndexStatusName(status)))
            if maxrec and record >= maxrec:
                break  # maximum record limit reached
            if vstatus and (record % vstatus) == 0:
                print("{0} records processed, {1} structures indexed".format(record, indexed))

        indextime = watch.Elapsed()
        if record == 0:
            oechem.OEThrow.Fatal("No records in input structure file for indexing")
        elif timer:
            if not verbose or not indextime:
                print("Processed {0} molecules, generating {1} fragments"
                      .format(record, mcsdb.NumFragments()))
            else:
                print("Processed {0} molecules, generating {1} fragments in {2:.2F} sec: {3:,.1F} mols/sec {4:,.1F} frags/sec"
                      .format(record, mcsdb.NumFragments(), indextime, float(record)/float(indextime), float(mcsdb.NumFragments())/float(indextime)))

        # indexing complete - drop through to results

    if not mcsdb.NumFragments():
        oechem.OEThrow.Fatal("No fragments found from indexing, use -fragGe,-fragLe options to extend indexable range")

    if outdb:
        if verbose:
            print("Writing index to {0}".format(outdb))
        if not oemedchem.OEWriteMCSFragDatabase(outdb, mcsdb):
            oechem.OEThrow.Fatal("Error serializing MCS fragment database: " + outdb)

    # return some status information about the index
    print("Index: molecules={0} fragments={1}\n".format(mcsdb.NumMols(), mcsdb.NumFragments()))

    # now rewind the input, fragment and collect stats
    ifsindex.rewind()
    watch.Start()

    record = 0
    uniqueCores = set()
    while oechem.OEReadMolecule(ifsindex, mol):
        record += 1
        cores = mcsdb.MoleculeToCores(mol)
        for core in cores:
            if not mcsdb.CoreToMoleculeCount(core):
                oechem.OEThrow.Warning('Occurrence 0 for {0}???'.format(core))
                continue
            uniqueCores.add(core)  # collect unique cores with >0 counts in the index

        if maxrec and record >= maxrec:
            break  # maximum record limit reached
        if vstatus and (record % vstatus) == 0:
            print("{0} records processed, {1} unique core fragments found".format(record, len(uniqueCores)))

    coreOcc = dict()
    for core in uniqueCores:
        coreOcc[core] = mcsdb.CoreToMoleculeCount(core)

    print()
    print('Common fragments with occurrence >1:')
    num = 0
    topnum = itf.GetInt('-top')
    for k, v in sorted(coreOcc.items(), key=operator.itemgetter(1,0), reverse=True):
        if v <= 1:
            break
        num += 1
        ids = list(mcsdb.CoreToMolecules(k))
        print(v, k, ids)
        if topnum and num >= topnum:
            break

    if itf.GetBool('-uncommon'):
        print()
        print('Uncommon fragments:')
        for k, v in sorted(coreOcc.items(), key=operator.itemgetter(1)):
            if v > 1:
                break
            ids = list(mcsdb.CoreToMolecules(k))
            print(v, k, ids)


############################################################
InterfaceData = """
#MCSFragOccurrence interface file
!CATEGORY MCSFragOccurrence

    !CATEGORY I/O
        !PARAMETER -inmols 1
          !TYPE string
          !REQUIRED true
          !BRIEF Input filename of structure(s) to index
        !END

        !PARAMETER -indb 2
          !TYPE string
          !BRIEF Input filename of index file to load
        !END

        !PARAMETER -outdb 3
          !TYPE string
          !BRIEF Output filename of serialized index
        !END

    !END

    !CATEGORY options
        !PARAMETER -verbose 1
           !TYPE bool
           !DEFAULT 0
           !BRIEF generate verbose output
        !END
        !PARAMETER -maxrec 2
           !TYPE int
           !DEFAULT 0
           !LEGAL_RANGE 0 inf
           !BRIEF limit indexing to -maxrec records from the -inmols structures
        !END
        !PARAMETER -timer 3
           !TYPE bool
           !DEFAULT 1
           !BRIEF report loading or indexing time
        !END
        !PARAMETER -status 4
           !TYPE int
           !DEFAULT 0
           !LEGAL_RANGE 0 inf
           !BRIEF print indexing status every -status records
        !END
        !PARAMETER -top 5
           !TYPE int
           !DEFAULT 10
           !LEGAL_RANGE 0 inf
           !BRIEF print the -top number of common fragment occurrences (0:all, 10:default)
        !END
        !PARAMETER -uncommon 6
           !TYPE bool
           !DEFAULT 0
           !BRIEF print the uncommon fragments also
        !END
    !END
!END
"""


def main(argv=[__name__]):
    itf = oechem.OEInterface(InterfaceData)
    oemedchem.OEConfigureMCSFragDatabaseOptions(itf)

    if not oechem.OEParseCommandLine(itf, argv):
        oechem.OEThrow.Fatal("Unable to interpret command line!")

    MCSFragOccurrence(itf)


if __name__ == "__main__":
    sys.exit(main(sys.argv))