Preparing a Protein
A program that selects the highest occupancy alternate locations of a protein, adds hydrogens in orientations that make appropriate hydrogen bonds, and splits out the complex for the selected binding site (optionally separating the ligand from the protein complex). This is an initial prototype of a “protein prep” workflow.
An example command that splits out the ligand and limits water sampling is as follows:
prompt> python proteinprep.py -waterprocessing ignore input.pdb prot.oeb.gz lig.oeb.gz
Help is available for all the supported parameters:
prompt> python proteinprep.py --help all
proteinprep [-options] <inmol> [<outcplx> [<outlig>]]
Complete parameter list
SplitMolComplex options :
-bindingsitenum : Select this binding site
-covalentligand : Split covalent ligands
-ligandfilter : Ligand filter category
-ligandname : Ligand name
-maxsitedistance : Maximum distance to be associated with the binding site
-maxsurfacedistance : Maximum distance to be associated with the protein
surface
-modelnum : Select this NMR model number
-proteinfilter : Protein filter category
-separateresidues : Separate individual residues before selection
-surfacewaters : Select surface waters
-waterfilter : Water filter category
input/output options :
-in : Input molecule filename (must have 3D coordinates)
-cplxout : Output complex filename
-ligout : Output ligand filename
Calculation options :
-alts : Alternate location atom handling (affects atom:atom interactions)
-placehydrogens : If false, hydrogens will not be added
-waterprocessing : How waters are processed (can greatly affect runtime)
-standardizehyd : If false, bonds for hydrogens are not adjusted to standard
lengths
-clashcutoff : Van der Waals overlap (in Angstroms) defined to be a bad
clash
-flipbias : Scale factor for the bias against flipping sidechains such as
HIS
Display options :
-verbose : Display more information about the process
Code
Download code
#!/usr/bin/env python
# (C) 2022 Cadence Design Systems, Inc. (Cadence)
# All rights reserved.
# TERMS FOR USE OF SAMPLE CODE The software below ("Sample Code") is
# provided to current licensees or subscribers of Cadence products or
# SaaS offerings (each a "Customer").
# Customer is hereby permitted to use, copy, and modify the Sample Code,
# subject to these terms. Cadence claims no rights to Customer's
# modifications. Modification of Sample Code is at Customer's sole and
# exclusive risk. Sample Code may require Customer to have a then
# current license or subscription to the applicable Cadence offering.
# THE SAMPLE CODE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED. OPENEYE DISCLAIMS ALL WARRANTIES, INCLUDING, BUT
# NOT LIMITED TO, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
# PARTICULAR PURPOSE AND NONINFRINGEMENT. In no event shall Cadence be
# liable for any damages or liability in connection with the Sample Code
# or its use.
#############################################################################
# prepare a molecule: alts, hydrogens, split ligand
#############################################################################
import sys
from openeye import oechem
def WaterProcess(processName):
if processName == "fullsearch":
return oechem.OEPlaceHydrogensWaterProcessing_FullSearch
elif processName == "focused":
return oechem.OEPlaceHydrogensWaterProcessing_Focused
return oechem.OEPlaceHydrogensWaterProcessing_Ignore
def main(argv=[__name__]):
itf = oechem.OEInterface(InterfaceData)
oechem.OEConfigureSplitMolComplexOptions(itf,
oechem.OESplitMolComplexSetup_All &
~ (oechem.OESplitMolComplexSetup_CovBondTreatment |
oechem.OESplitMolComplexSetup_CovCofactor))
if not oechem.OEParseCommandLine(itf, argv):
oechem.OEThrow.Fatal("Unable to interpret command line!")
verbose = itf.GetBool("-verbose")
if verbose:
oechem.OEThrow.SetLevel(oechem.OEErrorLevel_Verbose)
altProcess = itf.GetString("-alts")
keepAlts = (altProcess != "a")
highestOcc = (altProcess == "occupancy")
compareAlts = (altProcess == "compare")
siteNum = itf.GetUnsignedInt("-bindingsitenum")
allSites = (siteNum == 0)
otherModel = (itf.GetUnsignedInt("-modelnum") != 1)
placeHyd = itf.GetBool("-placehydrogens")
splitlig = itf.HasString("-ligout")
watProcessName = itf.GetString("-waterprocessing")
waterProcess = WaterProcess(watProcessName)
standardize = itf.GetBool("-standardizehyd")
badclash = itf.GetDouble("-clashcutoff")
flipbias = itf.GetDouble("-flipbias")
maxStates = itf.GetDouble("-maxsubstates")
flavor = oechem.OEIFlavor_PDB_Default | oechem.OEIFlavor_PDB_DATA
if keepAlts:
flavor = flavor | oechem.OEIFlavor_PDB_ALTLOC
if otherModel:
flavor = flavor & ~ oechem.OEIFlavor_PDB_ENDM
ims = oechem.oemolistream()
ims.SetFlavor(oechem.OEFormat_PDB, flavor)
inputFile = itf.GetString("-in")
if not ims.open(inputFile):
oechem.OEThrow.Fatal("Unable to open %s for reading." % inputFile)
if not oechem.OEIs3DFormat(ims.GetFormat()):
oechem.OEThrow.Fatal("%s is not in a 3D format." % inputFile)
inftype = oechem.OEGetFileType(oechem.OEGetFileExtension(inputFile))
if (inftype == oechem.OEFormat_PDB) and not keepAlts:
oechem.OEThrow.Verbose("Default processing of alt locations (keep just 'A' and ' ').")
sopt = oechem.OESplitMolComplexOptions()
oechem.OESetupSplitMolComplexOptions(sopt, itf)
inmol = oechem.OEGraphMol()
if not oechem.OEReadMolecule(ims, inmol):
oechem.OEThrow.Fatal("Unable to read %s." % inputFile)
ims.close()
if (inmol.NumAtoms() == 0):
oechem.OEThrow.Fatal("Input molecule %s contains no atoms." % inputFile)
if inmol.GetTitle() == "":
inmol.SetTitle("input mol")
oechem.OEThrow.Verbose("Processing %s." % inmol.GetTitle())
if not oechem.OEHasResidues(inmol):
oechem.OEPerceiveResidues(inmol, oechem.OEPreserveResInfo_All)
if highestOcc or compareAlts:
alf = oechem.OEAltLocationFactory(inmol)
if alf.GetGroupCount() != 0:
if highestOcc:
oechem.OEThrow.Verbose("Dropping alternate locations from input.")
alf.MakePrimaryAltMol(inmol)
elif compareAlts:
oechem.OEThrow.Verbose("Fixing alternate location issues.")
inmol = alf.GetSourceMol()
outmol = oechem.OEGraphMol()
if allSites:
outmol = inmol
else:
oechem.OEThrow.Verbose("Splitting out selected complex.")
soptSiteSel = oechem.OESplitMolComplexOptions(sopt)
soptSiteSel.SetSplitCovalent(False) # do any cov lig splitting later
frags = oechem.OEAtomBondSetVector()
if not oechem.OEGetMolComplexFragments(frags, inmol, soptSiteSel):
oechem.OEThrow.Fatal("Unable to fragment %s." % inmol.GetTitle())
howManySites = oechem.OECountMolComplexSites(frags)
if howManySites < siteNum:
oechem.OEThrow.Warning(("Binding site count (%d) " +
"less than requested site (%d) in %s.") %
(howManySites, siteNum, inmol.GetTitle()))
exit(0)
if not oechem.OECombineMolComplexFragments(outmol, frags, soptSiteSel):
oechem.OEThrow.Fatal("Unable to collect fragments from %s." % inmol.GetTitle())
if (outmol.NumAtoms() == 0):
oechem.OEThrow.Fatal("No fragments selected from %s." % inmol.GetTitle())
if placeHyd:
oechem.OEThrow.Verbose("Adding hydrogens to complex.")
hopt = oechem.OEPlaceHydrogensOptions()
hopt.SetAltsMustBeCompatible(compareAlts)
hopt.SetStandardizeBondLen(standardize)
hopt.SetWaterProcessing(waterProcess)
hopt.SetBadClashOverlapDistance(badclash)
hopt.SetFlipBiasScale(flipbias)
hopt.SetMaxSubstateCutoff(maxStates)
if verbose:
details = oechem.OEPlaceHydrogensDetails()
if not oechem.OEPlaceHydrogens(outmol, details, hopt):
oechem.OEThrow.Fatal("Unable to place hydrogens and get details on %s."
% inmol.GetTitle())
oechem.OEThrow.Verbose(details.Describe())
else:
if not oechem.OEPlaceHydrogens(outmol, hopt):
oechem.OEThrow.Fatal("Unable to place hydrogens on %s." % inmol.GetTitle())
oms1 = oechem.oemolostream()
cplxFile = itf.GetString("-cplxout")
if not oms1.open(cplxFile):
oechem.OEThrow.Fatal("Unable to open %s for writing." % cplxFile)
if splitlig:
oechem.OEThrow.Verbose("Splitting ligand from complex.")
returnAllSites = 0
oechem.OESetupSplitMolComplexOptions(sopt, itf, returnAllSites)
frags = oechem.OEAtomBondSetVector()
if not oechem.OEGetMolComplexFragments(frags, outmol, sopt):
oechem.OEThrow.Fatal("Unable to fragment complex from %s." % inmol.GetTitle())
lfilter = sopt.GetLigandFilter()
protComplex = oechem.OEGraphMol()
if not oechem.OECombineMolComplexFragments(protComplex,
frags,
sopt,
oechem.OENotRoleSet(lfilter)):
oechem.OEThrow.Fatal("Unable to collect complex from %s." % inmol.GetTitle())
if (protComplex.NumAtoms() == 0):
oechem.OEThrow.Warning("No complex identified in %s." % inmol.GetTitle())
else:
oechem.OEWriteMolecule(oms1, protComplex)
lig = oechem.OEGraphMol()
if not oechem.OECombineMolComplexFragments(lig, frags, sopt, lfilter):
oechem.OEThrow.Fatal("Unable to collect ligand from %s." % inmol.GetTitle())
if (lig.NumAtoms() == 0):
oechem.OEThrow.Warning("No ligand identified in %s." % inmol.GetTitle())
else:
oms2 = oechem.oemolostream()
if splitlig:
ligFile = itf.GetString("-ligout")
if not oms2.open(ligFile):
oechem.OEThrow.Fatal("Unable to open %s for writing." % ligFile)
oechem.OEThrow.Verbose("Ligand: %s" % lig.GetTitle())
oechem.OEWriteMolecule(oms2, lig)
oms2.close()
else:
oechem.OEWriteMolecule(oms1, outmol)
oms1.close()
#############################################################################
# INTERFACE
#############################################################################
InterfaceData = '''
!BRIEF proteinprep.py [-options] <inmol> [<outcplx> [<outlig>]]
!CATEGORY "input/output options :" 1
!PARAMETER -in 1
!ALIAS -i
!TYPE string
!BRIEF Input molecule filename (must have 3D coordinates)
!SIMPLE true
!REQUIRED true
!KEYLESS 1
!END
!PARAMETER -cplxout 2
!ALIAS -p
!TYPE string
!DEFAULT proteinprep.oeb.gz
!BRIEF Output complex filename
!SIMPLE true
!REQUIRED false
!KEYLESS 2
!END
!PARAMETER -ligout 3
!ALIAS -l
!TYPE string
!BRIEF Output ligand filename
!SIMPLE true
!REQUIRED false
!KEYLESS 3
!END
!END
!CATEGORY "Calculation options :" 2
!PARAMETER -alts 1
!TYPE string
!LEGAL_VALUE occupancy
!LEGAL_VALUE a
!LEGAL_VALUE ignore
!LEGAL_VALUE compare
!DEFAULT occupancy
!BRIEF Alternate location atom handling (affects atom:atom interactions)
!SIMPLE true
!REQUIRED false
!DETAIL
occupancy - keep just the highest average occupancy for each alt group
a - keep only loc code A (and blank)
ignore - assume alts already selected appropriately
compare - keep all alts but only interact if same loc code (or blank)
!END
!PARAMETER -placehydrogens 2
!TYPE bool
!DEFAULT true
!BRIEF If false, hydrogens will not be added
!SIMPLE true
!REQUIRED false
!END
!PARAMETER -waterprocessing 3
!TYPE string
!LEGAL_VALUE ignore
!LEGAL_VALUE focused
!LEGAL_VALUE fullsearch
!DEFAULT fullsearch
!BRIEF How waters are processed
!SIMPLE true
!REQUIRED false
!DETAIL
ignore - leave water hydrogens in a random orientation
focused - search orientations based on neighboring polar groups
fullsearch - do an extensive search of water orientations
!END
!PARAMETER -standardizehyd 4
!ALIAS -stdhyd
!TYPE bool
!DEFAULT true
!BRIEF If false, bonds for hydrogens are not adjusted to standard lengths
!SIMPLE false
!REQUIRED false
!END
!PARAMETER -clashcutoff 5
!TYPE double
!DEFAULT 0.4
!BRIEF Van der Waals overlap (in Angstroms) defined to be a bad clash
!SIMPLE false
!REQUIRED false
!END
!PARAMETER -flipbias 6
!TYPE double
!DEFAULT 1.0
!BRIEF Scale factor for the bias against flipping sidechains such as HIS
!SIMPLE false
!REQUIRED false
!END
!PARAMETER -maxsubstates 7
!TYPE double
!DEFAULT 1.0e8
!BRIEF Maximum number of substates in a single step of hydrogen placement optimization
!SIMPLE false
!REQUIRED false
!END
!END
!CATEGORY "Display options :" 9
!PARAMETER -verbose 1
!ALIAS -v
!TYPE bool
!DEFAULT false
!BRIEF Display more information about the process
!SIMPLE true
!REQUIRED false
!END
!END
'''
if __name__ == "__main__":
sys.exit(main(sys.argv))
See also
Protein Preparation section
OEAltLocationFactory object
OEIFlavor_PDB_ALTLOCflavorOEGetMolComplexFragmentsfunctionOECombineMolComplexFragmentsfunctionOESplitMolComplexOptions object
OEConfigureSplitMolComplexOptionsfunctionOESplitMolComplexSetupfunctionOEPlaceHydrogensfunctionOEPlaceHydrogensOptions object
OEPlaceHydrogensDetails object