Shape Database Client Histogram
ShapeDatabaseClientHistogram.py <server:port> <query> <results> [num_hits = 100]
Example script to send a query to a specified ShapeDatabaseServer and print the histogram of scores for the entire database of molecules. The histogram is a simple ASCII representation of the distribution. The histogram will also be updated in ‘real-time’ as the query progresses.
Download code
#!/usr/bin/env python
# (C) 2022 Cadence Design Systems, Inc. (Cadence)
# All rights reserved.
# TERMS FOR USE OF SAMPLE CODE The software below ("Sample Code") is
# provided to current licensees or subscribers of Cadence products or
# SaaS offerings (each a "Customer").
# Customer is hereby permitted to use, copy, and modify the Sample Code,
# subject to these terms. Cadence claims no rights to Customer's
# modifications. Modification of Sample Code is at Customer's sole and
# exclusive risk. Sample Code may require Customer to have a then
# current license or subscription to the applicable Cadence offering.
# THE SAMPLE CODE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED. OPENEYE DISCLAIMS ALL WARRANTIES, INCLUDING, BUT
# NOT LIMITED TO, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
# PARTICULAR PURPOSE AND NONINFRINGEMENT. In no event shall Cadence be
# liable for any damages or liability in connection with the Sample Code
# or its use.
from __future__ import unicode_literals
import os
import sys
import argparse
try:
from xmlrpclib import ServerProxy, Binary, Fault
except ImportError: # python 3
from xmlrpc.client import ServerProxy, Binary, Fault
class Pyasciigraph:
""" Copied from https://pypi.python.org/pypi/ascii_graph/0.2.1 """
def __init__(self, line_length=79, min_graph_length=50, separator_length=2):
"""Constructor of Pyasciigraph
:param int line_length: the max number of char on a line
if any line cannot be shorter,
it will go over this limit
:param int min_graph_length: the min number of char used by the graph
:param int separator_length: the length of field separator
"""
self.line_length = line_length
self.separator_length = separator_length
self.min_graph_length = min_graph_length
def _u(self, x):
if sys.version < '3':
import codecs
return codecs.unicode_escape_decode(x)[0]
else:
return x
def _get_maximum(self, data):
all_max = {}
all_max['value_max_length'] = 0
all_max['info_max_length'] = 0
all_max['max_value'] = 0
for (info, value) in data:
if value > all_max['max_value']:
all_max['max_value'] = value
if len(info) > all_max['info_max_length']:
all_max['info_max_length'] = len(info)
if len(str(value)) > all_max['value_max_length']:
all_max['value_max_length'] = len(str(value))
return all_max
def _gen_graph_string(self, value, max_value, graph_length, start_value):
number_of_square = 0
if max_value:
number_of_square = int(value * graph_length / max_value)
number_of_space = int(start_value - number_of_square)
return '#' * number_of_square + self._u(' ') * number_of_space
def _gen_info_string(self, info, start_info, line_length):
number_of_space = (line_length - start_info - len(info))
return info + self._u(' ') * number_of_space
def _gen_value_string(self, value, start_value, start_info):
number_space = start_info -\
start_value -\
len(str(value)) -\
self.separator_length
return ' ' * number_space +\
str(value) +\
' ' * self.separator_length
def _sanitize_string(self, string):
# get the type of a unicode string
unicode_type = type(self._u('t'))
input_type = type(string)
if input_type is str:
info = string
elif input_type is unicode_type:
info = string
elif input_type is int or input_type is float:
info = str(string)
return info
def _sanitize_data(self, data):
ret = []
for item in data:
ret.append((self._sanitize_string(item[0]), item[1]))
return ret
def graph(self, label, data, sort=0, with_value=True):
"""function generating the graph
:param string label: the label of the graph
:param iterable data: the data (list of tuple (info, value))
info must be "castable" to a unicode string
value must be an int or a float
:param int sort: flag sorted
0: not sorted (same order as given) (default)
1: increasing order
2: decreasing order
:param boolean with_value: flag printing value
True: print the numeric value (default)
False: don't print the numeric value
:rtype: a list of strings (each lines)
"""
result = []
san_data = self._sanitize_data(data)
san_label = self._sanitize_string(label)
if sort == 1:
san_data = sorted(san_data, key=lambda value: value[1], reverse=False)
elif sort == 2:
san_data = sorted(san_data, key=lambda value: value[1], reverse=True)
all_max = self._get_maximum(san_data)
real_line_length = max(self.line_length, len(label))
min_line_length = self.min_graph_length + 2 * self.separator_length +\
all_max['value_max_length'] + all_max['info_max_length']
if min_line_length < real_line_length:
# calcul of where to start info
start_info = self.line_length -\
all_max['info_max_length']
# calcul of where to start value
start_value = start_info -\
self.separator_length -\
all_max['value_max_length']
# calcul of where to end graph
graph_length = start_value -\
self.separator_length
else:
# calcul of where to start value
start_value = self.min_graph_length +\
self.separator_length
# calcul of where to start info
start_info = start_value +\
all_max['value_max_length'] +\
self.separator_length
# calcul of where to end graph
graph_length = self.min_graph_length
# calcul of the real line length
real_line_length = min_line_length
result.append(san_label)
result.append(self._u('#') * real_line_length)
for item in san_data:
info = item[0]
value = item[1]
graph_string = self._gen_graph_string(
value,
all_max['max_value'],
graph_length,
start_value
)
value_string = self._gen_value_string(
value,
start_value,
start_info
)
info_string = self._gen_info_string(
info,
start_info,
real_line_length
)
new_line = graph_string + value_string + info_string
result.append(new_line)
return result
def AddBin(bins, binSize, binIdx, curTotal):
lowerBound = binSize * binIdx
label = "%.2f" % lowerBound
bins.append((label, curTotal))
def GetGraphTitle(tversky, shapeOnly):
if not tversky and not shapeOnly:
return "FastROCS Tanimoto Combo Score Distribution"
if not tversky and shapeOnly:
return "FastROCS Tanimoto Shape Score Distribution"
if tversky and not shapeOnly:
return "FastROCS Tversky Combo Score Distribution"
if tversky and shapeOnly:
return "FastROCS Tversky Shape Score Distribution"
def PrintHistogram(hist, tversky=None, shapeOnly=None):
squashFactor = 10
if shapeOnly:
maxScore = 1.0
else:
maxScore = 2.0
binSize = maxScore/(len(hist) / squashFactor)
bins = []
curTotal = 0
binIdx = 0
for i, val in enumerate(hist):
if i != 0 and (i % squashFactor) == 0:
AddBin(bins, binSize, binIdx, curTotal)
curTotal = 0
binIdx += 1
curTotal += val
AddBin(bins, binSize, binIdx, curTotal)
graph = Pyasciigraph()
for line in graph.graph(GetGraphTitle(tversky, shapeOnly), bins):
print(line)
def GetFormatExtension(fname):
base, ext = os.path.splitext(fname.lower())
if ext == ".gz":
base, ext = os.path.splitext(base)
ext += ".gz"
return ext
def main(argv=[__name__]):
parser = argparse.ArgumentParser()
# positional arguments retaining backward compatibility
parser.add_argument('server:port', help='Server name and port number \
of database to search i.e. localhost:8080.')
parser.add_argument('query', help='File containing the query molecule to search \
(format not restricted to *.oeb).')
parser.add_argument('results', help='Output file to store results \
(format not restricted to *.oeb).')
parser.add_argument('nHits', nargs='?', type=int, default=100,
help='Number of hits to return (default=100).')
parser.add_argument('--tversky', action='store_true', default=argparse.SUPPRESS,
help='Switch to Tversky similarity scoring (default=Tanimoto).')
parser.add_argument('--shapeOnly', action='store_true', default=argparse.SUPPRESS,
help='Switch to shape-only scores (default=Combo).')
parser.add_argument('--alternativeStarts', default=argparse.SUPPRESS, nargs=1, dest='altStarts',
choices=('random', 'subrocs',
'inertialAtHeavyAtoms', 'inertialAtColorAtoms'),
help='Optimize using alternative starts. '
'To perform N random starts do \
"--alternativeStarts random N" (default N=10)')
known, remaining = (parser.parse_known_args())
dargs = vars(known)
qfname = dargs.pop('query')
numHits = dargs.pop('nHits')
startType = dargs.get('altStarts', None)
if startType:
dargs['altStarts'] = str(startType[0])
if len(remaining) == 1 and dargs['altStarts'] == 'random':
try:
numRands = int(remaining[0])
dargs['randStarts'] = numRands
except ValueError:
print("Invalid argument given. See --help menu for argument list")
sys.exit()
if len(remaining) > 1:
print("Too many arguments given. See --help menu for argument list")
sys.exit()
else:
if remaining:
print("Too many arguments given. See --help menu for argument list")
sys.exit()
try:
fh = open(qfname, 'rb')
except IOError:
sys.stderr.write("Unable to open '%s' for reading" % qfname)
return 1
iformat = GetFormatExtension(qfname)
ofname = dargs.pop('results')
oformat = GetFormatExtension(ofname)
s = ServerProxy("http://" + dargs.pop('server:port'))
data = Binary(fh.read())
try:
idx = s.SubmitQuery(data, numHits, iformat, oformat, dargs)
except Fault as e:
sys.stderr.write(str(e))
return 1
while True:
blocking = True
try:
current, total = s.QueryStatus(idx, blocking)
hist = s.QueryHistogram(idx)
except Fault as e:
print(str(e), file=sys.stderr)
return 1
if total == 0:
continue
PrintHistogram(hist, dargs.get('tversky', None), dargs.get('shapeOnly', None))
if total <= current:
break
results = s.QueryResults(idx)
# assuming the results come back as a string in the requested format
with open(ofname, 'wb') as output:
output.write(results.data)
return 0
if __name__ == '__main__':
sys.exit(main(sys.argv))