/usr/share/RDKit/Contrib/mmpa/indexing.py

# Copyright (c) 2012, GlaxoSmithKline Research & Development Ltd.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
#     * Redistributions of source code must retain the above copyright
#       notice, this list of conditions and the following disclaimer.
#     * Redistributions in binary form must reproduce the above
#       copyright notice, this list of conditions and the following
#       disclaimer in the documentation and/or other materials provided
#       with the distribution.
#     * Neither the name of GlaxoSmithKline Research & Development Ltd.
#       nor the names of its contributors may be used to endorse or promote
#       products derived from this software without specific prior written
#       permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# Created by Jameed Hussain, September 2012

import sys
import re
from rdkit import Chem
from optparse import OptionParser

def heavy_atom_count(smi):

    m = Chem.MolFromSmiles(smi)
    return m.GetNumAtoms()

def add_to_index(smi,attachments,cmpd_heavy):

    result = False
    core_size = heavy_atom_count(smi) - attachments

    if(use_ratio):
        core_ratio = float(core_size) / float(cmpd_heavy)
        if(core_ratio <= ratio ):
            result = True
    else:
        if(core_size <= max_size):
            result = True

    return result

def get_symmetry_class(smi):

    symmetry = []

    m = Chem.MolFromSmiles(smi)

    #determine the symmetry class
    #see: http://www.mail-archive.com/rdkit-discuss@lists.sourceforge.net/msg01894.html
    #A thanks to Greg (and Alan)
    Chem.AssignStereochemistry(m,cleanIt=True,force=True,flagPossibleStereoCenters=True)

    #get the symmetry class of the attachements points
    #Note: 1st star is the zero index,
    #2nd star is first index, etc
    for atom in m.GetAtoms():
        if(atom.GetMass() == 0):
            symmetry.append(atom.GetProp('_CIPRank'))

    return symmetry

def cansmirk(lhs,rhs,context):

    #cansmirk algorithm
    #1) cansmi the LHS.
    #2) For the LHS the 1st star will have label 1, 2nd star will have label 2 and so on
    #3) Do a symmetry check of lhs and rhs and use that to decide if the labels on
    #   RHS or/and context need to change.
    #4) For the rhs, if you have a choice (ie. two attachement points are symmetrically
    #   equivalent), always put the label with lower numerical value on the earlier
    #   attachement point on the cansmi-ed smiles

    #print "in: %s,%s" % (lhs,rhs)

    isotope_track={}
    #if the star count of lhs/context/rhs is 1, single cut
    stars = lhs.count("*")

    if(stars > 1):
        #get the symmetry class of stars of lhs and rhs
        lhs_sym = get_symmetry_class(lhs)
        rhs_sym = get_symmetry_class(rhs)

    #deal with double cuts
    if(stars == 2):
        #simple cases
        #unsymmetric lhs and unsymmetric rhs
        if( (lhs_sym[0] != lhs_sym[1]) and (rhs_sym[0] != rhs_sym[1]) ):
            #get 1st and 2nd labels and store the new label for it in isotope_track
            #structure: isotope_track[old_label]=new_label (as strings)
            isotope_track = build_track_dictionary(lhs,stars)

            #switch labels using isotope track
            lhs = switch_labels_on_position(lhs)
            rhs = switch_labels(isotope_track,stars,rhs)
            context = switch_labels(isotope_track,stars,context)
                        
        #symmetric lhs and symmetric rhs
        elif( (lhs_sym[0] == lhs_sym[1]) and (rhs_sym[0] == rhs_sym[1]) ):            
            #the points are all equivalent so change labels on lhs and rhs based on position
            #labels on context don't need to change
            lhs = switch_labels_on_position(lhs)
            rhs = switch_labels_on_position(rhs)
            
        #more difficult cases..
        #symmetric lhs and unsymmetric rhs
        elif( (lhs_sym[0] == lhs_sym[1]) and (rhs_sym[0] != rhs_sym[1]) ):            
            #switch labels lhs based on position
            lhs = switch_labels_on_position(lhs)
            #change labels on rhs based on position but need to record
            #the changes as need to appy them to the context
            isotope_track = build_track_dictionary(rhs,stars)
            rhs = switch_labels_on_position(rhs)
            context = switch_labels(isotope_track,stars,context)
            
        #unsymmetric lhs and symmetric rhs
        elif( (lhs_sym[0] != lhs_sym[1]) and (rhs_sym[0] == rhs_sym[1]) ):            
            #change labels on lhs based on position but need to record
            #the changes as need to appy them to the context
            isotope_track = build_track_dictionary(lhs,stars)
            lhs = switch_labels_on_position(lhs)
            context = switch_labels(isotope_track,stars,context)
            #as rhs is symmetric, positions are equivalent so change labels on position
            rhs = switch_labels_on_position(rhs)
            
    #deal with triple cut
    #unwieldy code but most readable I can make it
    elif(stars == 3):
        #simple cases
        #completely symmetric lhs and completely symmetric rhs
        if( ( (lhs_sym[0] == lhs_sym[1]) and (lhs_sym[1] == lhs_sym[2]) and (lhs_sym[0] == lhs_sym[2]) ) and
        ( (rhs_sym[0] == rhs_sym[1]) and (rhs_sym[1] == rhs_sym[2]) and (rhs_sym[0] == rhs_sym[2]) ) ):
            #the points are all equivalent so change labels on lhs and rhs based on position
            #labels on context don't need to change
            lhs = switch_labels_on_position(lhs)
            rhs = switch_labels_on_position(rhs)            
            
        #completely symmetric lhs and completely unsymmetric rhs
        elif( ( (lhs_sym[0] == lhs_sym[1]) and (lhs_sym[1] == lhs_sym[2]) and (lhs_sym[0] == lhs_sym[2]) ) and
        ( (rhs_sym[0] != rhs_sym[1]) and (rhs_sym[1] != rhs_sym[2]) and (rhs_sym[0] != rhs_sym[2]) ) ):
            #alter lhs in usual way
            lhs = switch_labels_on_position(lhs)
            #change labels on rhs based on position but need to record
            #the changes as need to appy them to the context
            isotope_track = build_track_dictionary(rhs,stars)
            rhs = switch_labels_on_position(rhs)
            context = switch_labels(isotope_track,stars,context)
            
        #completely unsymmetric lhs and completely unsymmetric rhs
        elif( ( (lhs_sym[0] != lhs_sym[1]) and (lhs_sym[1] != lhs_sym[2]) and (lhs_sym[0] != lhs_sym[2]) ) and 
        ( (rhs_sym[0] != rhs_sym[1]) and (rhs_sym[1] != rhs_sym[2]) and (rhs_sym[0] != rhs_sym[2]) ) ):
            #build the isotope track
            isotope_track = build_track_dictionary(lhs,stars)
            #alter lhs in usual way
            lhs = switch_labels_on_position(lhs)
            #change rhs and context based on isotope_track
            rhs = switch_labels(isotope_track,stars,rhs)
            context = switch_labels(isotope_track,stars,context)
                        
        #completely unsymmetric lhs and completely symmetric rhs
        elif( ( (lhs_sym[0] != lhs_sym[1]) and (lhs_sym[1] != lhs_sym[2]) and (lhs_sym[0] != lhs_sym[2]) ) and
        ( (rhs_sym[0] == rhs_sym[1]) and (rhs_sym[1] == rhs_sym[2]) and (rhs_sym[0] == rhs_sym[2]) ) ):
            #build isotope trach on lhs
            isotope_track = build_track_dictionary(lhs,stars)
            #alter lhs in usual way
            lhs = switch_labels_on_position(lhs)
            #change labels on context
            context = switch_labels(isotope_track,stars,context)
            #all positions on rhs equivalent so add labels on position
            rhs = switch_labels_on_position(rhs)            

        #more difficult cases, partial symmetry
        #completely unsymmetric on lhs and partial symmetry on rhs
        elif( (lhs_sym[0] != lhs_sym[1]) and (lhs_sym[1] != lhs_sym[2]) and (lhs_sym[0] != lhs_sym[2]) ):            
            #build the isotope track
            isotope_track = build_track_dictionary(lhs,stars)
            #alter lhs in usual way
            lhs = switch_labels_on_position(lhs)
            #change rhs and context based on isotope_track
            rhs = switch_labels(isotope_track,stars,rhs)
            context = switch_labels(isotope_track,stars,context)

            #tweak positions on rhs based on symmetry
            #rhs 1,2 equivalent
            if(rhs_sym[0] == rhs_sym[1]):
                #tweak rhs position 1 and 2 as they are symmetric
                rhs = switch_specific_labels_on_symmetry(rhs,rhs_sym,1,2)

            #rhs 2,3 equivalent
            elif(rhs_sym[1] == rhs_sym[2]):
                #tweak rhs position 1 and 2 as they are symmetric
                rhs = switch_specific_labels_on_symmetry(rhs,rhs_sym,2,3)
                
            #rhs 1,3 equivalent - try for larger set in future
            elif(rhs_sym[0] == rhs_sym[2]):
                #tweak rhs position 1 and 2 as they are symmetric
                rhs = switch_specific_labels_on_symmetry(rhs,rhs_sym,1,3)
                
        #now we are left with things with partial symmetry on lhs and not completely symmetric or unsymmetric on rhs
        else:
            #lhs 1,2,3 equivalent and any sort of partial symmetry on rhs
            if( (lhs_sym[0] == lhs_sym[1]) and (lhs_sym[1] == lhs_sym[2]) and (lhs_sym[0] == lhs_sym[2]) ):
                
                #alter lhs in usual way
                lhs = switch_labels_on_position(lhs)
                #change labels on rhs based on position but need to record
                #the changes as need to appy them to the context
                isotope_track = build_track_dictionary(rhs,stars)
                rhs = switch_labels_on_position(rhs)
                context = switch_labels(isotope_track,stars,context)
                

            #now deal partial symmetry on lhs or rhs.
            #Cases where:
            #lhs 1,2 equivalent
            #lhs 2,3 equivalent
            #lhs 1,3 equivalent
            else:
                #build isotope track on lhs
                isotope_track = build_track_dictionary(lhs,stars)
                #alter lhs in usual way
                lhs = switch_labels_on_position(lhs)
                #change rhs and context based on isotope_track
                rhs = switch_labels(isotope_track,stars,rhs)
                context = switch_labels(isotope_track,stars,context)

                #tweak positions on rhs based on symmetry

                #lhs 1,2 equivalent
                if(lhs_sym[0] == lhs_sym[1]):
                    #tweak rhs position 1 and 2 as they are symmetric on lhs
                    rhs = switch_specific_labels_on_symmetry(rhs,rhs_sym,1,2)                    

                #lhs 2,3 equivalent
                elif(lhs_sym[1] == lhs_sym[2]):
                    #tweak rhs position 1 and 2 as they are symmetric on lhs
                    rhs = switch_specific_labels_on_symmetry(rhs,rhs_sym,2,3)
                    
                #lhs 1,3 equivalent - try for larger set in future
                elif(lhs_sym[0] == lhs_sym[2]):
                    #tweak rhs position 1 and 2 as they are symmetric on lhs
                    rhs = switch_specific_labels_on_symmetry(rhs,rhs_sym,1,3)

    smirk = "%s>>%s" % (lhs,rhs)

    return smirk,context

def switch_specific_labels_on_symmetry(smi,symmetry_class,a,b):

    #check if a and b positions are symmetrically equivalent
    #if equivalent, swap labels if the lower numerical label is not on the
    #1st symmetrically equivalent attachment points in the smi

    if(symmetry_class[a-1] == symmetry_class[b-1]):
        #what are the labels on a and b

        matchObj = re.search( r'\[\*\:([123])\].*\[\*\:([123])\].*\[\*\:([123])\]', smi )
        if matchObj:
            #if the higher label comes first, fix
            if(int(matchObj.group(a)) > int(matchObj.group(b))):
            #if(int(matchObj.group(1)) > int(matchObj.group(2))):
                smi = re.sub(r'\[\*\:'+matchObj.group(a)+'\]', '[*:XX' + matchObj.group(b) + 'XX]' , smi)
                smi = re.sub(r'\[\*\:'+matchObj.group(b)+'\]', '[*:XX' + matchObj.group(a) + 'XX]' , smi)
                smi = re.sub('XX', '' , smi)

    return smi

def switch_labels_on_position(smi):
    
    #move the labels in order of position
    smi = re.sub(r'\[\*\:[123]\]', '[*:XX1XX]' , smi, 1)
    smi = re.sub(r'\[\*\:[123]\]', '[*:XX2XX]' , smi, 1)
    smi = re.sub(r'\[\*\:[123]\]', '[*:XX3XX]' , smi, 1)
    smi = re.sub('XX', '' , smi)
    
    return smi

def switch_labels(track,stars,smi):

    #switch labels based on the input dictionary track
    if(stars > 1):
        #for k in track:
        #        print "old: %s, new: %s" % (k,track[k])

        if(track['1'] != '1'):
            smi = re.sub(r'\[\*\:1\]', '[*:XX' + track['1'] + 'XX]' , smi)

        if(track['2'] != '2'):
            smi = re.sub(r'\[\*\:2\]', '[*:XX' + track['2'] + 'XX]' , smi)

        if(stars == 3):
            if(track['3'] != '3'):
                smi = re.sub(r'\[\*\:3\]', '[*:XX' + track['3'] + 'XX]' , smi)

        #now remove the XX
        smi = re.sub('XX', '' , smi)

    return smi

def build_track_dictionary(smi,stars):

    isotope_track = {}

    #find 1st label, record it in isotope_track as key, with value being the
    #new label based on its position (1st star is 1, 2nd star 2 etc.)
    if(stars ==2):
        matchObj = re.search( r'\[\*\:([123])\].*\[\*\:([123])\]', smi )
        if matchObj:
            isotope_track[matchObj.group(1)] = '1'
            isotope_track[matchObj.group(2)] = '2'

    elif(stars ==3):
        matchObj = re.search( r'\[\*\:([123])\].*\[\*\:([123])\].*\[\*\:([123])\]', smi )
        if matchObj:
            isotope_track[matchObj.group(1)] = '1'
            isotope_track[matchObj.group(2)] = '2'
            isotope_track[matchObj.group(3)] = '3'

    return isotope_track

def index_hydrogen_change():
    #Algorithm details
    #have an index of common fragment(key) => fragments conected to it (values)
    #Need to add *-H to the values where appropriate - and its
    #appropriate when the key is what you would get if you chopped a H off a cmpd.
    #Therefore simply need to check if key with the * replaced with a H is
    #the same as any full smiles in the set
    #
    #Specific details:
    #1) Loop through keys of index
    #2) If key is the result of a single cut (so contains only 1 *) replace the * with H, and cansmi
    #3) If full smiles matches key in hash above, add *-H to that fragment index.

    for key in index:

        attachments = key.count('*')
        #print attachments

        if(attachments==1):

            smi = key

            #simple method
            smi = re.sub(r'\[\*\:1\]', '[H]' , smi)
            
            #now cansmi it
            temp = Chem.MolFromSmiles(smi)

            if(temp == None):
                sys.stderr.write('Error with key: %s, Added H: %s\n' %(key,smi) )
            else:
                c_smi = Chem.MolToSmiles( temp, isomericSmiles=True )

                if(c_smi in smi_to_id):
                    core = "[*:1][H]"
                    id = smi_to_id[c_smi]

                    value = "%s;t%s" % (id,core)
                    #add to index
                    index[key].append(value)
                

if __name__=='__main__':

    #note max heavy atom count does not
    #include the attachement points (*)
    max_size = 10
    ratio = 0.3
    use_ratio = False

    index={}
    smi_to_id={}
    id_to_smi={}

    id_to_heavy={}

    #set up the command line options
    #parser = OptionParser()
    parser = OptionParser(description="Program to generate MMPs")
    parser.add_option('-s', '--symmetric', default=False, action='store_true', dest='sym',
                      help='Output symmetrically equivalent MMPs, i.e output both cmpd1,cmpd2, SMIRKS:A>>B and cmpd2,cmpd1, SMIRKS:B>>A')
    parser.add_option('-m','--maxsize',action='store', dest='maxsize', type='int',
                      help='Maximum size of change (in heavy atoms) allowed in matched molecular pairs identified. DEFAULT=10. \
                      Note: This option overrides the ratio option if both are specified.')
    parser.add_option('-r','--ratio',action='store', dest='ratio', type='float',
                      help='Maximum ratio of change allowed in matched molecular pairs identified. The ratio is: size of change / \
                      size of cmpd (in terms of heavy atoms). DEFAULT=0.3. Note: If this option is used with the maxsize option, the maxsize option will be used.')

    #parse the command line options
    (options, args) = parser.parse_args()

    #print options
    if(options.maxsize != None):
        max_size = options.maxsize
    elif(options.ratio != None):
        ratio = options.ratio
        if(ratio >= 1):
            print "Ratio specified: %s. Ratio needs to be less than 1."
            sys.exit(1)
        use_ratio = True

    #read the STDIN
    for line in sys.stdin:

        line = line.rstrip()
        smi,id,core,context = line.split(',')

        #fill in dictionaries
        smi_to_id[smi]=id
        id_to_smi[id]=smi

        #if using the ratio option, check if heavy atom
        #of mol already calculated. If not, calculate and store
        cmpd_heavy = None
        if(use_ratio):
            if( (id in id_to_heavy) == False):
                id_to_heavy[id] = heavy_atom_count(smi)

            cmpd_heavy = id_to_heavy[id]

        #deal with cmpds that have not been fragmented
        if(len(core) == 0) and (len(context) == 0):
            continue

        #deal with single cuts
        if(len(core) == 0):
            side_chains = context.split('.')

            #minus 1 for the attachement pt
            if( add_to_index(side_chains[1],1,cmpd_heavy)==True ):
                context = side_chains[0]
                core = side_chains[1]

                value = "%s;t%s" % (id,core)

                #add the array if no key exists
                #add the context with id to index
                index.setdefault(context, []).append(value)

            #minus 1 for the attachement pt
            if( add_to_index(side_chains[0],1,cmpd_heavy)==True ):
                context = side_chains[1]
                core = side_chains[0]

                value = "%s;t%s" % (id,core)

                #add the array if no key exists
                #add the context with id to index
                index.setdefault(context, []).append(value)

        #double or triple cut
        else:

            attachments = core.count('*')

            if( add_to_index(core,attachments,cmpd_heavy)==True ):
                value = "%s;t%s" % (id,core)

                #add the array if no key exists
                #add the context with id to index
                index.setdefault(context, []).append(value)

    #index the H change
    index_hydrogen_change()

    #Now index is ready

    #loop through the index
    for key in index:

        total = len(index[key])

        #check if have more than one value
        if(total == 1):
            continue

        for xa in xrange(total):

            for xb in xrange(xa, total):

                if(xa != xb):
                    #now generate the pairs

                    id_a,core_a = index[key][xa].split(";t")
                    id_b,core_b = index[key][xb].split(";t")

                    #make sure pairs are not same molecule
                    if(id_a != id_b):

                        #make sure LHS and RHS of SMIRKS are not the same
                        if(core_a != core_b):

                            smirks,context = cansmirk(core_a,core_b,key)
                            print "%s,%s,%s,%s,%s,%s" % ( id_to_smi[id_a], id_to_smi[id_b], id_a, id_b, smirks, context )

                            #deal with symmetry switch
                            if(options.sym == True):
                                smirks,context = cansmirk(core_b,core_a,key)
                                print "%s,%s,%s,%s,%s,%s" % ( id_to_smi[id_b], id_to_smi[id_a], id_b, id_a, smirks, context )
rdkit-data 201309-1 / usr / share / RDKit / Contrib / mmpa / indexing.py