view chembl.py @ 5:1ade252ebe08 draft

"planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 944ea4bb8a9cd4244152a4a4fecd0485fabc2ad0"
author bgruening
date Tue, 28 Jul 2020 08:20:47 -0400
parents 59eea7d0d720
children a57de37f12c2
line wrap: on
line source

from chembl_webresource_client.settings import Settings
Settings.Instance().CACHING = False
from chembl_webresource_client.new_client import new_client
import argparse

def open_file(filename):
    with open(filename) as f:
        return f.readline().split()[0]

def get_smiles(res):
    """
    Get a list of SMILES from function results
    """ 
    smiles = set()
    for smi in res: 
        try:
            smiles.add('{}\t{}'.format(smi['molecule_structures']['canonical_smiles'], smi['molecule_chembl_id']))
        except TypeError:
            continue
    return smiles

def sim_search(smiles, tanimoto):
    """
    Return compounds which are within a Tanimoto range of the SMILES input
    """
    similarity = new_client.similarity
    return similarity.filter(smiles=smiles, similarity=tanimoto).only(['molecule_structures', 'molecule_chembl_id'])
    
def substr_search(smiles):
    """
    Return compounds which contain the SMILES substructure input
    """
    substructure = new_client.substructure
    return substructure.filter(smiles=smiles).only(['molecule_structures', 'molecule_chembl_id'])
    
def filter_drugs(mols):
    """
    Return only compounds which are approved drugs
    """
    return mols.filter(max_phase=4)

def filter_biotherapeutic(mols):
    """
    Return only biotherapeutic molecules
    """
    return mols.filter(biotherapeutic__isnull=False)

def filter_nat_prod(mols):
    """
    Return only natural products
    """
    return mols.filter(natural_product=1)

def filter_ro5(mols):
    """
    Return only compounds with no RO5 violations
    """
    return mols.filter(molecule_properties__num_ro5_violations=0)

def main():
    parser = argparse.ArgumentParser(description='Search ChEMBL database for compounds')
    parser.add_argument('-i', '--input', help='SMILES input')
    parser.add_argument('-f', '--file', help='SMILES input as file')
    parser.add_argument('-o', '--output', help="SMILES output")
    parser.add_argument('-t', '--tanimoto', type=int, help='Tanimoto similarity score')
    parser.add_argument('-s', '--substructure', action='store_true', help='Substructure search using the SMILES input.')
    parser.add_argument('-d', '--drugs', action='store_true', help='Filter approved drugs')
    parser.add_argument('-b', '--biotherapeutic', action='store_true', help='Filter biotherapeutic molecules')
    parser.add_argument('-n', '--nat-prod', action='store_true', help='Filter natural products')
    parser.add_argument('-r', '--ro5', action='store_true', help='Filter compounds that pass Lipinski RO5')

    args = parser.parse_args()

    if args.file:  # get SMILES from file rather than -i option
        args.input = open_file(args.file)

    if len(args.input) < 5:
        raise IOError('SMILES must be at least 5 characters long.')

    if args.substructure:  # specify search type: substructure or similarity
        mols = substr_search(args.input)
    else:
        mols = sim_search(args.input, args.tanimoto)

    # filter options:
    if args.drugs:
        mols = filter_drugs(mols)

    if args.biotherapeutic:
        mols = filter_biotherapeutic(mols)

    if args.nat_prod:
        mols = filter_nat_prod(mols)

    if args.ro5:
        mols = filter_ro5(mols)

    # get SMILES from search output
    mols = get_smiles(mols)

    # write to file
    with open(args.output, 'w') as f:
        f.write('\n'.join(mols))
    

if __name__ == "__main__":
    main()