Mercurial > repos > bgruening > chembl_structure_pipeline

diff chembl.py @ 0:2f59c6239f25 draft default tip
"planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 78f2261af4e00c830ea311337d0aed9b297aad8e-dirty"
author: bgruening
date: Sat, 10 Oct 2020 09:43:40 +0000
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/chembl.py	Sat Oct 10 09:43:40 2020 +0000
@@ -0,0 +1,118 @@
+import argparse
+
+from chembl_webresource_client.new_client import new_client
+from chembl_webresource_client.settings import Settings
+
+Settings.Instance().CACHING = False
+
+
+def open_file(filename):
+    with open(filename) as f:
+        return f.readline().split()[0]
+
+
+def get_smiles(res):
+    """
+    Get a list of SMILES from function results
+    """
+    smiles = set()
+    for smi in res:
+        try:
+            smiles.add('{}\t{}'.format(smi['molecule_structures']['canonical_smiles'], smi['molecule_chembl_id']))
+        except TypeError:
+            continue
+    return smiles
+
+
+def sim_search(smiles, tanimoto):
+    """
+    Return compounds which are within a Tanimoto range of the SMILES input
+    """
+    similarity = new_client.similarity
+    return similarity.filter(smiles=smiles, similarity=tanimoto).only(['molecule_structures', 'molecule_chembl_id'])
+
+
+def substr_search(smiles):
+    """
+    Return compounds which contain the SMILES substructure input
+    """
+    substructure = new_client.substructure
+    return substructure.filter(smiles=smiles).only(['molecule_structures', 'molecule_chembl_id'])
+
+
+def filter_drugs(mols):
+    """
+    Return only compounds which are approved drugs
+    """
+    return mols.filter(max_phase=4)
+
+
+def filter_biotherapeutic(mols):
+    """
+    Return only biotherapeutic molecules
+    """
+    return mols.filter(biotherapeutic__isnull=False)
+
+
+def filter_nat_prod(mols):
+    """
+    Return only natural products
+    """
+    return mols.filter(natural_product=1)
+
+
+def filter_ro5(mols):
+    """
+    Return only compounds with no RO5 violations
+    """
+    return mols.filter(molecule_properties__num_ro5_violations=0)
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Search ChEMBL database for compounds')
+    parser.add_argument('-i', '--input', help='SMILES input')
+    parser.add_argument('-f', '--file', help='SMILES input as file')
+    parser.add_argument('-o', '--output', help="SMILES output")
+    parser.add_argument('-t', '--tanimoto', type=int, help='Tanimoto similarity score')
+    parser.add_argument('-s', '--substructure', action='store_true', help='Substructure search using the SMILES input.')
+    parser.add_argument('-d', '--drugs', action='store_true', help='Filter approved drugs')
+    parser.add_argument('-b', '--biotherapeutic', action='store_true', help='Filter biotherapeutic molecules')
+    parser.add_argument('-n', '--nat-prod', action='store_true', help='Filter natural products')
+    parser.add_argument('-r', '--ro5', action='store_true', help='Filter compounds that pass Lipinski RO5')
+
+    args = parser.parse_args()
+
+    if args.file:  # get SMILES from file rather than -i option
+        args.input = open_file(args.file)
+
+    if len(args.input) < 5:
+        raise IOError('SMILES must be at least 5 characters long.')
+
+    if args.substructure:  # specify search type: substructure or similarity
+        mols = substr_search(args.input)
+    else:
+        mols = sim_search(args.input, args.tanimoto)
+
+    # filter options:
+    if args.drugs:
+        mols = filter_drugs(mols)
+
+    if args.biotherapeutic:
+        mols = filter_biotherapeutic(mols)
+
+    if args.nat_prod:
+        mols = filter_nat_prod(mols)
+
+    if args.ro5:
+        mols = filter_ro5(mols)
+
+    # get SMILES from search output
+    mols = get_smiles(mols)
+
+    # write to file
+    with open(args.output, 'w') as f:
+        f.write('\n'.join(mols))
+
+
+if __name__ == "__main__":
+    main()
author	bgruening
date	Sat, 10 Oct 2020 09:43:40 +0000
parents
children