Mercurial > repos > bgruening > chembl
changeset 0:915e9be38994 draft
planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 2e3c3c2bd7ecdc9c2968a32f91e81136e0cb3835
author | bgruening |
---|---|
date | Mon, 05 Aug 2019 05:21:58 -0400 |
parents | |
children | 6f8458d1cf46 |
files | chembl.py chembl.xml test-data/in1.smi test-data/out1.smi test-data/out2.smi test-data/out3.smi test-data/out4.smi |
diffstat | 7 files changed, 345 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chembl.py Mon Aug 05 05:21:58 2019 -0400 @@ -0,0 +1,102 @@ +from chembl_webresource_client.new_client import new_client +import argparse + +def open_file(filename): + with open(filename) as f: + return f.readline() + +def get_smiles(res): + """ + Get a list of SMILES from function results + """ + smiles = set() + for smi in res: + smiles.add(smi['molecule_structures']['canonical_smiles']) + return smiles + +def sim_search(smiles, tanimoto): + """ + Return compounds which are within a Tanimoto range of the SMILES input + """ + similarity = new_client.similarity + return similarity.filter(smiles=smiles, similarity=tanimoto).only(['molecule_structures']) + +def substr_search(smiles): + """ + Return compounds which contain the SMILES substructure input + """ + substructure = new_client.substructure + return substructure.filter(smiles=smiles).only(['molecule_structures']) + +def filter_drugs(mols): + """ + Return only compounds which are approved drugs + """ + return mols.filter(max_phase=4) + +def filter_biotherapeutic(mols): + """ + Return only biotherapeutic molecules + """ + return mols.filter(biotherapeutic__isnull=False) + +def filter_nat_prod(mols): + """ + Return only natural products + """ + return mols.filter(natural_product=1) + +def filter_ro5(mols): + """ + Return only compounds with no RO5 violations + """ + return mols.filter(molecule_properties__num_ro5_violations=0) + +def main(): + parser = argparse.ArgumentParser(description='Search ChEMBL database for compounds') + parser.add_argument('-i', '--input', help='SMILES input') + parser.add_argument('-f', '--file', help='SMILES input as file') + parser.add_argument('-o', '--output', help="SMILES output") + parser.add_argument('-t', '--tanimoto', type=int, help='Tanimoto similarity score') + parser.add_argument('-s', '--substructure', action='store_true', help='Substructure search using the SMILES input.') + parser.add_argument('-d', '--drugs', action='store_true', help='Filter approved drugs') + parser.add_argument('-b', '--biotherapeutic', action='store_true', help='Filter biotherapeutic molecules') + parser.add_argument('-n', '--nat-prod', action='store_true', help='Filter natural products') + parser.add_argument('-r', '--ro5', action='store_true', help='Filter compounds that pass Lipinski RO5') + + args = parser.parse_args() + + if args.file: # get SMILES from file rather than -i option + args.input = open_file(args.file) + + if len(args.input) < 5: + raise IOError('SMILES must be at least 5 characters long.') + + if args.substructure: # specify search type: substructure or similarity + mols = substr_search(args.input) + else: + mols = sim_search(args.input, args.tanimoto) + + # filter options: + if args.drugs: + mols = filter_drugs(mols) + + if args.biotherapeutic: + mols = filter_biotherapeutic(mols) + + if args.nat_prod: + mols = filter_nat_prod(mols) + + if args.ro5: + mols = filter_ro5(mols) + + # get SMILES from search output + mols = get_smiles(mols) + + # write to file + with open(args.output, 'w') as f: + f.write('\n'.join(mols)) + + +if __name__ == "__main__": + main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chembl.xml Mon Aug 05 05:21:58 2019 -0400 @@ -0,0 +1,118 @@ +<tool id="chembl" name="Search ChEMBL database" version="0.1.0"> + <description>for compounds which are similar to a SMILES string</description> + <requirements> + <requirement type="package" version="0.9.31">chembl_webresource_client</requirement> + </requirements> + <command detect_errors="exit_code"><![CDATA[ + python -W ignore '$__tool_directory__/chembl.py' + $input.format '$input.smiles' + -o $outfile + $search.type + #if $search.type == '-t': + $search.tanimoto + #end if + $drugs + $biotherapeutic + $natprod + $ro5 + ]]></command> + <inputs> + <conditional name="input"> + <param name='format' type='select' format='text' label="SMILES input type" help="Enter SMILES as either text or file."> + <option value='-i'>Text</option> + <option value='-f'>File</option> + </param> + <when value='-i'> + <param name="smiles" type="text" label="SMILES input" help="Enter SMILES for a compound."> + <validator type='length' min='5'/> + </param> + </when> + <when value='-f'> + <param name="smiles" type="data" format="smi" label="Input file" help="File containing a single compound in SMILES format. Note only the first line of the file will be read, if the file contains multiple compounds."/> + </when> + </conditional> + + <conditional name="search"> + <param name='type' type='select' format='text' label="Search type" help="Search for compounds are similar to the SMILES input, or which contain the SMILES input as a substructure"> + <option value='-t'>Similarity</option> + <option value='-s'>Substructure</option> + </param> + <when value="-t"> + <param type="integer" name="tanimoto" label="Tanimoto cutoff score" help="Score for similarity search. Minimum value is 70." value="70" min="70" max="100"/> + </when> + <when value="-s"/> + </conditional> + + <param name="drugs" type="boolean" value="false" label="Filter to return only approved drugs" truevalue="-d" falsevalue=""/> + <param name="biotherapeutic" type="boolean" value="false" label="Filter to return only biotherapeutic molecules" truevalue="-b" falsevalue=""/> + <param name="natprod" type="boolean" value="false" label="Filter to return only natural products" truevalue="-n" falsevalue=""/> + <param name="ro5" type="boolean" value="false" label="Filter for Lipinski's Rule of Five" truevalue="-r" falsevalue=""/> + </inputs> + <outputs> + <data name="outfile" format="smi" /> + </outputs> + <tests> + <test> + <param name="format" value="-f"/> + <param name="smiles" value="in1.smi"/> + <param name='type' value='-t' /> + <param name='tanimoto' value='70' /> + <output name="outfile" ftype="smi" file='out1.smi'/> + </test> + <test> + <param name="format" value="-f"/> + <param name="smiles" value="in1.smi"/> + <param name='type' value='-t' /> + <param name='tanimoto' value='70' /> + <param name='drugs' value='true'/> + <output name="outfile" ftype="smi" file='out2.smi'/> + </test> + <test> + <param name="format" value="-f"/> + <param name="smiles" value="in1.smi"/> + <param name='type' value='-s' /> + <output name="outfile" ftype="smi" file='out3.smi'/> + </test> + <test> + <param name="format" value="-i"/> + <param name="smiles" value="C1CCCCC1"/> + <param name='type' value='-t' /> + <param name='tanimoto' value='70' /> + <output name="outfile" ftype="smi" file='out4.smi'/> + </test> + </tests> + <help><![CDATA[ + +Search the ChEMBL database for compounds which resemble a SMILES string. Two +search options are possible: similarity (searches for compounds which are +similar to the input within a specified Tanimoto cutoff) and substructure +(searches for compounds which contain the input substructure). + +Results can be filtered for compounds which are 1) approved drugs 2) biotherapeutic +3) natural products and 4) fulfil all of the Lipinski rule of five criteria. + +----- + +.. class:: infomark + +**Input** + +A single molecule in SMILES format. This can be submitted either as text or as a +file containing the SMILES string on the first line. Note that if the file contains +multiple lines, only the SMILES string on the first line will be used for the search. + +----- + +.. class:: infomark + +**Output** + +A SMILES file with search results, each on a new line. + + ]]></help> + + <citations> + <citation type="doi">10.1093/nar/gkv352</citation> + <citation type="doi">arXiv:1607.00378v1</citation> + </citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/in1.smi Mon Aug 05 05:21:58 2019 -0400 @@ -0,0 +1,1 @@ +CN1CCC[C@H]1c2cccnc2
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/out1.smi Mon Aug 05 05:21:58 2019 -0400 @@ -0,0 +1,47 @@ +CN1CCCC1c2cccnc2 +CN1CCC[C@@H]1c2cccnc2 +CN1CCC[C@H]1c2cccnc2 +CCN1CCCC1c2cccnc2 +CN1CCCC1c2ccc(C)nc2 +CCc1ccc(cn1)C2CCCN2C +CN1CCCC1c2cncc(C)c2 +CCCc1ccc(cn1)C2CCCN2C +CCc1cncc(c1)C2CCCN2C +CN1CCCC[C@H]1c2cccnc2 +CN1CCCCC1c2cccnc2 +CCCc1cncc(c1)C2CCCN2C +CN1CCCC1c2cccnc2C +CCCCc1ccc(cn1)C2CCCN2C +CCCCCc1ccc(cn1)C2CCCN2C +CC1CCN(C)[C@@H]1c2cccnc2 +CN1CCCC1c2ccc(CCCc3ccccc3)nc2 +CN1CCCC1c2cncc(Cl)c2 +CN1CCCC1c2ccc(CCc3ccccc3)nc2 +CN1CCC[C@H]1c2ccccc2 +CN1CCCC1c2ccccc2 +CN1CCC[C@H]1c2ccccn2 +CN1CCCC1c2cncc(F)c2 +COc1cncc(c1)C2CCCN2C +CN1CCCC1c2cncc(Br)c2 +CN1CCCC1c2ccc(nc2)c3ccccc3 +CN1CCCC1c2ccc(\C=C\c3ccccc3)nc2 +COc1ccncc1C2CCCN2C +CCCC[C@H]1CC[C@H](N1C)c2cccnc2 +CCCC[C@@H]1CC[C@H](N1C)c2cccnc2 +CN1CCCC1c2cnccc2N +C[C@H]1C[C@H](N(C)C1)c2cccnc2 +CC[C@H]1C[C@H](N(C)C1)c2cccnc2 +CN1CCCC1c2ccc(CCc3ccc(Cl)cc3)nc2 +CN1CCCC1c2ccc(Cl)nc2 +CN1CCCC1c2ccc(C)cc2 +COCC1CCN(C)[C@@H]1c2cccnc2 +C(N1CCCC1c2cccnc2)c3ccccc3 +C[C@H]1CC[C@H](N1C)c2cccnc2 +C[C@@H]1CC[C@H](N1C)c2cccnc2 +CN1CCCC1c2ccc(Cl)cc2 +CN1CCCC1c2ccc(F)nc2 +CN1CCC(CF)[C@H]1c2cccnc2 +CN1CCCC1c2ccc(Br)nc2 +COc1ccc(CCc2ccc(cn2)C3CCCN3C)cc1 +CN1CCC(CO)[C@H]1c2cccnc2 +CN1CCCC1c2cnc3ccccc3c2 \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/out2.smi Mon Aug 05 05:21:58 2019 -0400 @@ -0,0 +1,1 @@ +CN1CCC[C@H]1c2cccnc2 \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/out3.smi Mon Aug 05 05:21:58 2019 -0400 @@ -0,0 +1,72 @@ +CN1CCC[C@H]1c2cccnc2 +CN1CCC[C@H]1c2ccc[n+]([BH2-]C#N)c2 +CN1CC[C@H]2CCc3ncccc3[C@@H]12 +CN1CC[C@H]2CCc3c(ccc[n+]3[BH2-]C#N)[C@@H]12 +CN1[C@@H](CC[C@H]1c2cccnc2)C#N +CN1[C@H](CC[C@H]1c2cccnc2)C#N +CN1CCC[C@H]1c2cncc(c2)C#C +CN1C[C@@H](Cc2ccccc2)C[C@H]1c3cccnc3 +C[C@@H]1CC[C@H](N1C)c2cccnc2 +C[C@H]1CC[C@H](N1C)c2cccnc2 +CC[C@H]1C[C@H](N(C)C1)c2cccnc2 +CN1C[C@@H](O)C[C@H]1c2cccnc2 +CN1CCC(CO)[C@H]1c2cccnc2 +CSC[C@H]1C[C@H](N(C)C1)c2cccnc2 +CN1C[C@H](CO)C[C@H]1c2cccnc2 +CN1C[C@@H](CC#N)C[C@H]1c2cccnc2 +CN1C[C@@H](CF)C[C@H]1c2cccnc2 +CO[C@H]1C[C@H](N(C)C1)c2cccnc2 +CN1CCC(CF)[C@H]1c2cccnc2 +CC1CCN(C)[C@@H]1c2cccnc2 +COCC1CCN(C)[C@@H]1c2cccnc2 +CN1C[C@@H](CO)C[C@H]1c2cccnc2 +CN1C[C@H](C[C@H]1c2cccnc2)OC(=O)C +CN1C[C@@H](C[C@H]1c2cccnc2)C#N +CC1CN(C)[C@@H](C1C)c2cccnc2 +C[C@H]1C[C@H](N(C)C1)c2cccnc2 +CN1C[C@H](C[C@H]1c2cccnc2)OS(=O)(=O)C +COC[C@H]1C[C@H](N(C)C1)c2cccnc2 +CCCC[C@@H]1CC[C@H](N1C)c2cccnc2 +CCCC[C@H]1CC[C@H](N1C)c2cccnc2 +CN1[C@@H](CC[C@H]1c2cccnc2)c3ccccc3 +CN1[C@@H](CC[C@@H]1c2ccccc2)c3cccnc3 +Clc1ccc(OC[C@H]2CN3C(=O)CC[C@@]3(O2)c4cccnc4)cc1 +Clc1ccc(OC[C@@H]2CN3C(=O)CC[C@@]3(O2)c4cccnc4)cc1 +CN1[C@@H](CCC1=O)c2cccnc2 +CCCCCCCCC[n+]1cccc2[C@@H]3[C@@H](CCN3C)CCc12 +CN1CCC[C@H]1c2ccc[n+](CCCCCCCCCCCCCC[n+]3cccc(c3)[C@@H]4CCCN4C)c2 +CCCCCCCCCCCC[n+]1cccc2[C@@H]3[C@@H](CCN3C)CCc12 +CCCCCCCCCCCC[n+]1cccc(c1)[C@@H]2CCCN2C +CCCCCCCCCC[n+]1cccc(c1)[C@@H]2CCCN2C +CN1CCC[C@H]1c2ccc[n+](CCCCCC[n+]3cccc(c3)[C@@H]4CCCN4C)c2 +CN1CCC[C@H]1c2ccc[n+](CCCCCCCCCCC[n+]3cccc(c3)[C@@H]4CCCN4C)c2 +CN1CCC[C@H]1c2ccc[n+](CCCCCCCC[n+]3cccc(c3)[C@@H]4CCCN4C)c2 +CN1CCC[C@H]1c2ccc[n+](CCCCCCCCCC[n+]3cccc(c3)[C@@H]4CCCN4C)c2 +CN1CCC[C@H]1c2ccc[n+](CCCCc3ccc(CCCC[n+]4cccc(c4)[C@@H]5CCCN5C)cc3)c2 +CN1CCC[C@H]1c2ccc[n+](CCCCCc3ccccc3CCCCC[n+]4cccc(c4)[C@@H]5CCCN5C)c2 +CN1CCC[C@H]1c2ccc[n+](CCCC#Cc3ccccc3C#CCCC[n+]4cccc(c4)[C@@H]5CCCN5C)c2 +CN1CCC[C@H]1c2ccc[n+](CCCCCc3cc(CCCCC[n+]4cccc(c4)[C@@H]5CCCN5C)cc(CCCCC[n+]6cccc(c6)[C@@H]7CCCN7C)c3)c2 +CN1CCC[C@H]1c2ccc[n+](CCCCCCCCC[n+]3cccc(c3)[C@@H]4CCCN4C)c2 +CCCCCCCCCC[n+]1cccc2c1CC[C@]3(C)CCN(C)[C@]23C +CN1CCC[C@H]1c2ccc[n+](CCCC#Cc3cccc(c3)C#CCCC[n+]4cccc(c4)[C@@H]5CCCN5C)c2 +CCCCCCCC[n+]1cccc2[C@@H]3[C@@H](CCN3C)CCc12 +CN1CCC[C@H]1c2ccc[n+](CCCC#Cc3cc(cc(c3)C#CCCC[n+]4cccc(c4)[C@@H]5CCCN5C)C#CCCC[n+]6cccc(c6)[C@@H]7CCCN7C)c2 +CCCCCCCCCCC[n+]1cccc2[C@@H]3[C@@H](CCN3C)CCc12 +CCCCCCCCCC[n+]1cccc2[C@@H]3[C@@H](CCN3C)CCc12 +C[N@+]1(CC[N@+]2(C)CCC[C@@H]2c3cccnc3)CCC[C@H]1c4cccnc4 +CN1[C@@H](C[C@@H](OC2O[C@@H]([C@@H](O)[C@H](O)[C@H]2O)C(=O)O)C1=O)c3cccnc3 +CN1C(=O)CC[C@@]1(O)c2cccnc2 +OCN1[C@@H](CCC1=O)c2cccnc2 +CN1CCC[C@H]1c2ccc[n+](c2)[C@@H]3O[C@@H]([C@@H](O)[C@H](O)[C@H]3O)C(=O)O +C[N+]1([O-])CCC[C@H]1c2cccnc2 +CN1CCC[C@@]1(O)c2cccnc2 +CN1[C@@H](CCC1=O)c2ccc[n+](C)c2 +CN1[C@@H](CCC1=O)c2ccc[n+]([O-])c2 +CN1[C@@H](CCC1=O)c2ccc[n+](c2)C3O[C@@H]([C@@H](O)[C@H](O)[C@H]3O)C(=O)C +Cc1cncc(c1)[C@@H]2CCC[N+]2(C)[O-] +COc1ncc(cc1c2ncc(cc2[C@@H]3CC[C@H]4[C@H](OC(=O)N34)c5cc(cc(c5)C(F)(F)F)C(F)(F)F)C(F)(F)F)c6c(C)cc(cc6C)C(=O)O +COc1ccc(cc1c2ncc(cc2[C@@H]3CC[C@H]4[C@H](OC(=O)N34)c5cc(cc(c5)C(F)(F)F)C(F)(F)F)C(F)(F)F)c6c(C)cc(cc6C)C(=O)O +COc1ncc(cc1c2ncc(cc2[C@@H]3CC[C@H]4[C@H](OC(=O)N34)c5cc(cc(c5)C(F)(F)F)C(F)(F)F)C(F)(F)F)c6ccc(cc6C)C(=O)O +COCCOc1ncccc1[C@@H]2C(C(=O)C(C)C)C(=O)C(=O)N2c3ccc(cc3)c4ccsc4 +COCCOc1ncccc1[C@@H]2C(C(=O)C(C)C)C(=O)C(=O)N2c3ccc(cc3)c4ccc(C)s4 +O=S(=O)(Nc1ncns1)c2ccc3c(cccc3c2)N4CCC[C@H]4c5cccnc5 \ No newline at end of file