annotate sdf_to_tab.py @ 0:06828e0cc8a7 draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit 714e984db6ba1198cacf4dcf325320a5889fa02c"
author bgruening
date Wed, 16 Oct 2019 07:26:19 -0400
parents
children 40ff81f67f5e
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
06828e0cc8a7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit 714e984db6ba1198cacf4dcf325320a5889fa02c"
bgruening
parents:
diff changeset
1 #!/usr/bin/env python3
06828e0cc8a7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit 714e984db6ba1198cacf4dcf325320a5889fa02c"
bgruening
parents:
diff changeset
2 import argparse
06828e0cc8a7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit 714e984db6ba1198cacf4dcf325320a5889fa02c"
bgruening
parents:
diff changeset
3 import pandas as pd
06828e0cc8a7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit 714e984db6ba1198cacf4dcf325320a5889fa02c"
bgruening
parents:
diff changeset
4 from rdkit import Chem
06828e0cc8a7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit 714e984db6ba1198cacf4dcf325320a5889fa02c"
bgruening
parents:
diff changeset
5
06828e0cc8a7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit 714e984db6ba1198cacf4dcf325320a5889fa02c"
bgruening
parents:
diff changeset
6 def sdf_to_tab(vars):
06828e0cc8a7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit 714e984db6ba1198cacf4dcf325320a5889fa02c"
bgruening
parents:
diff changeset
7 mols = Chem.SDMolSupplier(vars.inp, sanitize=False)
06828e0cc8a7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit 714e984db6ba1198cacf4dcf325320a5889fa02c"
bgruening
parents:
diff changeset
8 df = pd.DataFrame() # for output
06828e0cc8a7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit 714e984db6ba1198cacf4dcf325320a5889fa02c"
bgruening
parents:
diff changeset
9
06828e0cc8a7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit 714e984db6ba1198cacf4dcf325320a5889fa02c"
bgruening
parents:
diff changeset
10 for n in range(len(mols)):
06828e0cc8a7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit 714e984db6ba1198cacf4dcf325320a5889fa02c"
bgruening
parents:
diff changeset
11 if mols[n]:
06828e0cc8a7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit 714e984db6ba1198cacf4dcf325320a5889fa02c"
bgruening
parents:
diff changeset
12 d = mols[n].GetPropsAsDict()
06828e0cc8a7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit 714e984db6ba1198cacf4dcf325320a5889fa02c"
bgruening
parents:
diff changeset
13 # filter dict for desired props
06828e0cc8a7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit 714e984db6ba1198cacf4dcf325320a5889fa02c"
bgruening
parents:
diff changeset
14 if vars.props.strip() == '': # none specified, return all
06828e0cc8a7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit 714e984db6ba1198cacf4dcf325320a5889fa02c"
bgruening
parents:
diff changeset
15 d = {prop: val for (prop, val) in d.items() if not any(x in str(val) for x in ['\n', '\t'])} # remove items containing newlines or tabs
06828e0cc8a7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit 714e984db6ba1198cacf4dcf325320a5889fa02c"
bgruening
parents:
diff changeset
16 else:
06828e0cc8a7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit 714e984db6ba1198cacf4dcf325320a5889fa02c"
bgruening
parents:
diff changeset
17 d = {prop: val for (prop, val) in d.items() if prop in vars.props.replace(' ', '').split(',')} # remove items not requested via CLI
06828e0cc8a7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit 714e984db6ba1198cacf4dcf325320a5889fa02c"
bgruening
parents:
diff changeset
18 if vars.name:
06828e0cc8a7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit 714e984db6ba1198cacf4dcf325320a5889fa02c"
bgruening
parents:
diff changeset
19 d['Name'] = mols[n].GetProp('_Name')
06828e0cc8a7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit 714e984db6ba1198cacf4dcf325320a5889fa02c"
bgruening
parents:
diff changeset
20 if vars.smiles:
06828e0cc8a7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit 714e984db6ba1198cacf4dcf325320a5889fa02c"
bgruening
parents:
diff changeset
21 d['SMILES'] = Chem.MolToSmiles(mols[n], isomericSmiles=False)
06828e0cc8a7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit 714e984db6ba1198cacf4dcf325320a5889fa02c"
bgruening
parents:
diff changeset
22 d['Index'] = int(n)
06828e0cc8a7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit 714e984db6ba1198cacf4dcf325320a5889fa02c"
bgruening
parents:
diff changeset
23
06828e0cc8a7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit 714e984db6ba1198cacf4dcf325320a5889fa02c"
bgruening
parents:
diff changeset
24 df = df.append(d, ignore_index=True)
06828e0cc8a7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit 714e984db6ba1198cacf4dcf325320a5889fa02c"
bgruening
parents:
diff changeset
25 else:
06828e0cc8a7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit 714e984db6ba1198cacf4dcf325320a5889fa02c"
bgruening
parents:
diff changeset
26 print("Molecule could not be read - skipped.")
06828e0cc8a7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit 714e984db6ba1198cacf4dcf325320a5889fa02c"
bgruening
parents:
diff changeset
27
06828e0cc8a7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit 714e984db6ba1198cacf4dcf325320a5889fa02c"
bgruening
parents:
diff changeset
28 df = df.astype({'Index': int}).set_index('Index')
06828e0cc8a7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit 714e984db6ba1198cacf4dcf325320a5889fa02c"
bgruening
parents:
diff changeset
29 df.to_csv(vars.out, sep='\t', header=vars.header)
06828e0cc8a7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit 714e984db6ba1198cacf4dcf325320a5889fa02c"
bgruening
parents:
diff changeset
30
06828e0cc8a7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit 714e984db6ba1198cacf4dcf325320a5889fa02c"
bgruening
parents:
diff changeset
31 def main():
06828e0cc8a7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit 714e984db6ba1198cacf4dcf325320a5889fa02c"
bgruening
parents:
diff changeset
32 parser = argparse.ArgumentParser(description="Convert SDF to tabular")
06828e0cc8a7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit 714e984db6ba1198cacf4dcf325320a5889fa02c"
bgruening
parents:
diff changeset
33 parser.add_argument('--inp', '-i', help="The input file", required=True)
06828e0cc8a7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit 714e984db6ba1198cacf4dcf325320a5889fa02c"
bgruening
parents:
diff changeset
34 parser.add_argument('--out', '-o', help="The output file", required=True)
06828e0cc8a7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit 714e984db6ba1198cacf4dcf325320a5889fa02c"
bgruening
parents:
diff changeset
35 parser.add_argument('--props', '-p', help="Properties to filter (leave blank for all)", required=True)
06828e0cc8a7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit 714e984db6ba1198cacf4dcf325320a5889fa02c"
bgruening
parents:
diff changeset
36 parser.add_argument('--header', '-t', action='store_true',
06828e0cc8a7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit 714e984db6ba1198cacf4dcf325320a5889fa02c"
bgruening
parents:
diff changeset
37 help="Write property name as the first row.")
06828e0cc8a7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit 714e984db6ba1198cacf4dcf325320a5889fa02c"
bgruening
parents:
diff changeset
38 parser.add_argument('--smiles', '-s', action='store_true',
06828e0cc8a7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit 714e984db6ba1198cacf4dcf325320a5889fa02c"
bgruening
parents:
diff changeset
39 help="Include SMILES in output.")
06828e0cc8a7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit 714e984db6ba1198cacf4dcf325320a5889fa02c"
bgruening
parents:
diff changeset
40 parser.add_argument('--name', '-n', action='store_true',
06828e0cc8a7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit 714e984db6ba1198cacf4dcf325320a5889fa02c"
bgruening
parents:
diff changeset
41 help="Include molecule name in output.")
06828e0cc8a7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit 714e984db6ba1198cacf4dcf325320a5889fa02c"
bgruening
parents:
diff changeset
42 sdf_to_tab(parser.parse_args())
06828e0cc8a7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit 714e984db6ba1198cacf4dcf325320a5889fa02c"
bgruening
parents:
diff changeset
43
06828e0cc8a7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit 714e984db6ba1198cacf4dcf325320a5889fa02c"
bgruening
parents:
diff changeset
44
06828e0cc8a7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit 714e984db6ba1198cacf4dcf325320a5889fa02c"
bgruening
parents:
diff changeset
45 if __name__ == "__main__":
06828e0cc8a7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit 714e984db6ba1198cacf4dcf325320a5889fa02c"
bgruening
parents:
diff changeset
46 main()