diff sdf_to_tab.py @ 0:0f3e5c69251e draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit 20df7e562341cd30e89a14d6bde9054956fadc06"
author bgruening
date Tue, 10 Mar 2020 12:57:24 -0400
parents
children 3d96dc99698f
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sdf_to_tab.py	Tue Mar 10 12:57:24 2020 -0400
@@ -0,0 +1,46 @@
+#!/usr/bin/env python3
+import argparse
+import pandas as pd
+from rdkit import Chem
+
+def sdf_to_tab(vars):
+    mols = Chem.SDMolSupplier(vars.inp, sanitize=False)
+    df = pd.DataFrame()  # for output
+
+    for n in range(len(mols)):
+        if mols[n]:
+            d = mols[n].GetPropsAsDict()
+            # filter dict for desired props
+            if vars.props.strip() == '':  # none specified, return all
+                d = {prop: val for (prop, val) in d.items() if not any(x in str(val) for x in ['\n', '\t'])}  # remove items containing newlines or tabs
+            else:
+                d = {prop: val for (prop, val) in d.items() if prop in vars.props.replace(' ', '').split(',')}  # remove items not requested via CLI
+            if vars.name:
+                d['Name'] = mols[n].GetProp('_Name')
+            if vars.smiles:
+                d['SMILES'] = Chem.MolToSmiles(mols[n], isomericSmiles=False)
+            d['Index'] = int(n)
+
+            df = df.append(d, ignore_index=True)
+        else:
+            print("Molecule could not be read - skipped.")
+
+    df = df.astype({'Index': int}).set_index('Index')
+    df.to_csv(vars.out, sep='\t', header=vars.header)
+
+def main():
+    parser = argparse.ArgumentParser(description="Convert SDF to tabular")
+    parser.add_argument('--inp', '-i', help="The input file", required=True)
+    parser.add_argument('--out', '-o', help="The output file", required=True)
+    parser.add_argument('--props', '-p', help="Properties to filter (leave blank for all)", required=True)
+    parser.add_argument('--header', '-t', action='store_true',
+                        help="Write property name as the first row.")
+    parser.add_argument('--smiles', '-s', action='store_true',
+                        help="Include SMILES in output.")
+    parser.add_argument('--name', '-n', action='store_true',
+                        help="Include molecule name in output.")
+    sdf_to_tab(parser.parse_args())
+    
+
+if __name__ == "__main__":
+    main()