Mercurial > repos > bgruening > ctb_rdkit_descriptors
view sdf_to_tab.py @ 9:0993ac4f4a23 draft default tip
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit c1d813d3f0fec60ea6efe8a11e59d98bfdc1636f"
author | bgruening |
---|---|
date | Sat, 04 Dec 2021 16:40:00 +0000 |
parents | a1c53f0533b0 |
children |
line wrap: on
line source
#!/usr/bin/env python3 import argparse import pandas as pd from rdkit import Chem def sdf_to_tab(vars): mols = Chem.SDMolSupplier(vars.inp, sanitize=False) df = pd.DataFrame() # for output for n in range(len(mols)): if mols[n]: d = mols[n].GetPropsAsDict() # filter dict for desired props if vars.props.strip() == "": # none specified, return all d = { prop: val for (prop, val) in d.items() if not any(x in str(val) for x in ["\n", "\t"]) } # remove items containing newlines or tabs else: d = { prop: val for (prop, val) in d.items() if prop in vars.props.replace(" ", "").split(",") } # remove items not requested via CLI if vars.name: d["SDFMoleculeName"] = mols[n].GetProp("_Name") if vars.smiles: d["SMILES"] = Chem.MolToSmiles(mols[n], isomericSmiles=False) d["Index"] = int(n) df = df.append(d, ignore_index=True) else: print("Molecule could not be read - skipped.") df = df.astype({"Index": int}).set_index("Index") sorted_cols = sorted(df.columns.values.tolist()) df.to_csv(vars.out, sep="\t", header=vars.header, columns=sorted_cols) def main(): parser = argparse.ArgumentParser(description="Convert SDF to tabular") parser.add_argument("--inp", "-i", help="The input file", required=True) parser.add_argument("--out", "-o", help="The output file", required=True) parser.add_argument( "--props", "-p", help="Properties to filter (leave blank for all)", required=True, ) parser.add_argument( "--header", "-t", action="store_true", help="Write property name as the first row.", ) parser.add_argument( "--smiles", "-s", action="store_true", help="Include SMILES in output." ) parser.add_argument( "--name", "-n", action="store_true", help="Include molecule name in output." ) sdf_to_tab(parser.parse_args()) if __name__ == "__main__": main()