Mercurial > repos > bgruening > openbabel_remduplicates

#!/usr/bin/env python
"""
    Input: set of molecules with pre-calculated physico-chemical properties
    Output: set of molecules that pass all the filters
    Copyright 2012, Bjoern Gruening and Xavier Lucas

    TODO: AND/OR conditions?
"""
import argparse
import json
import shlex
import subprocess
import sys

import cheminfolib
from openbabel import pybel

cheminfolib.pybel_stop_logging()


def parse_command_line():
    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--input", help="Input file name")
    parser.add_argument("-iformat", help="Input file format")
    parser.add_argument("-oformat", default="smi", help="Output file format")
    parser.add_argument("-o", "--output", help="Output file name", required=True)
    parser.add_argument("--filters", help="Specify the filters to apply", required=True)
    parser.add_argument(
        "--list_of_names",
        required=False,
        help="A file with list of molecule names to extract. Every name is in one line.",
    )
    return parser.parse_args()


def filter_precalculated_compounds(args, filters):
    outfile = pybel.Outputfile(args.oformat, args.output, overwrite=True)
    for mol in pybel.readfile("sdf", args.input):
        for key, elem in filters.items():
            # map the short description to the larger metadata names stored in the sdf file
            property = cheminfolib.ColumnNames.get(key, key)
            min = elem[0]
            max = elem[1]
            if float(mol.data[property]) >= float(min) and float(
                mol.data[property]
            ) <= float(max):
                pass
            else:
                # leave the filter loop, because one filter constrained are not satisfied
                break
        else:
            # if the filter loop terminates in a normal way (no break) all filter rules are satisfied, so save the compound
            outfile.write(mol)
    outfile.close()


def filter_new_compounds(args, filters):

    if args.iformat == args.oformat:
        # use the -ocopy option from openbabel to speed up the filtering, additionally no conversion is carried out
        # http://openbabel.org/docs/dev/FileFormats/Copy_raw_text.html#copy-raw-text
        cmd = "obabel -i%s %s -ocopy -O %s --filter" % (
            args.iformat,
            args.input,
            args.output,
        )
    else:
        cmd = "obabel -i%s %s -o%s -O %s --filter" % (
            args.iformat,
            args.input,
            args.oformat,
            args.output,
        )
    filter_cmd = ""
    # OBDescriptor stores a mapping from our desc shortcut to the OB name [0] and a long description [1]
    for key, elem in filters.items():
        ob_descriptor_name = cheminfolib.OBDescriptor[key][0]
        min = elem[0]
        max = elem[1]
        filter_cmd += " %s>=%s %s<=%s " % (
            ob_descriptor_name,
            min,
            ob_descriptor_name,
            max,
        )

    args = shlex.split('%s "%s"' % (cmd, filter_cmd))
    # print '%s "%s"' % (cmd, filter_cmd)
    # calling openbabel with subprocess and pipe potential errors occuring in openbabel to stdout
    child = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

    stdout, stderr = child.communicate()
    return_code = child.returncode

    if return_code:
        sys.stdout.write(stdout.decode("utf-8"))
        sys.stderr.write(stderr.decode("utf-8"))
        sys.stderr.write("Return error code %i from command:\n" % return_code)
        sys.stderr.write("%s\n" % cmd)
    else:
        sys.stdout.write(stdout.decode("utf-8"))
        sys.stdout.write(stderr.decode("utf-8"))


def filter_by_name(args):
    outfile = pybel.Outputfile(args.oformat, args.output, overwrite=True)
    for mol in pybel.readfile("sdf", args.input):
        for name in open(args.list_of_names):
            if mol.title.strip() == name.strip():
                outfile.write(mol)
    outfile.close()


def __main__():
    """
    Select compounds with certain properties from a small library
    """
    args = parse_command_line()

    if args.filters == "__filter_by_name__":
        filter_by_name(args)
        return

    # Its a small trick to get the parameters in an easy way from the xml file.
    # To keep it readable in the xml file, many white-spaces are included in that string it needs to be removed.
    # Also the last loop creates a ',{' that is not an valid jason expression.
    filters = json.loads((args.filters).replace(" ", "").replace(",}", "}"))
    if args.iformat == "sdf":
        # Check if the sdf file contains all of the required metadata to invoke the precalculation filtering
        mol = next(pybel.readfile("sdf", args.input))
        for key, elem in filters.items():
            property = cheminfolib.ColumnNames.get(key, key)
            if property not in mol.data:
                break
        else:
            # if the for loop finishes in a normal way, we should habe all properties at least in the first molecule
            # assume it is the same for all other molecules and start the precalculated filtering
            filter_precalculated_compounds(args, filters)
            return True
    filter_new_compounds(args, filters)


if __name__ == "__main__":
    __main__()
author	bgruening
date	Thu, 15 Aug 2024 11:00:46 +0000
parents	12aca74f07d7
children