Mercurial > repos > peterjc > make_nr

diff tools/make_nr/make_nr.py @ 0:c84f12187af9 draft
v0.0.1
author: peterjc
date: Fri, 09 Nov 2018 11:00:03 -0500
children: 84e483325b04
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/make_nr/make_nr.py	Fri Nov 09 11:00:03 2018 -0500
@@ -0,0 +1,133 @@
+#!/usr/bin/env python3
+"""Make FASTA files non-redundant by combining duplicated sequences.
+
+This script takes one or more (optionally gzipped) FASTA filenames as input,
+and will return a non-zero error if any duplicate identifiers are found.
+
+Writes output to stdout by default.
+
+Keeps all the sequences in memory, beware!
+"""
+from __future__ import print_function
+
+import gzip
+import os
+import sys
+
+from optparse import OptionParser
+
+
+if "-v" in sys.argv or "--version" in sys.argv:
+    print("v0.0.1")
+    sys.exit(0)
+
+
+# Parse Command Line
+usage = """Use as follows:
+
+$ python make_nr.py [options] A.fasta [B.fasta ...]
+
+For example,
+
+$ python make_nr.py -o dedup.fasta -s ";" input1.fasta input2.fasta
+
+The input files should be plain text FASTA format, optionally gzipped.
+
+The -a option controls how the representative replacement record for
+duplicated records are named. By default the identifiers are taken
+in the input file order, combined with the separator. If the -a or
+alphasort option is picked, the identifiers are alphabetically sorted
+first. This ensures the same names are used even if the input file
+order (or the record order within the input files) is randomised.
+
+There is additional guidance in the help text in the make_nr.xml file,
+which is shown to the user via the Galaxy interface to this tool.
+"""
+
+parser = OptionParser(usage=usage)
+parser.add_option("-s", "--sep", dest="sep",
+                  default=";",
+                  help="Separator character for combining identifiers "
+                  "of duplicated records e.g. '|' or ';' (required)")
+parser.add_option("-a", "--alphasort", action="store_true",
+                  help="When merging duplicated records sort their "
+                  "identifiers alphabetically before combining them. "
+                  "Default is input file order.")
+parser.add_option("-o", "--output", dest="output",
+                  default="/dev/stdout", metavar="FILE",
+                  help="Output filename (defaults to stdout)")
+options, args = parser.parse_args()
+
+if not args:
+    sys.exit("Expects at least one input FASTA filename")
+
+
+def gzip_open(filename):
+    """Open a possibly gzipped text file."""
+    with open(filename, "rb") as h:
+        magic = h.read(2)
+    if magic == b'\x1f\x8b':
+        return gzip.open(filename, "rt")
+    else:
+        return open(filename)
+
+
+def make_nr(input_fasta, output_fasta, sep=";", sort_ids=False):
+    """Make the sequences in FASTA files non-redundant.
+
+    Argument input_fasta is a list of filenames.
+    """
+    by_seq = dict()
+    try:
+        from Bio.SeqIO.FastaIO import SimpleFastaParser
+    except ImportError:
+        sys.exit("Missing Biopython")
+    for f in input_fasta:
+        with gzip_open(f) as handle:
+            for title, seq in SimpleFastaParser(handle):
+                idn = title.split(None, 1)[0]  # first word only
+                seq = seq.upper()
+                try:
+                    by_seq[seq].append(idn)
+                except KeyError:
+                    by_seq[seq] = [idn]
+    unique = 0
+    representatives = dict()
+    duplicates = set()
+    for cluster in by_seq.values():
+        if len(cluster) > 1:
+            # Is it useful to offer to sort here?
+            # if sort_ids:
+            #     cluster.sort()
+            representatives[cluster[0]] = cluster
+            duplicates.update(cluster[1:])
+        else:
+            unique += 1
+    del by_seq
+    if duplicates:
+        # TODO - refactor as a generator with single SeqIO.write(...) call
+        with open(output_fasta, "w") as handle:
+            for f in input_fasta:
+                with gzip_open(f) as in_handle:
+                    for title, seq in SimpleFastaParser(in_handle):
+                        idn = title.split(None, 1)[0]  # first word only
+                        if idn in representatives:
+                            cluster = representatives[idn]
+                            if sort_ids:
+                                cluster.sort()
+                            idn = sep.join(cluster)
+                            title = "%s representing %i records" % (idn, len(cluster))
+                        elif idn in duplicates:
+                            continue
+                        # TODO - line wrapping
+                        handle.write(">%s\n%s\n" % (title, seq))
+        sys.stderr.write("%i unique entries; removed %i duplicates "
+                         "leaving %i representative records\n"
+                         % (unique, len(duplicates), len(representatives)))
+    else:
+        os.symlink(os.path.abspath(input_fasta), output_fasta)
+        sys.stderr.write("No perfect duplicates in file, %i unique entries\n"
+                         % unique)
+
+
+make_nr(args, options.output, options.sep, options.alphasort)
author	peterjc
date	Fri, 09 Nov 2018 11:00:03 -0500
parents
children	84e483325b04