0
|
1 #!/usr/bin/env python3
|
|
2 """Make FASTA files non-redundant by combining duplicated sequences.
|
|
3
|
|
4 This script takes one or more (optionally gzipped) FASTA filenames as input,
|
|
5 and will return a non-zero error if any duplicate identifiers are found.
|
|
6
|
|
7 Writes output to stdout by default.
|
|
8
|
|
9 Keeps all the sequences in memory, beware!
|
|
10 """
|
|
11 from __future__ import print_function
|
|
12
|
|
13 import gzip
|
|
14 import os
|
|
15 import sys
|
|
16
|
|
17 from optparse import OptionParser
|
|
18
|
|
19
|
|
20 if "-v" in sys.argv or "--version" in sys.argv:
|
|
21 print("v0.0.1")
|
|
22 sys.exit(0)
|
|
23
|
|
24
|
|
25 # Parse Command Line
|
|
26 usage = """Use as follows:
|
|
27
|
|
28 $ python make_nr.py [options] A.fasta [B.fasta ...]
|
|
29
|
|
30 For example,
|
|
31
|
|
32 $ python make_nr.py -o dedup.fasta -s ";" input1.fasta input2.fasta
|
|
33
|
|
34 The input files should be plain text FASTA format, optionally gzipped.
|
|
35
|
|
36 The -a option controls how the representative replacement record for
|
|
37 duplicated records are named. By default the identifiers are taken
|
|
38 in the input file order, combined with the separator. If the -a or
|
|
39 alphasort option is picked, the identifiers are alphabetically sorted
|
|
40 first. This ensures the same names are used even if the input file
|
|
41 order (or the record order within the input files) is randomised.
|
|
42
|
|
43 There is additional guidance in the help text in the make_nr.xml file,
|
|
44 which is shown to the user via the Galaxy interface to this tool.
|
|
45 """
|
|
46
|
|
47 parser = OptionParser(usage=usage)
|
|
48 parser.add_option("-s", "--sep", dest="sep",
|
|
49 default=";",
|
|
50 help="Separator character for combining identifiers "
|
|
51 "of duplicated records e.g. '|' or ';' (required)")
|
|
52 parser.add_option("-a", "--alphasort", action="store_true",
|
|
53 help="When merging duplicated records sort their "
|
|
54 "identifiers alphabetically before combining them. "
|
|
55 "Default is input file order.")
|
|
56 parser.add_option("-o", "--output", dest="output",
|
|
57 default="/dev/stdout", metavar="FILE",
|
|
58 help="Output filename (defaults to stdout)")
|
|
59 options, args = parser.parse_args()
|
|
60
|
|
61 if not args:
|
|
62 sys.exit("Expects at least one input FASTA filename")
|
|
63
|
|
64
|
|
65 def gzip_open(filename):
|
|
66 """Open a possibly gzipped text file."""
|
|
67 with open(filename, "rb") as h:
|
|
68 magic = h.read(2)
|
|
69 if magic == b'\x1f\x8b':
|
|
70 return gzip.open(filename, "rt")
|
|
71 else:
|
|
72 return open(filename)
|
|
73
|
|
74
|
|
75 def make_nr(input_fasta, output_fasta, sep=";", sort_ids=False):
|
|
76 """Make the sequences in FASTA files non-redundant.
|
|
77
|
|
78 Argument input_fasta is a list of filenames.
|
|
79 """
|
|
80 by_seq = dict()
|
|
81 try:
|
|
82 from Bio.SeqIO.FastaIO import SimpleFastaParser
|
|
83 except ImportError:
|
|
84 sys.exit("Missing Biopython")
|
|
85 for f in input_fasta:
|
|
86 with gzip_open(f) as handle:
|
|
87 for title, seq in SimpleFastaParser(handle):
|
|
88 idn = title.split(None, 1)[0] # first word only
|
|
89 seq = seq.upper()
|
|
90 try:
|
|
91 by_seq[seq].append(idn)
|
|
92 except KeyError:
|
|
93 by_seq[seq] = [idn]
|
|
94 unique = 0
|
|
95 representatives = dict()
|
|
96 duplicates = set()
|
|
97 for cluster in by_seq.values():
|
|
98 if len(cluster) > 1:
|
|
99 # Is it useful to offer to sort here?
|
|
100 # if sort_ids:
|
|
101 # cluster.sort()
|
|
102 representatives[cluster[0]] = cluster
|
|
103 duplicates.update(cluster[1:])
|
|
104 else:
|
|
105 unique += 1
|
|
106 del by_seq
|
|
107 if duplicates:
|
|
108 # TODO - refactor as a generator with single SeqIO.write(...) call
|
|
109 with open(output_fasta, "w") as handle:
|
|
110 for f in input_fasta:
|
|
111 with gzip_open(f) as in_handle:
|
|
112 for title, seq in SimpleFastaParser(in_handle):
|
|
113 idn = title.split(None, 1)[0] # first word only
|
|
114 if idn in representatives:
|
|
115 cluster = representatives[idn]
|
|
116 if sort_ids:
|
|
117 cluster.sort()
|
|
118 idn = sep.join(cluster)
|
|
119 title = "%s representing %i records" % (idn, len(cluster))
|
|
120 elif idn in duplicates:
|
|
121 continue
|
|
122 # TODO - line wrapping
|
|
123 handle.write(">%s\n%s\n" % (title, seq))
|
|
124 sys.stderr.write("%i unique entries; removed %i duplicates "
|
|
125 "leaving %i representative records\n"
|
|
126 % (unique, len(duplicates), len(representatives)))
|
|
127 else:
|
|
128 os.symlink(os.path.abspath(input_fasta), output_fasta)
|
|
129 sys.stderr.write("No perfect duplicates in file, %i unique entries\n"
|
|
130 % unique)
|
|
131
|
|
132
|
|
133 make_nr(args, options.output, options.sep, options.alphasort)
|