annotate tools/make_nr/make_nr.py @ 0:c84f12187af9 draft

v0.0.1
author peterjc
date Fri, 09 Nov 2018 11:00:03 -0500
parents
children 84e483325b04
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
peterjc
parents:
diff changeset
1 #!/usr/bin/env python3
peterjc
parents:
diff changeset
2 """Make FASTA files non-redundant by combining duplicated sequences.
peterjc
parents:
diff changeset
3
peterjc
parents:
diff changeset
4 This script takes one or more (optionally gzipped) FASTA filenames as input,
peterjc
parents:
diff changeset
5 and will return a non-zero error if any duplicate identifiers are found.
peterjc
parents:
diff changeset
6
peterjc
parents:
diff changeset
7 Writes output to stdout by default.
peterjc
parents:
diff changeset
8
peterjc
parents:
diff changeset
9 Keeps all the sequences in memory, beware!
peterjc
parents:
diff changeset
10 """
peterjc
parents:
diff changeset
11 from __future__ import print_function
peterjc
parents:
diff changeset
12
peterjc
parents:
diff changeset
13 import gzip
peterjc
parents:
diff changeset
14 import os
peterjc
parents:
diff changeset
15 import sys
peterjc
parents:
diff changeset
16
peterjc
parents:
diff changeset
17 from optparse import OptionParser
peterjc
parents:
diff changeset
18
peterjc
parents:
diff changeset
19
peterjc
parents:
diff changeset
20 if "-v" in sys.argv or "--version" in sys.argv:
peterjc
parents:
diff changeset
21 print("v0.0.1")
peterjc
parents:
diff changeset
22 sys.exit(0)
peterjc
parents:
diff changeset
23
peterjc
parents:
diff changeset
24
peterjc
parents:
diff changeset
25 # Parse Command Line
peterjc
parents:
diff changeset
26 usage = """Use as follows:
peterjc
parents:
diff changeset
27
peterjc
parents:
diff changeset
28 $ python make_nr.py [options] A.fasta [B.fasta ...]
peterjc
parents:
diff changeset
29
peterjc
parents:
diff changeset
30 For example,
peterjc
parents:
diff changeset
31
peterjc
parents:
diff changeset
32 $ python make_nr.py -o dedup.fasta -s ";" input1.fasta input2.fasta
peterjc
parents:
diff changeset
33
peterjc
parents:
diff changeset
34 The input files should be plain text FASTA format, optionally gzipped.
peterjc
parents:
diff changeset
35
peterjc
parents:
diff changeset
36 The -a option controls how the representative replacement record for
peterjc
parents:
diff changeset
37 duplicated records are named. By default the identifiers are taken
peterjc
parents:
diff changeset
38 in the input file order, combined with the separator. If the -a or
peterjc
parents:
diff changeset
39 alphasort option is picked, the identifiers are alphabetically sorted
peterjc
parents:
diff changeset
40 first. This ensures the same names are used even if the input file
peterjc
parents:
diff changeset
41 order (or the record order within the input files) is randomised.
peterjc
parents:
diff changeset
42
peterjc
parents:
diff changeset
43 There is additional guidance in the help text in the make_nr.xml file,
peterjc
parents:
diff changeset
44 which is shown to the user via the Galaxy interface to this tool.
peterjc
parents:
diff changeset
45 """
peterjc
parents:
diff changeset
46
peterjc
parents:
diff changeset
47 parser = OptionParser(usage=usage)
peterjc
parents:
diff changeset
48 parser.add_option("-s", "--sep", dest="sep",
peterjc
parents:
diff changeset
49 default=";",
peterjc
parents:
diff changeset
50 help="Separator character for combining identifiers "
peterjc
parents:
diff changeset
51 "of duplicated records e.g. '|' or ';' (required)")
peterjc
parents:
diff changeset
52 parser.add_option("-a", "--alphasort", action="store_true",
peterjc
parents:
diff changeset
53 help="When merging duplicated records sort their "
peterjc
parents:
diff changeset
54 "identifiers alphabetically before combining them. "
peterjc
parents:
diff changeset
55 "Default is input file order.")
peterjc
parents:
diff changeset
56 parser.add_option("-o", "--output", dest="output",
peterjc
parents:
diff changeset
57 default="/dev/stdout", metavar="FILE",
peterjc
parents:
diff changeset
58 help="Output filename (defaults to stdout)")
peterjc
parents:
diff changeset
59 options, args = parser.parse_args()
peterjc
parents:
diff changeset
60
peterjc
parents:
diff changeset
61 if not args:
peterjc
parents:
diff changeset
62 sys.exit("Expects at least one input FASTA filename")
peterjc
parents:
diff changeset
63
peterjc
parents:
diff changeset
64
peterjc
parents:
diff changeset
65 def gzip_open(filename):
peterjc
parents:
diff changeset
66 """Open a possibly gzipped text file."""
peterjc
parents:
diff changeset
67 with open(filename, "rb") as h:
peterjc
parents:
diff changeset
68 magic = h.read(2)
peterjc
parents:
diff changeset
69 if magic == b'\x1f\x8b':
peterjc
parents:
diff changeset
70 return gzip.open(filename, "rt")
peterjc
parents:
diff changeset
71 else:
peterjc
parents:
diff changeset
72 return open(filename)
peterjc
parents:
diff changeset
73
peterjc
parents:
diff changeset
74
peterjc
parents:
diff changeset
75 def make_nr(input_fasta, output_fasta, sep=";", sort_ids=False):
peterjc
parents:
diff changeset
76 """Make the sequences in FASTA files non-redundant.
peterjc
parents:
diff changeset
77
peterjc
parents:
diff changeset
78 Argument input_fasta is a list of filenames.
peterjc
parents:
diff changeset
79 """
peterjc
parents:
diff changeset
80 by_seq = dict()
peterjc
parents:
diff changeset
81 try:
peterjc
parents:
diff changeset
82 from Bio.SeqIO.FastaIO import SimpleFastaParser
peterjc
parents:
diff changeset
83 except ImportError:
peterjc
parents:
diff changeset
84 sys.exit("Missing Biopython")
peterjc
parents:
diff changeset
85 for f in input_fasta:
peterjc
parents:
diff changeset
86 with gzip_open(f) as handle:
peterjc
parents:
diff changeset
87 for title, seq in SimpleFastaParser(handle):
peterjc
parents:
diff changeset
88 idn = title.split(None, 1)[0] # first word only
peterjc
parents:
diff changeset
89 seq = seq.upper()
peterjc
parents:
diff changeset
90 try:
peterjc
parents:
diff changeset
91 by_seq[seq].append(idn)
peterjc
parents:
diff changeset
92 except KeyError:
peterjc
parents:
diff changeset
93 by_seq[seq] = [idn]
peterjc
parents:
diff changeset
94 unique = 0
peterjc
parents:
diff changeset
95 representatives = dict()
peterjc
parents:
diff changeset
96 duplicates = set()
peterjc
parents:
diff changeset
97 for cluster in by_seq.values():
peterjc
parents:
diff changeset
98 if len(cluster) > 1:
peterjc
parents:
diff changeset
99 # Is it useful to offer to sort here?
peterjc
parents:
diff changeset
100 # if sort_ids:
peterjc
parents:
diff changeset
101 # cluster.sort()
peterjc
parents:
diff changeset
102 representatives[cluster[0]] = cluster
peterjc
parents:
diff changeset
103 duplicates.update(cluster[1:])
peterjc
parents:
diff changeset
104 else:
peterjc
parents:
diff changeset
105 unique += 1
peterjc
parents:
diff changeset
106 del by_seq
peterjc
parents:
diff changeset
107 if duplicates:
peterjc
parents:
diff changeset
108 # TODO - refactor as a generator with single SeqIO.write(...) call
peterjc
parents:
diff changeset
109 with open(output_fasta, "w") as handle:
peterjc
parents:
diff changeset
110 for f in input_fasta:
peterjc
parents:
diff changeset
111 with gzip_open(f) as in_handle:
peterjc
parents:
diff changeset
112 for title, seq in SimpleFastaParser(in_handle):
peterjc
parents:
diff changeset
113 idn = title.split(None, 1)[0] # first word only
peterjc
parents:
diff changeset
114 if idn in representatives:
peterjc
parents:
diff changeset
115 cluster = representatives[idn]
peterjc
parents:
diff changeset
116 if sort_ids:
peterjc
parents:
diff changeset
117 cluster.sort()
peterjc
parents:
diff changeset
118 idn = sep.join(cluster)
peterjc
parents:
diff changeset
119 title = "%s representing %i records" % (idn, len(cluster))
peterjc
parents:
diff changeset
120 elif idn in duplicates:
peterjc
parents:
diff changeset
121 continue
peterjc
parents:
diff changeset
122 # TODO - line wrapping
peterjc
parents:
diff changeset
123 handle.write(">%s\n%s\n" % (title, seq))
peterjc
parents:
diff changeset
124 sys.stderr.write("%i unique entries; removed %i duplicates "
peterjc
parents:
diff changeset
125 "leaving %i representative records\n"
peterjc
parents:
diff changeset
126 % (unique, len(duplicates), len(representatives)))
peterjc
parents:
diff changeset
127 else:
peterjc
parents:
diff changeset
128 os.symlink(os.path.abspath(input_fasta), output_fasta)
peterjc
parents:
diff changeset
129 sys.stderr.write("No perfect duplicates in file, %i unique entries\n"
peterjc
parents:
diff changeset
130 % unique)
peterjc
parents:
diff changeset
131
peterjc
parents:
diff changeset
132
peterjc
parents:
diff changeset
133 make_nr(args, options.output, options.sep, options.alphasort)