0
|
1 #!/usr/bin/env python
|
|
2 """Sub-sample sequence from a FASTA, FASTQ or SFF file.
|
|
3
|
|
4 This tool is a short Python script which requires Biopython 1.62 or later
|
|
5 for SFF file support. If you use this tool in scientific work leading to a
|
|
6 publication, please cite the Biopython application note:
|
|
7
|
|
8 Cock et al 2009. Biopython: freely available Python tools for computational
|
|
9 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
|
|
10 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
|
|
11
|
|
12 This script is copyright 2010-2013 by Peter Cock, The James Hutton Institute
|
|
13 (formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved.
|
|
14 See accompanying text file for licence details (MIT license).
|
|
15
|
|
16 This is version 0.1.0 of the script, use -v or --version to get the version.
|
|
17 """
|
|
18 import os
|
|
19 import sys
|
|
20
|
|
21 def stop_err(msg, err=1):
|
|
22 sys.stderr.write(msg.rstrip() + "\n")
|
|
23 sys.exit(err)
|
|
24
|
|
25 if "-v" in sys.argv or "--version" in sys.argv:
|
|
26 print("v0.1.0")
|
|
27 sys.exit(0)
|
|
28
|
|
29 #Parse Command Line
|
|
30 if len(sys.argv) < 5:
|
|
31 stop_err("Requires at least four arguments: seq_format, in_file, out_file, mode, ...")
|
|
32 seq_format, in_file, out_file, mode = sys.argv[1:5]
|
|
33 if in_file != "/dev/stdin" and not os.path.isfile(in_file):
|
|
34 stop_err("Missing input file %r" % in_file)
|
|
35
|
|
36 if mode == "everyNth":
|
|
37 if len(sys.argv) != 6:
|
|
38 stop_err("If using everyNth, just need argument N (integer, at least 2)")
|
|
39 try:
|
|
40 N = int(sys.argv[5])
|
|
41 except:
|
|
42 stop_err("Bad N argument %r" % sys.argv[5])
|
|
43 if N < 2:
|
|
44 stop_err("Bad N argument %r" % sys.argv[5])
|
|
45 if (N % 10) == 1:
|
|
46 sys.stderr.write("Sampling every %ist sequence\n" % N)
|
|
47 elif (N % 10) == 2:
|
|
48 sys.stderr.write("Sampling every %ind sequence\n" % N)
|
|
49 elif (N % 10) == 3:
|
|
50 sys.stderr.write("Sampling every %ird sequence\n" % N)
|
|
51 else:
|
|
52 sys.stderr.write("Sampling every %ith sequence\n" % N)
|
|
53 def sampler(iterator):
|
|
54 global N
|
|
55 count = 0
|
|
56 for record in iterator:
|
|
57 count += 1
|
|
58 if count % N == 1:
|
|
59 yield record
|
|
60 elif mode == "percentage":
|
|
61 if len(sys.argv) != 6:
|
|
62 stop_err("If using percentage, just need percentage argument (float, range 0 to 100)")
|
|
63 try:
|
|
64 percent = float(sys.argv[5]) / 100.0
|
|
65 except:
|
|
66 stop_err("Bad percent argument %r" % sys.argv[5])
|
|
67 if percent <= 0.0 or 1.0 <= percent:
|
|
68 stop_err("Bad percent argument %r" % sys.argv[5])
|
|
69 sys.stderr.write("Sampling %0.3f%% of sequences\n" % (100.0 * percent))
|
|
70 def sampler(iterator):
|
|
71 global percent
|
|
72 count = 0
|
|
73 taken = 0
|
|
74 for record in iterator:
|
|
75 count += 1
|
|
76 if percent * count > taken:
|
|
77 taken += 1
|
|
78 yield record
|
|
79 else:
|
|
80 stop_err("Unsupported mode %r" % mode)
|
|
81
|
|
82 def raw_fasta_iterator(handle):
|
|
83 """Yields raw FASTA records as multi-line strings."""
|
|
84 while True:
|
|
85 line = handle.readline()
|
|
86 if line == "":
|
|
87 return # Premature end of file, or just empty?
|
|
88 if line[0] == ">":
|
|
89 break
|
|
90
|
|
91 no_id_warned = False
|
|
92 while True:
|
|
93 if line[0] != ">":
|
|
94 raise ValueError(
|
|
95 "Records in Fasta files should start with '>' character")
|
|
96 try:
|
|
97 id = line[1:].split(None, 1)[0]
|
|
98 except IndexError:
|
|
99 if not no_id_warned:
|
|
100 sys.stderr.write("WARNING - Malformed FASTA entry with no identifier\n")
|
|
101 no_id_warned = True
|
|
102 id = None
|
|
103 lines = [line]
|
|
104 line = handle.readline()
|
|
105 while True:
|
|
106 if not line:
|
|
107 break
|
|
108 if line[0] == ">":
|
|
109 break
|
|
110 lines.append(line)
|
|
111 line = handle.readline()
|
|
112 yield "".join(lines)
|
|
113 if not line:
|
|
114 return # StopIteration
|
|
115
|
|
116 def fasta_filter(in_file, out_file, iterator_filter):
|
|
117 count = 0
|
|
118 #Galaxy now requires Python 2.5+ so can use with statements,
|
|
119 with open(in_file) as in_handle:
|
|
120 with open(out_file, "w") as pos_handle:
|
|
121 for record in iterator_filter(raw_fasta_iterator(in_handle)):
|
|
122 count += 1
|
|
123 pos_handle.write(record)
|
|
124 return count
|
|
125
|
|
126 try:
|
|
127 from galaxy_utils.sequence.fastq import fastqReader, fastqWriter
|
|
128 def fastq_filter(in_file, out_file, iterator_filter):
|
|
129 count = 0
|
|
130 #from galaxy_utils.sequence.fastq import fastqReader, fastqWriter
|
|
131 reader = fastqReader(open(in_file, "rU"))
|
|
132 writer = fastqWriter(open(out_file, "w"))
|
|
133 for record in iterator_filter(reader):
|
|
134 count += 1
|
|
135 writer.write(record)
|
|
136 writer.close()
|
|
137 reader.close()
|
|
138 return count
|
|
139 except ImportError:
|
|
140 from Bio.SeqIO.QualityIO import FastqGeneralIterator
|
|
141 def fastq_filter(in_file, out_file, iterator_filter):
|
|
142 count = 0
|
|
143 with open(in_file) as in_handle:
|
|
144 with open(out_file, "w") as pos_handle:
|
|
145 for title, seq, qual in iterator_filter(FastqGeneralIterator(in_handle)):
|
|
146 count += 1
|
|
147 pos_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
|
|
148 return count
|
|
149
|
|
150 def sff_filter(in_file, out_file, iterator_filter):
|
|
151 count = 0
|
|
152 try:
|
|
153 from Bio.SeqIO.SffIO import SffIterator, SffWriter
|
|
154 except ImportError:
|
|
155 stop_err("SFF filtering requires Biopython 1.54 or later")
|
|
156 try:
|
|
157 from Bio.SeqIO.SffIO import ReadRocheXmlManifest
|
|
158 except ImportError:
|
|
159 #Prior to Biopython 1.56 this was a private function
|
|
160 from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest
|
|
161 with open(in_file, "rb") as in_handle:
|
|
162 try:
|
|
163 manifest = ReadRocheXmlManifest(in_handle)
|
|
164 except ValueError:
|
|
165 manifest = None
|
|
166 in_handle.seek(0)
|
|
167 with open(out_file, "wb") as out_handle:
|
|
168 writer = SffWriter(out_handle, xml=manifest)
|
|
169 in_handle.seek(0) #start again after getting manifest
|
|
170 count = writer.write_file(iterator_filter(SffIterator(in_handle)))
|
|
171 #count = writer.write_file(SffIterator(in_handle))
|
|
172 return count
|
|
173
|
|
174 if seq_format.lower()=="sff":
|
|
175 count = sff_filter(in_file, out_file, sampler)
|
|
176 elif seq_format.lower()=="fasta":
|
|
177 count = fasta_filter(in_file, out_file, sampler)
|
|
178 elif seq_format.lower().startswith("fastq"):
|
|
179 count = fastq_filter(in_file, out_file, sampler)
|
|
180 else:
|
|
181 stop_err("Unsupported file type %r" % seq_format)
|
|
182
|
|
183 sys.stderr.write("Sampled %i records\n" % count)
|