annotate tools/sample_seqs/sample_seqs.py @ 1:16ecf25d521f draft

Uploaded v0.0.1 with fixed README file
author peterjc
date Thu, 27 Mar 2014 12:13:22 -0400
parents 3a807e5ea6c8
children da64f6a9e32b
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
1 #!/usr/bin/env python
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
2 """Sub-sample sequence from a FASTA, FASTQ or SFF file.
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
3
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
4 This tool is a short Python script which requires Biopython 1.62 or later
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
5 for SFF file support. If you use this tool in scientific work leading to a
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
6 publication, please cite the Biopython application note:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
7
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
8 Cock et al 2009. Biopython: freely available Python tools for computational
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
9 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
10 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
11
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
12 This script is copyright 2010-2013 by Peter Cock, The James Hutton Institute
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
13 (formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved.
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
14 See accompanying text file for licence details (MIT license).
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
15
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
16 This is version 0.1.0 of the script, use -v or --version to get the version.
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
17 """
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
18 import os
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
19 import sys
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
20
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
21 def stop_err(msg, err=1):
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
22 sys.stderr.write(msg.rstrip() + "\n")
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
23 sys.exit(err)
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
24
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
25 if "-v" in sys.argv or "--version" in sys.argv:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
26 print("v0.1.0")
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
27 sys.exit(0)
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
28
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
29 #Parse Command Line
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
30 if len(sys.argv) < 5:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
31 stop_err("Requires at least four arguments: seq_format, in_file, out_file, mode, ...")
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
32 seq_format, in_file, out_file, mode = sys.argv[1:5]
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
33 if in_file != "/dev/stdin" and not os.path.isfile(in_file):
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
34 stop_err("Missing input file %r" % in_file)
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
35
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
36 if mode == "everyNth":
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
37 if len(sys.argv) != 6:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
38 stop_err("If using everyNth, just need argument N (integer, at least 2)")
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
39 try:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
40 N = int(sys.argv[5])
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
41 except:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
42 stop_err("Bad N argument %r" % sys.argv[5])
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
43 if N < 2:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
44 stop_err("Bad N argument %r" % sys.argv[5])
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
45 if (N % 10) == 1:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
46 sys.stderr.write("Sampling every %ist sequence\n" % N)
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
47 elif (N % 10) == 2:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
48 sys.stderr.write("Sampling every %ind sequence\n" % N)
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
49 elif (N % 10) == 3:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
50 sys.stderr.write("Sampling every %ird sequence\n" % N)
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
51 else:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
52 sys.stderr.write("Sampling every %ith sequence\n" % N)
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
53 def sampler(iterator):
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
54 global N
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
55 count = 0
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
56 for record in iterator:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
57 count += 1
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
58 if count % N == 1:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
59 yield record
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
60 elif mode == "percentage":
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
61 if len(sys.argv) != 6:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
62 stop_err("If using percentage, just need percentage argument (float, range 0 to 100)")
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
63 try:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
64 percent = float(sys.argv[5]) / 100.0
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
65 except:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
66 stop_err("Bad percent argument %r" % sys.argv[5])
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
67 if percent <= 0.0 or 1.0 <= percent:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
68 stop_err("Bad percent argument %r" % sys.argv[5])
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
69 sys.stderr.write("Sampling %0.3f%% of sequences\n" % (100.0 * percent))
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
70 def sampler(iterator):
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
71 global percent
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
72 count = 0
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
73 taken = 0
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
74 for record in iterator:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
75 count += 1
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
76 if percent * count > taken:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
77 taken += 1
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
78 yield record
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
79 else:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
80 stop_err("Unsupported mode %r" % mode)
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
81
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
82 def raw_fasta_iterator(handle):
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
83 """Yields raw FASTA records as multi-line strings."""
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
84 while True:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
85 line = handle.readline()
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
86 if line == "":
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
87 return # Premature end of file, or just empty?
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
88 if line[0] == ">":
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
89 break
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
90
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
91 no_id_warned = False
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
92 while True:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
93 if line[0] != ">":
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
94 raise ValueError(
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
95 "Records in Fasta files should start with '>' character")
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
96 try:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
97 id = line[1:].split(None, 1)[0]
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
98 except IndexError:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
99 if not no_id_warned:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
100 sys.stderr.write("WARNING - Malformed FASTA entry with no identifier\n")
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
101 no_id_warned = True
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
102 id = None
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
103 lines = [line]
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
104 line = handle.readline()
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
105 while True:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
106 if not line:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
107 break
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
108 if line[0] == ">":
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
109 break
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
110 lines.append(line)
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
111 line = handle.readline()
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
112 yield "".join(lines)
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
113 if not line:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
114 return # StopIteration
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
115
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
116 def fasta_filter(in_file, out_file, iterator_filter):
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
117 count = 0
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
118 #Galaxy now requires Python 2.5+ so can use with statements,
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
119 with open(in_file) as in_handle:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
120 with open(out_file, "w") as pos_handle:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
121 for record in iterator_filter(raw_fasta_iterator(in_handle)):
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
122 count += 1
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
123 pos_handle.write(record)
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
124 return count
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
125
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
126 try:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
127 from galaxy_utils.sequence.fastq import fastqReader, fastqWriter
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
128 def fastq_filter(in_file, out_file, iterator_filter):
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
129 count = 0
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
130 #from galaxy_utils.sequence.fastq import fastqReader, fastqWriter
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
131 reader = fastqReader(open(in_file, "rU"))
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
132 writer = fastqWriter(open(out_file, "w"))
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
133 for record in iterator_filter(reader):
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
134 count += 1
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
135 writer.write(record)
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
136 writer.close()
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
137 reader.close()
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
138 return count
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
139 except ImportError:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
140 from Bio.SeqIO.QualityIO import FastqGeneralIterator
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
141 def fastq_filter(in_file, out_file, iterator_filter):
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
142 count = 0
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
143 with open(in_file) as in_handle:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
144 with open(out_file, "w") as pos_handle:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
145 for title, seq, qual in iterator_filter(FastqGeneralIterator(in_handle)):
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
146 count += 1
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
147 pos_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
148 return count
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
149
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
150 def sff_filter(in_file, out_file, iterator_filter):
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
151 count = 0
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
152 try:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
153 from Bio.SeqIO.SffIO import SffIterator, SffWriter
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
154 except ImportError:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
155 stop_err("SFF filtering requires Biopython 1.54 or later")
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
156 try:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
157 from Bio.SeqIO.SffIO import ReadRocheXmlManifest
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
158 except ImportError:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
159 #Prior to Biopython 1.56 this was a private function
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
160 from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
161 with open(in_file, "rb") as in_handle:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
162 try:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
163 manifest = ReadRocheXmlManifest(in_handle)
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
164 except ValueError:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
165 manifest = None
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
166 in_handle.seek(0)
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
167 with open(out_file, "wb") as out_handle:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
168 writer = SffWriter(out_handle, xml=manifest)
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
169 in_handle.seek(0) #start again after getting manifest
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
170 count = writer.write_file(iterator_filter(SffIterator(in_handle)))
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
171 #count = writer.write_file(SffIterator(in_handle))
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
172 return count
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
173
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
174 if seq_format.lower()=="sff":
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
175 count = sff_filter(in_file, out_file, sampler)
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
176 elif seq_format.lower()=="fasta":
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
177 count = fasta_filter(in_file, out_file, sampler)
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
178 elif seq_format.lower().startswith("fastq"):
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
179 count = fastq_filter(in_file, out_file, sampler)
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
180 else:
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
181 stop_err("Unsupported file type %r" % seq_format)
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
182
3a807e5ea6c8 Uploaded v0.0.1
peterjc
parents:
diff changeset
183 sys.stderr.write("Sampled %i records\n" % count)