Mercurial > repos > peterjc > seq_filter_by_id
comparison tools/seq_filter_by_id/seq_filter_by_id.py @ 10:4a7d8ad2a983 draft
Bump Biopython dependency
author | peterjc |
---|---|
date | Thu, 30 Nov 2023 09:50:34 +0000 |
parents | 141612f8c3e3 |
children | 85ef5f5a0562 |
comparison
equal
deleted
inserted
replaced
9:141612f8c3e3 | 10:4a7d8ad2a983 |
---|---|
17 If you use this tool in scientific work leading to a publication, please | 17 If you use this tool in scientific work leading to a publication, please |
18 cite the Biopython application note: | 18 cite the Biopython application note: |
19 | 19 |
20 Cock et al 2009. Biopython: freely available Python tools for computational | 20 Cock et al 2009. Biopython: freely available Python tools for computational |
21 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. | 21 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. |
22 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. | 22 https://doi.org/10.1093/bioinformatics/btp163 pmid:19304878. |
23 | 23 |
24 This script is copyright 2010-2017 by Peter Cock, The James Hutton Institute | 24 This script is copyright 2010-2017 by Peter Cock, The James Hutton Institute |
25 (formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved. | 25 (formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved. |
26 See accompanying text file for licence details (MIT license). | 26 See accompanying text file for licence details (MIT license). |
27 | 27 |
47 | 47 |
48 Multiple tabular files and column numbers may be given, or replaced with | 48 Multiple tabular files and column numbers may be given, or replaced with |
49 the -t or --text option. | 49 the -t or --text option. |
50 """ | 50 """ |
51 parser = OptionParser(usage=usage) | 51 parser = OptionParser(usage=usage) |
52 parser.add_option('-i', '--input', dest='input', | 52 parser.add_option( |
53 default=None, help='Input sequences filename', | 53 "-i", |
54 metavar="FILE") | 54 "--input", |
55 parser.add_option('-f', '--format', dest='format', | 55 dest="input", |
56 default=None, | 56 default=None, |
57 help='Input sequence format (e.g. fasta, fastq, sff)') | 57 help="Input sequences filename", |
58 parser.add_option('-t', '--text', dest='id_list', | 58 metavar="FILE", |
59 default=None, help="Lists of white space separated IDs (instead of a tabular file)") | 59 ) |
60 parser.add_option('-p', '--positive', dest='output_positive', | 60 parser.add_option( |
61 default=None, | 61 "-f", |
62 help='Output filename for matches', | 62 "--format", |
63 metavar="FILE") | 63 dest="format", |
64 parser.add_option('-n', '--negative', dest='output_negative', | 64 default=None, |
65 default=None, | 65 help="Input sequence format (e.g. fasta, fastq, sff)", |
66 help='Output filename for non-matches', | 66 ) |
67 metavar="FILE") | 67 parser.add_option( |
68 parser.add_option("-l", "--logic", dest="logic", | 68 "-t", |
69 default="UNION", | 69 "--text", |
70 help="How to combined multiple ID columns (UNION or INTERSECTION)") | 70 dest="id_list", |
71 parser.add_option("-s", "--suffix", dest="suffix", | 71 default=None, |
72 action="store_true", | 72 help="Lists of white space separated IDs (instead of a tabular file)", |
73 help="Ignore pair-read suffices for matching names") | 73 ) |
74 parser.add_option("-v", "--version", dest="version", | 74 parser.add_option( |
75 default=False, action="store_true", | 75 "-p", |
76 help="Show version and quit") | 76 "--positive", |
77 dest="output_positive", | |
78 default=None, | |
79 help="Output filename for matches", | |
80 metavar="FILE", | |
81 ) | |
82 parser.add_option( | |
83 "-n", | |
84 "--negative", | |
85 dest="output_negative", | |
86 default=None, | |
87 help="Output filename for non-matches", | |
88 metavar="FILE", | |
89 ) | |
90 parser.add_option( | |
91 "-l", | |
92 "--logic", | |
93 dest="logic", | |
94 default="UNION", | |
95 help="How to combined multiple ID columns (UNION or INTERSECTION)", | |
96 ) | |
97 parser.add_option( | |
98 "-s", | |
99 "--suffix", | |
100 dest="suffix", | |
101 action="store_true", | |
102 help="Ignore pair-read suffixes for matching names", | |
103 ) | |
104 parser.add_option( | |
105 "-v", | |
106 "--version", | |
107 dest="version", | |
108 default=False, | |
109 action="store_true", | |
110 help="Show version and quit", | |
111 ) | |
77 | 112 |
78 options, args = parser.parse_args() | 113 options, args = parser.parse_args() |
79 | 114 |
80 if options.version: | 115 if options.version: |
81 print("v0.2.7") | 116 print("v0.2.7") |
84 in_file = options.input | 119 in_file = options.input |
85 seq_format = options.format | 120 seq_format = options.format |
86 out_positive_file = options.output_positive | 121 out_positive_file = options.output_positive |
87 out_negative_file = options.output_negative | 122 out_negative_file = options.output_negative |
88 logic = options.logic | 123 logic = options.logic |
89 drop_suffices = bool(options.suffix) | 124 drop_suffixes = bool(options.suffix) |
90 | 125 |
91 if in_file is None or not os.path.isfile(in_file): | 126 if in_file is None or not os.path.isfile(in_file): |
92 sys.exit("Missing input file: %r" % in_file) | 127 sys.exit("Missing input file: %r" % in_file) |
93 if out_positive_file is None and out_negative_file is None: | 128 if out_positive_file is None and out_negative_file is None: |
94 sys.exit("Neither output file requested") | 129 sys.exit("Neither output file requested") |
130 if not os.path.isfile(tabular_file): | 165 if not os.path.isfile(tabular_file): |
131 sys.exit("Missing tabular identifier file %r" % tabular_file) | 166 sys.exit("Missing tabular identifier file %r" % tabular_file) |
132 try: | 167 try: |
133 columns = [int(arg) - 1 for arg in cols_arg.split(",")] | 168 columns = [int(arg) - 1 for arg in cols_arg.split(",")] |
134 except ValueError: | 169 except ValueError: |
135 sys.exit("Expected list of columns (comma separated integers), got %r" % cols_arg) | 170 sys.exit( |
171 "Expected list of columns (comma separated integers), got %r" % cols_arg | |
172 ) | |
136 if min(columns) < 0: | 173 if min(columns) < 0: |
137 sys.exit("Expect one-based column numbers (not zero-based counting), got %r" % cols_arg) | 174 sys.exit( |
175 "Expect one-based column numbers (not zero-based counting), got %r" | |
176 % cols_arg | |
177 ) | |
138 identifiers.append((tabular_file, columns)) | 178 identifiers.append((tabular_file, columns)) |
139 | 179 |
140 name_warn = False | 180 name_warn = False |
141 | 181 |
142 | 182 |
143 def check_white_space(name): | 183 def check_white_space(name): |
144 """Check identifier for white space, take first word only.""" | 184 """Check identifier for white space, take first word only.""" |
145 parts = name.split(None, 1) | 185 parts = name.split(None, 1) |
146 global name_warn | 186 global name_warn |
147 if not name_warn and len(parts) > 1: | 187 if not name_warn and len(parts) > 1: |
148 name_warn = "WARNING: Some of your identifiers had white space in them, " + \ | 188 name_warn = ( |
149 "using first word only. e.g.:\n%s\n" % name | 189 "WARNING: Some of your identifiers had white space in them, " |
190 + "using first word only. e.g.:\n%s\n" % name | |
191 ) | |
150 return parts[0] | 192 return parts[0] |
151 | 193 |
152 | 194 |
153 if drop_suffices: | 195 if drop_suffixes: |
196 | |
154 def clean_name(name): | 197 def clean_name(name): |
155 """Remove suffix.""" | 198 """Remove suffix.""" |
156 name = check_white_space(name) | 199 name = check_white_space(name) |
157 match = re_suffix.search(name) | 200 match = re_suffix.search(name) |
158 if match: | 201 if match: |
159 # Use the fact this is a suffix, and regular expression will be | 202 # Use the fact this is a suffix, and regular expression will be |
160 # anchored to the end of the name: | 203 # anchored to the end of the name: |
161 return name[:match.start()] | 204 return name[: match.start()] |
162 else: | 205 else: |
163 # Nothing to do | 206 # Nothing to do |
164 return name | 207 return name |
208 | |
165 assert clean_name("foo/1") == "foo" | 209 assert clean_name("foo/1") == "foo" |
166 assert clean_name("foo/2") == "foo" | 210 assert clean_name("foo/2") == "foo" |
167 assert clean_name("bar.f") == "bar" | 211 assert clean_name("bar.f") == "bar" |
168 assert clean_name("bar.r") == "bar" | 212 assert clean_name("bar.r") == "bar" |
169 assert clean_name("baz.p1") == "baz" | 213 assert clean_name("baz.p1") == "baz" |
172 # Just check the white space | 216 # Just check the white space |
173 clean_name = check_white_space | 217 clean_name = check_white_space |
174 | 218 |
175 | 219 |
176 mapped_chars = { | 220 mapped_chars = { |
177 '>': '__gt__', | 221 ">": "__gt__", |
178 '<': '__lt__', | 222 "<": "__lt__", |
179 "'": '__sq__', | 223 "'": "__sq__", |
180 '"': '__dq__', | 224 '"': "__dq__", |
181 '[': '__ob__', | 225 "[": "__ob__", |
182 ']': '__cb__', | 226 "]": "__cb__", |
183 '{': '__oc__', | 227 "{": "__oc__", |
184 '}': '__cc__', | 228 "}": "__cc__", |
185 '@': '__at__', | 229 "@": "__at__", |
186 '\n': '__cn__', | 230 "\n": "__cn__", |
187 '\r': '__cr__', | 231 "\r": "__cr__", |
188 '\t': '__tc__', | 232 "\t": "__tc__", |
189 '#': '__pd__', | 233 "#": "__pd__", |
190 } | 234 } |
191 | 235 |
192 # Read tabular file(s) and record all specified identifiers | 236 # Read tabular file(s) and record all specified identifiers |
193 ids = None # Will be a set | 237 ids = None # Will be a set |
194 if options.id_list: | 238 if options.id_list: |
223 continue | 267 continue |
224 if not line.startswith("#"): | 268 if not line.startswith("#"): |
225 name = clean_name(line.rstrip("\n").split("\t")[col]) | 269 name = clean_name(line.rstrip("\n").split("\t")[col]) |
226 if name: | 270 if name: |
227 file_ids.add(name) | 271 file_ids.add(name) |
228 print("Using %i IDs from column %s in tabular file" % (len(file_ids), ", ".join(str(col + 1) for col in columns))) | 272 print( |
273 "Using %i IDs from column %s in tabular file" | |
274 % (len(file_ids), ", ".join(str(col + 1) for col in columns)) | |
275 ) | |
229 if ids is None: | 276 if ids is None: |
230 ids = file_ids | 277 ids = file_ids |
231 if logic == "UNION": | 278 if logic == "UNION": |
232 ids.update(file_ids) | 279 ids.update(file_ids) |
233 else: | 280 else: |
234 ids.intersection_update(file_ids) | 281 ids.intersection_update(file_ids) |
235 handle.close() | 282 handle.close() |
236 if len(identifiers) > 1: | 283 if len(identifiers) > 1: |
237 if logic == "UNION": | 284 if logic == "UNION": |
238 print("Have %i IDs combined from %i tabular files" % (len(ids), len(identifiers))) | 285 print( |
286 "Have %i IDs combined from %i tabular files" % (len(ids), len(identifiers)) | |
287 ) | |
239 else: | 288 else: |
240 print("Have %i IDs in common from %i tabular files" % (len(ids), len(identifiers))) | 289 print( |
290 "Have %i IDs in common from %i tabular files" % (len(ids), len(identifiers)) | |
291 ) | |
241 if name_warn: | 292 if name_warn: |
242 sys.stderr.write(name_warn) | 293 sys.stderr.write(name_warn) |
243 | 294 |
244 | 295 |
245 def crude_fasta_iterator(handle): | 296 def crude_fasta_iterator(handle): |
246 """Yields tuples, record ID and the full record as a string.""" | 297 """Parse FASTA file yielding tuples of (name, sequence).""" |
247 while True: | 298 while True: |
248 line = handle.readline() | 299 line = handle.readline() |
249 if line == "": | 300 if line == "": |
250 return # Premature end of file, or just empty? | 301 return # Premature end of file, or just empty? |
251 if line[0] == ">": | 302 if line[0] == ">": |
252 break | 303 break |
253 | 304 |
254 no_id_warned = False | 305 no_id_warned = False |
255 while True: | 306 while True: |
256 if line[0] != ">": | 307 if line[0] != ">": |
257 raise ValueError( | 308 raise ValueError("Records in Fasta files should start with '>' character") |
258 "Records in Fasta files should start with '>' character") | |
259 try: | 309 try: |
260 id = line[1:].split(None, 1)[0] | 310 id = line[1:].split(None, 1)[0] |
261 except IndexError: | 311 except IndexError: |
262 if not no_id_warned: | 312 if not no_id_warned: |
263 sys.stderr.write("WARNING - Malformed FASTA entry with no identifier\n") | 313 sys.stderr.write("WARNING - Malformed FASTA entry with no identifier\n") |
318 | 368 |
319 | 369 |
320 def fastq_filter(in_file, pos_file, neg_file, wanted): | 370 def fastq_filter(in_file, pos_file, neg_file, wanted): |
321 """FASTQ filter.""" | 371 """FASTQ filter.""" |
322 from Bio.SeqIO.QualityIO import FastqGeneralIterator | 372 from Bio.SeqIO.QualityIO import FastqGeneralIterator |
373 | |
323 handle = open(in_file, "r") | 374 handle = open(in_file, "r") |
324 if pos_file is not None and neg_file is not None: | 375 if pos_file is not None and neg_file is not None: |
325 print("Generating two FASTQ files") | 376 print("Generating two FASTQ files") |
326 positive_handle = open(pos_file, "w") | 377 positive_handle = open(pos_file, "w") |
327 negative_handle = open(neg_file, "w") | 378 negative_handle = open(neg_file, "w") |
376 pos_count = neg_count = 0 | 427 pos_count = neg_count = 0 |
377 if pos_file is not None: | 428 if pos_file is not None: |
378 out_handle = open(pos_file, "wb") | 429 out_handle = open(pos_file, "wb") |
379 writer = SffWriter(out_handle, xml=manifest) | 430 writer = SffWriter(out_handle, xml=manifest) |
380 in_handle.seek(0) # start again after getting manifest | 431 in_handle.seek(0) # start again after getting manifest |
381 pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) in wanted) | 432 pos_count = writer.write_file( |
433 rec for rec in SffIterator(in_handle) if clean_name(rec.id) in wanted | |
434 ) | |
382 out_handle.close() | 435 out_handle.close() |
383 if neg_file is not None: | 436 if neg_file is not None: |
384 out_handle = open(neg_file, "wb") | 437 out_handle = open(neg_file, "wb") |
385 writer = SffWriter(out_handle, xml=manifest) | 438 writer = SffWriter(out_handle, xml=manifest) |
386 in_handle.seek(0) # start again | 439 in_handle.seek(0) # start again |
387 neg_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) not in wanted) | 440 neg_count = writer.write_file( |
441 rec for rec in SffIterator(in_handle) if clean_name(rec.id) not in wanted | |
442 ) | |
388 out_handle.close() | 443 out_handle.close() |
389 # And we're done | 444 # And we're done |
390 in_handle.close() | 445 in_handle.close() |
391 # At the time of writing, Galaxy doesn't show SFF file read counts, | 446 # At the time of writing, Galaxy doesn't show SFF file read counts, |
392 # so it is useful to put them in stdout and thus shown in job info. | 447 # so it is useful to put them in stdout and thus shown in job info. |
393 return pos_count, neg_count | 448 return pos_count, neg_count |
394 | 449 |
395 | 450 |
396 if seq_format.lower() == "sff": | 451 if seq_format.lower() == "sff": |
397 # Now write filtered SFF file based on IDs wanted | 452 # Now write filtered SFF file based on IDs wanted |
398 pos_count, neg_count = sff_filter(in_file, out_positive_file, out_negative_file, ids) | 453 pos_count, neg_count = sff_filter( |
454 in_file, out_positive_file, out_negative_file, ids | |
455 ) | |
399 # At the time of writing, Galaxy doesn't show SFF file read counts, | 456 # At the time of writing, Galaxy doesn't show SFF file read counts, |
400 # so it is useful to put them in stdout and thus shown in job info. | 457 # so it is useful to put them in stdout and thus shown in job info. |
401 elif seq_format.lower() == "fasta": | 458 elif seq_format.lower() == "fasta": |
402 # Write filtered FASTA file based on IDs from tabular file | 459 # Write filtered FASTA file based on IDs from tabular file |
403 pos_count, neg_count = fasta_filter(in_file, out_positive_file, out_negative_file, ids) | 460 pos_count, neg_count = fasta_filter( |
461 in_file, out_positive_file, out_negative_file, ids | |
462 ) | |
404 print("%i with and %i without specified IDs" % (pos_count, neg_count)) | 463 print("%i with and %i without specified IDs" % (pos_count, neg_count)) |
405 elif seq_format.lower().startswith("fastq"): | 464 elif seq_format.lower().startswith("fastq"): |
406 # Write filtered FASTQ file based on IDs from tabular file | 465 # Write filtered FASTQ file based on IDs from tabular file |
407 fastq_filter(in_file, out_positive_file, out_negative_file, ids) | 466 fastq_filter(in_file, out_positive_file, out_negative_file, ids) |
408 # This does not currently track the counts | 467 # This does not currently track the counts |