comparison tools/seq_filter_by_id/seq_filter_by_id.py @ 10:4a7d8ad2a983 draft

Bump Biopython dependency
author peterjc
date Thu, 30 Nov 2023 09:50:34 +0000
parents 141612f8c3e3
children 85ef5f5a0562
comparison
equal deleted inserted replaced
9:141612f8c3e3 10:4a7d8ad2a983
17 If you use this tool in scientific work leading to a publication, please 17 If you use this tool in scientific work leading to a publication, please
18 cite the Biopython application note: 18 cite the Biopython application note:
19 19
20 Cock et al 2009. Biopython: freely available Python tools for computational 20 Cock et al 2009. Biopython: freely available Python tools for computational
21 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. 21 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
22 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. 22 https://doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
23 23
24 This script is copyright 2010-2017 by Peter Cock, The James Hutton Institute 24 This script is copyright 2010-2017 by Peter Cock, The James Hutton Institute
25 (formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved. 25 (formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved.
26 See accompanying text file for licence details (MIT license). 26 See accompanying text file for licence details (MIT license).
27 27
47 47
48 Multiple tabular files and column numbers may be given, or replaced with 48 Multiple tabular files and column numbers may be given, or replaced with
49 the -t or --text option. 49 the -t or --text option.
50 """ 50 """
51 parser = OptionParser(usage=usage) 51 parser = OptionParser(usage=usage)
52 parser.add_option('-i', '--input', dest='input', 52 parser.add_option(
53 default=None, help='Input sequences filename', 53 "-i",
54 metavar="FILE") 54 "--input",
55 parser.add_option('-f', '--format', dest='format', 55 dest="input",
56 default=None, 56 default=None,
57 help='Input sequence format (e.g. fasta, fastq, sff)') 57 help="Input sequences filename",
58 parser.add_option('-t', '--text', dest='id_list', 58 metavar="FILE",
59 default=None, help="Lists of white space separated IDs (instead of a tabular file)") 59 )
60 parser.add_option('-p', '--positive', dest='output_positive', 60 parser.add_option(
61 default=None, 61 "-f",
62 help='Output filename for matches', 62 "--format",
63 metavar="FILE") 63 dest="format",
64 parser.add_option('-n', '--negative', dest='output_negative', 64 default=None,
65 default=None, 65 help="Input sequence format (e.g. fasta, fastq, sff)",
66 help='Output filename for non-matches', 66 )
67 metavar="FILE") 67 parser.add_option(
68 parser.add_option("-l", "--logic", dest="logic", 68 "-t",
69 default="UNION", 69 "--text",
70 help="How to combined multiple ID columns (UNION or INTERSECTION)") 70 dest="id_list",
71 parser.add_option("-s", "--suffix", dest="suffix", 71 default=None,
72 action="store_true", 72 help="Lists of white space separated IDs (instead of a tabular file)",
73 help="Ignore pair-read suffices for matching names") 73 )
74 parser.add_option("-v", "--version", dest="version", 74 parser.add_option(
75 default=False, action="store_true", 75 "-p",
76 help="Show version and quit") 76 "--positive",
77 dest="output_positive",
78 default=None,
79 help="Output filename for matches",
80 metavar="FILE",
81 )
82 parser.add_option(
83 "-n",
84 "--negative",
85 dest="output_negative",
86 default=None,
87 help="Output filename for non-matches",
88 metavar="FILE",
89 )
90 parser.add_option(
91 "-l",
92 "--logic",
93 dest="logic",
94 default="UNION",
95 help="How to combined multiple ID columns (UNION or INTERSECTION)",
96 )
97 parser.add_option(
98 "-s",
99 "--suffix",
100 dest="suffix",
101 action="store_true",
102 help="Ignore pair-read suffixes for matching names",
103 )
104 parser.add_option(
105 "-v",
106 "--version",
107 dest="version",
108 default=False,
109 action="store_true",
110 help="Show version and quit",
111 )
77 112
78 options, args = parser.parse_args() 113 options, args = parser.parse_args()
79 114
80 if options.version: 115 if options.version:
81 print("v0.2.7") 116 print("v0.2.7")
84 in_file = options.input 119 in_file = options.input
85 seq_format = options.format 120 seq_format = options.format
86 out_positive_file = options.output_positive 121 out_positive_file = options.output_positive
87 out_negative_file = options.output_negative 122 out_negative_file = options.output_negative
88 logic = options.logic 123 logic = options.logic
89 drop_suffices = bool(options.suffix) 124 drop_suffixes = bool(options.suffix)
90 125
91 if in_file is None or not os.path.isfile(in_file): 126 if in_file is None or not os.path.isfile(in_file):
92 sys.exit("Missing input file: %r" % in_file) 127 sys.exit("Missing input file: %r" % in_file)
93 if out_positive_file is None and out_negative_file is None: 128 if out_positive_file is None and out_negative_file is None:
94 sys.exit("Neither output file requested") 129 sys.exit("Neither output file requested")
130 if not os.path.isfile(tabular_file): 165 if not os.path.isfile(tabular_file):
131 sys.exit("Missing tabular identifier file %r" % tabular_file) 166 sys.exit("Missing tabular identifier file %r" % tabular_file)
132 try: 167 try:
133 columns = [int(arg) - 1 for arg in cols_arg.split(",")] 168 columns = [int(arg) - 1 for arg in cols_arg.split(",")]
134 except ValueError: 169 except ValueError:
135 sys.exit("Expected list of columns (comma separated integers), got %r" % cols_arg) 170 sys.exit(
171 "Expected list of columns (comma separated integers), got %r" % cols_arg
172 )
136 if min(columns) < 0: 173 if min(columns) < 0:
137 sys.exit("Expect one-based column numbers (not zero-based counting), got %r" % cols_arg) 174 sys.exit(
175 "Expect one-based column numbers (not zero-based counting), got %r"
176 % cols_arg
177 )
138 identifiers.append((tabular_file, columns)) 178 identifiers.append((tabular_file, columns))
139 179
140 name_warn = False 180 name_warn = False
141 181
142 182
143 def check_white_space(name): 183 def check_white_space(name):
144 """Check identifier for white space, take first word only.""" 184 """Check identifier for white space, take first word only."""
145 parts = name.split(None, 1) 185 parts = name.split(None, 1)
146 global name_warn 186 global name_warn
147 if not name_warn and len(parts) > 1: 187 if not name_warn and len(parts) > 1:
148 name_warn = "WARNING: Some of your identifiers had white space in them, " + \ 188 name_warn = (
149 "using first word only. e.g.:\n%s\n" % name 189 "WARNING: Some of your identifiers had white space in them, "
190 + "using first word only. e.g.:\n%s\n" % name
191 )
150 return parts[0] 192 return parts[0]
151 193
152 194
153 if drop_suffices: 195 if drop_suffixes:
196
154 def clean_name(name): 197 def clean_name(name):
155 """Remove suffix.""" 198 """Remove suffix."""
156 name = check_white_space(name) 199 name = check_white_space(name)
157 match = re_suffix.search(name) 200 match = re_suffix.search(name)
158 if match: 201 if match:
159 # Use the fact this is a suffix, and regular expression will be 202 # Use the fact this is a suffix, and regular expression will be
160 # anchored to the end of the name: 203 # anchored to the end of the name:
161 return name[:match.start()] 204 return name[: match.start()]
162 else: 205 else:
163 # Nothing to do 206 # Nothing to do
164 return name 207 return name
208
165 assert clean_name("foo/1") == "foo" 209 assert clean_name("foo/1") == "foo"
166 assert clean_name("foo/2") == "foo" 210 assert clean_name("foo/2") == "foo"
167 assert clean_name("bar.f") == "bar" 211 assert clean_name("bar.f") == "bar"
168 assert clean_name("bar.r") == "bar" 212 assert clean_name("bar.r") == "bar"
169 assert clean_name("baz.p1") == "baz" 213 assert clean_name("baz.p1") == "baz"
172 # Just check the white space 216 # Just check the white space
173 clean_name = check_white_space 217 clean_name = check_white_space
174 218
175 219
176 mapped_chars = { 220 mapped_chars = {
177 '>': '__gt__', 221 ">": "__gt__",
178 '<': '__lt__', 222 "<": "__lt__",
179 "'": '__sq__', 223 "'": "__sq__",
180 '"': '__dq__', 224 '"': "__dq__",
181 '[': '__ob__', 225 "[": "__ob__",
182 ']': '__cb__', 226 "]": "__cb__",
183 '{': '__oc__', 227 "{": "__oc__",
184 '}': '__cc__', 228 "}": "__cc__",
185 '@': '__at__', 229 "@": "__at__",
186 '\n': '__cn__', 230 "\n": "__cn__",
187 '\r': '__cr__', 231 "\r": "__cr__",
188 '\t': '__tc__', 232 "\t": "__tc__",
189 '#': '__pd__', 233 "#": "__pd__",
190 } 234 }
191 235
192 # Read tabular file(s) and record all specified identifiers 236 # Read tabular file(s) and record all specified identifiers
193 ids = None # Will be a set 237 ids = None # Will be a set
194 if options.id_list: 238 if options.id_list:
223 continue 267 continue
224 if not line.startswith("#"): 268 if not line.startswith("#"):
225 name = clean_name(line.rstrip("\n").split("\t")[col]) 269 name = clean_name(line.rstrip("\n").split("\t")[col])
226 if name: 270 if name:
227 file_ids.add(name) 271 file_ids.add(name)
228 print("Using %i IDs from column %s in tabular file" % (len(file_ids), ", ".join(str(col + 1) for col in columns))) 272 print(
273 "Using %i IDs from column %s in tabular file"
274 % (len(file_ids), ", ".join(str(col + 1) for col in columns))
275 )
229 if ids is None: 276 if ids is None:
230 ids = file_ids 277 ids = file_ids
231 if logic == "UNION": 278 if logic == "UNION":
232 ids.update(file_ids) 279 ids.update(file_ids)
233 else: 280 else:
234 ids.intersection_update(file_ids) 281 ids.intersection_update(file_ids)
235 handle.close() 282 handle.close()
236 if len(identifiers) > 1: 283 if len(identifiers) > 1:
237 if logic == "UNION": 284 if logic == "UNION":
238 print("Have %i IDs combined from %i tabular files" % (len(ids), len(identifiers))) 285 print(
286 "Have %i IDs combined from %i tabular files" % (len(ids), len(identifiers))
287 )
239 else: 288 else:
240 print("Have %i IDs in common from %i tabular files" % (len(ids), len(identifiers))) 289 print(
290 "Have %i IDs in common from %i tabular files" % (len(ids), len(identifiers))
291 )
241 if name_warn: 292 if name_warn:
242 sys.stderr.write(name_warn) 293 sys.stderr.write(name_warn)
243 294
244 295
245 def crude_fasta_iterator(handle): 296 def crude_fasta_iterator(handle):
246 """Yields tuples, record ID and the full record as a string.""" 297 """Parse FASTA file yielding tuples of (name, sequence)."""
247 while True: 298 while True:
248 line = handle.readline() 299 line = handle.readline()
249 if line == "": 300 if line == "":
250 return # Premature end of file, or just empty? 301 return # Premature end of file, or just empty?
251 if line[0] == ">": 302 if line[0] == ">":
252 break 303 break
253 304
254 no_id_warned = False 305 no_id_warned = False
255 while True: 306 while True:
256 if line[0] != ">": 307 if line[0] != ">":
257 raise ValueError( 308 raise ValueError("Records in Fasta files should start with '>' character")
258 "Records in Fasta files should start with '>' character")
259 try: 309 try:
260 id = line[1:].split(None, 1)[0] 310 id = line[1:].split(None, 1)[0]
261 except IndexError: 311 except IndexError:
262 if not no_id_warned: 312 if not no_id_warned:
263 sys.stderr.write("WARNING - Malformed FASTA entry with no identifier\n") 313 sys.stderr.write("WARNING - Malformed FASTA entry with no identifier\n")
318 368
319 369
320 def fastq_filter(in_file, pos_file, neg_file, wanted): 370 def fastq_filter(in_file, pos_file, neg_file, wanted):
321 """FASTQ filter.""" 371 """FASTQ filter."""
322 from Bio.SeqIO.QualityIO import FastqGeneralIterator 372 from Bio.SeqIO.QualityIO import FastqGeneralIterator
373
323 handle = open(in_file, "r") 374 handle = open(in_file, "r")
324 if pos_file is not None and neg_file is not None: 375 if pos_file is not None and neg_file is not None:
325 print("Generating two FASTQ files") 376 print("Generating two FASTQ files")
326 positive_handle = open(pos_file, "w") 377 positive_handle = open(pos_file, "w")
327 negative_handle = open(neg_file, "w") 378 negative_handle = open(neg_file, "w")
376 pos_count = neg_count = 0 427 pos_count = neg_count = 0
377 if pos_file is not None: 428 if pos_file is not None:
378 out_handle = open(pos_file, "wb") 429 out_handle = open(pos_file, "wb")
379 writer = SffWriter(out_handle, xml=manifest) 430 writer = SffWriter(out_handle, xml=manifest)
380 in_handle.seek(0) # start again after getting manifest 431 in_handle.seek(0) # start again after getting manifest
381 pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) in wanted) 432 pos_count = writer.write_file(
433 rec for rec in SffIterator(in_handle) if clean_name(rec.id) in wanted
434 )
382 out_handle.close() 435 out_handle.close()
383 if neg_file is not None: 436 if neg_file is not None:
384 out_handle = open(neg_file, "wb") 437 out_handle = open(neg_file, "wb")
385 writer = SffWriter(out_handle, xml=manifest) 438 writer = SffWriter(out_handle, xml=manifest)
386 in_handle.seek(0) # start again 439 in_handle.seek(0) # start again
387 neg_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) not in wanted) 440 neg_count = writer.write_file(
441 rec for rec in SffIterator(in_handle) if clean_name(rec.id) not in wanted
442 )
388 out_handle.close() 443 out_handle.close()
389 # And we're done 444 # And we're done
390 in_handle.close() 445 in_handle.close()
391 # At the time of writing, Galaxy doesn't show SFF file read counts, 446 # At the time of writing, Galaxy doesn't show SFF file read counts,
392 # so it is useful to put them in stdout and thus shown in job info. 447 # so it is useful to put them in stdout and thus shown in job info.
393 return pos_count, neg_count 448 return pos_count, neg_count
394 449
395 450
396 if seq_format.lower() == "sff": 451 if seq_format.lower() == "sff":
397 # Now write filtered SFF file based on IDs wanted 452 # Now write filtered SFF file based on IDs wanted
398 pos_count, neg_count = sff_filter(in_file, out_positive_file, out_negative_file, ids) 453 pos_count, neg_count = sff_filter(
454 in_file, out_positive_file, out_negative_file, ids
455 )
399 # At the time of writing, Galaxy doesn't show SFF file read counts, 456 # At the time of writing, Galaxy doesn't show SFF file read counts,
400 # so it is useful to put them in stdout and thus shown in job info. 457 # so it is useful to put them in stdout and thus shown in job info.
401 elif seq_format.lower() == "fasta": 458 elif seq_format.lower() == "fasta":
402 # Write filtered FASTA file based on IDs from tabular file 459 # Write filtered FASTA file based on IDs from tabular file
403 pos_count, neg_count = fasta_filter(in_file, out_positive_file, out_negative_file, ids) 460 pos_count, neg_count = fasta_filter(
461 in_file, out_positive_file, out_negative_file, ids
462 )
404 print("%i with and %i without specified IDs" % (pos_count, neg_count)) 463 print("%i with and %i without specified IDs" % (pos_count, neg_count))
405 elif seq_format.lower().startswith("fastq"): 464 elif seq_format.lower().startswith("fastq"):
406 # Write filtered FASTQ file based on IDs from tabular file 465 # Write filtered FASTQ file based on IDs from tabular file
407 fastq_filter(in_file, out_positive_file, out_negative_file, ids) 466 fastq_filter(in_file, out_positive_file, out_negative_file, ids)
408 # This does not currently track the counts 467 # This does not currently track the counts