Mercurial > repos > davidvanzessen > change_o
comparison ParseDb.py @ 0:183edf446dcf draft default tip
Uploaded
author | davidvanzessen |
---|---|
date | Mon, 17 Jul 2017 07:44:27 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:183edf446dcf |
---|---|
1 #!/usr/bin/env python3 | |
2 """ | |
3 Parses tab delimited database files | |
4 """ | |
5 # Info | |
6 __author__ = 'Jason Anthony Vander Heiden' | |
7 from changeo import __version__, __date__ | |
8 | |
9 # Imports | |
10 import csv | |
11 import os | |
12 import re | |
13 from argparse import ArgumentParser | |
14 from collections import OrderedDict | |
15 | |
16 from textwrap import dedent | |
17 from time import time | |
18 from Bio import SeqIO | |
19 from Bio.Seq import Seq | |
20 from Bio.SeqRecord import SeqRecord | |
21 from Bio.Alphabet import IUPAC | |
22 | |
23 # Presto and changeo imports | |
24 from presto.Defaults import default_delimiter, default_out_args | |
25 from presto.Annotation import flattenAnnotation | |
26 from presto.IO import getOutputHandle, printLog, printProgress, printMessage | |
27 from changeo.Defaults import default_csv_size | |
28 from changeo.Commandline import CommonHelpFormatter, checkArgs, getCommonArgParser, parseCommonArgs | |
29 from changeo.IO import getDbWriter, readDbFile, countDbFile | |
30 | |
31 # System settings | |
32 csv.field_size_limit(default_csv_size) | |
33 | |
34 # Defaults | |
35 default_id_field = 'SEQUENCE_ID' | |
36 default_seq_field = 'SEQUENCE_IMGT' | |
37 default_germ_field = 'GERMLINE_IMGT_D_MASK' | |
38 default_index_field = 'INDEX' | |
39 | |
40 # TODO: convert SQL-ish operations to modify_func() as per ParseHeaders | |
41 | |
42 def getDbSeqRecord(db_record, id_field, seq_field, meta_fields=None, | |
43 delimiter=default_delimiter): | |
44 """ | |
45 Parses a database record into a SeqRecord | |
46 | |
47 Arguments: | |
48 db_record = a dictionary containing a database record | |
49 id_field = the field containing identifiers | |
50 seq_field = the field containing sequences | |
51 meta_fields = a list of fields to add to sequence annotations | |
52 delimiter = a tuple of delimiters for (fields, values, value lists) | |
53 | |
54 Returns: | |
55 a SeqRecord | |
56 """ | |
57 # Return None if ID or sequence fields are empty | |
58 if not db_record[id_field] or not db_record[seq_field]: | |
59 return None | |
60 | |
61 # Create description string | |
62 desc_dict = OrderedDict([('ID', db_record[id_field])]) | |
63 if meta_fields is not None: | |
64 desc_dict.update([(f, db_record[f]) for f in meta_fields if f in db_record]) | |
65 desc_str = flattenAnnotation(desc_dict, delimiter=delimiter) | |
66 | |
67 # Create SeqRecord | |
68 seq_record = SeqRecord(Seq(db_record[seq_field], IUPAC.ambiguous_dna), | |
69 id=desc_str, name=desc_str, description='') | |
70 | |
71 return seq_record | |
72 | |
73 | |
74 def splitDbFile(db_file, field, num_split=None, out_args=default_out_args): | |
75 """ | |
76 Divides a tab-delimited database file into segments by description tags | |
77 | |
78 Arguments: | |
79 db_file = filename of the tab-delimited database file to split | |
80 field = the field name by which to split db_file | |
81 num_split = the numerical threshold by which to group sequences; | |
82 if None treat field as textual | |
83 out_args = common output argument dictionary from parseCommonArgs | |
84 | |
85 Returns: | |
86 a list of output file names | |
87 """ | |
88 log = OrderedDict() | |
89 log['START'] = 'ParseDb' | |
90 log['COMMAND'] = 'split' | |
91 log['FILE'] = os.path.basename(db_file) | |
92 log['FIELD'] = field | |
93 log['NUM_SPLIT'] = num_split | |
94 printLog(log) | |
95 | |
96 # Open IgRecord reader iter object | |
97 reader = readDbFile(db_file, ig=False) | |
98 | |
99 # Determine total numbers of records | |
100 rec_count = countDbFile(db_file) | |
101 | |
102 start_time = time() | |
103 count = 0 | |
104 # Sort records into files based on textual field | |
105 if num_split is None: | |
106 # Create set of unique field tags | |
107 tmp_iter = readDbFile(db_file, ig=False) | |
108 tag_list = list(set([row[field] for row in tmp_iter])) | |
109 | |
110 # Forbidden characters in filename and replacements | |
111 noGood = {'\/':'f','\\':'b','?':'q','\%':'p','*':'s',':':'c', | |
112 '\|':'pi','\"':'dq','\'':'sq','<':'gt','>':'lt',' ':'_'} | |
113 # Replace forbidden characters in tag_list | |
114 tag_dict = {} | |
115 for tag in tag_list: | |
116 for c,r in noGood.items(): | |
117 tag_dict[tag] = (tag_dict.get(tag, tag).replace(c,r) \ | |
118 if c in tag else tag_dict.get(tag, tag)) | |
119 | |
120 # Create output handles | |
121 handles_dict = {tag:getOutputHandle(db_file, | |
122 '%s-%s' % (field, label), | |
123 out_type = out_args['out_type'], | |
124 out_name = out_args['out_name'], | |
125 out_dir = out_args['out_dir']) | |
126 for tag, label in tag_dict.items()} | |
127 | |
128 # Create Db writer instances | |
129 writers_dict = {tag:getDbWriter(handles_dict[tag], db_file) | |
130 for tag in tag_dict} | |
131 | |
132 # Iterate over IgRecords | |
133 for row in reader: | |
134 printProgress(count, rec_count, 0.05, start_time) | |
135 count += 1 | |
136 # Write row to appropriate file | |
137 tag = row[field] | |
138 writers_dict[tag].writerow(row) | |
139 | |
140 # Sort records into files based on numeric num_split | |
141 else: | |
142 num_split = float(num_split) | |
143 | |
144 # Create output handles | |
145 handles_dict = {'under':getOutputHandle(db_file, | |
146 'under-%.1f' % num_split, | |
147 out_type = out_args['out_type'], | |
148 out_name = out_args['out_name'], | |
149 out_dir = out_args['out_dir']), | |
150 'atleast':getOutputHandle(db_file, | |
151 'atleast-%.1f' % num_split, | |
152 out_type = out_args['out_type'], | |
153 out_name = out_args['out_name'], | |
154 out_dir = out_args['out_dir'])} | |
155 | |
156 # Create Db writer instances | |
157 writers_dict = {'under':getDbWriter(handles_dict['under'], db_file), | |
158 'atleast':getDbWriter(handles_dict['atleast'], db_file)} | |
159 | |
160 # Iterate over IgRecords | |
161 for row in reader: | |
162 printProgress(count, rec_count, 0.05, start_time) | |
163 count += 1 | |
164 tag = row[field] | |
165 tag = 'under' if float(tag) < num_split else 'atleast' | |
166 writers_dict[tag].writerow(row) | |
167 | |
168 # Write log | |
169 printProgress(count, rec_count, 0.05, start_time) | |
170 log = OrderedDict() | |
171 for i, k in enumerate(handles_dict): | |
172 log['OUTPUT%i' % (i + 1)] = os.path.basename(handles_dict[k].name) | |
173 log['RECORDS'] = rec_count | |
174 log['PARTS'] = len(handles_dict) | |
175 log['END'] = 'ParseDb' | |
176 printLog(log) | |
177 | |
178 # Close output file handles | |
179 for t in handles_dict: handles_dict[t].close() | |
180 | |
181 return [handles_dict[t].name for t in handles_dict] | |
182 | |
183 | |
184 # TODO: SHOULD ALLOW FOR UNSORTED CLUSTER COLUMN | |
185 # TODO: SHOULD ALLOW FOR GROUPING FIELDS | |
186 def convertDbBaseline(db_file, id_field=default_id_field, seq_field=default_seq_field, | |
187 germ_field=default_germ_field, cluster_field=None, | |
188 meta_fields=None, out_args=default_out_args): | |
189 """ | |
190 Builds fasta files from database records | |
191 | |
192 Arguments: | |
193 db_file = the database file name | |
194 id_field = the field containing identifiers | |
195 seq_field = the field containing sample sequences | |
196 germ_field = the field containing germline sequences | |
197 cluster_field = the field containing clonal groupings | |
198 if None write the germline for each record | |
199 meta_fields = a list of fields to add to sequence annotations | |
200 out_args = common output argument dictionary from parseCommonArgs | |
201 | |
202 Returns: | |
203 the output file name | |
204 """ | |
205 log = OrderedDict() | |
206 log['START'] = 'ParseDb' | |
207 log['COMMAND'] = 'fasta' | |
208 log['FILE'] = os.path.basename(db_file) | |
209 log['ID_FIELD'] = id_field | |
210 log['SEQ_FIELD'] = seq_field | |
211 log['GERM_FIELD'] = germ_field | |
212 log['CLUSTER_FIELD'] = cluster_field | |
213 if meta_fields is not None: log['META_FIELDS'] = ','.join(meta_fields) | |
214 printLog(log) | |
215 | |
216 # Open file handles | |
217 db_iter = readDbFile(db_file, ig=False) | |
218 pass_handle = getOutputHandle(db_file, out_label='sequences', out_dir=out_args['out_dir'], | |
219 out_name=out_args['out_name'], out_type='clip') | |
220 # Count records | |
221 result_count = countDbFile(db_file) | |
222 | |
223 # Iterate over records | |
224 start_time = time() | |
225 rec_count = germ_count = pass_count = fail_count = 0 | |
226 cluster_last = None | |
227 for rec in db_iter: | |
228 # Print progress for previous iteration | |
229 printProgress(rec_count, result_count, 0.05, start_time) | |
230 rec_count += 1 | |
231 | |
232 # Update cluster ID | |
233 cluster = rec.get(cluster_field, None) | |
234 | |
235 # Get germline SeqRecord when needed | |
236 if cluster_field is None: | |
237 germ = getDbSeqRecord(rec, id_field, germ_field, meta_fields, | |
238 delimiter=out_args['delimiter']) | |
239 germ.id = '>' + germ.id | |
240 elif cluster != cluster_last: | |
241 germ = getDbSeqRecord(rec, cluster_field, germ_field, | |
242 delimiter=out_args['delimiter']) | |
243 germ.id = '>' + germ.id | |
244 else: | |
245 germ = None | |
246 | |
247 # Get read SeqRecord | |
248 seq = getDbSeqRecord(rec, id_field, seq_field, meta_fields, | |
249 delimiter=out_args['delimiter']) | |
250 | |
251 # Write germline | |
252 if germ is not None: | |
253 germ_count += 1 | |
254 SeqIO.write(germ, pass_handle, 'fasta') | |
255 | |
256 # Write sequences | |
257 if seq is not None: | |
258 pass_count += 1 | |
259 SeqIO.write(seq, pass_handle, 'fasta') | |
260 else: | |
261 fail_count += 1 | |
262 | |
263 # Set last cluster ID | |
264 cluster_last = cluster | |
265 | |
266 # Print counts | |
267 printProgress(rec_count, result_count, 0.05, start_time) | |
268 log = OrderedDict() | |
269 log['OUTPUT'] = os.path.basename(pass_handle.name) | |
270 log['RECORDS'] = rec_count | |
271 log['GERMLINES'] = germ_count | |
272 log['PASS'] = pass_count | |
273 log['FAIL'] = fail_count | |
274 log['END'] = 'ParseDb' | |
275 printLog(log) | |
276 | |
277 # Close file handles | |
278 pass_handle.close() | |
279 | |
280 return pass_handle.name | |
281 | |
282 | |
283 def convertDbFasta(db_file, id_field=default_id_field, seq_field=default_seq_field, | |
284 meta_fields=None, out_args=default_out_args): | |
285 """ | |
286 Builds fasta files from database records | |
287 | |
288 Arguments: | |
289 db_file = the database file name | |
290 id_field = the field containing identifiers | |
291 seq_field = the field containing sequences | |
292 meta_fields = a list of fields to add to sequence annotations | |
293 out_args = common output argument dictionary from parseCommonArgs | |
294 | |
295 Returns: | |
296 the output file name | |
297 """ | |
298 log = OrderedDict() | |
299 log['START'] = 'ParseDb' | |
300 log['COMMAND'] = 'fasta' | |
301 log['FILE'] = os.path.basename(db_file) | |
302 log['ID_FIELD'] = id_field | |
303 log['SEQ_FIELD'] = seq_field | |
304 if meta_fields is not None: log['META_FIELDS'] = ','.join(meta_fields) | |
305 printLog(log) | |
306 | |
307 # Open file handles | |
308 out_type = 'fasta' | |
309 db_iter = readDbFile(db_file, ig=False) | |
310 pass_handle = getOutputHandle(db_file, out_label='sequences', out_dir=out_args['out_dir'], | |
311 out_name=out_args['out_name'], out_type=out_type) | |
312 # Count records | |
313 result_count = countDbFile(db_file) | |
314 | |
315 # Iterate over records | |
316 start_time = time() | |
317 rec_count = pass_count = fail_count = 0 | |
318 for rec in db_iter: | |
319 # Print progress for previous iteration | |
320 printProgress(rec_count, result_count, 0.05, start_time) | |
321 rec_count += 1 | |
322 | |
323 # Get SeqRecord | |
324 seq = getDbSeqRecord(rec, id_field, seq_field, meta_fields, out_args['delimiter']) | |
325 | |
326 # Write sequences | |
327 if seq is not None: | |
328 pass_count += 1 | |
329 SeqIO.write(seq, pass_handle, out_type) | |
330 else: | |
331 fail_count += 1 | |
332 | |
333 # Print counts | |
334 printProgress(rec_count, result_count, 0.05, start_time) | |
335 log = OrderedDict() | |
336 log['OUTPUT'] = os.path.basename(pass_handle.name) | |
337 log['RECORDS'] = rec_count | |
338 log['PASS'] = pass_count | |
339 log['FAIL'] = fail_count | |
340 log['END'] = 'ParseDb' | |
341 printLog(log) | |
342 | |
343 # Close file handles | |
344 pass_handle.close() | |
345 | |
346 return pass_handle.name | |
347 | |
348 | |
349 def addDbFile(db_file, fields, values, out_args=default_out_args): | |
350 """ | |
351 Adds field and value pairs to a database file | |
352 | |
353 Arguments: | |
354 db_file = the database file name | |
355 fields = a list of fields to add | |
356 values = a list of values to assign to all rows of each field | |
357 out_args = common output argument dictionary from parseCommonArgs | |
358 | |
359 Returns: | |
360 the output file name | |
361 """ | |
362 log = OrderedDict() | |
363 log['START'] = 'ParseDb' | |
364 log['COMMAND'] = 'add' | |
365 log['FILE'] = os.path.basename(db_file) | |
366 log['FIELDS'] = ','.join(fields) | |
367 log['VALUES'] = ','.join(values) | |
368 printLog(log) | |
369 | |
370 # Open file handles | |
371 db_iter = readDbFile(db_file, ig=False) | |
372 pass_handle = getOutputHandle(db_file, out_label='parse-add', out_dir=out_args['out_dir'], | |
373 out_name=out_args['out_name'], out_type='tab') | |
374 pass_writer = getDbWriter(pass_handle, db_file, add_fields=fields) | |
375 # Count records | |
376 result_count = countDbFile(db_file) | |
377 | |
378 # Define fields and values to append | |
379 add_dict = {k:v for k,v in zip(fields, values) if k not in db_iter.fieldnames} | |
380 | |
381 # Iterate over records | |
382 start_time = time() | |
383 rec_count = 0 | |
384 for rec in db_iter: | |
385 # Print progress for previous iteration | |
386 printProgress(rec_count, result_count, 0.05, start_time) | |
387 rec_count += 1 | |
388 # Write updated row | |
389 rec.update(add_dict) | |
390 pass_writer.writerow(rec) | |
391 | |
392 # Print counts | |
393 printProgress(rec_count, result_count, 0.05, start_time) | |
394 log = OrderedDict() | |
395 log['OUTPUT'] = os.path.basename(pass_handle.name) | |
396 log['RECORDS'] = rec_count | |
397 log['END'] = 'ParseDb' | |
398 printLog(log) | |
399 | |
400 # Close file handles | |
401 pass_handle.close() | |
402 | |
403 return pass_handle.name | |
404 | |
405 | |
406 def indexDbFile(db_file, field=default_index_field, out_args=default_out_args): | |
407 """ | |
408 Adds an index column to a database file | |
409 | |
410 Arguments: | |
411 db_file = the database file name | |
412 field = the name of the index field to add | |
413 out_args = common output argument dictionary from parseCommonArgs | |
414 | |
415 Returns: | |
416 the output file name | |
417 """ | |
418 log = OrderedDict() | |
419 log['START'] = 'ParseDb' | |
420 log['COMMAND'] = 'index' | |
421 log['FILE'] = os.path.basename(db_file) | |
422 log['FIELD'] = field | |
423 printLog(log) | |
424 | |
425 # Open file handles | |
426 db_iter = readDbFile(db_file, ig=False) | |
427 pass_handle = getOutputHandle(db_file, out_label='parse-index', out_dir=out_args['out_dir'], | |
428 out_name=out_args['out_name'], out_type='tab') | |
429 pass_writer = getDbWriter(pass_handle, db_file, add_fields=field) | |
430 # Count records | |
431 result_count = countDbFile(db_file) | |
432 | |
433 # Iterate over records | |
434 start_time = time() | |
435 rec_count = 0 | |
436 for rec in db_iter: | |
437 # Print progress for previous iteration | |
438 printProgress(rec_count, result_count, 0.05, start_time) | |
439 rec_count += 1 | |
440 | |
441 # Add count and write updated row | |
442 rec.update({field:rec_count}) | |
443 pass_writer.writerow(rec) | |
444 | |
445 # Print counts | |
446 printProgress(rec_count, result_count, 0.05, start_time) | |
447 log = OrderedDict() | |
448 log['OUTPUT'] = os.path.basename(pass_handle.name) | |
449 log['RECORDS'] = rec_count | |
450 log['END'] = 'ParseDb' | |
451 printLog(log) | |
452 | |
453 # Close file handles | |
454 pass_handle.close() | |
455 | |
456 return pass_handle.name | |
457 | |
458 | |
459 def dropDbFile(db_file, fields, out_args=default_out_args): | |
460 """ | |
461 Deletes entire fields from a database file | |
462 | |
463 Arguments: | |
464 db_file = the database file name | |
465 fields = a list of fields to drop | |
466 out_args = common output argument dictionary from parseCommonArgs | |
467 | |
468 Returns: | |
469 the output file name | |
470 """ | |
471 log = OrderedDict() | |
472 log['START'] = 'ParseDb' | |
473 log['COMMAND'] = 'add' | |
474 log['FILE'] = os.path.basename(db_file) | |
475 log['FIELDS'] = ','.join(fields) | |
476 printLog(log) | |
477 | |
478 # Open file handles | |
479 db_iter = readDbFile(db_file, ig=False) | |
480 pass_handle = getOutputHandle(db_file, out_label='parse-drop', out_dir=out_args['out_dir'], | |
481 out_name=out_args['out_name'], out_type='tab') | |
482 pass_writer = getDbWriter(pass_handle, db_file, exclude_fields=fields) | |
483 # Count records | |
484 result_count = countDbFile(db_file) | |
485 | |
486 # Iterate over records | |
487 start_time = time() | |
488 rec_count = 0 | |
489 for rec in db_iter: | |
490 # Print progress for previous iteration | |
491 printProgress(rec_count, result_count, 0.05, start_time) | |
492 rec_count += 1 | |
493 # Write row | |
494 pass_writer.writerow(rec) | |
495 | |
496 # Print counts | |
497 printProgress(rec_count, result_count, 0.05, start_time) | |
498 log = OrderedDict() | |
499 log['OUTPUT'] = os.path.basename(pass_handle.name) | |
500 log['RECORDS'] = rec_count | |
501 log['END'] = 'ParseDb' | |
502 printLog(log) | |
503 | |
504 # Close file handles | |
505 pass_handle.close() | |
506 | |
507 return pass_handle.name | |
508 | |
509 | |
510 def deleteDbFile(db_file, fields, values, logic='any', regex=False, | |
511 out_args=default_out_args): | |
512 """ | |
513 Deletes records from a database file | |
514 | |
515 Arguments: | |
516 db_file = the database file name | |
517 fields = a list of fields to check for deletion criteria | |
518 values = a list of values defining deletion targets | |
519 logic = one of 'any' or 'all' defining whether one or all fields must have a match. | |
520 regex = if False do exact full string matches; if True allow partial regex matches. | |
521 out_args = common output argument dictionary from parseCommonArgs | |
522 | |
523 Returns: | |
524 the output file name | |
525 """ | |
526 # Define string match function | |
527 if regex: | |
528 def _match_func(x, patterns): return any([re.search(p, x) for p in patterns]) | |
529 else: | |
530 def _match_func(x, patterns): return x in patterns | |
531 | |
532 # Define logic function | |
533 if logic == 'any': | |
534 _logic_func = any | |
535 elif logic == 'all': | |
536 _logic_func = all | |
537 | |
538 log = OrderedDict() | |
539 log['START'] = 'ParseDb' | |
540 log['COMMAND'] = 'delete' | |
541 log['FILE'] = os.path.basename(db_file) | |
542 log['FIELDS'] = ','.join(fields) | |
543 log['VALUES'] = ','.join(values) | |
544 printLog(log) | |
545 | |
546 # Open file handles | |
547 db_iter = readDbFile(db_file, ig=False) | |
548 pass_handle = getOutputHandle(db_file, out_label='parse-delete', out_dir=out_args['out_dir'], | |
549 out_name=out_args['out_name'], out_type='tab') | |
550 pass_writer = getDbWriter(pass_handle, db_file) | |
551 # Count records | |
552 result_count = countDbFile(db_file) | |
553 | |
554 # Iterate over records | |
555 start_time = time() | |
556 rec_count = pass_count = fail_count = 0 | |
557 for rec in db_iter: | |
558 # Print progress for previous iteration | |
559 printProgress(rec_count, result_count, 0.05, start_time) | |
560 rec_count += 1 | |
561 | |
562 # Check for deletion values in all fields | |
563 delete = _logic_func([_match_func(rec.get(f, False), values) for f in fields]) | |
564 | |
565 # Write sequences | |
566 if not delete: | |
567 pass_count += 1 | |
568 pass_writer.writerow(rec) | |
569 else: | |
570 fail_count += 1 | |
571 | |
572 # Print counts | |
573 printProgress(rec_count, result_count, 0.05, start_time) | |
574 log = OrderedDict() | |
575 log['OUTPUT'] = os.path.basename(pass_handle.name) | |
576 log['RECORDS'] = rec_count | |
577 log['KEPT'] = pass_count | |
578 log['DELETED'] = fail_count | |
579 log['END'] = 'ParseDb' | |
580 printLog(log) | |
581 | |
582 # Close file handles | |
583 pass_handle.close() | |
584 | |
585 return pass_handle.name | |
586 | |
587 | |
588 def renameDbFile(db_file, fields, names, out_args=default_out_args): | |
589 """ | |
590 Renames fields in a database file | |
591 | |
592 Arguments: | |
593 db_file = the database file name | |
594 fields = a list of fields to rename | |
595 values = a list of new names for fields | |
596 out_args = common output argument dictionary from parseCommonArgs | |
597 | |
598 Returns: | |
599 the output file name | |
600 """ | |
601 log = OrderedDict() | |
602 log['START'] = 'ParseDb' | |
603 log['COMMAND'] = 'rename' | |
604 log['FILE'] = os.path.basename(db_file) | |
605 log['FIELDS'] = ','.join(fields) | |
606 log['NAMES'] = ','.join(names) | |
607 printLog(log) | |
608 | |
609 # Open file handles | |
610 db_iter = readDbFile(db_file, ig=False) | |
611 pass_handle = getOutputHandle(db_file, out_label='parse-rename', out_dir=out_args['out_dir'], | |
612 out_name=out_args['out_name'], out_type='tab') | |
613 | |
614 # Get header and rename fields | |
615 header = (readDbFile(db_file, ig=False)).fieldnames | |
616 for f, n in zip(fields, names): | |
617 i = header.index(f) | |
618 header[i] = n | |
619 | |
620 # Open writer and write new header | |
621 # TODO: should modify getDbWriter to take a list of fields | |
622 pass_writer = csv.DictWriter(pass_handle, fieldnames=header, dialect='excel-tab') | |
623 pass_writer.writeheader() | |
624 | |
625 # Count records | |
626 result_count = countDbFile(db_file) | |
627 | |
628 # Iterate over records | |
629 start_time = time() | |
630 rec_count = 0 | |
631 for rec in db_iter: | |
632 # Print progress for previous iteration | |
633 printProgress(rec_count, result_count, 0.05, start_time) | |
634 rec_count += 1 | |
635 # TODO: repeating renaming is unnecessary. should had a non-dict reader/writer to DbCore | |
636 # Rename fields | |
637 for f, n in zip(fields, names): | |
638 rec[n] = rec.pop(f) | |
639 # Write | |
640 pass_writer.writerow(rec) | |
641 | |
642 # Print counts | |
643 printProgress(rec_count, result_count, 0.05, start_time) | |
644 log = OrderedDict() | |
645 log['OUTPUT'] = os.path.basename(pass_handle.name) | |
646 log['RECORDS'] = rec_count | |
647 log['END'] = 'ParseDb' | |
648 printLog(log) | |
649 | |
650 # Close file handles | |
651 pass_handle.close() | |
652 | |
653 return pass_handle.name | |
654 | |
655 | |
656 def selectDbFile(db_file, fields, values, logic='any', regex=False, | |
657 out_args=default_out_args): | |
658 """ | |
659 Selects records from a database file | |
660 | |
661 Arguments: | |
662 db_file = the database file name | |
663 fields = a list of fields to check for selection criteria | |
664 values = a list of values defining selection targets | |
665 logic = one of 'any' or 'all' defining whether one or all fields must have a match. | |
666 regex = if False do exact full string matches; if True allow partial regex matches. | |
667 out_args = common output argument dictionary from parseCommonArgs | |
668 | |
669 Returns: | |
670 the output file name | |
671 """ | |
672 # Define string match function | |
673 if regex: | |
674 def _match_func(x, patterns): return any([re.search(p, x) for p in patterns]) | |
675 else: | |
676 def _match_func(x, patterns): return x in patterns | |
677 | |
678 # Define logic function | |
679 if logic == 'any': | |
680 _logic_func = any | |
681 elif logic == 'all': | |
682 _logic_func = all | |
683 | |
684 # Print console log | |
685 log = OrderedDict() | |
686 log['START'] = 'ParseDb' | |
687 log['COMMAND'] = 'select' | |
688 log['FILE'] = os.path.basename(db_file) | |
689 log['FIELDS'] = ','.join(fields) | |
690 log['VALUES'] = ','.join(values) | |
691 log['REGEX'] =regex | |
692 printLog(log) | |
693 | |
694 # Open file handles | |
695 db_iter = readDbFile(db_file, ig=False) | |
696 pass_handle = getOutputHandle(db_file, out_label='parse-select', out_dir=out_args['out_dir'], | |
697 out_name=out_args['out_name'], out_type='tab') | |
698 pass_writer = getDbWriter(pass_handle, db_file) | |
699 # Count records | |
700 result_count = countDbFile(db_file) | |
701 | |
702 # Iterate over records | |
703 start_time = time() | |
704 rec_count = pass_count = fail_count = 0 | |
705 for rec in db_iter: | |
706 # Print progress for previous iteration | |
707 printProgress(rec_count, result_count, 0.05, start_time) | |
708 rec_count += 1 | |
709 | |
710 # Check for selection values in all fields | |
711 select = _logic_func([_match_func(rec.get(f, False), values) for f in fields]) | |
712 | |
713 # Write sequences | |
714 if select: | |
715 pass_count += 1 | |
716 pass_writer.writerow(rec) | |
717 else: | |
718 fail_count += 1 | |
719 | |
720 # Print counts | |
721 printProgress(rec_count, result_count, 0.05, start_time) | |
722 log = OrderedDict() | |
723 log['OUTPUT'] = os.path.basename(pass_handle.name) | |
724 log['RECORDS'] = rec_count | |
725 log['SELECTED'] = pass_count | |
726 log['DISCARDED'] = fail_count | |
727 log['END'] = 'ParseDb' | |
728 printLog(log) | |
729 | |
730 # Close file handles | |
731 pass_handle.close() | |
732 | |
733 return pass_handle.name | |
734 | |
735 | |
736 def sortDbFile(db_file, field, numeric=False, descend=False, | |
737 out_args=default_out_args): | |
738 """ | |
739 Sorts records by values in an annotation field | |
740 | |
741 Arguments: | |
742 db_file = the database filename | |
743 field = the field name to sort by | |
744 numeric = if True sort field numerically; | |
745 if False sort field alphabetically | |
746 descend = if True sort in descending order; | |
747 if False sort in ascending order | |
748 | |
749 out_args = common output argument dictionary from parseCommonArgs | |
750 | |
751 Returns: | |
752 the output file name | |
753 """ | |
754 log = OrderedDict() | |
755 log['START'] = 'ParseDb' | |
756 log['COMMAND'] = 'sort' | |
757 log['FILE'] = os.path.basename(db_file) | |
758 log['FIELD'] = field | |
759 log['NUMERIC'] = numeric | |
760 printLog(log) | |
761 | |
762 # Open file handles | |
763 db_iter = readDbFile(db_file, ig=False) | |
764 pass_handle = getOutputHandle(db_file, out_label='parse-sort', out_dir=out_args['out_dir'], | |
765 out_name=out_args['out_name'], out_type='tab') | |
766 pass_writer = getDbWriter(pass_handle, db_file) | |
767 | |
768 | |
769 # Store all records in a dictionary | |
770 start_time = time() | |
771 printMessage("Indexing: Running", start_time=start_time) | |
772 db_dict = {i:r for i, r in enumerate(db_iter)} | |
773 result_count = len(db_dict) | |
774 | |
775 # Sort db_dict by field values | |
776 tag_dict = {k:v[field] for k, v in db_dict.items()} | |
777 if numeric: tag_dict = {k:float(v or 0) for k, v in tag_dict.items()} | |
778 sorted_keys = sorted(tag_dict, key=tag_dict.get, reverse=descend) | |
779 printMessage("Indexing: Done", start_time=start_time, end=True) | |
780 | |
781 # Iterate over records | |
782 start_time = time() | |
783 rec_count = 0 | |
784 for key in sorted_keys: | |
785 # Print progress for previous iteration | |
786 printProgress(rec_count, result_count, 0.05, start_time) | |
787 rec_count += 1 | |
788 | |
789 # Write records | |
790 pass_writer.writerow(db_dict[key]) | |
791 | |
792 # Print counts | |
793 printProgress(rec_count, result_count, 0.05, start_time) | |
794 log = OrderedDict() | |
795 log['OUTPUT'] = os.path.basename(pass_handle.name) | |
796 log['RECORDS'] = rec_count | |
797 log['END'] = 'ParseDb' | |
798 printLog(log) | |
799 | |
800 # Close file handles | |
801 pass_handle.close() | |
802 | |
803 return pass_handle.name | |
804 | |
805 | |
806 def updateDbFile(db_file, field, values, updates, out_args=default_out_args): | |
807 """ | |
808 Updates field and value pairs to a database file | |
809 | |
810 Arguments: | |
811 db_file = the database file name | |
812 field = the field to update | |
813 values = a list of values to specifying which rows to update | |
814 updates = a list of values to update each value with | |
815 out_args = common output argument dictionary from parseCommonArgs | |
816 | |
817 Returns: | |
818 the output file name | |
819 """ | |
820 log = OrderedDict() | |
821 log['START'] = 'ParseDb' | |
822 log['COMMAND'] = 'update' | |
823 log['FILE'] = os.path.basename(db_file) | |
824 log['FIELD'] = field | |
825 log['VALUES'] = ','.join(values) | |
826 log['UPDATES'] = ','.join(updates) | |
827 printLog(log) | |
828 | |
829 # Open file handles | |
830 db_iter = readDbFile(db_file, ig=False) | |
831 pass_handle = getOutputHandle(db_file, out_label='parse-update', out_dir=out_args['out_dir'], | |
832 out_name=out_args['out_name'], out_type='tab') | |
833 pass_writer = getDbWriter(pass_handle, db_file) | |
834 # Count records | |
835 result_count = countDbFile(db_file) | |
836 | |
837 # Iterate over records | |
838 start_time = time() | |
839 rec_count = pass_count = 0 | |
840 for rec in db_iter: | |
841 # Print progress for previous iteration | |
842 printProgress(rec_count, result_count, 0.05, start_time) | |
843 rec_count += 1 | |
844 | |
845 # Updated values if found | |
846 for x, y in zip(values, updates): | |
847 if rec[field] == x: | |
848 rec[field] = y | |
849 pass_count += 1 | |
850 | |
851 # Write records | |
852 pass_writer.writerow(rec) | |
853 | |
854 # Print counts | |
855 printProgress(rec_count, result_count, 0.05, start_time) | |
856 log = OrderedDict() | |
857 log['OUTPUT'] = os.path.basename(pass_handle.name) | |
858 log['RECORDS'] = rec_count | |
859 log['UPDATED'] = pass_count | |
860 log['END'] = 'ParseDb' | |
861 printLog(log) | |
862 | |
863 # Close file handles | |
864 pass_handle.close() | |
865 | |
866 return pass_handle.name | |
867 | |
868 | |
869 def getArgParser(): | |
870 """ | |
871 Defines the ArgumentParser | |
872 | |
873 Arguments: | |
874 None | |
875 | |
876 Returns: | |
877 an ArgumentParser object | |
878 """ | |
879 # Define input and output field help message | |
880 fields = dedent( | |
881 ''' | |
882 output files: | |
883 sequences | |
884 FASTA formatted sequences output from the subcommands fasta and clip. | |
885 <field>-<value> | |
886 database files partitioned by annotation <field> and <value>. | |
887 parse-<command> | |
888 output of the database modification functions where <command> is one of | |
889 the subcommands add, index, drop, delete, rename, select, sort or update. | |
890 | |
891 required fields: | |
892 SEQUENCE_ID | |
893 | |
894 optional fields: | |
895 JUNCTION, SEQUENCE_IMGT, SEQUENCE_VDJ, GERMLINE_IMGT, GERMLINE_VDJ, | |
896 GERMLINE_IMGT_D_MASK, GERMLINE_VDJ_D_MASK, | |
897 GERMLINE_IMGT_V_REGION, GERMLINE_VDJ_V_REGION | |
898 | |
899 output fields: | |
900 None | |
901 ''') | |
902 | |
903 # Define ArgumentParser | |
904 parser = ArgumentParser(description=__doc__, epilog=fields, | |
905 formatter_class=CommonHelpFormatter) | |
906 parser.add_argument('--version', action='version', | |
907 version='%(prog)s:' + ' %s-%s' %(__version__, __date__)) | |
908 subparsers = parser.add_subparsers(title='subcommands', dest='command', metavar='', | |
909 help='Database operation') | |
910 # TODO: This is a temporary fix for Python issue 9253 | |
911 subparsers.required = True | |
912 | |
913 # Define parent parser | |
914 parser_parent = getCommonArgParser(seq_in=False, seq_out=False, db_in=True, | |
915 failed=False, log=False) | |
916 | |
917 # Subparser to convert database entries to sequence file | |
918 parser_seq = subparsers.add_parser('fasta', parents=[parser_parent], | |
919 formatter_class=CommonHelpFormatter, | |
920 help='Creates a fasta file from database records.', | |
921 description='Creates a fasta file from database records.') | |
922 parser_seq.add_argument('--if', action='store', dest='id_field', | |
923 default=default_id_field, | |
924 help='The name of the field containing identifiers') | |
925 parser_seq.add_argument('--sf', action='store', dest='seq_field', | |
926 default=default_seq_field, | |
927 help='The name of the field containing sequences') | |
928 parser_seq.add_argument('--mf', nargs='+', action='store', dest='meta_fields', | |
929 help='List of annotation fields to add to the sequence description') | |
930 parser_seq.set_defaults(func=convertDbFasta) | |
931 | |
932 # Subparser to convert database entries to clip-fasta file | |
933 parser_baseln = subparsers.add_parser('baseline', parents=[parser_parent], | |
934 formatter_class=CommonHelpFormatter, | |
935 description='Creates a BASELINe fasta file from database records.', | |
936 help='''Creates a specially formatted fasta file | |
937 from database records for input into the BASELINe | |
938 website. The format groups clonally related sequences | |
939 sequentially, with the germline sequence preceding | |
940 each clone and denoted by headers starting with ">>".''') | |
941 parser_baseln.add_argument('--if', action='store', dest='id_field', | |
942 default=default_id_field, | |
943 help='The name of the field containing identifiers') | |
944 parser_baseln.add_argument('--sf', action='store', dest='seq_field', | |
945 default=default_seq_field, | |
946 help='The name of the field containing reads') | |
947 parser_baseln.add_argument('--gf', action='store', dest='germ_field', | |
948 default=default_germ_field, | |
949 help='The name of the field containing germline sequences') | |
950 parser_baseln.add_argument('--cf', action='store', dest='cluster_field', default=None, | |
951 help='The name of the field containing containing sorted clone IDs') | |
952 parser_baseln.add_argument('--mf', nargs='+', action='store', dest='meta_fields', | |
953 help='List of annotation fields to add to the sequence description') | |
954 parser_baseln.set_defaults(func=convertDbBaseline) | |
955 | |
956 # Subparser to partition files by annotation values | |
957 parser_split = subparsers.add_parser('split', parents=[parser_parent], | |
958 formatter_class=CommonHelpFormatter, | |
959 help='Splits database files by field values.', | |
960 description='Splits database files by field values') | |
961 parser_split.add_argument('-f', action='store', dest='field', type=str, required=True, | |
962 help='Annotation field by which to split database files.') | |
963 parser_split.add_argument('--num', action='store', dest='num_split', type=float, default=None, | |
964 help='''Specify to define the field as numeric and group | |
965 records by whether they are less than or at least | |
966 (greater than or equal to) the specified value.''') | |
967 parser_split.set_defaults(func=splitDbFile) | |
968 | |
969 # Subparser to add records | |
970 parser_add = subparsers.add_parser('add', parents=[parser_parent], | |
971 formatter_class=CommonHelpFormatter, | |
972 help='Adds field and value pairs.', | |
973 description='Adds field and value pairs.') | |
974 parser_add.add_argument('-f', nargs='+', action='store', dest='fields', required=True, | |
975 help='The name of the fields to add.') | |
976 parser_add.add_argument('-u', nargs='+', action='store', dest='values', required=True, | |
977 help='The value to assign to all rows for each field.') | |
978 parser_add.set_defaults(func=addDbFile) | |
979 | |
980 # Subparser to delete records | |
981 parser_delete = subparsers.add_parser('delete', parents=[parser_parent], | |
982 formatter_class=CommonHelpFormatter, | |
983 help='Deletes specific records.', | |
984 description='Deletes specific records.') | |
985 parser_delete.add_argument('-f', nargs='+', action='store', dest='fields', required=True, | |
986 help='The name of the fields to check for deletion criteria.') | |
987 parser_delete.add_argument('-u', nargs='+', action='store', dest='values', default=['', 'NA'], | |
988 help='''The values defining which records to delete. A value | |
989 may appear in any of the fields specified with -f.''') | |
990 parser_delete.add_argument('--logic', action='store', dest='logic', | |
991 choices=('any', 'all'), default='any', | |
992 help='''Defines whether a value may appear in any field (any) | |
993 or whether it must appear in all fields (all).''') | |
994 parser_delete.add_argument('--regex', action='store_true', dest='regex', | |
995 help='''If specified, treat values as regular expressions | |
996 and allow partial string matches.''') | |
997 parser_delete.set_defaults(func=deleteDbFile) | |
998 | |
999 # Subparser to drop fields | |
1000 parser_drop = subparsers.add_parser('drop', parents=[parser_parent], | |
1001 formatter_class=CommonHelpFormatter, | |
1002 help='Deletes entire fields.', | |
1003 description='Deletes specific records.') | |
1004 parser_drop.add_argument('-f', nargs='+', action='store', dest='fields', required=True, | |
1005 help='The name of the fields to delete from the database.') | |
1006 parser_drop.set_defaults(func=dropDbFile) | |
1007 | |
1008 # Subparser to index fields | |
1009 parser_index = subparsers.add_parser('index', parents=[parser_parent], | |
1010 formatter_class=CommonHelpFormatter, | |
1011 help='Adds a numeric index field.', | |
1012 description='Adds a numeric index field.') | |
1013 parser_index.add_argument('-f', action='store', dest='field', | |
1014 default=default_index_field, | |
1015 help='The name of the index field to add to the database.') | |
1016 parser_index.set_defaults(func=indexDbFile) | |
1017 | |
1018 # Subparser to rename fields | |
1019 parser_rename = subparsers.add_parser('rename', parents=[parser_parent], | |
1020 formatter_class=CommonHelpFormatter, | |
1021 help='Renames fields.', | |
1022 description='Renames fields.') | |
1023 parser_rename.add_argument('-f', nargs='+', action='store', dest='fields', required=True, | |
1024 help='List of fields to rename.') | |
1025 parser_rename.add_argument('-k', nargs='+', action='store', dest='names', required=True, | |
1026 help='List of new names for each field.') | |
1027 parser_rename.set_defaults(func=renameDbFile) | |
1028 | |
1029 # Subparser to select records | |
1030 parser_select = subparsers.add_parser('select', parents=[parser_parent], | |
1031 formatter_class=CommonHelpFormatter, | |
1032 help='Selects specific records.', | |
1033 description='Selects specific records.') | |
1034 parser_select.add_argument('-f', nargs='+', action='store', dest='fields', required=True, | |
1035 help='The name of the fields to check for selection criteria.') | |
1036 parser_select.add_argument('-u', nargs='+', action='store', dest='values', required=True, | |
1037 help='''The values defining with records to select. A value | |
1038 may appear in any of the fields specified with -f.''') | |
1039 parser_select.add_argument('--logic', action='store', dest='logic', | |
1040 choices=('any', 'all'), default='any', | |
1041 help='''Defines whether a value may appear in any field (any) | |
1042 or whether it must appear in all fields (all).''') | |
1043 parser_select.add_argument('--regex', action='store_true', dest='regex', | |
1044 help='''If specified, treat values as regular expressions | |
1045 and allow partial string matches.''') | |
1046 parser_select.set_defaults(func=selectDbFile) | |
1047 | |
1048 # Subparser to sort file by records | |
1049 parser_sort = subparsers.add_parser('sort', parents=[parser_parent], | |
1050 formatter_class=CommonHelpFormatter, | |
1051 help='Sorts records by field values.', | |
1052 description='Sorts records by field values.') | |
1053 parser_sort.add_argument('-f', action='store', dest='field', type=str, required=True, | |
1054 help='The annotation field by which to sort records.') | |
1055 parser_sort.add_argument('--num', action='store_true', dest='numeric', default=False, | |
1056 help='''Specify to define the sort column as numeric rather | |
1057 than textual.''') | |
1058 parser_sort.add_argument('--descend', action='store_true', dest='descend', | |
1059 help='''If specified, sort records in descending, rather | |
1060 than ascending, order by values in the target field.''') | |
1061 parser_sort.set_defaults(func=sortDbFile) | |
1062 | |
1063 # Subparser to update records | |
1064 parser_update = subparsers.add_parser('update', parents=[parser_parent], | |
1065 formatter_class=CommonHelpFormatter, | |
1066 help='Updates field and value pairs.', | |
1067 description='Updates field and value pairs.') | |
1068 parser_update.add_argument('-f', action='store', dest='field', required=True, | |
1069 help='The name of the field to update.') | |
1070 parser_update.add_argument('-u', nargs='+', action='store', dest='values', required=True, | |
1071 help='The values that will be replaced.') | |
1072 parser_update.add_argument('-t', nargs='+', action='store', dest='updates', required=True, | |
1073 help='''The new value to assign to each selected row.''') | |
1074 parser_update.set_defaults(func=updateDbFile) | |
1075 | |
1076 return parser | |
1077 | |
1078 | |
1079 if __name__ == '__main__': | |
1080 """ | |
1081 Parses command line arguments and calls main function | |
1082 """ | |
1083 # Parse arguments | |
1084 parser = getArgParser() | |
1085 checkArgs(parser) | |
1086 args = parser.parse_args() | |
1087 args_dict = parseCommonArgs(args) | |
1088 # Convert case of fields | |
1089 if 'id_field' in args_dict: | |
1090 args_dict['id_field'] = args_dict['id_field'].upper() | |
1091 if 'seq_field' in args_dict: | |
1092 args_dict['seq_field'] = args_dict['seq_field'].upper() | |
1093 if 'germ_field' in args_dict: | |
1094 args_dict['germ_field'] = args_dict['germ_field'].upper() | |
1095 if 'field' in args_dict: | |
1096 args_dict['field'] = args_dict['field'].upper() | |
1097 if 'cluster_field' in args_dict and args_dict['cluster_field'] is not None: | |
1098 args_dict['cluster_field'] = args_dict['cluster_field'].upper() | |
1099 if 'meta_fields' in args_dict and args_dict['meta_fields'] is not None: | |
1100 args_dict['meta_fields'] = [f.upper() for f in args_dict['meta_fields']] | |
1101 if 'fields' in args_dict: | |
1102 args_dict['fields'] = [f.upper() for f in args_dict['fields']] | |
1103 | |
1104 # Check modify_args arguments | |
1105 if args.command == 'add' and len(args_dict['fields']) != len(args_dict['values']): | |
1106 parser.error('You must specify exactly one value (-u) per field (-f)') | |
1107 elif args.command == 'rename' and len(args_dict['fields']) != len(args_dict['names']): | |
1108 parser.error('You must specify exactly one new name (-k) per field (-f)') | |
1109 elif args.command == 'update' and len(args_dict['values']) != len(args_dict['updates']): | |
1110 parser.error('You must specify exactly one value (-u) per replacement (-t)') | |
1111 | |
1112 # Call parser function for each database file | |
1113 del args_dict['command'] | |
1114 del args_dict['func'] | |
1115 del args_dict['db_files'] | |
1116 for f in args.__dict__['db_files']: | |
1117 args_dict['db_file'] = f | |
1118 args.func(**args_dict) | |
1119 |