comparison ParseDb.py @ 0:183edf446dcf draft default tip

Uploaded
author davidvanzessen
date Mon, 17 Jul 2017 07:44:27 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:183edf446dcf
1 #!/usr/bin/env python3
2 """
3 Parses tab delimited database files
4 """
5 # Info
6 __author__ = 'Jason Anthony Vander Heiden'
7 from changeo import __version__, __date__
8
9 # Imports
10 import csv
11 import os
12 import re
13 from argparse import ArgumentParser
14 from collections import OrderedDict
15
16 from textwrap import dedent
17 from time import time
18 from Bio import SeqIO
19 from Bio.Seq import Seq
20 from Bio.SeqRecord import SeqRecord
21 from Bio.Alphabet import IUPAC
22
23 # Presto and changeo imports
24 from presto.Defaults import default_delimiter, default_out_args
25 from presto.Annotation import flattenAnnotation
26 from presto.IO import getOutputHandle, printLog, printProgress, printMessage
27 from changeo.Defaults import default_csv_size
28 from changeo.Commandline import CommonHelpFormatter, checkArgs, getCommonArgParser, parseCommonArgs
29 from changeo.IO import getDbWriter, readDbFile, countDbFile
30
31 # System settings
32 csv.field_size_limit(default_csv_size)
33
34 # Defaults
35 default_id_field = 'SEQUENCE_ID'
36 default_seq_field = 'SEQUENCE_IMGT'
37 default_germ_field = 'GERMLINE_IMGT_D_MASK'
38 default_index_field = 'INDEX'
39
40 # TODO: convert SQL-ish operations to modify_func() as per ParseHeaders
41
42 def getDbSeqRecord(db_record, id_field, seq_field, meta_fields=None,
43 delimiter=default_delimiter):
44 """
45 Parses a database record into a SeqRecord
46
47 Arguments:
48 db_record = a dictionary containing a database record
49 id_field = the field containing identifiers
50 seq_field = the field containing sequences
51 meta_fields = a list of fields to add to sequence annotations
52 delimiter = a tuple of delimiters for (fields, values, value lists)
53
54 Returns:
55 a SeqRecord
56 """
57 # Return None if ID or sequence fields are empty
58 if not db_record[id_field] or not db_record[seq_field]:
59 return None
60
61 # Create description string
62 desc_dict = OrderedDict([('ID', db_record[id_field])])
63 if meta_fields is not None:
64 desc_dict.update([(f, db_record[f]) for f in meta_fields if f in db_record])
65 desc_str = flattenAnnotation(desc_dict, delimiter=delimiter)
66
67 # Create SeqRecord
68 seq_record = SeqRecord(Seq(db_record[seq_field], IUPAC.ambiguous_dna),
69 id=desc_str, name=desc_str, description='')
70
71 return seq_record
72
73
74 def splitDbFile(db_file, field, num_split=None, out_args=default_out_args):
75 """
76 Divides a tab-delimited database file into segments by description tags
77
78 Arguments:
79 db_file = filename of the tab-delimited database file to split
80 field = the field name by which to split db_file
81 num_split = the numerical threshold by which to group sequences;
82 if None treat field as textual
83 out_args = common output argument dictionary from parseCommonArgs
84
85 Returns:
86 a list of output file names
87 """
88 log = OrderedDict()
89 log['START'] = 'ParseDb'
90 log['COMMAND'] = 'split'
91 log['FILE'] = os.path.basename(db_file)
92 log['FIELD'] = field
93 log['NUM_SPLIT'] = num_split
94 printLog(log)
95
96 # Open IgRecord reader iter object
97 reader = readDbFile(db_file, ig=False)
98
99 # Determine total numbers of records
100 rec_count = countDbFile(db_file)
101
102 start_time = time()
103 count = 0
104 # Sort records into files based on textual field
105 if num_split is None:
106 # Create set of unique field tags
107 tmp_iter = readDbFile(db_file, ig=False)
108 tag_list = list(set([row[field] for row in tmp_iter]))
109
110 # Forbidden characters in filename and replacements
111 noGood = {'\/':'f','\\':'b','?':'q','\%':'p','*':'s',':':'c',
112 '\|':'pi','\"':'dq','\'':'sq','<':'gt','>':'lt',' ':'_'}
113 # Replace forbidden characters in tag_list
114 tag_dict = {}
115 for tag in tag_list:
116 for c,r in noGood.items():
117 tag_dict[tag] = (tag_dict.get(tag, tag).replace(c,r) \
118 if c in tag else tag_dict.get(tag, tag))
119
120 # Create output handles
121 handles_dict = {tag:getOutputHandle(db_file,
122 '%s-%s' % (field, label),
123 out_type = out_args['out_type'],
124 out_name = out_args['out_name'],
125 out_dir = out_args['out_dir'])
126 for tag, label in tag_dict.items()}
127
128 # Create Db writer instances
129 writers_dict = {tag:getDbWriter(handles_dict[tag], db_file)
130 for tag in tag_dict}
131
132 # Iterate over IgRecords
133 for row in reader:
134 printProgress(count, rec_count, 0.05, start_time)
135 count += 1
136 # Write row to appropriate file
137 tag = row[field]
138 writers_dict[tag].writerow(row)
139
140 # Sort records into files based on numeric num_split
141 else:
142 num_split = float(num_split)
143
144 # Create output handles
145 handles_dict = {'under':getOutputHandle(db_file,
146 'under-%.1f' % num_split,
147 out_type = out_args['out_type'],
148 out_name = out_args['out_name'],
149 out_dir = out_args['out_dir']),
150 'atleast':getOutputHandle(db_file,
151 'atleast-%.1f' % num_split,
152 out_type = out_args['out_type'],
153 out_name = out_args['out_name'],
154 out_dir = out_args['out_dir'])}
155
156 # Create Db writer instances
157 writers_dict = {'under':getDbWriter(handles_dict['under'], db_file),
158 'atleast':getDbWriter(handles_dict['atleast'], db_file)}
159
160 # Iterate over IgRecords
161 for row in reader:
162 printProgress(count, rec_count, 0.05, start_time)
163 count += 1
164 tag = row[field]
165 tag = 'under' if float(tag) < num_split else 'atleast'
166 writers_dict[tag].writerow(row)
167
168 # Write log
169 printProgress(count, rec_count, 0.05, start_time)
170 log = OrderedDict()
171 for i, k in enumerate(handles_dict):
172 log['OUTPUT%i' % (i + 1)] = os.path.basename(handles_dict[k].name)
173 log['RECORDS'] = rec_count
174 log['PARTS'] = len(handles_dict)
175 log['END'] = 'ParseDb'
176 printLog(log)
177
178 # Close output file handles
179 for t in handles_dict: handles_dict[t].close()
180
181 return [handles_dict[t].name for t in handles_dict]
182
183
184 # TODO: SHOULD ALLOW FOR UNSORTED CLUSTER COLUMN
185 # TODO: SHOULD ALLOW FOR GROUPING FIELDS
186 def convertDbBaseline(db_file, id_field=default_id_field, seq_field=default_seq_field,
187 germ_field=default_germ_field, cluster_field=None,
188 meta_fields=None, out_args=default_out_args):
189 """
190 Builds fasta files from database records
191
192 Arguments:
193 db_file = the database file name
194 id_field = the field containing identifiers
195 seq_field = the field containing sample sequences
196 germ_field = the field containing germline sequences
197 cluster_field = the field containing clonal groupings
198 if None write the germline for each record
199 meta_fields = a list of fields to add to sequence annotations
200 out_args = common output argument dictionary from parseCommonArgs
201
202 Returns:
203 the output file name
204 """
205 log = OrderedDict()
206 log['START'] = 'ParseDb'
207 log['COMMAND'] = 'fasta'
208 log['FILE'] = os.path.basename(db_file)
209 log['ID_FIELD'] = id_field
210 log['SEQ_FIELD'] = seq_field
211 log['GERM_FIELD'] = germ_field
212 log['CLUSTER_FIELD'] = cluster_field
213 if meta_fields is not None: log['META_FIELDS'] = ','.join(meta_fields)
214 printLog(log)
215
216 # Open file handles
217 db_iter = readDbFile(db_file, ig=False)
218 pass_handle = getOutputHandle(db_file, out_label='sequences', out_dir=out_args['out_dir'],
219 out_name=out_args['out_name'], out_type='clip')
220 # Count records
221 result_count = countDbFile(db_file)
222
223 # Iterate over records
224 start_time = time()
225 rec_count = germ_count = pass_count = fail_count = 0
226 cluster_last = None
227 for rec in db_iter:
228 # Print progress for previous iteration
229 printProgress(rec_count, result_count, 0.05, start_time)
230 rec_count += 1
231
232 # Update cluster ID
233 cluster = rec.get(cluster_field, None)
234
235 # Get germline SeqRecord when needed
236 if cluster_field is None:
237 germ = getDbSeqRecord(rec, id_field, germ_field, meta_fields,
238 delimiter=out_args['delimiter'])
239 germ.id = '>' + germ.id
240 elif cluster != cluster_last:
241 germ = getDbSeqRecord(rec, cluster_field, germ_field,
242 delimiter=out_args['delimiter'])
243 germ.id = '>' + germ.id
244 else:
245 germ = None
246
247 # Get read SeqRecord
248 seq = getDbSeqRecord(rec, id_field, seq_field, meta_fields,
249 delimiter=out_args['delimiter'])
250
251 # Write germline
252 if germ is not None:
253 germ_count += 1
254 SeqIO.write(germ, pass_handle, 'fasta')
255
256 # Write sequences
257 if seq is not None:
258 pass_count += 1
259 SeqIO.write(seq, pass_handle, 'fasta')
260 else:
261 fail_count += 1
262
263 # Set last cluster ID
264 cluster_last = cluster
265
266 # Print counts
267 printProgress(rec_count, result_count, 0.05, start_time)
268 log = OrderedDict()
269 log['OUTPUT'] = os.path.basename(pass_handle.name)
270 log['RECORDS'] = rec_count
271 log['GERMLINES'] = germ_count
272 log['PASS'] = pass_count
273 log['FAIL'] = fail_count
274 log['END'] = 'ParseDb'
275 printLog(log)
276
277 # Close file handles
278 pass_handle.close()
279
280 return pass_handle.name
281
282
283 def convertDbFasta(db_file, id_field=default_id_field, seq_field=default_seq_field,
284 meta_fields=None, out_args=default_out_args):
285 """
286 Builds fasta files from database records
287
288 Arguments:
289 db_file = the database file name
290 id_field = the field containing identifiers
291 seq_field = the field containing sequences
292 meta_fields = a list of fields to add to sequence annotations
293 out_args = common output argument dictionary from parseCommonArgs
294
295 Returns:
296 the output file name
297 """
298 log = OrderedDict()
299 log['START'] = 'ParseDb'
300 log['COMMAND'] = 'fasta'
301 log['FILE'] = os.path.basename(db_file)
302 log['ID_FIELD'] = id_field
303 log['SEQ_FIELD'] = seq_field
304 if meta_fields is not None: log['META_FIELDS'] = ','.join(meta_fields)
305 printLog(log)
306
307 # Open file handles
308 out_type = 'fasta'
309 db_iter = readDbFile(db_file, ig=False)
310 pass_handle = getOutputHandle(db_file, out_label='sequences', out_dir=out_args['out_dir'],
311 out_name=out_args['out_name'], out_type=out_type)
312 # Count records
313 result_count = countDbFile(db_file)
314
315 # Iterate over records
316 start_time = time()
317 rec_count = pass_count = fail_count = 0
318 for rec in db_iter:
319 # Print progress for previous iteration
320 printProgress(rec_count, result_count, 0.05, start_time)
321 rec_count += 1
322
323 # Get SeqRecord
324 seq = getDbSeqRecord(rec, id_field, seq_field, meta_fields, out_args['delimiter'])
325
326 # Write sequences
327 if seq is not None:
328 pass_count += 1
329 SeqIO.write(seq, pass_handle, out_type)
330 else:
331 fail_count += 1
332
333 # Print counts
334 printProgress(rec_count, result_count, 0.05, start_time)
335 log = OrderedDict()
336 log['OUTPUT'] = os.path.basename(pass_handle.name)
337 log['RECORDS'] = rec_count
338 log['PASS'] = pass_count
339 log['FAIL'] = fail_count
340 log['END'] = 'ParseDb'
341 printLog(log)
342
343 # Close file handles
344 pass_handle.close()
345
346 return pass_handle.name
347
348
349 def addDbFile(db_file, fields, values, out_args=default_out_args):
350 """
351 Adds field and value pairs to a database file
352
353 Arguments:
354 db_file = the database file name
355 fields = a list of fields to add
356 values = a list of values to assign to all rows of each field
357 out_args = common output argument dictionary from parseCommonArgs
358
359 Returns:
360 the output file name
361 """
362 log = OrderedDict()
363 log['START'] = 'ParseDb'
364 log['COMMAND'] = 'add'
365 log['FILE'] = os.path.basename(db_file)
366 log['FIELDS'] = ','.join(fields)
367 log['VALUES'] = ','.join(values)
368 printLog(log)
369
370 # Open file handles
371 db_iter = readDbFile(db_file, ig=False)
372 pass_handle = getOutputHandle(db_file, out_label='parse-add', out_dir=out_args['out_dir'],
373 out_name=out_args['out_name'], out_type='tab')
374 pass_writer = getDbWriter(pass_handle, db_file, add_fields=fields)
375 # Count records
376 result_count = countDbFile(db_file)
377
378 # Define fields and values to append
379 add_dict = {k:v for k,v in zip(fields, values) if k not in db_iter.fieldnames}
380
381 # Iterate over records
382 start_time = time()
383 rec_count = 0
384 for rec in db_iter:
385 # Print progress for previous iteration
386 printProgress(rec_count, result_count, 0.05, start_time)
387 rec_count += 1
388 # Write updated row
389 rec.update(add_dict)
390 pass_writer.writerow(rec)
391
392 # Print counts
393 printProgress(rec_count, result_count, 0.05, start_time)
394 log = OrderedDict()
395 log['OUTPUT'] = os.path.basename(pass_handle.name)
396 log['RECORDS'] = rec_count
397 log['END'] = 'ParseDb'
398 printLog(log)
399
400 # Close file handles
401 pass_handle.close()
402
403 return pass_handle.name
404
405
406 def indexDbFile(db_file, field=default_index_field, out_args=default_out_args):
407 """
408 Adds an index column to a database file
409
410 Arguments:
411 db_file = the database file name
412 field = the name of the index field to add
413 out_args = common output argument dictionary from parseCommonArgs
414
415 Returns:
416 the output file name
417 """
418 log = OrderedDict()
419 log['START'] = 'ParseDb'
420 log['COMMAND'] = 'index'
421 log['FILE'] = os.path.basename(db_file)
422 log['FIELD'] = field
423 printLog(log)
424
425 # Open file handles
426 db_iter = readDbFile(db_file, ig=False)
427 pass_handle = getOutputHandle(db_file, out_label='parse-index', out_dir=out_args['out_dir'],
428 out_name=out_args['out_name'], out_type='tab')
429 pass_writer = getDbWriter(pass_handle, db_file, add_fields=field)
430 # Count records
431 result_count = countDbFile(db_file)
432
433 # Iterate over records
434 start_time = time()
435 rec_count = 0
436 for rec in db_iter:
437 # Print progress for previous iteration
438 printProgress(rec_count, result_count, 0.05, start_time)
439 rec_count += 1
440
441 # Add count and write updated row
442 rec.update({field:rec_count})
443 pass_writer.writerow(rec)
444
445 # Print counts
446 printProgress(rec_count, result_count, 0.05, start_time)
447 log = OrderedDict()
448 log['OUTPUT'] = os.path.basename(pass_handle.name)
449 log['RECORDS'] = rec_count
450 log['END'] = 'ParseDb'
451 printLog(log)
452
453 # Close file handles
454 pass_handle.close()
455
456 return pass_handle.name
457
458
459 def dropDbFile(db_file, fields, out_args=default_out_args):
460 """
461 Deletes entire fields from a database file
462
463 Arguments:
464 db_file = the database file name
465 fields = a list of fields to drop
466 out_args = common output argument dictionary from parseCommonArgs
467
468 Returns:
469 the output file name
470 """
471 log = OrderedDict()
472 log['START'] = 'ParseDb'
473 log['COMMAND'] = 'add'
474 log['FILE'] = os.path.basename(db_file)
475 log['FIELDS'] = ','.join(fields)
476 printLog(log)
477
478 # Open file handles
479 db_iter = readDbFile(db_file, ig=False)
480 pass_handle = getOutputHandle(db_file, out_label='parse-drop', out_dir=out_args['out_dir'],
481 out_name=out_args['out_name'], out_type='tab')
482 pass_writer = getDbWriter(pass_handle, db_file, exclude_fields=fields)
483 # Count records
484 result_count = countDbFile(db_file)
485
486 # Iterate over records
487 start_time = time()
488 rec_count = 0
489 for rec in db_iter:
490 # Print progress for previous iteration
491 printProgress(rec_count, result_count, 0.05, start_time)
492 rec_count += 1
493 # Write row
494 pass_writer.writerow(rec)
495
496 # Print counts
497 printProgress(rec_count, result_count, 0.05, start_time)
498 log = OrderedDict()
499 log['OUTPUT'] = os.path.basename(pass_handle.name)
500 log['RECORDS'] = rec_count
501 log['END'] = 'ParseDb'
502 printLog(log)
503
504 # Close file handles
505 pass_handle.close()
506
507 return pass_handle.name
508
509
510 def deleteDbFile(db_file, fields, values, logic='any', regex=False,
511 out_args=default_out_args):
512 """
513 Deletes records from a database file
514
515 Arguments:
516 db_file = the database file name
517 fields = a list of fields to check for deletion criteria
518 values = a list of values defining deletion targets
519 logic = one of 'any' or 'all' defining whether one or all fields must have a match.
520 regex = if False do exact full string matches; if True allow partial regex matches.
521 out_args = common output argument dictionary from parseCommonArgs
522
523 Returns:
524 the output file name
525 """
526 # Define string match function
527 if regex:
528 def _match_func(x, patterns): return any([re.search(p, x) for p in patterns])
529 else:
530 def _match_func(x, patterns): return x in patterns
531
532 # Define logic function
533 if logic == 'any':
534 _logic_func = any
535 elif logic == 'all':
536 _logic_func = all
537
538 log = OrderedDict()
539 log['START'] = 'ParseDb'
540 log['COMMAND'] = 'delete'
541 log['FILE'] = os.path.basename(db_file)
542 log['FIELDS'] = ','.join(fields)
543 log['VALUES'] = ','.join(values)
544 printLog(log)
545
546 # Open file handles
547 db_iter = readDbFile(db_file, ig=False)
548 pass_handle = getOutputHandle(db_file, out_label='parse-delete', out_dir=out_args['out_dir'],
549 out_name=out_args['out_name'], out_type='tab')
550 pass_writer = getDbWriter(pass_handle, db_file)
551 # Count records
552 result_count = countDbFile(db_file)
553
554 # Iterate over records
555 start_time = time()
556 rec_count = pass_count = fail_count = 0
557 for rec in db_iter:
558 # Print progress for previous iteration
559 printProgress(rec_count, result_count, 0.05, start_time)
560 rec_count += 1
561
562 # Check for deletion values in all fields
563 delete = _logic_func([_match_func(rec.get(f, False), values) for f in fields])
564
565 # Write sequences
566 if not delete:
567 pass_count += 1
568 pass_writer.writerow(rec)
569 else:
570 fail_count += 1
571
572 # Print counts
573 printProgress(rec_count, result_count, 0.05, start_time)
574 log = OrderedDict()
575 log['OUTPUT'] = os.path.basename(pass_handle.name)
576 log['RECORDS'] = rec_count
577 log['KEPT'] = pass_count
578 log['DELETED'] = fail_count
579 log['END'] = 'ParseDb'
580 printLog(log)
581
582 # Close file handles
583 pass_handle.close()
584
585 return pass_handle.name
586
587
588 def renameDbFile(db_file, fields, names, out_args=default_out_args):
589 """
590 Renames fields in a database file
591
592 Arguments:
593 db_file = the database file name
594 fields = a list of fields to rename
595 values = a list of new names for fields
596 out_args = common output argument dictionary from parseCommonArgs
597
598 Returns:
599 the output file name
600 """
601 log = OrderedDict()
602 log['START'] = 'ParseDb'
603 log['COMMAND'] = 'rename'
604 log['FILE'] = os.path.basename(db_file)
605 log['FIELDS'] = ','.join(fields)
606 log['NAMES'] = ','.join(names)
607 printLog(log)
608
609 # Open file handles
610 db_iter = readDbFile(db_file, ig=False)
611 pass_handle = getOutputHandle(db_file, out_label='parse-rename', out_dir=out_args['out_dir'],
612 out_name=out_args['out_name'], out_type='tab')
613
614 # Get header and rename fields
615 header = (readDbFile(db_file, ig=False)).fieldnames
616 for f, n in zip(fields, names):
617 i = header.index(f)
618 header[i] = n
619
620 # Open writer and write new header
621 # TODO: should modify getDbWriter to take a list of fields
622 pass_writer = csv.DictWriter(pass_handle, fieldnames=header, dialect='excel-tab')
623 pass_writer.writeheader()
624
625 # Count records
626 result_count = countDbFile(db_file)
627
628 # Iterate over records
629 start_time = time()
630 rec_count = 0
631 for rec in db_iter:
632 # Print progress for previous iteration
633 printProgress(rec_count, result_count, 0.05, start_time)
634 rec_count += 1
635 # TODO: repeating renaming is unnecessary. should had a non-dict reader/writer to DbCore
636 # Rename fields
637 for f, n in zip(fields, names):
638 rec[n] = rec.pop(f)
639 # Write
640 pass_writer.writerow(rec)
641
642 # Print counts
643 printProgress(rec_count, result_count, 0.05, start_time)
644 log = OrderedDict()
645 log['OUTPUT'] = os.path.basename(pass_handle.name)
646 log['RECORDS'] = rec_count
647 log['END'] = 'ParseDb'
648 printLog(log)
649
650 # Close file handles
651 pass_handle.close()
652
653 return pass_handle.name
654
655
656 def selectDbFile(db_file, fields, values, logic='any', regex=False,
657 out_args=default_out_args):
658 """
659 Selects records from a database file
660
661 Arguments:
662 db_file = the database file name
663 fields = a list of fields to check for selection criteria
664 values = a list of values defining selection targets
665 logic = one of 'any' or 'all' defining whether one or all fields must have a match.
666 regex = if False do exact full string matches; if True allow partial regex matches.
667 out_args = common output argument dictionary from parseCommonArgs
668
669 Returns:
670 the output file name
671 """
672 # Define string match function
673 if regex:
674 def _match_func(x, patterns): return any([re.search(p, x) for p in patterns])
675 else:
676 def _match_func(x, patterns): return x in patterns
677
678 # Define logic function
679 if logic == 'any':
680 _logic_func = any
681 elif logic == 'all':
682 _logic_func = all
683
684 # Print console log
685 log = OrderedDict()
686 log['START'] = 'ParseDb'
687 log['COMMAND'] = 'select'
688 log['FILE'] = os.path.basename(db_file)
689 log['FIELDS'] = ','.join(fields)
690 log['VALUES'] = ','.join(values)
691 log['REGEX'] =regex
692 printLog(log)
693
694 # Open file handles
695 db_iter = readDbFile(db_file, ig=False)
696 pass_handle = getOutputHandle(db_file, out_label='parse-select', out_dir=out_args['out_dir'],
697 out_name=out_args['out_name'], out_type='tab')
698 pass_writer = getDbWriter(pass_handle, db_file)
699 # Count records
700 result_count = countDbFile(db_file)
701
702 # Iterate over records
703 start_time = time()
704 rec_count = pass_count = fail_count = 0
705 for rec in db_iter:
706 # Print progress for previous iteration
707 printProgress(rec_count, result_count, 0.05, start_time)
708 rec_count += 1
709
710 # Check for selection values in all fields
711 select = _logic_func([_match_func(rec.get(f, False), values) for f in fields])
712
713 # Write sequences
714 if select:
715 pass_count += 1
716 pass_writer.writerow(rec)
717 else:
718 fail_count += 1
719
720 # Print counts
721 printProgress(rec_count, result_count, 0.05, start_time)
722 log = OrderedDict()
723 log['OUTPUT'] = os.path.basename(pass_handle.name)
724 log['RECORDS'] = rec_count
725 log['SELECTED'] = pass_count
726 log['DISCARDED'] = fail_count
727 log['END'] = 'ParseDb'
728 printLog(log)
729
730 # Close file handles
731 pass_handle.close()
732
733 return pass_handle.name
734
735
736 def sortDbFile(db_file, field, numeric=False, descend=False,
737 out_args=default_out_args):
738 """
739 Sorts records by values in an annotation field
740
741 Arguments:
742 db_file = the database filename
743 field = the field name to sort by
744 numeric = if True sort field numerically;
745 if False sort field alphabetically
746 descend = if True sort in descending order;
747 if False sort in ascending order
748
749 out_args = common output argument dictionary from parseCommonArgs
750
751 Returns:
752 the output file name
753 """
754 log = OrderedDict()
755 log['START'] = 'ParseDb'
756 log['COMMAND'] = 'sort'
757 log['FILE'] = os.path.basename(db_file)
758 log['FIELD'] = field
759 log['NUMERIC'] = numeric
760 printLog(log)
761
762 # Open file handles
763 db_iter = readDbFile(db_file, ig=False)
764 pass_handle = getOutputHandle(db_file, out_label='parse-sort', out_dir=out_args['out_dir'],
765 out_name=out_args['out_name'], out_type='tab')
766 pass_writer = getDbWriter(pass_handle, db_file)
767
768
769 # Store all records in a dictionary
770 start_time = time()
771 printMessage("Indexing: Running", start_time=start_time)
772 db_dict = {i:r for i, r in enumerate(db_iter)}
773 result_count = len(db_dict)
774
775 # Sort db_dict by field values
776 tag_dict = {k:v[field] for k, v in db_dict.items()}
777 if numeric: tag_dict = {k:float(v or 0) for k, v in tag_dict.items()}
778 sorted_keys = sorted(tag_dict, key=tag_dict.get, reverse=descend)
779 printMessage("Indexing: Done", start_time=start_time, end=True)
780
781 # Iterate over records
782 start_time = time()
783 rec_count = 0
784 for key in sorted_keys:
785 # Print progress for previous iteration
786 printProgress(rec_count, result_count, 0.05, start_time)
787 rec_count += 1
788
789 # Write records
790 pass_writer.writerow(db_dict[key])
791
792 # Print counts
793 printProgress(rec_count, result_count, 0.05, start_time)
794 log = OrderedDict()
795 log['OUTPUT'] = os.path.basename(pass_handle.name)
796 log['RECORDS'] = rec_count
797 log['END'] = 'ParseDb'
798 printLog(log)
799
800 # Close file handles
801 pass_handle.close()
802
803 return pass_handle.name
804
805
806 def updateDbFile(db_file, field, values, updates, out_args=default_out_args):
807 """
808 Updates field and value pairs to a database file
809
810 Arguments:
811 db_file = the database file name
812 field = the field to update
813 values = a list of values to specifying which rows to update
814 updates = a list of values to update each value with
815 out_args = common output argument dictionary from parseCommonArgs
816
817 Returns:
818 the output file name
819 """
820 log = OrderedDict()
821 log['START'] = 'ParseDb'
822 log['COMMAND'] = 'update'
823 log['FILE'] = os.path.basename(db_file)
824 log['FIELD'] = field
825 log['VALUES'] = ','.join(values)
826 log['UPDATES'] = ','.join(updates)
827 printLog(log)
828
829 # Open file handles
830 db_iter = readDbFile(db_file, ig=False)
831 pass_handle = getOutputHandle(db_file, out_label='parse-update', out_dir=out_args['out_dir'],
832 out_name=out_args['out_name'], out_type='tab')
833 pass_writer = getDbWriter(pass_handle, db_file)
834 # Count records
835 result_count = countDbFile(db_file)
836
837 # Iterate over records
838 start_time = time()
839 rec_count = pass_count = 0
840 for rec in db_iter:
841 # Print progress for previous iteration
842 printProgress(rec_count, result_count, 0.05, start_time)
843 rec_count += 1
844
845 # Updated values if found
846 for x, y in zip(values, updates):
847 if rec[field] == x:
848 rec[field] = y
849 pass_count += 1
850
851 # Write records
852 pass_writer.writerow(rec)
853
854 # Print counts
855 printProgress(rec_count, result_count, 0.05, start_time)
856 log = OrderedDict()
857 log['OUTPUT'] = os.path.basename(pass_handle.name)
858 log['RECORDS'] = rec_count
859 log['UPDATED'] = pass_count
860 log['END'] = 'ParseDb'
861 printLog(log)
862
863 # Close file handles
864 pass_handle.close()
865
866 return pass_handle.name
867
868
869 def getArgParser():
870 """
871 Defines the ArgumentParser
872
873 Arguments:
874 None
875
876 Returns:
877 an ArgumentParser object
878 """
879 # Define input and output field help message
880 fields = dedent(
881 '''
882 output files:
883 sequences
884 FASTA formatted sequences output from the subcommands fasta and clip.
885 <field>-<value>
886 database files partitioned by annotation <field> and <value>.
887 parse-<command>
888 output of the database modification functions where <command> is one of
889 the subcommands add, index, drop, delete, rename, select, sort or update.
890
891 required fields:
892 SEQUENCE_ID
893
894 optional fields:
895 JUNCTION, SEQUENCE_IMGT, SEQUENCE_VDJ, GERMLINE_IMGT, GERMLINE_VDJ,
896 GERMLINE_IMGT_D_MASK, GERMLINE_VDJ_D_MASK,
897 GERMLINE_IMGT_V_REGION, GERMLINE_VDJ_V_REGION
898
899 output fields:
900 None
901 ''')
902
903 # Define ArgumentParser
904 parser = ArgumentParser(description=__doc__, epilog=fields,
905 formatter_class=CommonHelpFormatter)
906 parser.add_argument('--version', action='version',
907 version='%(prog)s:' + ' %s-%s' %(__version__, __date__))
908 subparsers = parser.add_subparsers(title='subcommands', dest='command', metavar='',
909 help='Database operation')
910 # TODO: This is a temporary fix for Python issue 9253
911 subparsers.required = True
912
913 # Define parent parser
914 parser_parent = getCommonArgParser(seq_in=False, seq_out=False, db_in=True,
915 failed=False, log=False)
916
917 # Subparser to convert database entries to sequence file
918 parser_seq = subparsers.add_parser('fasta', parents=[parser_parent],
919 formatter_class=CommonHelpFormatter,
920 help='Creates a fasta file from database records.',
921 description='Creates a fasta file from database records.')
922 parser_seq.add_argument('--if', action='store', dest='id_field',
923 default=default_id_field,
924 help='The name of the field containing identifiers')
925 parser_seq.add_argument('--sf', action='store', dest='seq_field',
926 default=default_seq_field,
927 help='The name of the field containing sequences')
928 parser_seq.add_argument('--mf', nargs='+', action='store', dest='meta_fields',
929 help='List of annotation fields to add to the sequence description')
930 parser_seq.set_defaults(func=convertDbFasta)
931
932 # Subparser to convert database entries to clip-fasta file
933 parser_baseln = subparsers.add_parser('baseline', parents=[parser_parent],
934 formatter_class=CommonHelpFormatter,
935 description='Creates a BASELINe fasta file from database records.',
936 help='''Creates a specially formatted fasta file
937 from database records for input into the BASELINe
938 website. The format groups clonally related sequences
939 sequentially, with the germline sequence preceding
940 each clone and denoted by headers starting with ">>".''')
941 parser_baseln.add_argument('--if', action='store', dest='id_field',
942 default=default_id_field,
943 help='The name of the field containing identifiers')
944 parser_baseln.add_argument('--sf', action='store', dest='seq_field',
945 default=default_seq_field,
946 help='The name of the field containing reads')
947 parser_baseln.add_argument('--gf', action='store', dest='germ_field',
948 default=default_germ_field,
949 help='The name of the field containing germline sequences')
950 parser_baseln.add_argument('--cf', action='store', dest='cluster_field', default=None,
951 help='The name of the field containing containing sorted clone IDs')
952 parser_baseln.add_argument('--mf', nargs='+', action='store', dest='meta_fields',
953 help='List of annotation fields to add to the sequence description')
954 parser_baseln.set_defaults(func=convertDbBaseline)
955
956 # Subparser to partition files by annotation values
957 parser_split = subparsers.add_parser('split', parents=[parser_parent],
958 formatter_class=CommonHelpFormatter,
959 help='Splits database files by field values.',
960 description='Splits database files by field values')
961 parser_split.add_argument('-f', action='store', dest='field', type=str, required=True,
962 help='Annotation field by which to split database files.')
963 parser_split.add_argument('--num', action='store', dest='num_split', type=float, default=None,
964 help='''Specify to define the field as numeric and group
965 records by whether they are less than or at least
966 (greater than or equal to) the specified value.''')
967 parser_split.set_defaults(func=splitDbFile)
968
969 # Subparser to add records
970 parser_add = subparsers.add_parser('add', parents=[parser_parent],
971 formatter_class=CommonHelpFormatter,
972 help='Adds field and value pairs.',
973 description='Adds field and value pairs.')
974 parser_add.add_argument('-f', nargs='+', action='store', dest='fields', required=True,
975 help='The name of the fields to add.')
976 parser_add.add_argument('-u', nargs='+', action='store', dest='values', required=True,
977 help='The value to assign to all rows for each field.')
978 parser_add.set_defaults(func=addDbFile)
979
980 # Subparser to delete records
981 parser_delete = subparsers.add_parser('delete', parents=[parser_parent],
982 formatter_class=CommonHelpFormatter,
983 help='Deletes specific records.',
984 description='Deletes specific records.')
985 parser_delete.add_argument('-f', nargs='+', action='store', dest='fields', required=True,
986 help='The name of the fields to check for deletion criteria.')
987 parser_delete.add_argument('-u', nargs='+', action='store', dest='values', default=['', 'NA'],
988 help='''The values defining which records to delete. A value
989 may appear in any of the fields specified with -f.''')
990 parser_delete.add_argument('--logic', action='store', dest='logic',
991 choices=('any', 'all'), default='any',
992 help='''Defines whether a value may appear in any field (any)
993 or whether it must appear in all fields (all).''')
994 parser_delete.add_argument('--regex', action='store_true', dest='regex',
995 help='''If specified, treat values as regular expressions
996 and allow partial string matches.''')
997 parser_delete.set_defaults(func=deleteDbFile)
998
999 # Subparser to drop fields
1000 parser_drop = subparsers.add_parser('drop', parents=[parser_parent],
1001 formatter_class=CommonHelpFormatter,
1002 help='Deletes entire fields.',
1003 description='Deletes specific records.')
1004 parser_drop.add_argument('-f', nargs='+', action='store', dest='fields', required=True,
1005 help='The name of the fields to delete from the database.')
1006 parser_drop.set_defaults(func=dropDbFile)
1007
1008 # Subparser to index fields
1009 parser_index = subparsers.add_parser('index', parents=[parser_parent],
1010 formatter_class=CommonHelpFormatter,
1011 help='Adds a numeric index field.',
1012 description='Adds a numeric index field.')
1013 parser_index.add_argument('-f', action='store', dest='field',
1014 default=default_index_field,
1015 help='The name of the index field to add to the database.')
1016 parser_index.set_defaults(func=indexDbFile)
1017
1018 # Subparser to rename fields
1019 parser_rename = subparsers.add_parser('rename', parents=[parser_parent],
1020 formatter_class=CommonHelpFormatter,
1021 help='Renames fields.',
1022 description='Renames fields.')
1023 parser_rename.add_argument('-f', nargs='+', action='store', dest='fields', required=True,
1024 help='List of fields to rename.')
1025 parser_rename.add_argument('-k', nargs='+', action='store', dest='names', required=True,
1026 help='List of new names for each field.')
1027 parser_rename.set_defaults(func=renameDbFile)
1028
1029 # Subparser to select records
1030 parser_select = subparsers.add_parser('select', parents=[parser_parent],
1031 formatter_class=CommonHelpFormatter,
1032 help='Selects specific records.',
1033 description='Selects specific records.')
1034 parser_select.add_argument('-f', nargs='+', action='store', dest='fields', required=True,
1035 help='The name of the fields to check for selection criteria.')
1036 parser_select.add_argument('-u', nargs='+', action='store', dest='values', required=True,
1037 help='''The values defining with records to select. A value
1038 may appear in any of the fields specified with -f.''')
1039 parser_select.add_argument('--logic', action='store', dest='logic',
1040 choices=('any', 'all'), default='any',
1041 help='''Defines whether a value may appear in any field (any)
1042 or whether it must appear in all fields (all).''')
1043 parser_select.add_argument('--regex', action='store_true', dest='regex',
1044 help='''If specified, treat values as regular expressions
1045 and allow partial string matches.''')
1046 parser_select.set_defaults(func=selectDbFile)
1047
1048 # Subparser to sort file by records
1049 parser_sort = subparsers.add_parser('sort', parents=[parser_parent],
1050 formatter_class=CommonHelpFormatter,
1051 help='Sorts records by field values.',
1052 description='Sorts records by field values.')
1053 parser_sort.add_argument('-f', action='store', dest='field', type=str, required=True,
1054 help='The annotation field by which to sort records.')
1055 parser_sort.add_argument('--num', action='store_true', dest='numeric', default=False,
1056 help='''Specify to define the sort column as numeric rather
1057 than textual.''')
1058 parser_sort.add_argument('--descend', action='store_true', dest='descend',
1059 help='''If specified, sort records in descending, rather
1060 than ascending, order by values in the target field.''')
1061 parser_sort.set_defaults(func=sortDbFile)
1062
1063 # Subparser to update records
1064 parser_update = subparsers.add_parser('update', parents=[parser_parent],
1065 formatter_class=CommonHelpFormatter,
1066 help='Updates field and value pairs.',
1067 description='Updates field and value pairs.')
1068 parser_update.add_argument('-f', action='store', dest='field', required=True,
1069 help='The name of the field to update.')
1070 parser_update.add_argument('-u', nargs='+', action='store', dest='values', required=True,
1071 help='The values that will be replaced.')
1072 parser_update.add_argument('-t', nargs='+', action='store', dest='updates', required=True,
1073 help='''The new value to assign to each selected row.''')
1074 parser_update.set_defaults(func=updateDbFile)
1075
1076 return parser
1077
1078
1079 if __name__ == '__main__':
1080 """
1081 Parses command line arguments and calls main function
1082 """
1083 # Parse arguments
1084 parser = getArgParser()
1085 checkArgs(parser)
1086 args = parser.parse_args()
1087 args_dict = parseCommonArgs(args)
1088 # Convert case of fields
1089 if 'id_field' in args_dict:
1090 args_dict['id_field'] = args_dict['id_field'].upper()
1091 if 'seq_field' in args_dict:
1092 args_dict['seq_field'] = args_dict['seq_field'].upper()
1093 if 'germ_field' in args_dict:
1094 args_dict['germ_field'] = args_dict['germ_field'].upper()
1095 if 'field' in args_dict:
1096 args_dict['field'] = args_dict['field'].upper()
1097 if 'cluster_field' in args_dict and args_dict['cluster_field'] is not None:
1098 args_dict['cluster_field'] = args_dict['cluster_field'].upper()
1099 if 'meta_fields' in args_dict and args_dict['meta_fields'] is not None:
1100 args_dict['meta_fields'] = [f.upper() for f in args_dict['meta_fields']]
1101 if 'fields' in args_dict:
1102 args_dict['fields'] = [f.upper() for f in args_dict['fields']]
1103
1104 # Check modify_args arguments
1105 if args.command == 'add' and len(args_dict['fields']) != len(args_dict['values']):
1106 parser.error('You must specify exactly one value (-u) per field (-f)')
1107 elif args.command == 'rename' and len(args_dict['fields']) != len(args_dict['names']):
1108 parser.error('You must specify exactly one new name (-k) per field (-f)')
1109 elif args.command == 'update' and len(args_dict['values']) != len(args_dict['updates']):
1110 parser.error('You must specify exactly one value (-u) per replacement (-t)')
1111
1112 # Call parser function for each database file
1113 del args_dict['command']
1114 del args_dict['func']
1115 del args_dict['db_files']
1116 for f in args.__dict__['db_files']:
1117 args_dict['db_file'] = f
1118 args.func(**args_dict)
1119