Mercurial > repos > davidvanzessen > change_o
diff ParseDb.py @ 0:183edf446dcf draft default tip
Uploaded
author | davidvanzessen |
---|---|
date | Mon, 17 Jul 2017 07:44:27 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ParseDb.py Mon Jul 17 07:44:27 2017 -0400 @@ -0,0 +1,1119 @@ +#!/usr/bin/env python3 +""" +Parses tab delimited database files +""" +# Info +__author__ = 'Jason Anthony Vander Heiden' +from changeo import __version__, __date__ + +# Imports +import csv +import os +import re +from argparse import ArgumentParser +from collections import OrderedDict + +from textwrap import dedent +from time import time +from Bio import SeqIO +from Bio.Seq import Seq +from Bio.SeqRecord import SeqRecord +from Bio.Alphabet import IUPAC + +# Presto and changeo imports +from presto.Defaults import default_delimiter, default_out_args +from presto.Annotation import flattenAnnotation +from presto.IO import getOutputHandle, printLog, printProgress, printMessage +from changeo.Defaults import default_csv_size +from changeo.Commandline import CommonHelpFormatter, checkArgs, getCommonArgParser, parseCommonArgs +from changeo.IO import getDbWriter, readDbFile, countDbFile + +# System settings +csv.field_size_limit(default_csv_size) + +# Defaults +default_id_field = 'SEQUENCE_ID' +default_seq_field = 'SEQUENCE_IMGT' +default_germ_field = 'GERMLINE_IMGT_D_MASK' +default_index_field = 'INDEX' + +# TODO: convert SQL-ish operations to modify_func() as per ParseHeaders + +def getDbSeqRecord(db_record, id_field, seq_field, meta_fields=None, + delimiter=default_delimiter): + """ + Parses a database record into a SeqRecord + + Arguments: + db_record = a dictionary containing a database record + id_field = the field containing identifiers + seq_field = the field containing sequences + meta_fields = a list of fields to add to sequence annotations + delimiter = a tuple of delimiters for (fields, values, value lists) + + Returns: + a SeqRecord + """ + # Return None if ID or sequence fields are empty + if not db_record[id_field] or not db_record[seq_field]: + return None + + # Create description string + desc_dict = OrderedDict([('ID', db_record[id_field])]) + if meta_fields is not None: + desc_dict.update([(f, db_record[f]) for f in meta_fields if f in db_record]) + desc_str = flattenAnnotation(desc_dict, delimiter=delimiter) + + # Create SeqRecord + seq_record = SeqRecord(Seq(db_record[seq_field], IUPAC.ambiguous_dna), + id=desc_str, name=desc_str, description='') + + return seq_record + + +def splitDbFile(db_file, field, num_split=None, out_args=default_out_args): + """ + Divides a tab-delimited database file into segments by description tags + + Arguments: + db_file = filename of the tab-delimited database file to split + field = the field name by which to split db_file + num_split = the numerical threshold by which to group sequences; + if None treat field as textual + out_args = common output argument dictionary from parseCommonArgs + + Returns: + a list of output file names + """ + log = OrderedDict() + log['START'] = 'ParseDb' + log['COMMAND'] = 'split' + log['FILE'] = os.path.basename(db_file) + log['FIELD'] = field + log['NUM_SPLIT'] = num_split + printLog(log) + + # Open IgRecord reader iter object + reader = readDbFile(db_file, ig=False) + + # Determine total numbers of records + rec_count = countDbFile(db_file) + + start_time = time() + count = 0 + # Sort records into files based on textual field + if num_split is None: + # Create set of unique field tags + tmp_iter = readDbFile(db_file, ig=False) + tag_list = list(set([row[field] for row in tmp_iter])) + + # Forbidden characters in filename and replacements + noGood = {'\/':'f','\\':'b','?':'q','\%':'p','*':'s',':':'c', + '\|':'pi','\"':'dq','\'':'sq','<':'gt','>':'lt',' ':'_'} + # Replace forbidden characters in tag_list + tag_dict = {} + for tag in tag_list: + for c,r in noGood.items(): + tag_dict[tag] = (tag_dict.get(tag, tag).replace(c,r) \ + if c in tag else tag_dict.get(tag, tag)) + + # Create output handles + handles_dict = {tag:getOutputHandle(db_file, + '%s-%s' % (field, label), + out_type = out_args['out_type'], + out_name = out_args['out_name'], + out_dir = out_args['out_dir']) + for tag, label in tag_dict.items()} + + # Create Db writer instances + writers_dict = {tag:getDbWriter(handles_dict[tag], db_file) + for tag in tag_dict} + + # Iterate over IgRecords + for row in reader: + printProgress(count, rec_count, 0.05, start_time) + count += 1 + # Write row to appropriate file + tag = row[field] + writers_dict[tag].writerow(row) + + # Sort records into files based on numeric num_split + else: + num_split = float(num_split) + + # Create output handles + handles_dict = {'under':getOutputHandle(db_file, + 'under-%.1f' % num_split, + out_type = out_args['out_type'], + out_name = out_args['out_name'], + out_dir = out_args['out_dir']), + 'atleast':getOutputHandle(db_file, + 'atleast-%.1f' % num_split, + out_type = out_args['out_type'], + out_name = out_args['out_name'], + out_dir = out_args['out_dir'])} + + # Create Db writer instances + writers_dict = {'under':getDbWriter(handles_dict['under'], db_file), + 'atleast':getDbWriter(handles_dict['atleast'], db_file)} + + # Iterate over IgRecords + for row in reader: + printProgress(count, rec_count, 0.05, start_time) + count += 1 + tag = row[field] + tag = 'under' if float(tag) < num_split else 'atleast' + writers_dict[tag].writerow(row) + + # Write log + printProgress(count, rec_count, 0.05, start_time) + log = OrderedDict() + for i, k in enumerate(handles_dict): + log['OUTPUT%i' % (i + 1)] = os.path.basename(handles_dict[k].name) + log['RECORDS'] = rec_count + log['PARTS'] = len(handles_dict) + log['END'] = 'ParseDb' + printLog(log) + + # Close output file handles + for t in handles_dict: handles_dict[t].close() + + return [handles_dict[t].name for t in handles_dict] + + +# TODO: SHOULD ALLOW FOR UNSORTED CLUSTER COLUMN +# TODO: SHOULD ALLOW FOR GROUPING FIELDS +def convertDbBaseline(db_file, id_field=default_id_field, seq_field=default_seq_field, + germ_field=default_germ_field, cluster_field=None, + meta_fields=None, out_args=default_out_args): + """ + Builds fasta files from database records + + Arguments: + db_file = the database file name + id_field = the field containing identifiers + seq_field = the field containing sample sequences + germ_field = the field containing germline sequences + cluster_field = the field containing clonal groupings + if None write the germline for each record + meta_fields = a list of fields to add to sequence annotations + out_args = common output argument dictionary from parseCommonArgs + + Returns: + the output file name + """ + log = OrderedDict() + log['START'] = 'ParseDb' + log['COMMAND'] = 'fasta' + log['FILE'] = os.path.basename(db_file) + log['ID_FIELD'] = id_field + log['SEQ_FIELD'] = seq_field + log['GERM_FIELD'] = germ_field + log['CLUSTER_FIELD'] = cluster_field + if meta_fields is not None: log['META_FIELDS'] = ','.join(meta_fields) + printLog(log) + + # Open file handles + db_iter = readDbFile(db_file, ig=False) + pass_handle = getOutputHandle(db_file, out_label='sequences', out_dir=out_args['out_dir'], + out_name=out_args['out_name'], out_type='clip') + # Count records + result_count = countDbFile(db_file) + + # Iterate over records + start_time = time() + rec_count = germ_count = pass_count = fail_count = 0 + cluster_last = None + for rec in db_iter: + # Print progress for previous iteration + printProgress(rec_count, result_count, 0.05, start_time) + rec_count += 1 + + # Update cluster ID + cluster = rec.get(cluster_field, None) + + # Get germline SeqRecord when needed + if cluster_field is None: + germ = getDbSeqRecord(rec, id_field, germ_field, meta_fields, + delimiter=out_args['delimiter']) + germ.id = '>' + germ.id + elif cluster != cluster_last: + germ = getDbSeqRecord(rec, cluster_field, germ_field, + delimiter=out_args['delimiter']) + germ.id = '>' + germ.id + else: + germ = None + + # Get read SeqRecord + seq = getDbSeqRecord(rec, id_field, seq_field, meta_fields, + delimiter=out_args['delimiter']) + + # Write germline + if germ is not None: + germ_count += 1 + SeqIO.write(germ, pass_handle, 'fasta') + + # Write sequences + if seq is not None: + pass_count += 1 + SeqIO.write(seq, pass_handle, 'fasta') + else: + fail_count += 1 + + # Set last cluster ID + cluster_last = cluster + + # Print counts + printProgress(rec_count, result_count, 0.05, start_time) + log = OrderedDict() + log['OUTPUT'] = os.path.basename(pass_handle.name) + log['RECORDS'] = rec_count + log['GERMLINES'] = germ_count + log['PASS'] = pass_count + log['FAIL'] = fail_count + log['END'] = 'ParseDb' + printLog(log) + + # Close file handles + pass_handle.close() + + return pass_handle.name + + +def convertDbFasta(db_file, id_field=default_id_field, seq_field=default_seq_field, + meta_fields=None, out_args=default_out_args): + """ + Builds fasta files from database records + + Arguments: + db_file = the database file name + id_field = the field containing identifiers + seq_field = the field containing sequences + meta_fields = a list of fields to add to sequence annotations + out_args = common output argument dictionary from parseCommonArgs + + Returns: + the output file name + """ + log = OrderedDict() + log['START'] = 'ParseDb' + log['COMMAND'] = 'fasta' + log['FILE'] = os.path.basename(db_file) + log['ID_FIELD'] = id_field + log['SEQ_FIELD'] = seq_field + if meta_fields is not None: log['META_FIELDS'] = ','.join(meta_fields) + printLog(log) + + # Open file handles + out_type = 'fasta' + db_iter = readDbFile(db_file, ig=False) + pass_handle = getOutputHandle(db_file, out_label='sequences', out_dir=out_args['out_dir'], + out_name=out_args['out_name'], out_type=out_type) + # Count records + result_count = countDbFile(db_file) + + # Iterate over records + start_time = time() + rec_count = pass_count = fail_count = 0 + for rec in db_iter: + # Print progress for previous iteration + printProgress(rec_count, result_count, 0.05, start_time) + rec_count += 1 + + # Get SeqRecord + seq = getDbSeqRecord(rec, id_field, seq_field, meta_fields, out_args['delimiter']) + + # Write sequences + if seq is not None: + pass_count += 1 + SeqIO.write(seq, pass_handle, out_type) + else: + fail_count += 1 + + # Print counts + printProgress(rec_count, result_count, 0.05, start_time) + log = OrderedDict() + log['OUTPUT'] = os.path.basename(pass_handle.name) + log['RECORDS'] = rec_count + log['PASS'] = pass_count + log['FAIL'] = fail_count + log['END'] = 'ParseDb' + printLog(log) + + # Close file handles + pass_handle.close() + + return pass_handle.name + + +def addDbFile(db_file, fields, values, out_args=default_out_args): + """ + Adds field and value pairs to a database file + + Arguments: + db_file = the database file name + fields = a list of fields to add + values = a list of values to assign to all rows of each field + out_args = common output argument dictionary from parseCommonArgs + + Returns: + the output file name + """ + log = OrderedDict() + log['START'] = 'ParseDb' + log['COMMAND'] = 'add' + log['FILE'] = os.path.basename(db_file) + log['FIELDS'] = ','.join(fields) + log['VALUES'] = ','.join(values) + printLog(log) + + # Open file handles + db_iter = readDbFile(db_file, ig=False) + pass_handle = getOutputHandle(db_file, out_label='parse-add', out_dir=out_args['out_dir'], + out_name=out_args['out_name'], out_type='tab') + pass_writer = getDbWriter(pass_handle, db_file, add_fields=fields) + # Count records + result_count = countDbFile(db_file) + + # Define fields and values to append + add_dict = {k:v for k,v in zip(fields, values) if k not in db_iter.fieldnames} + + # Iterate over records + start_time = time() + rec_count = 0 + for rec in db_iter: + # Print progress for previous iteration + printProgress(rec_count, result_count, 0.05, start_time) + rec_count += 1 + # Write updated row + rec.update(add_dict) + pass_writer.writerow(rec) + + # Print counts + printProgress(rec_count, result_count, 0.05, start_time) + log = OrderedDict() + log['OUTPUT'] = os.path.basename(pass_handle.name) + log['RECORDS'] = rec_count + log['END'] = 'ParseDb' + printLog(log) + + # Close file handles + pass_handle.close() + + return pass_handle.name + + +def indexDbFile(db_file, field=default_index_field, out_args=default_out_args): + """ + Adds an index column to a database file + + Arguments: + db_file = the database file name + field = the name of the index field to add + out_args = common output argument dictionary from parseCommonArgs + + Returns: + the output file name + """ + log = OrderedDict() + log['START'] = 'ParseDb' + log['COMMAND'] = 'index' + log['FILE'] = os.path.basename(db_file) + log['FIELD'] = field + printLog(log) + + # Open file handles + db_iter = readDbFile(db_file, ig=False) + pass_handle = getOutputHandle(db_file, out_label='parse-index', out_dir=out_args['out_dir'], + out_name=out_args['out_name'], out_type='tab') + pass_writer = getDbWriter(pass_handle, db_file, add_fields=field) + # Count records + result_count = countDbFile(db_file) + + # Iterate over records + start_time = time() + rec_count = 0 + for rec in db_iter: + # Print progress for previous iteration + printProgress(rec_count, result_count, 0.05, start_time) + rec_count += 1 + + # Add count and write updated row + rec.update({field:rec_count}) + pass_writer.writerow(rec) + + # Print counts + printProgress(rec_count, result_count, 0.05, start_time) + log = OrderedDict() + log['OUTPUT'] = os.path.basename(pass_handle.name) + log['RECORDS'] = rec_count + log['END'] = 'ParseDb' + printLog(log) + + # Close file handles + pass_handle.close() + + return pass_handle.name + + +def dropDbFile(db_file, fields, out_args=default_out_args): + """ + Deletes entire fields from a database file + + Arguments: + db_file = the database file name + fields = a list of fields to drop + out_args = common output argument dictionary from parseCommonArgs + + Returns: + the output file name + """ + log = OrderedDict() + log['START'] = 'ParseDb' + log['COMMAND'] = 'add' + log['FILE'] = os.path.basename(db_file) + log['FIELDS'] = ','.join(fields) + printLog(log) + + # Open file handles + db_iter = readDbFile(db_file, ig=False) + pass_handle = getOutputHandle(db_file, out_label='parse-drop', out_dir=out_args['out_dir'], + out_name=out_args['out_name'], out_type='tab') + pass_writer = getDbWriter(pass_handle, db_file, exclude_fields=fields) + # Count records + result_count = countDbFile(db_file) + + # Iterate over records + start_time = time() + rec_count = 0 + for rec in db_iter: + # Print progress for previous iteration + printProgress(rec_count, result_count, 0.05, start_time) + rec_count += 1 + # Write row + pass_writer.writerow(rec) + + # Print counts + printProgress(rec_count, result_count, 0.05, start_time) + log = OrderedDict() + log['OUTPUT'] = os.path.basename(pass_handle.name) + log['RECORDS'] = rec_count + log['END'] = 'ParseDb' + printLog(log) + + # Close file handles + pass_handle.close() + + return pass_handle.name + + +def deleteDbFile(db_file, fields, values, logic='any', regex=False, + out_args=default_out_args): + """ + Deletes records from a database file + + Arguments: + db_file = the database file name + fields = a list of fields to check for deletion criteria + values = a list of values defining deletion targets + logic = one of 'any' or 'all' defining whether one or all fields must have a match. + regex = if False do exact full string matches; if True allow partial regex matches. + out_args = common output argument dictionary from parseCommonArgs + + Returns: + the output file name + """ + # Define string match function + if regex: + def _match_func(x, patterns): return any([re.search(p, x) for p in patterns]) + else: + def _match_func(x, patterns): return x in patterns + + # Define logic function + if logic == 'any': + _logic_func = any + elif logic == 'all': + _logic_func = all + + log = OrderedDict() + log['START'] = 'ParseDb' + log['COMMAND'] = 'delete' + log['FILE'] = os.path.basename(db_file) + log['FIELDS'] = ','.join(fields) + log['VALUES'] = ','.join(values) + printLog(log) + + # Open file handles + db_iter = readDbFile(db_file, ig=False) + pass_handle = getOutputHandle(db_file, out_label='parse-delete', out_dir=out_args['out_dir'], + out_name=out_args['out_name'], out_type='tab') + pass_writer = getDbWriter(pass_handle, db_file) + # Count records + result_count = countDbFile(db_file) + + # Iterate over records + start_time = time() + rec_count = pass_count = fail_count = 0 + for rec in db_iter: + # Print progress for previous iteration + printProgress(rec_count, result_count, 0.05, start_time) + rec_count += 1 + + # Check for deletion values in all fields + delete = _logic_func([_match_func(rec.get(f, False), values) for f in fields]) + + # Write sequences + if not delete: + pass_count += 1 + pass_writer.writerow(rec) + else: + fail_count += 1 + + # Print counts + printProgress(rec_count, result_count, 0.05, start_time) + log = OrderedDict() + log['OUTPUT'] = os.path.basename(pass_handle.name) + log['RECORDS'] = rec_count + log['KEPT'] = pass_count + log['DELETED'] = fail_count + log['END'] = 'ParseDb' + printLog(log) + + # Close file handles + pass_handle.close() + + return pass_handle.name + + +def renameDbFile(db_file, fields, names, out_args=default_out_args): + """ + Renames fields in a database file + + Arguments: + db_file = the database file name + fields = a list of fields to rename + values = a list of new names for fields + out_args = common output argument dictionary from parseCommonArgs + + Returns: + the output file name + """ + log = OrderedDict() + log['START'] = 'ParseDb' + log['COMMAND'] = 'rename' + log['FILE'] = os.path.basename(db_file) + log['FIELDS'] = ','.join(fields) + log['NAMES'] = ','.join(names) + printLog(log) + + # Open file handles + db_iter = readDbFile(db_file, ig=False) + pass_handle = getOutputHandle(db_file, out_label='parse-rename', out_dir=out_args['out_dir'], + out_name=out_args['out_name'], out_type='tab') + + # Get header and rename fields + header = (readDbFile(db_file, ig=False)).fieldnames + for f, n in zip(fields, names): + i = header.index(f) + header[i] = n + + # Open writer and write new header + # TODO: should modify getDbWriter to take a list of fields + pass_writer = csv.DictWriter(pass_handle, fieldnames=header, dialect='excel-tab') + pass_writer.writeheader() + + # Count records + result_count = countDbFile(db_file) + + # Iterate over records + start_time = time() + rec_count = 0 + for rec in db_iter: + # Print progress for previous iteration + printProgress(rec_count, result_count, 0.05, start_time) + rec_count += 1 + # TODO: repeating renaming is unnecessary. should had a non-dict reader/writer to DbCore + # Rename fields + for f, n in zip(fields, names): + rec[n] = rec.pop(f) + # Write + pass_writer.writerow(rec) + + # Print counts + printProgress(rec_count, result_count, 0.05, start_time) + log = OrderedDict() + log['OUTPUT'] = os.path.basename(pass_handle.name) + log['RECORDS'] = rec_count + log['END'] = 'ParseDb' + printLog(log) + + # Close file handles + pass_handle.close() + + return pass_handle.name + + +def selectDbFile(db_file, fields, values, logic='any', regex=False, + out_args=default_out_args): + """ + Selects records from a database file + + Arguments: + db_file = the database file name + fields = a list of fields to check for selection criteria + values = a list of values defining selection targets + logic = one of 'any' or 'all' defining whether one or all fields must have a match. + regex = if False do exact full string matches; if True allow partial regex matches. + out_args = common output argument dictionary from parseCommonArgs + + Returns: + the output file name + """ + # Define string match function + if regex: + def _match_func(x, patterns): return any([re.search(p, x) for p in patterns]) + else: + def _match_func(x, patterns): return x in patterns + + # Define logic function + if logic == 'any': + _logic_func = any + elif logic == 'all': + _logic_func = all + + # Print console log + log = OrderedDict() + log['START'] = 'ParseDb' + log['COMMAND'] = 'select' + log['FILE'] = os.path.basename(db_file) + log['FIELDS'] = ','.join(fields) + log['VALUES'] = ','.join(values) + log['REGEX'] =regex + printLog(log) + + # Open file handles + db_iter = readDbFile(db_file, ig=False) + pass_handle = getOutputHandle(db_file, out_label='parse-select', out_dir=out_args['out_dir'], + out_name=out_args['out_name'], out_type='tab') + pass_writer = getDbWriter(pass_handle, db_file) + # Count records + result_count = countDbFile(db_file) + + # Iterate over records + start_time = time() + rec_count = pass_count = fail_count = 0 + for rec in db_iter: + # Print progress for previous iteration + printProgress(rec_count, result_count, 0.05, start_time) + rec_count += 1 + + # Check for selection values in all fields + select = _logic_func([_match_func(rec.get(f, False), values) for f in fields]) + + # Write sequences + if select: + pass_count += 1 + pass_writer.writerow(rec) + else: + fail_count += 1 + + # Print counts + printProgress(rec_count, result_count, 0.05, start_time) + log = OrderedDict() + log['OUTPUT'] = os.path.basename(pass_handle.name) + log['RECORDS'] = rec_count + log['SELECTED'] = pass_count + log['DISCARDED'] = fail_count + log['END'] = 'ParseDb' + printLog(log) + + # Close file handles + pass_handle.close() + + return pass_handle.name + + +def sortDbFile(db_file, field, numeric=False, descend=False, + out_args=default_out_args): + """ + Sorts records by values in an annotation field + + Arguments: + db_file = the database filename + field = the field name to sort by + numeric = if True sort field numerically; + if False sort field alphabetically + descend = if True sort in descending order; + if False sort in ascending order + + out_args = common output argument dictionary from parseCommonArgs + + Returns: + the output file name + """ + log = OrderedDict() + log['START'] = 'ParseDb' + log['COMMAND'] = 'sort' + log['FILE'] = os.path.basename(db_file) + log['FIELD'] = field + log['NUMERIC'] = numeric + printLog(log) + + # Open file handles + db_iter = readDbFile(db_file, ig=False) + pass_handle = getOutputHandle(db_file, out_label='parse-sort', out_dir=out_args['out_dir'], + out_name=out_args['out_name'], out_type='tab') + pass_writer = getDbWriter(pass_handle, db_file) + + + # Store all records in a dictionary + start_time = time() + printMessage("Indexing: Running", start_time=start_time) + db_dict = {i:r for i, r in enumerate(db_iter)} + result_count = len(db_dict) + + # Sort db_dict by field values + tag_dict = {k:v[field] for k, v in db_dict.items()} + if numeric: tag_dict = {k:float(v or 0) for k, v in tag_dict.items()} + sorted_keys = sorted(tag_dict, key=tag_dict.get, reverse=descend) + printMessage("Indexing: Done", start_time=start_time, end=True) + + # Iterate over records + start_time = time() + rec_count = 0 + for key in sorted_keys: + # Print progress for previous iteration + printProgress(rec_count, result_count, 0.05, start_time) + rec_count += 1 + + # Write records + pass_writer.writerow(db_dict[key]) + + # Print counts + printProgress(rec_count, result_count, 0.05, start_time) + log = OrderedDict() + log['OUTPUT'] = os.path.basename(pass_handle.name) + log['RECORDS'] = rec_count + log['END'] = 'ParseDb' + printLog(log) + + # Close file handles + pass_handle.close() + + return pass_handle.name + + +def updateDbFile(db_file, field, values, updates, out_args=default_out_args): + """ + Updates field and value pairs to a database file + + Arguments: + db_file = the database file name + field = the field to update + values = a list of values to specifying which rows to update + updates = a list of values to update each value with + out_args = common output argument dictionary from parseCommonArgs + + Returns: + the output file name + """ + log = OrderedDict() + log['START'] = 'ParseDb' + log['COMMAND'] = 'update' + log['FILE'] = os.path.basename(db_file) + log['FIELD'] = field + log['VALUES'] = ','.join(values) + log['UPDATES'] = ','.join(updates) + printLog(log) + + # Open file handles + db_iter = readDbFile(db_file, ig=False) + pass_handle = getOutputHandle(db_file, out_label='parse-update', out_dir=out_args['out_dir'], + out_name=out_args['out_name'], out_type='tab') + pass_writer = getDbWriter(pass_handle, db_file) + # Count records + result_count = countDbFile(db_file) + + # Iterate over records + start_time = time() + rec_count = pass_count = 0 + for rec in db_iter: + # Print progress for previous iteration + printProgress(rec_count, result_count, 0.05, start_time) + rec_count += 1 + + # Updated values if found + for x, y in zip(values, updates): + if rec[field] == x: + rec[field] = y + pass_count += 1 + + # Write records + pass_writer.writerow(rec) + + # Print counts + printProgress(rec_count, result_count, 0.05, start_time) + log = OrderedDict() + log['OUTPUT'] = os.path.basename(pass_handle.name) + log['RECORDS'] = rec_count + log['UPDATED'] = pass_count + log['END'] = 'ParseDb' + printLog(log) + + # Close file handles + pass_handle.close() + + return pass_handle.name + + +def getArgParser(): + """ + Defines the ArgumentParser + + Arguments: + None + + Returns: + an ArgumentParser object + """ + # Define input and output field help message + fields = dedent( + ''' + output files: + sequences + FASTA formatted sequences output from the subcommands fasta and clip. + <field>-<value> + database files partitioned by annotation <field> and <value>. + parse-<command> + output of the database modification functions where <command> is one of + the subcommands add, index, drop, delete, rename, select, sort or update. + + required fields: + SEQUENCE_ID + + optional fields: + JUNCTION, SEQUENCE_IMGT, SEQUENCE_VDJ, GERMLINE_IMGT, GERMLINE_VDJ, + GERMLINE_IMGT_D_MASK, GERMLINE_VDJ_D_MASK, + GERMLINE_IMGT_V_REGION, GERMLINE_VDJ_V_REGION + + output fields: + None + ''') + + # Define ArgumentParser + parser = ArgumentParser(description=__doc__, epilog=fields, + formatter_class=CommonHelpFormatter) + parser.add_argument('--version', action='version', + version='%(prog)s:' + ' %s-%s' %(__version__, __date__)) + subparsers = parser.add_subparsers(title='subcommands', dest='command', metavar='', + help='Database operation') + # TODO: This is a temporary fix for Python issue 9253 + subparsers.required = True + + # Define parent parser + parser_parent = getCommonArgParser(seq_in=False, seq_out=False, db_in=True, + failed=False, log=False) + + # Subparser to convert database entries to sequence file + parser_seq = subparsers.add_parser('fasta', parents=[parser_parent], + formatter_class=CommonHelpFormatter, + help='Creates a fasta file from database records.', + description='Creates a fasta file from database records.') + parser_seq.add_argument('--if', action='store', dest='id_field', + default=default_id_field, + help='The name of the field containing identifiers') + parser_seq.add_argument('--sf', action='store', dest='seq_field', + default=default_seq_field, + help='The name of the field containing sequences') + parser_seq.add_argument('--mf', nargs='+', action='store', dest='meta_fields', + help='List of annotation fields to add to the sequence description') + parser_seq.set_defaults(func=convertDbFasta) + + # Subparser to convert database entries to clip-fasta file + parser_baseln = subparsers.add_parser('baseline', parents=[parser_parent], + formatter_class=CommonHelpFormatter, + description='Creates a BASELINe fasta file from database records.', + help='''Creates a specially formatted fasta file + from database records for input into the BASELINe + website. The format groups clonally related sequences + sequentially, with the germline sequence preceding + each clone and denoted by headers starting with ">>".''') + parser_baseln.add_argument('--if', action='store', dest='id_field', + default=default_id_field, + help='The name of the field containing identifiers') + parser_baseln.add_argument('--sf', action='store', dest='seq_field', + default=default_seq_field, + help='The name of the field containing reads') + parser_baseln.add_argument('--gf', action='store', dest='germ_field', + default=default_germ_field, + help='The name of the field containing germline sequences') + parser_baseln.add_argument('--cf', action='store', dest='cluster_field', default=None, + help='The name of the field containing containing sorted clone IDs') + parser_baseln.add_argument('--mf', nargs='+', action='store', dest='meta_fields', + help='List of annotation fields to add to the sequence description') + parser_baseln.set_defaults(func=convertDbBaseline) + + # Subparser to partition files by annotation values + parser_split = subparsers.add_parser('split', parents=[parser_parent], + formatter_class=CommonHelpFormatter, + help='Splits database files by field values.', + description='Splits database files by field values') + parser_split.add_argument('-f', action='store', dest='field', type=str, required=True, + help='Annotation field by which to split database files.') + parser_split.add_argument('--num', action='store', dest='num_split', type=float, default=None, + help='''Specify to define the field as numeric and group + records by whether they are less than or at least + (greater than or equal to) the specified value.''') + parser_split.set_defaults(func=splitDbFile) + + # Subparser to add records + parser_add = subparsers.add_parser('add', parents=[parser_parent], + formatter_class=CommonHelpFormatter, + help='Adds field and value pairs.', + description='Adds field and value pairs.') + parser_add.add_argument('-f', nargs='+', action='store', dest='fields', required=True, + help='The name of the fields to add.') + parser_add.add_argument('-u', nargs='+', action='store', dest='values', required=True, + help='The value to assign to all rows for each field.') + parser_add.set_defaults(func=addDbFile) + + # Subparser to delete records + parser_delete = subparsers.add_parser('delete', parents=[parser_parent], + formatter_class=CommonHelpFormatter, + help='Deletes specific records.', + description='Deletes specific records.') + parser_delete.add_argument('-f', nargs='+', action='store', dest='fields', required=True, + help='The name of the fields to check for deletion criteria.') + parser_delete.add_argument('-u', nargs='+', action='store', dest='values', default=['', 'NA'], + help='''The values defining which records to delete. A value + may appear in any of the fields specified with -f.''') + parser_delete.add_argument('--logic', action='store', dest='logic', + choices=('any', 'all'), default='any', + help='''Defines whether a value may appear in any field (any) + or whether it must appear in all fields (all).''') + parser_delete.add_argument('--regex', action='store_true', dest='regex', + help='''If specified, treat values as regular expressions + and allow partial string matches.''') + parser_delete.set_defaults(func=deleteDbFile) + + # Subparser to drop fields + parser_drop = subparsers.add_parser('drop', parents=[parser_parent], + formatter_class=CommonHelpFormatter, + help='Deletes entire fields.', + description='Deletes specific records.') + parser_drop.add_argument('-f', nargs='+', action='store', dest='fields', required=True, + help='The name of the fields to delete from the database.') + parser_drop.set_defaults(func=dropDbFile) + + # Subparser to index fields + parser_index = subparsers.add_parser('index', parents=[parser_parent], + formatter_class=CommonHelpFormatter, + help='Adds a numeric index field.', + description='Adds a numeric index field.') + parser_index.add_argument('-f', action='store', dest='field', + default=default_index_field, + help='The name of the index field to add to the database.') + parser_index.set_defaults(func=indexDbFile) + + # Subparser to rename fields + parser_rename = subparsers.add_parser('rename', parents=[parser_parent], + formatter_class=CommonHelpFormatter, + help='Renames fields.', + description='Renames fields.') + parser_rename.add_argument('-f', nargs='+', action='store', dest='fields', required=True, + help='List of fields to rename.') + parser_rename.add_argument('-k', nargs='+', action='store', dest='names', required=True, + help='List of new names for each field.') + parser_rename.set_defaults(func=renameDbFile) + + # Subparser to select records + parser_select = subparsers.add_parser('select', parents=[parser_parent], + formatter_class=CommonHelpFormatter, + help='Selects specific records.', + description='Selects specific records.') + parser_select.add_argument('-f', nargs='+', action='store', dest='fields', required=True, + help='The name of the fields to check for selection criteria.') + parser_select.add_argument('-u', nargs='+', action='store', dest='values', required=True, + help='''The values defining with records to select. A value + may appear in any of the fields specified with -f.''') + parser_select.add_argument('--logic', action='store', dest='logic', + choices=('any', 'all'), default='any', + help='''Defines whether a value may appear in any field (any) + or whether it must appear in all fields (all).''') + parser_select.add_argument('--regex', action='store_true', dest='regex', + help='''If specified, treat values as regular expressions + and allow partial string matches.''') + parser_select.set_defaults(func=selectDbFile) + + # Subparser to sort file by records + parser_sort = subparsers.add_parser('sort', parents=[parser_parent], + formatter_class=CommonHelpFormatter, + help='Sorts records by field values.', + description='Sorts records by field values.') + parser_sort.add_argument('-f', action='store', dest='field', type=str, required=True, + help='The annotation field by which to sort records.') + parser_sort.add_argument('--num', action='store_true', dest='numeric', default=False, + help='''Specify to define the sort column as numeric rather + than textual.''') + parser_sort.add_argument('--descend', action='store_true', dest='descend', + help='''If specified, sort records in descending, rather + than ascending, order by values in the target field.''') + parser_sort.set_defaults(func=sortDbFile) + + # Subparser to update records + parser_update = subparsers.add_parser('update', parents=[parser_parent], + formatter_class=CommonHelpFormatter, + help='Updates field and value pairs.', + description='Updates field and value pairs.') + parser_update.add_argument('-f', action='store', dest='field', required=True, + help='The name of the field to update.') + parser_update.add_argument('-u', nargs='+', action='store', dest='values', required=True, + help='The values that will be replaced.') + parser_update.add_argument('-t', nargs='+', action='store', dest='updates', required=True, + help='''The new value to assign to each selected row.''') + parser_update.set_defaults(func=updateDbFile) + + return parser + + +if __name__ == '__main__': + """ + Parses command line arguments and calls main function + """ + # Parse arguments + parser = getArgParser() + checkArgs(parser) + args = parser.parse_args() + args_dict = parseCommonArgs(args) + # Convert case of fields + if 'id_field' in args_dict: + args_dict['id_field'] = args_dict['id_field'].upper() + if 'seq_field' in args_dict: + args_dict['seq_field'] = args_dict['seq_field'].upper() + if 'germ_field' in args_dict: + args_dict['germ_field'] = args_dict['germ_field'].upper() + if 'field' in args_dict: + args_dict['field'] = args_dict['field'].upper() + if 'cluster_field' in args_dict and args_dict['cluster_field'] is not None: + args_dict['cluster_field'] = args_dict['cluster_field'].upper() + if 'meta_fields' in args_dict and args_dict['meta_fields'] is not None: + args_dict['meta_fields'] = [f.upper() for f in args_dict['meta_fields']] + if 'fields' in args_dict: + args_dict['fields'] = [f.upper() for f in args_dict['fields']] + + # Check modify_args arguments + if args.command == 'add' and len(args_dict['fields']) != len(args_dict['values']): + parser.error('You must specify exactly one value (-u) per field (-f)') + elif args.command == 'rename' and len(args_dict['fields']) != len(args_dict['names']): + parser.error('You must specify exactly one new name (-k) per field (-f)') + elif args.command == 'update' and len(args_dict['values']) != len(args_dict['updates']): + parser.error('You must specify exactly one value (-u) per replacement (-t)') + + # Call parser function for each database file + del args_dict['command'] + del args_dict['func'] + del args_dict['db_files'] + for f in args.__dict__['db_files']: + args_dict['db_file'] = f + args.func(**args_dict) +