change_o: ParseDb.py comparison

comparison ParseDb.py @ 0:183edf446dcf draft default tip

Uploaded

author	davidvanzessen
date	Mon, 17 Jul 2017 07:44:27 -0400
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:183edf446dcf
+#!/usr/bin/env python3
+"""
+Parses tab delimited database files
+"""
+# Info
+__author__ = 'Jason Anthony Vander Heiden'
+from changeo import __version__, __date__
+# Imports
+import csv
+import os
+import re
+from argparse import ArgumentParser
+from collections import OrderedDict
+from textwrap import dedent
+from time import time
+from Bio import SeqIO
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+from Bio.Alphabet import IUPAC
+# Presto and changeo imports
+from presto.Defaults import default_delimiter, default_out_args
+from presto.Annotation import flattenAnnotation
+from presto.IO import getOutputHandle, printLog, printProgress, printMessage
+from changeo.Defaults import default_csv_size
+from changeo.Commandline import CommonHelpFormatter, checkArgs, getCommonArgParser, parseCommonArgs
+from changeo.IO import getDbWriter, readDbFile, countDbFile
+# System settings
+csv.field_size_limit(default_csv_size)
+# Defaults
+default_id_field = 'SEQUENCE_ID'
+default_seq_field = 'SEQUENCE_IMGT'
+default_germ_field = 'GERMLINE_IMGT_D_MASK'
+default_index_field = 'INDEX'
+# TODO:  convert SQL-ish operations to modify_func() as per ParseHeaders
+def getDbSeqRecord(db_record, id_field, seq_field, meta_fields=None,
+delimiter=default_delimiter):
+"""
+Parses a database record into a SeqRecord
+Arguments:
+db_record = a dictionary containing a database record
+id_field = the field containing identifiers
+seq_field = the field containing sequences
+meta_fields = a list of fields to add to sequence annotations
+delimiter = a tuple of delimiters for (fields, values, value lists)
+Returns:
+a SeqRecord
+"""
+# Return None if ID or sequence fields are empty
+if not db_record[id_field] or not db_record[seq_field]:
+return None
+# Create description string
+desc_dict = OrderedDict([('ID', db_record[id_field])])
+if meta_fields is not None:
+desc_dict.update([(f, db_record[f]) for f in meta_fields if f in db_record])
+desc_str = flattenAnnotation(desc_dict, delimiter=delimiter)
+# Create SeqRecord
+seq_record = SeqRecord(Seq(db_record[seq_field], IUPAC.ambiguous_dna),
+id=desc_str, name=desc_str, description='')
+return seq_record
+def splitDbFile(db_file, field, num_split=None, out_args=default_out_args):
+"""
+Divides a tab-delimited database file into segments by description tags
+Arguments:
+db_file = filename of the tab-delimited database file to split
+field = the field name by which to split db_file
+num_split = the numerical threshold by which to group sequences;
+if None treat field as textual
+out_args = common output argument dictionary from parseCommonArgs
+Returns:
+a list of output file names
+"""
+log = OrderedDict()
+log['START'] = 'ParseDb'
+log['COMMAND'] = 'split'
+log['FILE'] = os.path.basename(db_file)
+log['FIELD'] = field
+log['NUM_SPLIT'] = num_split
+printLog(log)
+# Open IgRecord reader iter object
+reader = readDbFile(db_file, ig=False)
+# Determine total numbers of records
+rec_count = countDbFile(db_file)
+start_time = time()
+count = 0
+# Sort records into files based on textual field
+if num_split is None:
+# Create set of unique field tags
+tmp_iter = readDbFile(db_file, ig=False)
+tag_list = list(set([row[field] for row in tmp_iter]))
+# Forbidden characters in filename and replacements
+noGood = {'\/':'f','\\':'b','?':'q','\%':'p','*':'s',':':'c',
+'\|':'pi','\"':'dq','\'':'sq','<':'gt','>':'lt',' ':'_'}
+# Replace forbidden characters in tag_list
+tag_dict = {}
+for tag in tag_list:
+for c,r in noGood.items():
+tag_dict[tag] = (tag_dict.get(tag, tag).replace(c,r) \
+if c in tag else tag_dict.get(tag, tag))
+# Create output handles
+handles_dict = {tag:getOutputHandle(db_file,
+'%s-%s' % (field, label),
+out_type = out_args['out_type'],
+out_name = out_args['out_name'],
+out_dir = out_args['out_dir'])
+for tag, label in tag_dict.items()}
+# Create Db writer instances
+writers_dict = {tag:getDbWriter(handles_dict[tag], db_file)
+for tag in tag_dict}
+# Iterate over IgRecords
+for row in reader:
+printProgress(count, rec_count, 0.05, start_time)
+count += 1
+# Write row to appropriate file
+tag = row[field]
+writers_dict[tag].writerow(row)
+# Sort records into files based on numeric num_split
+else:
+num_split = float(num_split)
+# Create output handles
+handles_dict = {'under':getOutputHandle(db_file,
+'under-%.1f' % num_split,
+out_type = out_args['out_type'],
+out_name = out_args['out_name'],
+out_dir = out_args['out_dir']),
+'atleast':getOutputHandle(db_file,
+'atleast-%.1f' % num_split,
+out_type = out_args['out_type'],
+out_name = out_args['out_name'],
+out_dir = out_args['out_dir'])}
+# Create Db writer instances
+writers_dict = {'under':getDbWriter(handles_dict['under'], db_file),
+'atleast':getDbWriter(handles_dict['atleast'], db_file)}
+# Iterate over IgRecords
+for row in reader:
+printProgress(count, rec_count, 0.05, start_time)
+count += 1
+tag = row[field]
+tag = 'under' if float(tag) < num_split else 'atleast'
+writers_dict[tag].writerow(row)
+# Write log
+printProgress(count, rec_count, 0.05, start_time)
+log = OrderedDict()
+for i, k in enumerate(handles_dict):
+log['OUTPUT%i' % (i + 1)] = os.path.basename(handles_dict[k].name)
+log['RECORDS'] = rec_count
+log['PARTS'] = len(handles_dict)
+log['END'] = 'ParseDb'
+printLog(log)
+# Close output file handles
+for t in handles_dict: handles_dict[t].close()
+return [handles_dict[t].name for t in handles_dict]
+# TODO:  SHOULD ALLOW FOR UNSORTED CLUSTER COLUMN
+# TODO:  SHOULD ALLOW FOR GROUPING FIELDS
+def convertDbBaseline(db_file, id_field=default_id_field, seq_field=default_seq_field,
+germ_field=default_germ_field, cluster_field=None,
+meta_fields=None, out_args=default_out_args):
+"""
+Builds fasta files from database records
+Arguments:
+db_file = the database file name
+id_field = the field containing identifiers
+seq_field = the field containing sample sequences
+germ_field = the field containing germline sequences
+cluster_field = the field containing clonal groupings
+if None write the germline for each record
+meta_fields = a list of fields to add to sequence annotations
+out_args = common output argument dictionary from parseCommonArgs
+Returns:
+the output file name
+"""
+log = OrderedDict()
+log['START'] = 'ParseDb'
+log['COMMAND'] = 'fasta'
+log['FILE'] = os.path.basename(db_file)
+log['ID_FIELD'] = id_field
+log['SEQ_FIELD'] = seq_field
+log['GERM_FIELD'] = germ_field
+log['CLUSTER_FIELD'] = cluster_field
+if meta_fields is not None:  log['META_FIELDS'] = ','.join(meta_fields)
+printLog(log)
+# Open file handles
+db_iter = readDbFile(db_file, ig=False)
+pass_handle = getOutputHandle(db_file, out_label='sequences', out_dir=out_args['out_dir'],
+out_name=out_args['out_name'], out_type='clip')
+# Count records
+result_count = countDbFile(db_file)
+# Iterate over records
+start_time = time()
+rec_count = germ_count = pass_count = fail_count = 0
+cluster_last = None
+for rec in db_iter:
+# Print progress for previous iteration
+printProgress(rec_count, result_count, 0.05, start_time)
+rec_count += 1
+# Update cluster ID
+cluster = rec.get(cluster_field, None)
+# Get germline SeqRecord when needed
+if cluster_field is None:
+germ = getDbSeqRecord(rec, id_field, germ_field, meta_fields,
+delimiter=out_args['delimiter'])
+germ.id = '>' + germ.id
+elif cluster != cluster_last:
+germ = getDbSeqRecord(rec, cluster_field, germ_field,
+delimiter=out_args['delimiter'])
+germ.id = '>' + germ.id
+else:
+germ = None
+# Get read SeqRecord
+seq = getDbSeqRecord(rec, id_field, seq_field, meta_fields,
+delimiter=out_args['delimiter'])
+# Write germline
+if germ is not None:
+germ_count += 1
+SeqIO.write(germ, pass_handle, 'fasta')
+# Write sequences
+if seq is not None:
+pass_count += 1
+SeqIO.write(seq, pass_handle, 'fasta')
+else:
+fail_count += 1
+# Set last cluster ID
+cluster_last = cluster
+# Print counts
+printProgress(rec_count, result_count, 0.05, start_time)
+log = OrderedDict()
+log['OUTPUT'] = os.path.basename(pass_handle.name)
+log['RECORDS'] = rec_count
+log['GERMLINES'] = germ_count
+log['PASS'] = pass_count
+log['FAIL'] = fail_count
+log['END'] = 'ParseDb'
+printLog(log)
+# Close file handles
+pass_handle.close()
+return pass_handle.name
+def convertDbFasta(db_file, id_field=default_id_field, seq_field=default_seq_field,
+meta_fields=None, out_args=default_out_args):
+"""
+Builds fasta files from database records
+Arguments:
+db_file = the database file name
+id_field = the field containing identifiers
+seq_field = the field containing sequences
+meta_fields = a list of fields to add to sequence annotations
+out_args = common output argument dictionary from parseCommonArgs
+Returns:
+the output file name
+"""
+log = OrderedDict()
+log['START'] = 'ParseDb'
+log['COMMAND'] = 'fasta'
+log['FILE'] = os.path.basename(db_file)
+log['ID_FIELD'] = id_field
+log['SEQ_FIELD'] = seq_field
+if meta_fields is not None:  log['META_FIELDS'] = ','.join(meta_fields)
+printLog(log)
+# Open file handles
+out_type = 'fasta'
+db_iter = readDbFile(db_file, ig=False)
+pass_handle = getOutputHandle(db_file, out_label='sequences', out_dir=out_args['out_dir'],
+out_name=out_args['out_name'], out_type=out_type)
+# Count records
+result_count = countDbFile(db_file)
+# Iterate over records
+start_time = time()
+rec_count = pass_count = fail_count = 0
+for rec in db_iter:
+# Print progress for previous iteration
+printProgress(rec_count, result_count, 0.05, start_time)
+rec_count += 1
+# Get SeqRecord
+seq = getDbSeqRecord(rec, id_field, seq_field, meta_fields, out_args['delimiter'])
+# Write sequences
+if seq is not None:
+pass_count += 1
+SeqIO.write(seq, pass_handle, out_type)
+else:
+fail_count += 1
+# Print counts
+printProgress(rec_count, result_count, 0.05, start_time)
+log = OrderedDict()
+log['OUTPUT'] = os.path.basename(pass_handle.name)
+log['RECORDS'] = rec_count
+log['PASS'] = pass_count
+log['FAIL'] = fail_count
+log['END'] = 'ParseDb'
+printLog(log)
+# Close file handles
+pass_handle.close()
+return pass_handle.name
+def addDbFile(db_file, fields, values, out_args=default_out_args):
+"""
+Adds field and value pairs to a database file
+Arguments:
+db_file = the database file name
+fields = a list of fields to add
+values = a list of values to assign to all rows of each field
+out_args = common output argument dictionary from parseCommonArgs
+Returns:
+the output file name
+"""
+log = OrderedDict()
+log['START'] = 'ParseDb'
+log['COMMAND'] = 'add'
+log['FILE'] = os.path.basename(db_file)
+log['FIELDS'] = ','.join(fields)
+log['VALUES'] = ','.join(values)
+printLog(log)
+# Open file handles
+db_iter = readDbFile(db_file, ig=False)
+pass_handle = getOutputHandle(db_file, out_label='parse-add', out_dir=out_args['out_dir'],
+out_name=out_args['out_name'], out_type='tab')
+pass_writer = getDbWriter(pass_handle, db_file, add_fields=fields)
+# Count records
+result_count = countDbFile(db_file)
+# Define fields and values to append
+add_dict = {k:v for k,v in zip(fields, values) if k not in db_iter.fieldnames}
+# Iterate over records
+start_time = time()
+rec_count = 0
+for rec in db_iter:
+# Print progress for previous iteration
+printProgress(rec_count, result_count, 0.05, start_time)
+rec_count += 1
+# Write updated row
+rec.update(add_dict)
+pass_writer.writerow(rec)
+# Print counts
+printProgress(rec_count, result_count, 0.05, start_time)
+log = OrderedDict()
+log['OUTPUT'] = os.path.basename(pass_handle.name)
+log['RECORDS'] = rec_count
+log['END'] = 'ParseDb'
+printLog(log)
+# Close file handles
+pass_handle.close()
+return pass_handle.name
+def indexDbFile(db_file, field=default_index_field, out_args=default_out_args):
+"""
+Adds an index column to a database file
+Arguments:
+db_file = the database file name
+field = the name of the index field to add
+out_args = common output argument dictionary from parseCommonArgs
+Returns:
+the output file name
+"""
+log = OrderedDict()
+log['START'] = 'ParseDb'
+log['COMMAND'] = 'index'
+log['FILE'] = os.path.basename(db_file)
+log['FIELD'] = field
+printLog(log)
+# Open file handles
+db_iter = readDbFile(db_file, ig=False)
+pass_handle = getOutputHandle(db_file, out_label='parse-index', out_dir=out_args['out_dir'],
+out_name=out_args['out_name'], out_type='tab')
+pass_writer = getDbWriter(pass_handle, db_file, add_fields=field)
+# Count records
+result_count = countDbFile(db_file)
+# Iterate over records
+start_time = time()
+rec_count = 0
+for rec in db_iter:
+# Print progress for previous iteration
+printProgress(rec_count, result_count, 0.05, start_time)
+rec_count += 1
+# Add count and write updated row
+rec.update({field:rec_count})
+pass_writer.writerow(rec)
+# Print counts
+printProgress(rec_count, result_count, 0.05, start_time)
+log = OrderedDict()
+log['OUTPUT'] = os.path.basename(pass_handle.name)
+log['RECORDS'] = rec_count
+log['END'] = 'ParseDb'
+printLog(log)
+# Close file handles
+pass_handle.close()
+return pass_handle.name
+def dropDbFile(db_file, fields, out_args=default_out_args):
+"""
+Deletes entire fields from a database file
+Arguments:
+db_file = the database file name
+fields = a list of fields to drop
+out_args = common output argument dictionary from parseCommonArgs
+Returns:
+the output file name
+"""
+log = OrderedDict()
+log['START'] = 'ParseDb'
+log['COMMAND'] = 'add'
+log['FILE'] = os.path.basename(db_file)
+log['FIELDS'] = ','.join(fields)
+printLog(log)
+# Open file handles
+db_iter = readDbFile(db_file, ig=False)
+pass_handle = getOutputHandle(db_file, out_label='parse-drop', out_dir=out_args['out_dir'],
+out_name=out_args['out_name'], out_type='tab')
+pass_writer = getDbWriter(pass_handle, db_file, exclude_fields=fields)
+# Count records
+result_count = countDbFile(db_file)
+# Iterate over records
+start_time = time()
+rec_count = 0
+for rec in db_iter:
+# Print progress for previous iteration
+printProgress(rec_count, result_count, 0.05, start_time)
+rec_count += 1
+# Write row
+pass_writer.writerow(rec)
+# Print counts
+printProgress(rec_count, result_count, 0.05, start_time)
+log = OrderedDict()
+log['OUTPUT'] = os.path.basename(pass_handle.name)
+log['RECORDS'] = rec_count
+log['END'] = 'ParseDb'
+printLog(log)
+# Close file handles
+pass_handle.close()
+return pass_handle.name
+def deleteDbFile(db_file, fields, values, logic='any', regex=False,
+out_args=default_out_args):
+"""
+Deletes records from a database file
+Arguments:
+db_file = the database file name
+fields = a list of fields to check for deletion criteria
+values = a list of values defining deletion targets
+logic = one of 'any' or 'all' defining whether one or all fields must have a match.
+regex = if False do exact full string matches; if True allow partial regex matches.
+out_args = common output argument dictionary from parseCommonArgs
+Returns:
+the output file name
+"""
+# Define string match function
+if regex:
+def _match_func(x, patterns):  return any([re.search(p, x) for p in patterns])
+else:
+def _match_func(x, patterns):  return x in patterns
+# Define logic function
+if logic == 'any':
+_logic_func = any
+elif logic == 'all':
+_logic_func = all
+log = OrderedDict()
+log['START'] = 'ParseDb'
+log['COMMAND'] = 'delete'
+log['FILE'] = os.path.basename(db_file)
+log['FIELDS'] = ','.join(fields)
+log['VALUES'] = ','.join(values)
+printLog(log)
+# Open file handles
+db_iter = readDbFile(db_file, ig=False)
+pass_handle = getOutputHandle(db_file, out_label='parse-delete', out_dir=out_args['out_dir'],
+out_name=out_args['out_name'], out_type='tab')
+pass_writer = getDbWriter(pass_handle, db_file)
+# Count records
+result_count = countDbFile(db_file)
+# Iterate over records
+start_time = time()
+rec_count = pass_count = fail_count = 0
+for rec in db_iter:
+# Print progress for previous iteration
+printProgress(rec_count, result_count, 0.05, start_time)
+rec_count += 1
+# Check for deletion values in all fields
+delete = _logic_func([_match_func(rec.get(f, False), values) for f in fields])
+# Write sequences
+if not delete:
+pass_count += 1
+pass_writer.writerow(rec)
+else:
+fail_count += 1
+# Print counts
+printProgress(rec_count, result_count, 0.05, start_time)
+log = OrderedDict()
+log['OUTPUT'] = os.path.basename(pass_handle.name)
+log['RECORDS'] = rec_count
+log['KEPT'] = pass_count
+log['DELETED'] = fail_count
+log['END'] = 'ParseDb'
+printLog(log)
+# Close file handles
+pass_handle.close()
+return pass_handle.name
+def renameDbFile(db_file, fields, names, out_args=default_out_args):
+"""
+Renames fields in a database file
+Arguments:
+db_file = the database file name
+fields = a list of fields to rename
+values = a list of new names for fields
+out_args = common output argument dictionary from parseCommonArgs
+Returns:
+the output file name
+"""
+log = OrderedDict()
+log['START'] = 'ParseDb'
+log['COMMAND'] = 'rename'
+log['FILE'] = os.path.basename(db_file)
+log['FIELDS'] = ','.join(fields)
+log['NAMES'] = ','.join(names)
+printLog(log)
+# Open file handles
+db_iter = readDbFile(db_file, ig=False)
+pass_handle = getOutputHandle(db_file, out_label='parse-rename', out_dir=out_args['out_dir'],
+out_name=out_args['out_name'], out_type='tab')
+# Get header and rename fields
+header = (readDbFile(db_file, ig=False)).fieldnames
+for f, n in zip(fields, names):
+i = header.index(f)
+header[i] = n
+# Open writer and write new header
+# TODO:  should modify getDbWriter to take a list of fields
+pass_writer = csv.DictWriter(pass_handle, fieldnames=header, dialect='excel-tab')
+pass_writer.writeheader()
+# Count records
+result_count = countDbFile(db_file)
+# Iterate over records
+start_time = time()
+rec_count = 0
+for rec in db_iter:
+# Print progress for previous iteration
+printProgress(rec_count, result_count, 0.05, start_time)
+rec_count += 1
+# TODO:  repeating renaming is unnecessary.  should had a non-dict reader/writer to DbCore
+# Rename fields
+for f, n in zip(fields, names):
+rec[n] = rec.pop(f)
+# Write
+pass_writer.writerow(rec)
+# Print counts
+printProgress(rec_count, result_count, 0.05, start_time)
+log = OrderedDict()
+log['OUTPUT'] = os.path.basename(pass_handle.name)
+log['RECORDS'] = rec_count
+log['END'] = 'ParseDb'
+printLog(log)
+# Close file handles
+pass_handle.close()
+return pass_handle.name
+def selectDbFile(db_file, fields, values, logic='any', regex=False,
+out_args=default_out_args):
+"""
+Selects records from a database file
+Arguments:
+db_file = the database file name
+fields = a list of fields to check for selection criteria
+values = a list of values defining selection targets
+logic = one of 'any' or 'all' defining whether one or all fields must have a match.
+regex = if False do exact full string matches; if True allow partial regex matches.
+out_args = common output argument dictionary from parseCommonArgs
+Returns:
+the output file name
+"""
+# Define string match function
+if regex:
+def _match_func(x, patterns):  return any([re.search(p, x) for p in patterns])
+else:
+def _match_func(x, patterns):  return x in patterns
+# Define logic function
+if logic == 'any':
+_logic_func = any
+elif logic == 'all':
+_logic_func = all
+# Print console log
+log = OrderedDict()
+log['START'] = 'ParseDb'
+log['COMMAND'] = 'select'
+log['FILE'] = os.path.basename(db_file)
+log['FIELDS'] = ','.join(fields)
+log['VALUES'] = ','.join(values)
+log['REGEX'] =regex
+printLog(log)
+# Open file handles
+db_iter = readDbFile(db_file, ig=False)
+pass_handle = getOutputHandle(db_file, out_label='parse-select', out_dir=out_args['out_dir'],
+out_name=out_args['out_name'], out_type='tab')
+pass_writer = getDbWriter(pass_handle, db_file)
+# Count records
+result_count = countDbFile(db_file)
+# Iterate over records
+start_time = time()
+rec_count = pass_count = fail_count = 0
+for rec in db_iter:
+# Print progress for previous iteration
+printProgress(rec_count, result_count, 0.05, start_time)
+rec_count += 1
+# Check for selection values in all fields
+select = _logic_func([_match_func(rec.get(f, False), values) for f in fields])
+# Write sequences
+if select:
+pass_count += 1
+pass_writer.writerow(rec)
+else:
+fail_count += 1
+# Print counts
+printProgress(rec_count, result_count, 0.05, start_time)
+log = OrderedDict()
+log['OUTPUT'] = os.path.basename(pass_handle.name)
+log['RECORDS'] = rec_count
+log['SELECTED'] = pass_count
+log['DISCARDED'] = fail_count
+log['END'] = 'ParseDb'
+printLog(log)
+# Close file handles
+pass_handle.close()
+return pass_handle.name
+def sortDbFile(db_file, field, numeric=False, descend=False,
+out_args=default_out_args):
+"""
+Sorts records by values in an annotation field
+Arguments:
+db_file = the database filename
+field = the field name to sort by
+numeric = if True sort field numerically;
+if False sort field alphabetically
+descend = if True sort in descending order;
+if False sort in ascending order
+out_args = common output argument dictionary from parseCommonArgs
+Returns:
+the output file name
+"""
+log = OrderedDict()
+log['START'] = 'ParseDb'
+log['COMMAND'] = 'sort'
+log['FILE'] = os.path.basename(db_file)
+log['FIELD'] = field
+log['NUMERIC'] = numeric
+printLog(log)
+# Open file handles
+db_iter = readDbFile(db_file, ig=False)
+pass_handle = getOutputHandle(db_file, out_label='parse-sort', out_dir=out_args['out_dir'],
+out_name=out_args['out_name'], out_type='tab')
+pass_writer = getDbWriter(pass_handle, db_file)
+# Store all records in a dictionary
+start_time = time()
+printMessage("Indexing: Running", start_time=start_time)
+db_dict = {i:r for i, r in enumerate(db_iter)}
+result_count = len(db_dict)
+# Sort db_dict by field values
+tag_dict = {k:v[field] for k, v in db_dict.items()}
+if numeric:  tag_dict = {k:float(v or 0) for k, v in tag_dict.items()}
+sorted_keys = sorted(tag_dict, key=tag_dict.get, reverse=descend)
+printMessage("Indexing: Done", start_time=start_time, end=True)
+# Iterate over records
+start_time = time()
+rec_count = 0
+for key in sorted_keys:
+# Print progress for previous iteration
+printProgress(rec_count, result_count, 0.05, start_time)
+rec_count += 1
+# Write records
+pass_writer.writerow(db_dict[key])
+# Print counts
+printProgress(rec_count, result_count, 0.05, start_time)
+log = OrderedDict()
+log['OUTPUT'] = os.path.basename(pass_handle.name)
+log['RECORDS'] = rec_count
+log['END'] = 'ParseDb'
+printLog(log)
+# Close file handles
+pass_handle.close()
+return pass_handle.name
+def updateDbFile(db_file, field, values, updates, out_args=default_out_args):
+"""
+Updates field and value pairs to a database file
+Arguments:
+db_file = the database file name
+field = the field to update
+values = a list of values to specifying which rows to update
+updates = a list of values to update each value with
+out_args = common output argument dictionary from parseCommonArgs
+Returns:
+the output file name
+"""
+log = OrderedDict()
+log['START'] = 'ParseDb'
+log['COMMAND'] = 'update'
+log['FILE'] = os.path.basename(db_file)
+log['FIELD'] = field
+log['VALUES'] = ','.join(values)
+log['UPDATES'] = ','.join(updates)
+printLog(log)
+# Open file handles
+db_iter = readDbFile(db_file, ig=False)
+pass_handle = getOutputHandle(db_file, out_label='parse-update', out_dir=out_args['out_dir'],
+out_name=out_args['out_name'], out_type='tab')
+pass_writer = getDbWriter(pass_handle, db_file)
+# Count records
+result_count = countDbFile(db_file)
+# Iterate over records
+start_time = time()
+rec_count = pass_count = 0
+for rec in db_iter:
+# Print progress for previous iteration
+printProgress(rec_count, result_count, 0.05, start_time)
+rec_count += 1
+# Updated values if found
+for x, y in zip(values, updates):
+if rec[field] == x:
+rec[field] = y
+pass_count += 1
+# Write records
+pass_writer.writerow(rec)
+# Print counts
+printProgress(rec_count, result_count, 0.05, start_time)
+log = OrderedDict()
+log['OUTPUT'] = os.path.basename(pass_handle.name)
+log['RECORDS'] = rec_count
+log['UPDATED'] = pass_count
+log['END'] = 'ParseDb'
+printLog(log)
+# Close file handles
+pass_handle.close()
+return pass_handle.name
+def getArgParser():
+"""
+Defines the ArgumentParser
+Arguments:
+None
+Returns:
+an ArgumentParser object
+"""
+# Define input and output field help message
+fields = dedent(
+'''
+output files:
+sequences
+FASTA formatted sequences output from the subcommands fasta and clip.
+<field>-<value>
+database files partitioned by annotation <field> and <value>.
+parse-<command>
+output of the database modification functions where <command> is one of
+the subcommands add, index, drop, delete, rename, select, sort or update.
+required fields:
+SEQUENCE_ID
+optional fields:
+JUNCTION, SEQUENCE_IMGT, SEQUENCE_VDJ, GERMLINE_IMGT, GERMLINE_VDJ,
+GERMLINE_IMGT_D_MASK, GERMLINE_VDJ_D_MASK,
+GERMLINE_IMGT_V_REGION, GERMLINE_VDJ_V_REGION
+output fields:
+None
+''')
+# Define ArgumentParser
+parser = ArgumentParser(description=__doc__, epilog=fields,
+formatter_class=CommonHelpFormatter)
+parser.add_argument('--version', action='version',
+version='%(prog)s:' + ' %s-%s' %(__version__, __date__))
+subparsers = parser.add_subparsers(title='subcommands', dest='command', metavar='',
+help='Database operation')
+# TODO:  This is a temporary fix for Python issue 9253
+subparsers.required = True
+# Define parent parser
+parser_parent = getCommonArgParser(seq_in=False, seq_out=False, db_in=True,
+failed=False, log=False)
+# Subparser to convert database entries to sequence file
+parser_seq = subparsers.add_parser('fasta', parents=[parser_parent],
+formatter_class=CommonHelpFormatter,
+help='Creates a fasta file from database records.',
+description='Creates a fasta file from database records.')
+parser_seq.add_argument('--if', action='store', dest='id_field',
+default=default_id_field,
+help='The name of the field containing identifiers')
+parser_seq.add_argument('--sf', action='store', dest='seq_field',
+default=default_seq_field,
+help='The name of the field containing sequences')
+parser_seq.add_argument('--mf', nargs='+', action='store', dest='meta_fields',
+help='List of annotation fields to add to the sequence description')
+parser_seq.set_defaults(func=convertDbFasta)
+# Subparser to convert database entries to clip-fasta file
+parser_baseln = subparsers.add_parser('baseline', parents=[parser_parent],
+formatter_class=CommonHelpFormatter,
+description='Creates a BASELINe fasta file from database records.',
+help='''Creates a specially formatted fasta file
+from database records for input into the BASELINe
+website. The format groups clonally related sequences
+sequentially, with the germline sequence preceding
+each clone and denoted by headers starting with ">>".''')
+parser_baseln.add_argument('--if', action='store', dest='id_field',
+default=default_id_field,
+help='The name of the field containing identifiers')
+parser_baseln.add_argument('--sf', action='store', dest='seq_field',
+default=default_seq_field,
+help='The name of the field containing reads')
+parser_baseln.add_argument('--gf', action='store', dest='germ_field',
+default=default_germ_field,
+help='The name of the field containing germline sequences')
+parser_baseln.add_argument('--cf', action='store', dest='cluster_field', default=None,
+help='The name of the field containing containing sorted clone IDs')
+parser_baseln.add_argument('--mf', nargs='+', action='store', dest='meta_fields',
+help='List of annotation fields to add to the sequence description')
+parser_baseln.set_defaults(func=convertDbBaseline)
+# Subparser to partition files by annotation values
+parser_split = subparsers.add_parser('split', parents=[parser_parent],
+formatter_class=CommonHelpFormatter,
+help='Splits database files by field values.',
+description='Splits database files by field values')
+parser_split.add_argument('-f', action='store', dest='field', type=str, required=True,
+help='Annotation field by which to split database files.')
+parser_split.add_argument('--num', action='store', dest='num_split', type=float, default=None,
+help='''Specify to define the field as numeric and group
+records by whether they are less than or at least
+(greater than or equal to) the specified value.''')
+parser_split.set_defaults(func=splitDbFile)
+# Subparser to add records
+parser_add = subparsers.add_parser('add', parents=[parser_parent],
+formatter_class=CommonHelpFormatter,
+help='Adds field and value pairs.',
+description='Adds field and value pairs.')
+parser_add.add_argument('-f', nargs='+', action='store', dest='fields', required=True,
+help='The name of the fields to add.')
+parser_add.add_argument('-u', nargs='+', action='store', dest='values', required=True,
+help='The value to assign to all rows for each field.')
+parser_add.set_defaults(func=addDbFile)
+# Subparser to delete records
+parser_delete = subparsers.add_parser('delete', parents=[parser_parent],
+formatter_class=CommonHelpFormatter,
+help='Deletes specific records.',
+description='Deletes specific records.')
+parser_delete.add_argument('-f', nargs='+', action='store', dest='fields', required=True,
+help='The name of the fields to check for deletion criteria.')
+parser_delete.add_argument('-u', nargs='+', action='store', dest='values', default=['', 'NA'],
+help='''The values defining which records to delete. A value
+may appear in any of the fields specified with -f.''')
+parser_delete.add_argument('--logic', action='store', dest='logic',
+choices=('any', 'all'), default='any',
+help='''Defines whether a value may appear in any field (any)
+or whether it must appear in all fields (all).''')
+parser_delete.add_argument('--regex', action='store_true', dest='regex',
+help='''If specified, treat values as regular expressions
+and allow partial string matches.''')
+parser_delete.set_defaults(func=deleteDbFile)
+# Subparser to drop fields
+parser_drop = subparsers.add_parser('drop', parents=[parser_parent],
+formatter_class=CommonHelpFormatter,
+help='Deletes entire fields.',
+description='Deletes specific records.')
+parser_drop.add_argument('-f', nargs='+', action='store', dest='fields', required=True,
+help='The name of the fields to delete from the database.')
+parser_drop.set_defaults(func=dropDbFile)
+# Subparser to index fields
+parser_index = subparsers.add_parser('index', parents=[parser_parent],
+formatter_class=CommonHelpFormatter,
+help='Adds a numeric index field.',
+description='Adds a numeric index field.')
+parser_index.add_argument('-f', action='store', dest='field',
+default=default_index_field,
+help='The name of the index field to add to the database.')
+parser_index.set_defaults(func=indexDbFile)
+# Subparser to rename fields
+parser_rename = subparsers.add_parser('rename', parents=[parser_parent],
+formatter_class=CommonHelpFormatter,
+help='Renames fields.',
+description='Renames fields.')
+parser_rename.add_argument('-f', nargs='+', action='store', dest='fields', required=True,
+help='List of fields to rename.')
+parser_rename.add_argument('-k', nargs='+', action='store', dest='names', required=True,
+help='List of new names for each field.')
+parser_rename.set_defaults(func=renameDbFile)
+# Subparser to select records
+parser_select = subparsers.add_parser('select', parents=[parser_parent],
+formatter_class=CommonHelpFormatter,
+help='Selects specific records.',
+description='Selects specific records.')
+parser_select.add_argument('-f', nargs='+', action='store', dest='fields', required=True,
+help='The name of the fields to check for selection criteria.')
+parser_select.add_argument('-u', nargs='+', action='store', dest='values', required=True,
+help='''The values defining with records to select. A value
+may appear in any of the fields specified with -f.''')
+parser_select.add_argument('--logic', action='store', dest='logic',
+choices=('any', 'all'), default='any',
+help='''Defines whether a value may appear in any field (any)
+or whether it must appear in all fields (all).''')
+parser_select.add_argument('--regex', action='store_true', dest='regex',
+help='''If specified, treat values as regular expressions
+and allow partial string matches.''')
+parser_select.set_defaults(func=selectDbFile)
+# Subparser to sort file by records
+parser_sort = subparsers.add_parser('sort', parents=[parser_parent],
+formatter_class=CommonHelpFormatter,
+help='Sorts records by field values.',
+description='Sorts records by field values.')
+parser_sort.add_argument('-f', action='store', dest='field', type=str, required=True,
+help='The annotation field by which to sort records.')
+parser_sort.add_argument('--num', action='store_true', dest='numeric', default=False,
+help='''Specify to define the sort column as numeric rather
+than textual.''')
+parser_sort.add_argument('--descend', action='store_true', dest='descend',
+help='''If specified, sort records in descending, rather
+than ascending, order by values in the target field.''')
+parser_sort.set_defaults(func=sortDbFile)
+# Subparser to update records
+parser_update = subparsers.add_parser('update', parents=[parser_parent],
+formatter_class=CommonHelpFormatter,
+help='Updates field and value pairs.',
+description='Updates field and value pairs.')
+parser_update.add_argument('-f', action='store', dest='field', required=True,
+help='The name of the field to update.')
+parser_update.add_argument('-u', nargs='+', action='store', dest='values', required=True,
+help='The values that will be replaced.')
+parser_update.add_argument('-t', nargs='+', action='store', dest='updates', required=True,
+help='''The new value to assign to each selected row.''')
+parser_update.set_defaults(func=updateDbFile)
+return parser
+if __name__ == '__main__':
+"""
+Parses command line arguments and calls main function
+"""
+# Parse arguments
+parser = getArgParser()
+checkArgs(parser)
+args = parser.parse_args()
+args_dict = parseCommonArgs(args)
+# Convert case of fields
+if 'id_field' in args_dict:
+args_dict['id_field'] = args_dict['id_field'].upper()
+if 'seq_field' in args_dict:
+args_dict['seq_field'] = args_dict['seq_field'].upper()
+if 'germ_field' in args_dict:
+args_dict['germ_field'] = args_dict['germ_field'].upper()
+if 'field' in args_dict:
+args_dict['field'] = args_dict['field'].upper()
+if 'cluster_field' in args_dict and args_dict['cluster_field'] is not None:
+args_dict['cluster_field'] = args_dict['cluster_field'].upper()
+if 'meta_fields' in args_dict and args_dict['meta_fields'] is not None:
+args_dict['meta_fields'] = [f.upper() for f in args_dict['meta_fields']]
+if 'fields' in args_dict:
+args_dict['fields'] = [f.upper() for f in args_dict['fields']]
+# Check modify_args arguments
+if args.command == 'add' and len(args_dict['fields']) != len(args_dict['values']):
+parser.error('You must specify exactly one value (-u) per field (-f)')
+elif args.command == 'rename' and len(args_dict['fields']) != len(args_dict['names']):
+parser.error('You must specify exactly one new name (-k) per field (-f)')
+elif args.command == 'update' and len(args_dict['values']) != len(args_dict['updates']):
+parser.error('You must specify exactly one value (-u) per replacement (-t)')
+# Call parser function for each database file
+del args_dict['command']
+del args_dict['func']
+del args_dict['db_files']
+for f in args.__dict__['db_files']:
+args_dict['db_file'] = f
+args.func(**args_dict)

Mercurial > repos > davidvanzessen > change_o

comparison ParseDb.py @ 0:183edf446dcf draft default tip