# HG changeset patch # User davidvanzessen # Date 1598990624 14400 # Node ID 98e3fecedd2be53cc941e824f7e7a35c4cc4d01b # Parent aff3ba86ef7a1cf74dd4da6ca18e140d401b48fa Uploaded diff -r aff3ba86ef7a -r 98e3fecedd2b change_o/DefineClones.py --- a/change_o/DefineClones.py Mon Aug 31 11:20:08 2020 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,739 +0,0 @@ -#!/usr/bin/env python3 -""" -Assign Ig sequences into clones -""" - -# Info -__author__ = 'Namita Gupta, Jason Anthony Vander Heiden, Gur Yaari, Mohamed Uduman' -from changeo import __version__, __date__ - -# Imports -import os -import re -import sys -from argparse import ArgumentParser -from collections import OrderedDict -from itertools import chain -from textwrap import dedent -from time import time -from Bio.Seq import translate - -# Presto and changeo imports -from presto.Defaults import default_out_args -from presto.IO import printLog, printProgress, printCount, printWarning, printError -from presto.Multiprocessing import manageProcesses -from changeo.Defaults import default_format, default_v_field, default_j_field, default_junction_field -from changeo.Commandline import CommonHelpFormatter, checkArgs, getCommonArgParser, parseCommonArgs -from changeo.Distance import distance_models, calcDistances, formClusters -from changeo.IO import countDbFile, getDbFields, getFormatOperators, getOutputHandle, \ - AIRRWriter, ChangeoWriter -from changeo.Multiprocessing import DbResult, feedDbQueue, processDbQueue - -# Defaults -default_translate = False -default_distance = 0.0 -default_index_mode = 'gene' -default_index_action = 'set' -default_distance_model = 'ham' -default_norm = 'len' -default_sym = 'avg' -default_linkage = 'single' -default_max_missing=0 -choices_distance_model = ('ham', 'aa', 'hh_s1f', 'hh_s5f', - 'mk_rs1nf', 'mk_rs5nf', - 'hs1f_compat', 'm1n_compat') - - -def filterMissing(data, seq_field=default_junction_field, v_field=default_v_field, - j_field=default_j_field, max_missing=default_max_missing): - """ - Splits a set of sequence into passed and failed groups based on the number - of missing characters in the sequence - - Arguments: - data : changeo.Multiprocessing.DbData object. - seq_field : sequence field to filter on. - v_field : field containing the V call. - j_field : field containing the J call. - max_missing : maximum number of missing characters (non-ACGT) to permit before failing the record. - - Returns: - changeo.Multiprocessing.DbResult : objected containing filtered records. - """ - # Function to validate the sequence string - def _pass(seq): - if len(seq) > 0 and len(re.findall(r'[^ACGT]', seq)) <= max_missing: - return True - else: - return False - - # Define result object for iteration and get data records - result = DbResult(data.id, data.data) - - if not data: - result.data_pass = [] - result.data_fail = data.data - return result - - result.data_pass = [] - result.data_fail = [] - for rec in data.data: - seq = rec.getField(seq_field) - if _pass(seq): result.data_pass.append(rec) - else: result.data_fail.append(rec) - - # Add V(D)J to log - result.log['ID'] = ','.join([str(x) for x in data.id]) - result.log['VCALL'] = ','.join(set([(r.getVAllele(field=v_field) or '') for r in data.data])) - result.log['JCALL'] = ','.join(set([(r.getJAllele(field=j_field) or '') for r in data.data])) - result.log['JUNCLEN'] = ','.join(set([(str(len(r.junction)) or '0') for r in data.data])) - result.log['CLONED'] = len(result.data_pass) - result.log['FILTERED'] = len(result.data_fail) - - return result - - -def indexByIdentity(index, key, rec, group_fields=None): - """ - Updates a preclone index with a simple key - - Arguments: - index : preclone index from groupByGene - key : index key - rec : Receptor to add to the index - group_fields : additional annotation fields to use to group preclones; - if None use only V, J and junction length - - Returns: - None : Updates index with new key and records. - """ - index.setdefault(tuple(key), []).append(rec) - - -def indexByUnion(index, key, rec, group_fields=None): - """ - Updates a preclone index with the union of nested keys - - Arguments: - index : preclone index from groupByGene - key : index key - rec : Receptor to add to the index - group_fields : additional annotation fields to use to group preclones; - if None use only V, J and junction length - - Returns: - None : Updates index with new key and records. - """ - # List of values for this/new key - val = [rec] - f_range = list(range(2, 3 + (len(group_fields) if group_fields else 0))) - - # See if field/junction length combination exists in index - outer_dict = index - for field in f_range: - try: - outer_dict = outer_dict[key[field]] - except KeyError: - outer_dict = None - break - # If field combination exists, look through Js - j_matches = [] - if outer_dict is not None: - for j in outer_dict.keys(): - if not set(key[1]).isdisjoint(set(j)): - key[1] = tuple(set(key[1]).union(set(j))) - j_matches += [j] - # If J overlap exists, look through Vs for each J - for j in j_matches: - v_matches = [] - # Collect V matches for this J - for v in outer_dict[j].keys(): - if not set(key[0]).isdisjoint(set(v)): - key[0] = tuple(set(key[0]).union(set(v))) - v_matches += [v] - # If there are V overlaps for this J, pop them out - if v_matches: - val += list(chain(*(outer_dict[j].pop(v) for v in v_matches))) - # If the J dict is now empty, remove it - if not outer_dict[j]: - outer_dict.pop(j, None) - - # Add value(s) into index nested dictionary - # OMG Python pointers are the best! - # Add field dictionaries into index - outer_dict = index - for field in f_range: - outer_dict.setdefault(key[field], {}) - outer_dict = outer_dict[key[field]] - # Add J, then V into index - if key[1] in outer_dict: - outer_dict[key[1]].update({key[0]: val}) - else: - outer_dict[key[1]] = {key[0]: val} - - -def groupByGene(db_iter, group_fields=None, v_field=default_v_field, j_field=default_j_field, - mode=default_index_mode, action=default_index_action): - """ - Identifies preclonal groups by V, J and junction length - - Arguments: - db_iter : an iterator of Receptor objects defined by ChangeoReader - group_fields : additional annotation fields to use to group preclones; - if None use only V, J and junction length - mode : specificity of alignment call to use for assigning preclones; - one of ('allele', 'gene') - action : how to handle multiple value fields when assigning preclones; - one of ('first', 'set') - - Returns: - dict: dictionary of {(V, J, junction length):[Receptor]} - """ - # print(fields) - # Define functions for grouping keys - if mode == 'allele' and group_fields is None: - def _get_key(rec, act): - return [rec.getVAllele(act, field=v_field), rec.getJAllele(act, field=j_field), - None if rec.junction is None else len(rec.junction)] - elif mode == 'gene' and group_fields is None: - def _get_key(rec, act): - return [rec.getVGene(act, field=v_field), rec.getJGene(act, field=j_field), - None if rec.junction is None else len(rec.junction)] - elif mode == 'allele' and group_fields is not None: - def _get_key(rec, act): - vdj = [rec.getVAllele(act, field=v_field), rec.getJAllele(act, field=j_field), - None if rec.junction is None else len(rec.junction)] - ann = [rec.getField(k) for k in group_fields] - return list(chain(vdj, ann)) - elif mode == 'gene' and group_fields is not None: - def _get_key(rec, act): - vdj = [rec.getVGene(act, field=v_field), rec.getJGene(act, field=j_field), - None if rec.junction is None else len(rec.junction)] - ann = [rec.getField(k) for k in group_fields] - return list(chain(vdj, ann)) - - # Function to flatten nested dictionary - def _flatten_dict(d, parent_key=''): - items = [] - for k, v in d.items(): - new_key = parent_key + [k] if parent_key else [k] - if isinstance(v, dict): - items.extend(_flatten_dict(v, new_key).items()) - else: - items.append((new_key, v)) - flat_dict = {None if None in i[0] else tuple(i[0]): i[1] for i in items} - return flat_dict - - if action == 'first': - index_func = indexByIdentity - elif action == 'set': - index_func = indexByUnion - else: - sys.stderr.write('Unrecognized action: %s.\n' % action) - - start_time = time() - clone_index = {} - rec_count = 0 - for rec in db_iter: - key = _get_key(rec, action) - - # Print progress - printCount(rec_count, step=1000, start_time=start_time, task='Grouping sequences') - rec_count += 1 - - # Assigned passed preclone records to key and failed to index None - if all([k is not None and k != '' for k in key]): - # Update index dictionary - index_func(clone_index, key, rec, group_fields) - else: - clone_index.setdefault(None, []).append(rec) - - printCount(rec_count, step=1000, start_time=start_time, task='Grouping sequences', end=True) - - if action == 'set': - clone_index = _flatten_dict(clone_index) - - return clone_index - - -def distanceClones(result, seq_field=default_junction_field, model=default_distance_model, - distance=default_distance, dist_mat=None, norm=default_norm, sym=default_sym, - linkage=default_linkage): - """ - Separates a set of Receptor objects into clones - - Arguments: - result : a changeo.Multiprocessing.DbResult object with filtered records to clone - seq_field : sequence field used to calculate distance between records - model : substitution model used to calculate distance - distance : the distance threshold to assign clonal groups - dist_mat : pandas DataFrame of pairwise nucleotide or amino acid distances - norm : normalization method - sym : symmetry method - linkage : type of linkage - - Returns: - changeo.Multiprocessing.DbResult : an updated DbResult object - """ - # Get distance matrix if not provided - if dist_mat is None: - try: - dist_mat = distance_models[model] - except KeyError: - printError('Unrecognized distance model: %s' % args_dict['model']) - - # TODO: can be cleaned up with abstract model class - # Determine length of n-mers - if model in ['hs1f_compat', 'm1n_compat', 'aa', 'ham', 'hh_s1f', 'mk_rs1nf']: - nmer_len = 1 - elif model in ['hh_s5f', 'mk_rs5nf']: - nmer_len = 5 - else: - printError('Unrecognized distance model: %s.\n' % model) - - # Define unique junction mapping - seq_map = {} - for rec in result.data_pass: - seq = rec.getField(seq_field) - seq = re.sub('[\.-]', 'N', seq) - if model == 'aa': seq = translate(seq) - seq_map.setdefault(seq, []).append(rec) - - # Define sequences - sequences = list(seq_map.keys()) - - # Zero record case - if not sequences: - result.valid = False - result.log['CLONES'] = 0 - return result - - # Single record case - if len(sequences) == 1: - result.results = {1: result.data_pass} - result.valid = True - result.log['CLONES'] = 1 - return result - - # Calculate pairwise distance matrix - dists = calcDistances(sequences, nmer_len, dist_mat, sym=sym, norm=norm) - - # Perform hierarchical clustering - clusters = formClusters(dists, linkage, distance) - - # Turn clusters into clone dictionary - clone_dict = {} - for i, c in enumerate(clusters): - clone_dict.setdefault(c, []).extend(seq_map[sequences[i]]) - - if clone_dict: - result.results = clone_dict - result.valid = True - result.log['CLONES'] = len(clone_dict) - else: - result.log['CLONES'] = 0 - - return result - - -def collectQueue(alive, result_queue, collect_queue, db_file, fields, - writer=ChangeoWriter, out_file=None, out_args=default_out_args): - """ - Assembles results from a queue of individual sequence results and manages log/file I/O - - Arguments: - alive = a multiprocessing.Value boolean controlling whether processing continues - if False exit process - result_queue : a multiprocessing.Queue holding processQueue results - collect_queue : a multiprocessing.Queue to store collector return values - db_file : the input database file name - fields : list of output field names - writer : writer class. - out_file : output file name. Automatically generated from the input file if None. - out_args : common output argument dictionary from parseCommonArgs - - Returns: - None : Adds a dictionary with key value pairs to collect_queue containing - 'log' defining a log object along with the 'pass' and 'fail' output file names. - """ - # Wrapper for opening handles and writers - def _open(x, f, writer=writer, out_file=out_file): - if out_file is not None and x == 'pass': - handle = open(out_file, 'w') - else: - handle = getOutputHandle(db_file, - out_label='clone-%s' % x, - out_dir=out_args['out_dir'], - out_name=out_args['out_name'], - out_type=out_args['out_type']) - return handle, writer(handle, fields=f) - - # Open log file - try: - # Count input records - result_count = countDbFile(db_file) - - # Define log handle - if out_args['log_file'] is None: - log_handle = None - else: - log_handle = open(out_args['log_file'], 'w') - except: - #sys.stderr.write('Exception in collector file opening step\n') - alive.value = False - raise - - # Get results from queue and write to files - try: - # Initialize handles, writers and counters - pass_handle, pass_writer = None, None - fail_handle, fail_writer = None, None - rec_count, clone_count, pass_count, fail_count = 0, 0, 0, 0 - start_time = time() - - # Iterator over results queue until sentinel object reached - while alive.value: - # Get result from queue - if result_queue.empty(): continue - else: result = result_queue.get() - # Exit upon reaching sentinel - if result is None: break - - # Print progress for previous iteration and update record count - printProgress(rec_count, result_count, 0.05, start_time=start_time, task='Assigning clones') - rec_count += len(result.data) - - # Write passed and failed records - if result: - # Writing passing sequences - for clone in result.results.values(): - clone_count += 1 - for i, rec in enumerate(clone, start=1): - pass_count += 1 - rec.setField('clone', str(clone_count)) - result.log['CLONE%i-%i' % (clone_count, i)] = rec.junction - try: - pass_writer.writeReceptor(rec) - except AttributeError: - # Open pass file and define writer object - pass_handle, pass_writer = _open('pass', fields) - pass_writer.writeReceptor(rec) - - # Write failed sequences from passing sets - if result.data_fail: - # Write failed sequences - for i, rec in enumerate(result.data_fail, start=1): - fail_count += 1 - result.log['FAIL%i-%i' % (clone_count, i)] = rec.junction - if out_args['failed']: - try: - fail_writer.writeReceptor(rec) - except AttributeError: - # Open fail file and define writer object - fail_handle, fail_writer = _open('fail', fields) - fail_writer.writeReceptor(rec) - else: - # Write failing records - for i, rec in enumerate(result.data, start=1): - fail_count += 1 - result.log['CLONE0-%i' % (i)] = rec.junction - if out_args['failed']: - try: - fail_writer.writeReceptor(rec) - except AttributeError: - # Open fail file and define writer object - fail_handle, fail_writer = _open('fail', fields) - fail_writer.writeReceptor(rec) - - # Write log - printLog(result.log, handle=log_handle) - else: - sys.stderr.write('PID %s> Error in sibling process detected. Cleaning up.\n' \ - % os.getpid()) - return None - - # Print total counts - printProgress(rec_count, result_count, 0.05, start_time=start_time, task='Assigning clones') - - # Update return list - log = OrderedDict() - log['OUTPUT'] = os.path.basename(pass_handle.name) if pass_handle is not None else None - log['CLONES'] = clone_count - log['RECORDS'] = rec_count - log['PASS'] = pass_count - log['FAIL'] = fail_count - - # Close file handles and generate return data - collect_dict = {'log': log, 'pass': None, 'fail': None} - if pass_handle is not None: - collect_dict['pass'] = pass_handle.name - pass_handle.close() - if fail_handle is not None: - collect_dict['fail'] = fail_handle.name - fail_handle.close() - if log_handle is not None: - log_handle.close() - collect_queue.put(collect_dict) - except: - alive.value = False - raise - - return None - - -def defineClones(db_file, seq_field=default_junction_field, v_field=default_v_field, - j_field=default_j_field, max_missing=default_max_missing, - group_fields=None, group_func=groupByGene, group_args={}, - clone_func=distanceClones, clone_args={}, - format=default_format, out_file=None, out_args=default_out_args, - nproc=None, queue_size=None): - """ - Define clonally related sequences - - Arguments: - db_file : filename of input database. - seq_field : sequence field used to determine clones. - v_field : field containing the V call. - j_field : field containing the J call. - max_missing : maximum number of non-ACGT characters to allow in the junction sequence. - group_fields : additional annotation fields to use to group preclones; - if None use only V and J. - group_func : the function to use for assigning preclones. - group_args : a dictionary of arguments to pass to group_func. - clone_func : the function to use for determining clones within preclonal groups. - clone_args : a dictionary of arguments to pass to clone_func. - format : input and output format. - out_file : output file name. Automatically generated from the input file if None. - out_args : common output argument dictionary from parseCommonArgs. - nproc : the number of processQueue processes; - if None defaults to the number of CPUs. - queue_size : maximum size of the argument queue; - if None defaults to 2*nproc. - - Returns: - dict: dictionary of output pass and fail files. - """ - # Print parameter info - log = OrderedDict() - log['START'] = 'DefineClones' - log['FILE'] = os.path.basename(db_file) - log['SEQ_FIELD'] = seq_field - log['V_FIELD'] = v_field - log['J_FIELD'] = j_field - log['MAX_MISSING'] = max_missing - log['GROUP_FIELDS'] = ','.join(group_fields) if group_fields is not None else None - for k in sorted(group_args): - log[k.upper()] = group_args[k] - for k in sorted(clone_args): - if k != 'dist_mat': log[k.upper()] = clone_args[k] - log['NPROC'] = nproc - printLog(log) - - # Define format operators - try: - reader, writer, schema = getFormatOperators(format) - except ValueError: - printError('Invalid format %s.' % format) - - # Translate to Receptor attribute names - seq_field = schema.toReceptor(seq_field) - v_field = schema.toReceptor(v_field) - j_field = schema.toReceptor(j_field) - if group_fields is not None: - group_fields = [schema.toReceptor(f) for f in group_fields] - - # Define feeder function and arguments - group_args['group_fields'] = group_fields - group_args['v_field'] = v_field - group_args['j_field'] = j_field - feed_args = {'db_file': db_file, - 'reader': reader, - 'group_func': group_func, - 'group_args': group_args} - - # Define worker function and arguments - filter_args = {'seq_field': seq_field, - 'v_field': v_field, - 'j_field': j_field, - 'max_missing': max_missing} - clone_args['seq_field'] = seq_field - work_args = {'process_func': clone_func, - 'process_args': clone_args, - 'filter_func': filterMissing, - 'filter_args': filter_args} - - # Define collector function and arguments - out_fields = getDbFields(db_file, add=schema.fromReceptor('clone'), reader=reader) - out_args['out_type'] = schema.out_type - collect_args = {'db_file': db_file, - 'fields': out_fields, - 'writer': writer, - 'out_file': out_file, - 'out_args': out_args} - - # Call process manager - result = manageProcesses(feed_func=feedDbQueue, work_func=processDbQueue, collect_func=collectQueue, - feed_args=feed_args, work_args=work_args, collect_args=collect_args, - nproc=nproc, queue_size=queue_size) - - # Print log - result['log']['END'] = 'DefineClones' - printLog(result['log']) - output = {k: v for k, v in result.items() if k in ('pass', 'fail')} - - return output - - -def getArgParser(): - """ - Defines the ArgumentParser - - Arguments: - None - - Returns: - an ArgumentParser object - """ - # Define input and output fields - fields = dedent( - ''' - output files: - clone-pass - database with assigned clonal group numbers. - clone-fail - database with records failing clonal grouping. - - required fields: - SEQUENCE_ID, V_CALL, J_CALL, JUNCTION - - output fields: - CLONE - ''') - # Define argument parser - parser = ArgumentParser(description=__doc__, epilog=fields, - parents=[getCommonArgParser(format=False, multiproc=True)], - formatter_class=CommonHelpFormatter, add_help=False) - - # Distance cloning method - group = parser.add_argument_group('cloning arguments') - group.add_argument('--sf', action='store', dest='seq_field', default=default_junction_field, - help='Field to be used to calculate distance between records.') - group.add_argument('--vf', action='store', dest='v_field', default=default_v_field, - help='Field containing the germline V segment call.') - group.add_argument('--jf', action='store', dest='j_field', default=default_j_field, - help='Field containing the germline J segment call.') - group.add_argument('--gf', nargs='+', action='store', dest='group_fields', default=None, - help='Additional fields to use for grouping clones aside from V, J and junction length.') - group.add_argument('--mode', action='store', dest='mode', - choices=('allele', 'gene'), default=default_index_mode, - help='''Specifies whether to use the V(D)J allele or gene for - initial grouping.''') - group.add_argument('--act', action='store', dest='action', - choices=('first', 'set'), default=default_index_action, - help='''Specifies how to handle multiple V(D)J assignments for initial grouping. - The "first" action will use only the first gene listed. - The "set" action will use all gene assignments and construct a larger gene - grouping composed of any sequences sharing an assignment or linked to another - sequence by a common assignment (similar to single-linkage).''') - group.add_argument('--model', action='store', dest='model', - choices=choices_distance_model, - default=default_distance_model, - help='''Specifies which substitution model to use for calculating distance - between sequences. The "ham" model is nucleotide Hamming distance and - "aa" is amino acid Hamming distance. The "hh_s1f" and "hh_s5f" models are - human specific single nucleotide and 5-mer content models, respectively, - from Yaari et al, 2013. The "mk_rs1nf" and "mk_rs5nf" models are - mouse specific single nucleotide and 5-mer content models, respectively, - from Cui et al, 2016. The "m1n_compat" and "hs1f_compat" models are - deprecated models provided backwards compatibility with the "m1n" and - "hs1f" models in Change-O v0.3.3 and SHazaM v0.1.4. Both - 5-mer models should be considered experimental.''') - group.add_argument('--dist', action='store', dest='distance', type=float, - default=default_distance, - help='The distance threshold for clonal grouping') - group.add_argument('--norm', action='store', dest='norm', - choices=('len', 'mut', 'none'), default=default_norm, - help='''Specifies how to normalize distances. One of none - (do not normalize), len (normalize by length), - or mut (normalize by number of mutations between sequences).''') - group.add_argument('--sym', action='store', dest='sym', - choices=('avg', 'min'), default=default_sym, - help='''Specifies how to combine asymmetric distances. One of avg - (average of A->B and B->A) or min (minimum of A->B and B->A).''') - group.add_argument('--link', action='store', dest='linkage', - choices=('single', 'average', 'complete'), default=default_linkage, - help='''Type of linkage to use for hierarchical clustering.''') - group.add_argument('--maxmiss', action='store', dest='max_missing', type=int, - default=default_max_missing, - help='''The maximum number of non-ACGT characters (gaps or Ns) to - permit in the junction sequence before excluding the record - from clonal assignment. Note, under single linkage - non-informative positions can create artifactual links - between unrelated sequences. Use with caution.''') - parser.set_defaults(group_func=groupByGene) - parser.set_defaults(clone_func=distanceClones) - - return parser - - -if __name__ == '__main__': - """ - Parses command line arguments and calls main function - """ - # Parse arguments - parser = getArgParser() - checkArgs(parser) - args = parser.parse_args() - args_dict = parseCommonArgs(args) - - # # Set default fields if not specified. - # default_fields = {'seq_field': default_junction_field, - # 'v_field': default_v_field, - # 'j_field': default_j_field} - # - # # Default Change-O fields - # if args_dict['format'] == 'changeo': - # for f in default_fields: - # if args_dict[f] is None: args_dict[f] = default_fields[f] - # else: args_dict[f] = args_dict[f].upper() - # - # # Default AIRR fields - # if args_dict['format'] == 'airr': - # for f in default_fields: - # if args_dict[f] is None: args_dict[f] = ChangeoSchema.toAIRR(default_fields[f]) - # else: args_dict[f] = args_dict[f].lower() - - # Define grouping and cloning function arguments - args_dict['group_args'] = {'action': args_dict['action'], - 'mode':args_dict['mode']} - args_dict['clone_args'] = {'model': args_dict['model'], - 'distance': args_dict['distance'], - 'norm': args_dict['norm'], - 'sym': args_dict['sym'], - 'linkage': args_dict['linkage']} - - # Get distance matrix - try: - args_dict['clone_args']['dist_mat'] = distance_models[args_dict['model']] - except KeyError: - printError('Unrecognized distance model: %s' % args_dict['model']) - - # Clean argument dictionary - del args_dict['action'] - del args_dict['mode'] - del args_dict['model'] - del args_dict['distance'] - del args_dict['norm'] - del args_dict['sym'] - del args_dict['linkage'] - - # Clean arguments dictionary - del args_dict['db_files'] - if 'out_files' in args_dict: del args_dict['out_files'] - - # Call main function for each input file - for i, f in enumerate(args.__dict__['db_files']): - args_dict['db_file'] = f - args_dict['out_file'] = args.__dict__['out_files'][i] \ - if args.__dict__['out_files'] else None - defineClones(**args_dict) diff -r aff3ba86ef7a -r 98e3fecedd2b change_o/MakeDb.py --- a/change_o/MakeDb.py Mon Aug 31 11:20:08 2020 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,885 +0,0 @@ -#!/usr/bin/env python3 -""" -Create tab-delimited database file to store sequence alignment information -""" - -# Info -__author__ = 'Namita Gupta, Jason Anthony Vander Heiden' -from changeo import __version__, __date__ - -# Imports -import os -import re -import csv -from argparse import ArgumentParser -from collections import OrderedDict -from textwrap import dedent -from time import time -from Bio import SeqIO - -# Presto and changeo imports -from presto.Annotation import parseAnnotation -from presto.IO import countSeqFile, printLog, printMessage, printProgress, printError, printWarning, readSeqFile -from changeo.Defaults import default_format, default_out_args -from changeo.Commandline import CommonHelpFormatter, checkArgs, getCommonArgParser, parseCommonArgs -from changeo.Alignment import RegionDefinition -from changeo.Gene import buildGermline -from changeo.IO import countDbFile, extractIMGT, readGermlines, getFormatOperators, getOutputHandle, \ - AIRRWriter, ChangeoWriter, IgBLASTReader, IgBLASTReaderAA, IMGTReader, IHMMuneReader -from changeo.Receptor import ChangeoSchema, AIRRSchema - -# 10X Receptor attributes -cellranger_base = ['cell', 'c_call', 'conscount', 'umicount'] -cellranger_extended = ['cell', 'c_call', 'conscount', 'umicount', - 'v_call_10x', 'd_call_10x', 'j_call_10x', - 'junction_10x', 'junction_10x_aa'] - -def readCellRanger(cellranger_file, fields=cellranger_base): - """ - Load a Cell Ranger annotation table - - Arguments: - cellranger_file (str): path to the annotation file. - fields (list): list of fields to keep. - - Returns: - dict: dict of dicts with contig_id as the primary key. - """ - # Mapping of 10X annotations to Receptor attributes - cellranger_map = {'cell': 'barcode', - 'c_call': 'c_gene', - 'locus': 'chain', - 'conscount': 'reads', - 'umicount': 'umis', - 'v_call_10x': 'v_gene', - 'd_call_10x': 'd_gene', - 'j_call_10x': 'j_gene', - 'junction_10x': 'cdr3_nt', - 'junction_10x_aa': 'cdr3'} - - # Function to parse individual fields - def _parse(x): - return '' if x == 'None' else x - - # Generate annotation dictionary - ann_dict = {} - with open(cellranger_file) as csv_file: - # Detect delimiters - dialect = csv.Sniffer().sniff(csv_file.readline()) - csv_file.seek(0) - # Read in annotation file - csv_reader = csv.DictReader(csv_file, dialect=dialect) - - # Generate annotation dictionary - for row in csv_reader: - ann_dict[row['contig_id']] = {f: _parse(row[cellranger_map[f]]) for f in fields} - - return ann_dict - - -def addGermline(receptor, references, amino_acid=False): - """ - Add full length germline to Receptor object - - Arguments: - receptor (changeo.Receptor.Receptor): Receptor object to modify. - references (dict): dictionary of IMGT-gapped references sequences. - amino_acid (bool): if True build amino acid germline, otherwise build nucleotide germline - - Returns: - changeo.Receptor.Receptor: modified Receptor with the germline sequence added. - """ - if amino_acid: - __, germlines, __ = buildGermline(receptor, references, seq_field='sequence_aa_imgt', - amino_acid=True) - germline_seq = None if germlines is None else germlines['full'] - receptor.setField('germline_aa_imgt', germline_seq) - else: - __, germlines, __ = buildGermline(receptor, references, amino_acid=False) - germline_seq = None if germlines is None else germlines['full'] - receptor.setField('germline_imgt', germline_seq) - - return receptor - - -def getIDforIMGT(seq_file): - """ - Create a sequence ID translation using IMGT truncation. - - Arguments: - seq_file : a fasta file of sequences input to IMGT. - - Returns: - dict : a dictionary of with the IMGT truncated ID as the key and the full sequence description as the value. - """ - - # Create a sequence ID translation using IDs truncate up to space or 50 chars - ids = {} - for rec in readSeqFile(seq_file): - if len(rec.description) <= 50: - id_key = rec.description - else: - id_key = re.sub('\||\s|!|&|\*|<|>|\?', '_', rec.description[:50]) - ids.update({id_key: rec.description}) - - return ids - - -def getSeqDict(seq_file): - """ - Create a dictionary from a sequence file. - - Arguments: - seq_file : sequence file. - - Returns: - dict : sequence description as keys with Bio.SeqRecords as values. - """ - seq_dict = SeqIO.to_dict(readSeqFile(seq_file), key_function=lambda x: x.description) - - return seq_dict - - -def writeDb(records, fields, aligner_file, total_count, id_dict=None, annotations=None, - amino_acid=False, partial=False, asis_id=True, regions='default', - writer=AIRRWriter, out_file=None, out_args=default_out_args): - """ - Writes parsed records to an output file - - Arguments: - records : a iterator of Receptor objects containing alignment data. - fields : a list of ordered field names to write. - aligner_file : input file name. - total_count : number of records (for progress bar). - id_dict : a dictionary of the truncated sequence ID mapped to the full sequence ID. - annotations : additional annotation dictionary. - amino_acid : if True do verification on amino acid fields. - partial : if True put incomplete alignments in the pass file. - asis_id : if ID is to be parsed for pRESTO output with default delimiters. - regions (str): name of the IMGT FWR/CDR region definitions to use. - writer : writer class. - out_file : output file name. Automatically generated from the input file if None. - out_args : common output argument dictionary from parseCommonArgs. - - Returns: - None - """ - # Wrapper for opening handles and writers - def _open(x, f, writer=writer, out_file=out_file): - if out_file is not None and x == 'pass': - handle = open(out_file, 'w') - else: - handle = getOutputHandle(aligner_file, - out_label='db-%s' % x, - out_dir=out_args['out_dir'], - out_name=out_args['out_name'], - out_type=out_args['out_type']) - return handle, writer(handle, fields=f) - - # Function to convert fasta header annotations to changeo columns - def _changeo(f, header): - h = [ChangeoSchema.fromReceptor(x) for x in header if x.upper() not in f] - f.extend(h) - return f - - def _airr(f, header): - h = [AIRRSchema.fromReceptor(x) for x in header if x.lower() not in f] - f.extend(h) - return f - - # Function to verify IMGT-gapped sequence and junction concur - def _imgt_check(rec): - try: - if amino_acid: - rd = RegionDefinition(rec.junction_aa_length, amino_acid=amino_acid, definition=regions) - x, y = rd.positions['junction'] - check = (rec.junction_aa == rec.sequence_aa_imgt[x:y]) - else: - rd = RegionDefinition(rec.junction_length, amino_acid=amino_acid, definition=regions) - x, y = rd.positions['junction'] - check = (rec.junction == rec.sequence_imgt[x:y]) - except (TypeError, AttributeError): - check = False - return check - - # Function to check for valid records strictly - def _strict(rec): - if amino_acid: - valid = [rec.v_call and rec.v_call != 'None', - rec.j_call and rec.j_call != 'None', - rec.functional is not None, - rec.sequence_aa_imgt, - rec.junction_aa, - _imgt_check(rec)] - else: - valid = [rec.v_call and rec.v_call != 'None', - rec.j_call and rec.j_call != 'None', - rec.functional is not None, - rec.sequence_imgt, - rec.junction, - _imgt_check(rec)] - return all(valid) - - # Function to check for valid records loosely - def _gentle(rec): - valid = [rec.v_call and rec.v_call != 'None', - rec.d_call and rec.d_call != 'None', - rec.j_call and rec.j_call != 'None'] - return any(valid) - - # Set writer class and annotation conversion function - if writer == ChangeoWriter: - _annotate = _changeo - elif writer == AIRRWriter: - _annotate = _airr - else: - printError('Invalid output writer.') - - # Additional annotation (e.g. 10X cell calls) - # _append_table = None - # if cellranger_file is not None: - # with open(cellranger_file) as csv_file: - # # Read in annotation file (use Sniffer to discover file delimiters) - # dialect = csv.Sniffer().sniff(csv_file.readline()) - # csv_file.seek(0) - # csv_reader = csv.DictReader(csv_file, dialect = dialect) - # - # # Generate annotation dictionary - # anntab_dict = {entry['contig_id']: {cellranger_map[field]: entry[field] \ - # for field in cellranger_map.keys()} for entry in csv_reader} - # - # fields = _annotate(fields, cellranger_map.values()) - # _append_table = lambda sequence_id: anntab_dict[sequence_id] - - # Set pass criteria - _pass = _gentle if partial else _strict - - # Define log handle - if out_args['log_file'] is None: - log_handle = None - else: - log_handle = open(out_args['log_file'], 'w') - - # Initialize handles, writers and counters - pass_handle, pass_writer = None, None - fail_handle, fail_writer = None, None - pass_count, fail_count = 0, 0 - start_time = time() - - # Validate and write output - printProgress(0, total_count, 0.05, start_time=start_time) - for i, record in enumerate(records, start=1): - # Replace sequence description with full string, if required - if id_dict is not None and record.sequence_id in id_dict: - record.sequence_id = id_dict[record.sequence_id] - - # Parse sequence description into new columns - if not asis_id: - try: - ann_raw = parseAnnotation(record.sequence_id) - record.sequence_id = ann_raw.pop('ID') - - # Convert to Receptor fields - ann_parsed = OrderedDict() - for k, v in ann_raw.items(): - ann_parsed[ChangeoSchema.toReceptor(k)] = v - - # Add annotations to Receptor and update field list - record.setDict(ann_parsed, parse=True) - if i == 1: fields = _annotate(fields, ann_parsed.keys()) - except IndexError: - # Could not parse pRESTO-style annotations so fall back to no parse - asis_id = True - printWarning('Sequence annotation format not recognized. Sequence headers will not be parsed.') - - # Add supplemental annotation fields - # if _append_table is not None: - # record.setDict(_append_table(record.sequence_id), parse=True) - if annotations is not None: - record.setDict(annotations[record.sequence_id], parse=True) - if i == 1: fields = _annotate(fields, annotations[record.sequence_id].keys()) - - # Count pass or fail and write to appropriate file - if _pass(record): - pass_count += 1 - # Write row to pass file - try: - pass_writer.writeReceptor(record) - except AttributeError: - # Open pass file and writer - pass_handle, pass_writer = _open('pass', fields) - pass_writer.writeReceptor(record) - else: - fail_count += 1 - # Write row to fail file if specified - if out_args['failed']: - try: - fail_writer.writeReceptor(record) - except AttributeError: - # Open fail file and writer - fail_handle, fail_writer = _open('fail', fields) - fail_writer.writeReceptor(record) - - # Write log - if log_handle is not None: - log = OrderedDict([('ID', record.sequence_id), - ('V_CALL', record.v_call), - ('D_CALL', record.d_call), - ('J_CALL', record.j_call), - ('PRODUCTIVE', record.functional)]) - if not _imgt_check(record) and not amino_acid: - log['ERROR'] = 'Junction does not match the sequence starting at position 310 in the IMGT numbered V(D)J sequence.' - printLog(log, log_handle) - - # Print progress - printProgress(i, total_count, 0.05, start_time=start_time) - - # Print console log - log = OrderedDict() - log['OUTPUT'] = os.path.basename(pass_handle.name) if pass_handle is not None else None - log['PASS'] = pass_count - log['FAIL'] = fail_count - log['END'] = 'MakeDb' - printLog(log) - - # Close file handles - output = {'pass': None, 'fail': None} - if pass_handle is not None: - output['pass'] = pass_handle.name - pass_handle.close() - if fail_handle is not None: - output['fail'] = fail_handle.name - fail_handle.close() - - return output - - -def parseIMGT(aligner_file, seq_file=None, repo=None, cellranger_file=None, partial=False, asis_id=True, - extended=False, format=default_format, out_file=None, out_args=default_out_args): - """ - Main for IMGT aligned sample sequences. - - Arguments: - aligner_file : zipped file or unzipped folder output by IMGT. - seq_file : FASTA file input to IMGT (from which to get seqID). - repo : folder with germline repertoire files. - partial : If True put incomplete alignments in the pass file. - asis_id : if ID is to be parsed for pRESTO output with default delimiters. - extended : if True add alignment score, FWR, CDR and junction fields to output file. - format : output format. one of 'changeo' or 'airr'. - out_file : output file name. Automatically generated from the input file if None. - out_args : common output argument dictionary from parseCommonArgs. - - Returns: - dict : names of the 'pass' and 'fail' output files. - """ - # Print parameter info - log = OrderedDict() - log['START'] = 'MakeDb' - log['COMMAND'] = 'imgt' - log['ALIGNER_FILE'] = aligner_file - log['SEQ_FILE'] = os.path.basename(seq_file) if seq_file else '' - log['ASIS_ID'] = asis_id - log['PARTIAL'] = partial - log['EXTENDED'] = extended - printLog(log) - - start_time = time() - printMessage('Loading files', start_time=start_time, width=20) - - # Extract IMGT files - temp_dir, imgt_files = extractIMGT(aligner_file) - - # Count records in IMGT files - total_count = countDbFile(imgt_files['summary']) - - # Get (parsed) IDs from fasta file submitted to IMGT - id_dict = getIDforIMGT(seq_file) if seq_file else {} - - # Load supplementary annotation table - if cellranger_file is not None: - f = cellranger_extended if extended else cellranger_base - annotations = readCellRanger(cellranger_file, fields=f) - else: - annotations = None - - printMessage('Done', start_time=start_time, end=True, width=20) - - # Define format operators - try: - __, writer, schema = getFormatOperators(format) - except ValueError: - printError('Invalid format %s.' % format) - out_args['out_type'] = schema.out_type - - # Define output fields - fields = list(schema.required) - if extended: - custom = IMGTReader.customFields(scores=True, regions=True, junction=True, schema=schema) - fields.extend(custom) - - # Parse IMGT output and write db - with open(imgt_files['summary'], 'r') as summary_handle, \ - open(imgt_files['gapped'], 'r') as gapped_handle, \ - open(imgt_files['ntseq'], 'r') as ntseq_handle, \ - open(imgt_files['junction'], 'r') as junction_handle: - - # Open parser - parse_iter = IMGTReader(summary_handle, gapped_handle, ntseq_handle, junction_handle) - - # Add germline sequence - if repo is None: - germ_iter = parse_iter - else: - references = readGermlines(repo) - # Check for IMGT-gaps in germlines - if all('...' not in x for x in references.values()): - printWarning('Germline reference sequences do not appear to contain IMGT-numbering spacers. Results may be incorrect.') - germ_iter = (addGermline(x, references) for x in parse_iter) - - # Write db - output = writeDb(germ_iter, fields=fields, aligner_file=aligner_file, total_count=total_count, - annotations=annotations, id_dict=id_dict, asis_id=asis_id, partial=partial, - writer=writer, out_file=out_file, out_args=out_args) - - # Cleanup temp directory - temp_dir.cleanup() - - return output - - -def parseIgBLAST(aligner_file, seq_file, repo, amino_acid=False, cellranger_file=None, partial=False, - asis_id=True, asis_calls=False, extended=False, regions='default', - format='changeo', out_file=None, out_args=default_out_args): - """ - Main for IgBLAST aligned sample sequences. - - Arguments: - aligner_file (str): IgBLAST output file to process. - seq_file (str): fasta file input to IgBlast (from which to get sequence). - repo (str): folder with germline repertoire files. - amino_acid (bool): if True then the IgBLAST output files are results from igblastp. igblastn is assumed if False. - partial : If True put incomplete alignments in the pass file. - asis_id (bool): if ID is to be parsed for pRESTO output with default delimiters. - asis_calls (bool): if True do not parse gene calls for allele names. - extended (bool): if True add alignment scores, FWR regions, and CDR regions to the output. - regions (str): name of the IMGT FWR/CDR region definitions to use. - format (str): output format. one of 'changeo' or 'airr'. - out_file (str): output file name. Automatically generated from the input file if None. - out_args (dict): common output argument dictionary from parseCommonArgs. - - Returns: - dict : names of the 'pass' and 'fail' output files. - """ - # Print parameter info - log = OrderedDict() - log['START'] = 'MakeDB' - log['COMMAND'] = 'igblast-aa' if amino_acid else 'igblast' - log['ALIGNER_FILE'] = os.path.basename(aligner_file) - log['SEQ_FILE'] = os.path.basename(seq_file) - log['ASIS_ID'] = asis_id - log['ASIS_CALLS'] = asis_calls - log['PARTIAL'] = partial - log['EXTENDED'] = extended - printLog(log) - - # Set amino acid conditions - if amino_acid: - format = '%s-aa' % format - parser = IgBLASTReaderAA - else: - parser = IgBLASTReader - - # Start - start_time = time() - printMessage('Loading files', start_time=start_time, width=20) - - # Count records in sequence file - total_count = countSeqFile(seq_file) - - # Get input sequence dictionary - seq_dict = getSeqDict(seq_file) - - # Create germline repo dictionary - references = readGermlines(repo, asis=asis_calls) - - # Load supplementary annotation table - if cellranger_file is not None: - f = cellranger_extended if extended else cellranger_base - annotations = readCellRanger(cellranger_file, fields=f) - else: - annotations = None - - printMessage('Done', start_time=start_time, end=True, width=20) - - # Check for IMGT-gaps in germlines - if all('...' not in x for x in references.values()): - printWarning('Germline reference sequences do not appear to contain IMGT-numbering spacers. Results may be incorrect.') - - # Define format operators - try: - __, writer, schema = getFormatOperators(format) - except ValueError: - printError('Invalid format %s.' % format) - out_args['out_type'] = schema.out_type - - # Define output fields - fields = list(schema.required) - if extended: - custom = parser.customFields(schema=schema) - fields.extend(custom) - - # Parse and write output - with open(aligner_file, 'r') as f: - parse_iter = parser(f, seq_dict, references, regions=regions, asis_calls=asis_calls) - germ_iter = (addGermline(x, references, amino_acid=amino_acid) for x in parse_iter) - output = writeDb(germ_iter, fields=fields, aligner_file=aligner_file, total_count=total_count, - annotations=annotations, amino_acid=amino_acid, partial=partial, asis_id=asis_id, - regions=regions, writer=writer, out_file=out_file, out_args=out_args) - - return output - - -def parseIHMM(aligner_file, seq_file, repo, cellranger_file=None, partial=False, asis_id=True, - extended=False, format=default_format, out_file=None, out_args=default_out_args): - """ - Main for iHMMuneAlign aligned sample sequences. - - Arguments: - aligner_file : iHMMune-Align output file to process. - seq_file : fasta file input to iHMMuneAlign (from which to get sequence). - repo : folder with germline repertoire files. - partial : If True put incomplete alignments in the pass file. - asis_id : if ID is to be parsed for pRESTO output with default delimiters. - extended : if True parse alignment scores, FWR and CDR region fields. - format : output format. One of 'changeo' or 'airr'. - out_file : output file name. Automatically generated from the input file if None. - out_args : common output argument dictionary from parseCommonArgs. - - Returns: - dict : names of the 'pass' and 'fail' output files. - """ - # Print parameter info - log = OrderedDict() - log['START'] = 'MakeDB' - log['COMMAND'] = 'ihmm' - log['ALIGNER_FILE'] = os.path.basename(aligner_file) - log['SEQ_FILE'] = os.path.basename(seq_file) - log['ASIS_ID'] = asis_id - log['PARTIAL'] = partial - log['EXTENDED'] = extended - printLog(log) - - start_time = time() - printMessage('Loading files', start_time=start_time, width=20) - - # Count records in sequence file - total_count = countSeqFile(seq_file) - - # Get input sequence dictionary - seq_dict = getSeqDict(seq_file) - - # Create germline repo dictionary - references = readGermlines(repo) - - # Load supplementary annotation table - if cellranger_file is not None: - f = cellranger_extended if extended else cellranger_base - annotations = readCellRanger(cellranger_file, fields=f) - else: - annotations = None - - printMessage('Done', start_time=start_time, end=True, width=20) - - # Check for IMGT-gaps in germlines - if all('...' not in x for x in references.values()): - printWarning('Germline reference sequences do not appear to contain IMGT-numbering spacers. Results may be incorrect.') - - # Define format operators - try: - __, writer, schema = getFormatOperators(format) - except ValueError: - printError('Invalid format %s.' % format) - out_args['out_type'] = schema.out_type - - # Define output fields - fields = list(schema.required) - if extended: - custom = IHMMuneReader.customFields(scores=True, regions=True, schema=schema) - fields.extend(custom) - - # Parse and write output - with open(aligner_file, 'r') as f: - parse_iter = IHMMuneReader(f, seq_dict, references) - germ_iter = (addGermline(x, references) for x in parse_iter) - output = writeDb(germ_iter, fields=fields, aligner_file=aligner_file, total_count=total_count, - annotations=annotations, asis_id=asis_id, partial=partial, - writer=writer, out_file=out_file, out_args=out_args) - - return output - - -def getArgParser(): - """ - Defines the ArgumentParser. - - Returns: - argparse.ArgumentParser - """ - fields = dedent( - ''' - output files: - db-pass - database of alignment records with functionality information, - V and J calls, and a junction region. - db-fail - database with records that fail due to no productivity information, - no gene V assignment, no J assignment, or no junction region. - - universal output fields: - sequence_id, sequence, sequence_alignment, germline_alignment, - rev_comp, productive, stop_codon, vj_in_frame, locus, - v_call, d_call, j_call, junction, junction_length, junction_aa, - v_sequence_start, v_sequence_end, v_germline_start, v_germline_end, - d_sequence_start, d_sequence_end, d_germline_start, d_germline_end, - j_sequence_start, j_sequence_end, j_germline_start, j_germline_end, - np1_length, np2_length, fwr1, fwr2, fwr3, fwr4, cdr1, cdr2, cdr3 - - imgt specific output fields: - n1_length, n2_length, p3v_length, p5d_length, p3d_length, p5j_length, - d_frame, v_score, v_identity, d_score, d_identity, j_score, j_identity - - igblast specific output fields: - v_score, v_identity, v_support, v_cigar, - d_score, d_identity, d_support, d_cigar, - j_score, j_identity, j_support, j_cigar - - ihmm specific output fields: - vdj_score - - 10X specific output fields: - cell_id, c_call, consensus_count, umi_count, - v_call_10x, d_call_10x, j_call_10x, - junction_10x, junction_10x_aa - ''') - - # Define ArgumentParser - parser = ArgumentParser(description=__doc__, epilog=fields, - formatter_class=CommonHelpFormatter, add_help=False) - group_help = parser.add_argument_group('help') - group_help.add_argument('--version', action='version', - version='%(prog)s:' + ' %s %s' %(__version__, __date__)) - group_help.add_argument('-h', '--help', action='help', help='show this help message and exit') - subparsers = parser.add_subparsers(title='subcommands', dest='command', - help='Aligner used', metavar='') - # TODO: This is a temporary fix for Python issue 9253 - subparsers.required = True - - # Parent parser - parser_parent = getCommonArgParser(db_in=False) - - # igblastn output parser - parser_igblast = subparsers.add_parser('igblast', parents=[parser_parent], - formatter_class=CommonHelpFormatter, add_help=False, - help='Process igblastn output.', - description='Process igblastn output.') - group_igblast = parser_igblast.add_argument_group('aligner parsing arguments') - group_igblast.add_argument('-i', nargs='+', action='store', dest='aligner_files', - required=True, - help='''IgBLAST output files in format 7 with query sequence - (igblastn argument \'-outfmt "7 std qseq sseq btop"\').''') - group_igblast.add_argument('-r', nargs='+', action='store', dest='repo', required=True, - help='''List of folders and/or fasta files containing - the same germline set used in the IgBLAST alignment. These - reference sequences must contain IMGT-numbering spacers (gaps) - in the V segment.''') - group_igblast.add_argument('-s', action='store', nargs='+', dest='seq_files', - required=True, - help='''List of input FASTA files (with .fasta, .fna or .fa - extension), containing sequences.''') - group_igblast.add_argument('--10x', action='store', nargs='+', dest='cellranger_file', - help='''Table file containing 10X annotations (with .csv or .tsv - extension).''') - group_igblast.add_argument('--asis-id', action='store_true', dest='asis_id', - help='''Specify to prevent input sequence headers from being parsed - to add new columns to database. Parsing of sequence headers requires - headers to be in the pRESTO annotation format, so this should be specified - when sequence headers are incompatible with the pRESTO annotation scheme. - Note, unrecognized header formats will default to this behavior.''') - group_igblast.add_argument('--asis-calls', action='store_true', dest='asis_calls', - help='''Specify to prevent gene calls from being parsed into standard allele names - in both the IgBLAST output and reference database. Note, this requires - the sequence identifiers in the reference sequence set and the IgBLAST - database to be exact string matches.''') - group_igblast.add_argument('--partial', action='store_true', dest='partial', - help='''If specified, include incomplete V(D)J alignments in - the pass file instead of the fail file. An incomplete alignment - is defined as a record for which a valid IMGT-gapped sequence - cannot be built or that is missing a V gene assignment, - J gene assignment, junction region, or productivity call.''') - group_igblast.add_argument('--extended', action='store_true', dest='extended', - help='''Specify to include additional aligner specific fields in the output. - Adds _score, _identity, _support, _cigar, - fwr1, fwr2, fwr3, fwr4, cdr1, cdr2 and cdr3.''') - group_igblast.add_argument('--regions', action='store', dest='regions', - choices=('default', 'rhesus-igl'), default='default', - help='''IMGT CDR and FWR boundary definition to use.''') - parser_igblast.set_defaults(func=parseIgBLAST, amino_acid=False) - - # igblastp output parser - parser_igblast_aa = subparsers.add_parser('igblast-aa', parents=[parser_parent], - formatter_class=CommonHelpFormatter, add_help=False, - help='Process igblastp output.', - description='Process igblastp output.') - group_igblast_aa = parser_igblast_aa.add_argument_group('aligner parsing arguments') - group_igblast_aa.add_argument('-i', nargs='+', action='store', dest='aligner_files', - required=True, - help='''IgBLAST output files in format 7 with query sequence - (igblastp argument \'-outfmt "7 std qseq sseq btop"\').''') - group_igblast_aa.add_argument('-r', nargs='+', action='store', dest='repo', required=True, - help='''List of folders and/or fasta files containing - the same germline set used in the IgBLAST alignment. These - reference sequences must contain IMGT-numbering spacers (gaps) - in the V segment.''') - group_igblast_aa.add_argument('-s', action='store', nargs='+', dest='seq_files', required=True, - help='''List of input FASTA files (with .fasta, .fna or .fa - extension), containing sequences.''') - group_igblast_aa.add_argument('--10x', action='store', nargs='+', dest='cellranger_file', - help='''Table file containing 10X annotations (with .csv or .tsv extension).''') - group_igblast_aa.add_argument('--asis-id', action='store_true', dest='asis_id', - help='''Specify to prevent input sequence headers from being parsed - to add new columns to database. Parsing of sequence headers requires - headers to be in the pRESTO annotation format, so this should be specified - when sequence headers are incompatible with the pRESTO annotation scheme. - Note, unrecognized header formats will default to this behavior.''') - group_igblast_aa.add_argument('--asis-calls', action='store_true', dest='asis_calls', - help='''Specify to prevent gene calls from being parsed into standard allele names - in both the IgBLAST output and reference database. Note, this requires - the sequence identifiers in the reference sequence set and the IgBLAST - database to be exact string matches.''') - group_igblast_aa.add_argument('--extended', action='store_true', dest='extended', - help='''Specify to include additional aligner specific fields in the output. - Adds v_score, v_identity, v_support, v_cigar, fwr1, fwr2, fwr3, cdr1 and cdr2.''') - group_igblast_aa.add_argument('--regions', action='store', dest='regions', - choices=('default', 'rhesus-igl'), default='default', - help='''IMGT CDR and FWR boundary definition to use.''') - parser_igblast_aa.set_defaults(func=parseIgBLAST, partial=True, amino_acid=True) - - - # IMGT aligner - parser_imgt = subparsers.add_parser('imgt', parents=[parser_parent], - formatter_class=CommonHelpFormatter, add_help=False, - help='''Process IMGT/HighV-Quest output - (does not work with V-QUEST).''', - description='''Process IMGT/HighV-Quest output - (does not work with V-QUEST).''') - group_imgt = parser_imgt.add_argument_group('aligner parsing arguments') - group_imgt.add_argument('-i', nargs='+', action='store', dest='aligner_files', - help='''Either zipped IMGT output files (.zip or .txz) or a - folder containing unzipped IMGT output files (which must - include 1_Summary, 2_IMGT-gapped, 3_Nt-sequences, - and 6_Junction).''') - group_imgt.add_argument('-s', nargs='*', action='store', dest='seq_files', required=False, - help='''List of FASTA files (with .fasta, .fna or .fa - extension) that were submitted to IMGT/HighV-QUEST. - If unspecified, sequence identifiers truncated by IMGT/HighV-QUEST - will not be corrected.''') - group_imgt.add_argument('-r', nargs='+', action='store', dest='repo', required=False, - help='''List of folders and/or fasta files containing - the germline sequence set used by IMGT/HighV-QUEST. - These reference sequences must contain IMGT-numbering spacers (gaps) - in the V segment. If unspecified, the germline sequence reconstruction - will not be included in the output.''') - group_imgt.add_argument('--10x', action='store', nargs='+', dest='cellranger_file', - help='''Table file containing 10X annotations (with .csv or .tsv - extension).''') - group_imgt.add_argument('--asis-id', action='store_true', dest='asis_id', - help='''Specify to prevent input sequence headers from being parsed - to add new columns to database. Parsing of sequence headers requires - headers to be in the pRESTO annotation format, so this should be specified - when sequence headers are incompatible with the pRESTO annotation scheme. - Note, unrecognized header formats will default to this behavior.''') - group_imgt.add_argument('--partial', action='store_true', dest='partial', - help='''If specified, include incomplete V(D)J alignments in - the pass file instead of the fail file. An incomplete alignment - is defined as a record that is missing a V gene assignment, - J gene assignment, junction region, or productivity call.''') - group_imgt.add_argument('--extended', action='store_true', dest='extended', - help='''Specify to include additional aligner specific fields in the output. - Adds _score, _identity>, fwr1, fwr2, fwr3, fwr4, - cdr1, cdr2, cdr3, n1_length, n2_length, p3v_length, p5d_length, - p3d_length, p5j_length and d_frame.''') - parser_imgt.set_defaults(func=parseIMGT) - - # iHMMuneAlign Aligner - parser_ihmm = subparsers.add_parser('ihmm', parents=[parser_parent], - formatter_class=CommonHelpFormatter, add_help=False, - help='Process iHMMune-Align output.', - description='Process iHMMune-Align output.') - group_ihmm = parser_ihmm.add_argument_group('aligner parsing arguments') - group_ihmm.add_argument('-i', nargs='+', action='store', dest='aligner_files', - required=True, - help='''iHMMune-Align output file.''') - group_ihmm.add_argument('-r', nargs='+', action='store', dest='repo', required=True, - help='''List of folders and/or FASTA files containing - the set of germline sequences used by iHMMune-Align. These - reference sequences must contain IMGT-numbering spacers (gaps) - in the V segment.''') - group_ihmm.add_argument('-s', action='store', nargs='+', dest='seq_files', - required=True, - help='''List of input FASTA files (with .fasta, .fna or .fa - extension) containing sequences.''') - group_ihmm.add_argument('--10x', action='store', nargs='+', dest='cellranger_file', - help='''Table file containing 10X annotations (with .csv or .tsv - extension).''') - group_ihmm.add_argument('--asis-id', action='store_true', dest='asis_id', - help='''Specify to prevent input sequence headers from being parsed - to add new columns to database. Parsing of sequence headers requires - headers to be in the pRESTO annotation format, so this should be specified - when sequence headers are incompatible with the pRESTO annotation scheme. - Note, unrecognized header formats will default to this behavior.''') - group_ihmm.add_argument('--partial', action='store_true', dest='partial', - help='''If specified, include incomplete V(D)J alignments in - the pass file instead of the fail file. An incomplete alignment - is defined as a record for which a valid IMGT-gapped sequence - cannot be built or that is missing a V gene assignment, - J gene assignment, junction region, or productivity call.''') - group_ihmm.add_argument('--extended', action='store_true', dest='extended', - help='''Specify to include additional aligner specific fields in the output. - Adds the path score of the iHMMune-Align hidden Markov model as vdj_score; - adds fwr1, fwr2, fwr3, fwr4, cdr1, cdr2 and cdr3.''') - parser_ihmm.set_defaults(func=parseIHMM) - - return parser - - -if __name__ == "__main__": - """ - Parses command line arguments and calls main - """ - parser = getArgParser() - checkArgs(parser) - args = parser.parse_args() - args_dict = parseCommonArgs(args, in_arg='aligner_files') - - # Set no ID parsing if sequence files are not provided - if 'seq_files' in args_dict and not args_dict['seq_files']: - args_dict['asis_id'] = True - - # Delete - if 'aligner_files' in args_dict: del args_dict['aligner_files'] - if 'seq_files' in args_dict: del args_dict['seq_files'] - if 'out_files' in args_dict: del args_dict['out_files'] - if 'command' in args_dict: del args_dict['command'] - if 'func' in args_dict: del args_dict['func'] - - # Call main - for i, f in enumerate(args.__dict__['aligner_files']): - args_dict['aligner_file'] = f - args_dict['seq_file'] = args.__dict__['seq_files'][i] \ - if args.__dict__['seq_files'] else None - args_dict['out_file'] = args.__dict__['out_files'][i] \ - if args.__dict__['out_files'] else None - args_dict['cellranger_file'] = args.__dict__['cellranger_file'][i] \ - if args.__dict__['cellranger_file'] else None - args.func(**args_dict) diff -r aff3ba86ef7a -r 98e3fecedd2b change_o/define_clones.sh --- a/change_o/define_clones.sh Mon Aug 31 11:20:08 2020 -0400 +++ b/change_o/define_clones.sh Tue Sep 01 16:03:44 2020 -0400 @@ -21,7 +21,7 @@ output=${10} output2=${11} - python3 $dir/DefineClones.py -d $PWD/input.tab --nproc 4 --outdir $PWD/outdir --outname output --mode $mode --act $act --model $model --dist $dist --norm $norm --sym $sym --link $link + DefineClones.py -d $PWD/input.tab --nproc 4 --outdir $PWD/outdir --outname output --mode $mode --act $act --model $model --dist $dist --norm $norm --sym $sym --link $link Rscript $dir/define_clones.r $PWD/outdir/output_clone-pass.tab $output2 2>&1 else @@ -29,7 +29,7 @@ output=$4 output2=$5 - python3 $dir/DefineClones.py hclust -d $PWD/input.tab --nproc 4 --outdir $PWD/outdir --outname output --method $method + DefineClones.py hclust -d $PWD/input.tab --nproc 4 --outdir $PWD/outdir --outname output --method $method Rscript $dir/define_clones.r $PWD/outdir/output_clone-pass.tab $output2 2>&1 fi diff -r aff3ba86ef7a -r 98e3fecedd2b change_o/makedb.sh --- a/change_o/makedb.sh Mon Aug 31 11:20:08 2020 -0400 +++ b/change_o/makedb.sh Tue Sep 01 16:03:44 2020 -0400 @@ -29,7 +29,7 @@ echo "makedb: $PWD/outdir" -python3 $dir/MakeDb.py imgt -i $input --outdir $PWD/outdir --outname output $noparse $scores $regions +MakeDb.py imgt -i $input --outdir $PWD/outdir --outname output $noparse $scores $regions mv $PWD/outdir/output_db-pass.tab $output