shm_csr: change_o/MakeDb.py comparison

comparison change_o/MakeDb.py @ 78:aff3ba86ef7a draft

Uploaded

author	davidvanzessen
date	Mon, 31 Aug 2020 11:20:08 -0400
parents
children

comparison

equal deleted inserted replaced

-:58d2377b507d
+:aff3ba86ef7a
+#!/usr/bin/env python3
+"""
+Create tab-delimited database file to store sequence alignment information
+"""
+# Info
+__author__ = 'Namita Gupta, Jason Anthony Vander Heiden'
+from changeo import __version__, __date__
+# Imports
+import os
+import re
+import csv
+from argparse import ArgumentParser
+from collections import OrderedDict
+from textwrap import dedent
+from time import time
+from Bio import SeqIO
+# Presto and changeo imports
+from presto.Annotation import parseAnnotation
+from presto.IO import countSeqFile, printLog, printMessage, printProgress, printError, printWarning, readSeqFile
+from changeo.Defaults import default_format, default_out_args
+from changeo.Commandline import CommonHelpFormatter, checkArgs, getCommonArgParser, parseCommonArgs
+from changeo.Alignment import RegionDefinition
+from changeo.Gene import buildGermline
+from changeo.IO import countDbFile, extractIMGT, readGermlines, getFormatOperators, getOutputHandle, \
+AIRRWriter, ChangeoWriter, IgBLASTReader, IgBLASTReaderAA, IMGTReader, IHMMuneReader
+from changeo.Receptor import ChangeoSchema, AIRRSchema
+# 10X Receptor attributes
+cellranger_base = ['cell', 'c_call', 'conscount', 'umicount']
+cellranger_extended = ['cell', 'c_call', 'conscount', 'umicount',
+'v_call_10x', 'd_call_10x', 'j_call_10x',
+'junction_10x', 'junction_10x_aa']
+def readCellRanger(cellranger_file, fields=cellranger_base):
+"""
+Load a Cell Ranger annotation table
+Arguments:
+cellranger_file (str): path to the annotation file.
+fields (list): list of fields to keep.
+Returns:
+dict: dict of dicts with contig_id as the primary key.
+"""
+# Mapping of 10X annotations to Receptor attributes
+cellranger_map = {'cell':  'barcode',
+'c_call': 'c_gene',
+'locus': 'chain',
+'conscount': 'reads',
+'umicount': 'umis',
+'v_call_10x': 'v_gene',
+'d_call_10x': 'd_gene',
+'j_call_10x': 'j_gene',
+'junction_10x': 'cdr3_nt',
+'junction_10x_aa': 'cdr3'}
+# Function to parse individual fields
+def _parse(x):
+return '' if x == 'None' else x
+# Generate annotation dictionary
+ann_dict = {}
+with open(cellranger_file) as csv_file:
+# Detect delimiters
+dialect = csv.Sniffer().sniff(csv_file.readline())
+csv_file.seek(0)
+# Read in annotation file
+csv_reader = csv.DictReader(csv_file, dialect=dialect)
+# Generate annotation dictionary
+for row in csv_reader:
+ann_dict[row['contig_id']] = {f: _parse(row[cellranger_map[f]]) for f in fields}
+return ann_dict
+def addGermline(receptor, references, amino_acid=False):
+"""
+Add full length germline to Receptor object
+Arguments:
+receptor (changeo.Receptor.Receptor): Receptor object to modify.
+references (dict): dictionary of IMGT-gapped references sequences.
+amino_acid (bool): if True build amino acid germline, otherwise build nucleotide germline
+Returns:
+changeo.Receptor.Receptor: modified Receptor with the germline sequence added.
+"""
+if amino_acid:
+__, germlines, __ = buildGermline(receptor, references, seq_field='sequence_aa_imgt',
+amino_acid=True)
+germline_seq = None if germlines is None else germlines['full']
+receptor.setField('germline_aa_imgt', germline_seq)
+else:
+__, germlines, __ = buildGermline(receptor, references, amino_acid=False)
+germline_seq = None if germlines is None else germlines['full']
+receptor.setField('germline_imgt', germline_seq)
+return receptor
+def getIDforIMGT(seq_file):
+"""
+Create a sequence ID translation using IMGT truncation.
+Arguments:
+seq_file : a fasta file of sequences input to IMGT.
+Returns:
+dict : a dictionary of with the IMGT truncated ID as the key and the full sequence description as the value.
+"""
+# Create a sequence ID translation using IDs truncate up to space or 50 chars
+ids = {}
+for rec in readSeqFile(seq_file):
+if len(rec.description) <= 50:
+id_key = rec.description
+else:
+id_key = re.sub('\||\s|!|&|\*|<|>|\?', '_', rec.description[:50])
+ids.update({id_key: rec.description})
+return ids
+def getSeqDict(seq_file):
+"""
+Create a dictionary from a sequence file.
+Arguments:
+seq_file : sequence file.
+Returns:
+dict : sequence description as keys with Bio.SeqRecords as values.
+"""
+seq_dict = SeqIO.to_dict(readSeqFile(seq_file), key_function=lambda x: x.description)
+return seq_dict
+def writeDb(records, fields, aligner_file, total_count, id_dict=None, annotations=None,
+amino_acid=False, partial=False, asis_id=True, regions='default',
+writer=AIRRWriter, out_file=None, out_args=default_out_args):
+"""
+Writes parsed records to an output file
+Arguments:
+records : a iterator of Receptor objects containing alignment data.
+fields : a list of ordered field names to write.
+aligner_file : input file name.
+total_count : number of records (for progress bar).
+id_dict : a dictionary of the truncated sequence ID mapped to the full sequence ID.
+annotations : additional annotation dictionary.
+amino_acid : if True do verification on amino acid fields.
+partial : if True put incomplete alignments in the pass file.
+asis_id : if ID is to be parsed for pRESTO output with default delimiters.
+regions (str): name of the IMGT FWR/CDR region definitions to use.
+writer : writer class.
+out_file : output file name. Automatically generated from the input file if None.
+out_args : common output argument dictionary from parseCommonArgs.
+Returns:
+None
+"""
+# Wrapper for opening handles and writers
+def _open(x, f, writer=writer, out_file=out_file):
+if out_file is not None and x == 'pass':
+handle = open(out_file, 'w')
+else:
+handle = getOutputHandle(aligner_file,
+out_label='db-%s' % x,
+out_dir=out_args['out_dir'],
+out_name=out_args['out_name'],
+out_type=out_args['out_type'])
+return handle, writer(handle, fields=f)
+# Function to convert fasta header annotations to changeo columns
+def _changeo(f, header):
+h = [ChangeoSchema.fromReceptor(x) for x in header if x.upper() not in f]
+f.extend(h)
+return f
+def _airr(f, header):
+h = [AIRRSchema.fromReceptor(x) for x in header if x.lower() not in f]
+f.extend(h)
+return f
+# Function to verify IMGT-gapped sequence and junction concur
+def _imgt_check(rec):
+try:
+if amino_acid:
+rd = RegionDefinition(rec.junction_aa_length, amino_acid=amino_acid, definition=regions)
+x, y = rd.positions['junction']
+check = (rec.junction_aa == rec.sequence_aa_imgt[x:y])
+else:
+rd = RegionDefinition(rec.junction_length, amino_acid=amino_acid, definition=regions)
+x, y = rd.positions['junction']
+check = (rec.junction == rec.sequence_imgt[x:y])
+except (TypeError, AttributeError):
+check = False
+return check
+# Function to check for valid records strictly
+def _strict(rec):
+if amino_acid:
+valid = [rec.v_call and rec.v_call != 'None',
+rec.j_call and rec.j_call != 'None',
+rec.functional is not None,
+rec.sequence_aa_imgt,
+rec.junction_aa,
+_imgt_check(rec)]
+else:
+valid = [rec.v_call and rec.v_call != 'None',
+rec.j_call and rec.j_call != 'None',
+rec.functional is not None,
+rec.sequence_imgt,
+rec.junction,
+_imgt_check(rec)]
+return all(valid)
+# Function to check for valid records loosely
+def _gentle(rec):
+valid = [rec.v_call and rec.v_call != 'None',
+rec.d_call and rec.d_call != 'None',
+rec.j_call and rec.j_call != 'None']
+return any(valid)
+# Set writer class and annotation conversion function
+if writer == ChangeoWriter:
+_annotate = _changeo
+elif writer == AIRRWriter:
+_annotate = _airr
+else:
+printError('Invalid output writer.')
+# Additional annotation (e.g. 10X cell calls)
+# _append_table = None
+# if cellranger_file is not None:
+#     with open(cellranger_file) as csv_file:
+#         # Read in annotation file (use Sniffer to discover file delimiters)
+#         dialect = csv.Sniffer().sniff(csv_file.readline())
+#         csv_file.seek(0)
+#         csv_reader = csv.DictReader(csv_file, dialect = dialect)
+#
+#         # Generate annotation dictionary
+#         anntab_dict = {entry['contig_id']: {cellranger_map[field]: entry[field] \
+#                        for field in cellranger_map.keys()} for entry in csv_reader}
+#
+#     fields = _annotate(fields, cellranger_map.values())
+#     _append_table = lambda sequence_id: anntab_dict[sequence_id]
+# Set pass criteria
+_pass = _gentle if partial else _strict
+# Define log handle
+if out_args['log_file'] is None:
+log_handle = None
+else:
+log_handle = open(out_args['log_file'], 'w')
+# Initialize handles, writers and counters
+pass_handle, pass_writer = None, None
+fail_handle, fail_writer = None, None
+pass_count, fail_count = 0, 0
+start_time = time()
+# Validate and write output
+printProgress(0, total_count, 0.05, start_time=start_time)
+for i, record in enumerate(records, start=1):
+# Replace sequence description with full string, if required
+if id_dict is not None and record.sequence_id in id_dict:
+record.sequence_id = id_dict[record.sequence_id]
+# Parse sequence description into new columns
+if not asis_id:
+try:
+ann_raw = parseAnnotation(record.sequence_id)
+record.sequence_id = ann_raw.pop('ID')
+# Convert to Receptor fields
+ann_parsed = OrderedDict()
+for k, v in ann_raw.items():
+ann_parsed[ChangeoSchema.toReceptor(k)] = v
+# Add annotations to Receptor and update field list
+record.setDict(ann_parsed, parse=True)
+if i == 1:  fields = _annotate(fields, ann_parsed.keys())
+except IndexError:
+# Could not parse pRESTO-style annotations so fall back to no parse
+asis_id = True
+printWarning('Sequence annotation format not recognized. Sequence headers will not be parsed.')
+# Add supplemental annotation fields
+# if _append_table is not None:
+#     record.setDict(_append_table(record.sequence_id), parse=True)
+if annotations is not None:
+record.setDict(annotations[record.sequence_id], parse=True)
+if i == 1:  fields = _annotate(fields, annotations[record.sequence_id].keys())
+# Count pass or fail and write to appropriate file
+if _pass(record):
+pass_count += 1
+# Write row to pass file
+try:
+pass_writer.writeReceptor(record)
+except AttributeError:
+# Open pass file and writer
+pass_handle, pass_writer = _open('pass', fields)
+pass_writer.writeReceptor(record)
+else:
+fail_count += 1
+# Write row to fail file if specified
+if out_args['failed']:
+try:
+fail_writer.writeReceptor(record)
+except AttributeError:
+# Open fail file and writer
+fail_handle, fail_writer = _open('fail', fields)
+fail_writer.writeReceptor(record)
+# Write log
+if log_handle is not None:
+log = OrderedDict([('ID', record.sequence_id),
+('V_CALL', record.v_call),
+('D_CALL', record.d_call),
+('J_CALL', record.j_call),
+('PRODUCTIVE', record.functional)])
+if not _imgt_check(record) and not amino_acid:
+log['ERROR'] = 'Junction does not match the sequence starting at position 310 in the IMGT numbered V(D)J sequence.'
+printLog(log, log_handle)
+# Print progress
+printProgress(i, total_count, 0.05, start_time=start_time)
+# Print console log
+log = OrderedDict()
+log['OUTPUT'] = os.path.basename(pass_handle.name) if pass_handle is not None else None
+log['PASS'] = pass_count
+log['FAIL'] = fail_count
+log['END'] = 'MakeDb'
+printLog(log)
+# Close file handles
+output = {'pass': None, 'fail': None}
+if pass_handle is not None:
+output['pass'] = pass_handle.name
+pass_handle.close()
+if fail_handle is not None:
+output['fail'] = fail_handle.name
+fail_handle.close()
+return output
+def parseIMGT(aligner_file, seq_file=None, repo=None, cellranger_file=None, partial=False, asis_id=True,
+extended=False, format=default_format, out_file=None, out_args=default_out_args):
+"""
+Main for IMGT aligned sample sequences.
+Arguments:
+aligner_file : zipped file or unzipped folder output by IMGT.
+seq_file : FASTA file input to IMGT (from which to get seqID).
+repo : folder with germline repertoire files.
+partial : If True put incomplete alignments in the pass file.
+asis_id : if ID is to be parsed for pRESTO output with default delimiters.
+extended : if True add alignment score, FWR, CDR and junction fields to output file.
+format : output format. one of 'changeo' or 'airr'.
+out_file : output file name. Automatically generated from the input file if None.
+out_args : common output argument dictionary from parseCommonArgs.
+Returns:
+dict : names of the 'pass' and 'fail' output files.
+"""
+# Print parameter info
+log = OrderedDict()
+log['START'] = 'MakeDb'
+log['COMMAND'] = 'imgt'
+log['ALIGNER_FILE'] = aligner_file
+log['SEQ_FILE'] = os.path.basename(seq_file) if seq_file else ''
+log['ASIS_ID'] = asis_id
+log['PARTIAL'] = partial
+log['EXTENDED'] = extended
+printLog(log)
+start_time = time()
+printMessage('Loading files', start_time=start_time, width=20)
+# Extract IMGT files
+temp_dir, imgt_files = extractIMGT(aligner_file)
+# Count records in IMGT files
+total_count = countDbFile(imgt_files['summary'])
+# Get (parsed) IDs from fasta file submitted to IMGT
+id_dict = getIDforIMGT(seq_file) if seq_file else {}
+# Load supplementary annotation table
+if cellranger_file is not None:
+f = cellranger_extended if extended else cellranger_base
+annotations = readCellRanger(cellranger_file, fields=f)
+else:
+annotations = None
+printMessage('Done', start_time=start_time, end=True, width=20)
+# Define format operators
+try:
+__, writer, schema = getFormatOperators(format)
+except ValueError:
+printError('Invalid format %s.' % format)
+out_args['out_type'] = schema.out_type
+# Define output fields
+fields = list(schema.required)
+if extended:
+custom = IMGTReader.customFields(scores=True, regions=True, junction=True, schema=schema)
+fields.extend(custom)
+# Parse IMGT output and write db
+with open(imgt_files['summary'], 'r') as summary_handle, \
+open(imgt_files['gapped'], 'r') as gapped_handle, \
+open(imgt_files['ntseq'], 'r') as ntseq_handle, \
+open(imgt_files['junction'], 'r') as junction_handle:
+# Open parser
+parse_iter = IMGTReader(summary_handle, gapped_handle, ntseq_handle, junction_handle)
+# Add germline sequence
+if repo is None:
+germ_iter = parse_iter
+else:
+references = readGermlines(repo)
+# Check for IMGT-gaps in germlines
+if all('...' not in x for x in references.values()):
+printWarning('Germline reference sequences do not appear to contain IMGT-numbering spacers. Results may be incorrect.')
+germ_iter = (addGermline(x, references) for x in parse_iter)
+# Write db
+output = writeDb(germ_iter, fields=fields, aligner_file=aligner_file, total_count=total_count,
+annotations=annotations, id_dict=id_dict, asis_id=asis_id, partial=partial,
+writer=writer, out_file=out_file, out_args=out_args)
+# Cleanup temp directory
+temp_dir.cleanup()
+return output
+def parseIgBLAST(aligner_file, seq_file, repo, amino_acid=False, cellranger_file=None, partial=False,
+asis_id=True, asis_calls=False, extended=False, regions='default',
+format='changeo', out_file=None, out_args=default_out_args):
+"""
+Main for IgBLAST aligned sample sequences.
+Arguments:
+aligner_file (str): IgBLAST output file to process.
+seq_file (str): fasta file input to IgBlast (from which to get sequence).
+repo (str): folder with germline repertoire files.
+amino_acid (bool): if True then the IgBLAST output files are results from igblastp. igblastn is assumed if False.
+partial : If True put incomplete alignments in the pass file.
+asis_id (bool): if ID is to be parsed for pRESTO output with default delimiters.
+asis_calls (bool): if True do not parse gene calls for allele names.
+extended (bool): if True add alignment scores, FWR regions, and CDR regions to the output.
+regions (str): name of the IMGT FWR/CDR region definitions to use.
+format (str): output format. one of 'changeo' or 'airr'.
+out_file (str): output file name. Automatically generated from the input file if None.
+out_args (dict): common output argument dictionary from parseCommonArgs.
+Returns:
+dict : names of the 'pass' and 'fail' output files.
+"""
+# Print parameter info
+log = OrderedDict()
+log['START'] = 'MakeDB'
+log['COMMAND'] = 'igblast-aa' if amino_acid else 'igblast'
+log['ALIGNER_FILE'] = os.path.basename(aligner_file)
+log['SEQ_FILE'] = os.path.basename(seq_file)
+log['ASIS_ID'] = asis_id
+log['ASIS_CALLS'] = asis_calls
+log['PARTIAL'] = partial
+log['EXTENDED'] = extended
+printLog(log)
+# Set amino acid conditions
+if amino_acid:
+format = '%s-aa' % format
+parser = IgBLASTReaderAA
+else:
+parser = IgBLASTReader
+# Start
+start_time = time()
+printMessage('Loading files', start_time=start_time, width=20)
+# Count records in sequence file
+total_count = countSeqFile(seq_file)
+# Get input sequence dictionary
+seq_dict = getSeqDict(seq_file)
+# Create germline repo dictionary
+references = readGermlines(repo, asis=asis_calls)
+# Load supplementary annotation table
+if cellranger_file is not None:
+f = cellranger_extended if extended else cellranger_base
+annotations = readCellRanger(cellranger_file, fields=f)
+else:
+annotations = None
+printMessage('Done', start_time=start_time, end=True, width=20)
+# Check for IMGT-gaps in germlines
+if all('...' not in x for x in references.values()):
+printWarning('Germline reference sequences do not appear to contain IMGT-numbering spacers. Results may be incorrect.')
+# Define format operators
+try:
+__, writer, schema = getFormatOperators(format)
+except ValueError:
+printError('Invalid format %s.' % format)
+out_args['out_type'] = schema.out_type
+# Define output fields
+fields = list(schema.required)
+if extended:
+custom = parser.customFields(schema=schema)
+fields.extend(custom)
+# Parse and write output
+with open(aligner_file, 'r') as f:
+parse_iter = parser(f, seq_dict, references, regions=regions, asis_calls=asis_calls)
+germ_iter = (addGermline(x, references, amino_acid=amino_acid) for x in parse_iter)
+output = writeDb(germ_iter, fields=fields, aligner_file=aligner_file, total_count=total_count,
+annotations=annotations, amino_acid=amino_acid, partial=partial, asis_id=asis_id,
+regions=regions, writer=writer, out_file=out_file, out_args=out_args)
+return output
+def parseIHMM(aligner_file, seq_file, repo, cellranger_file=None, partial=False, asis_id=True,
+extended=False, format=default_format, out_file=None, out_args=default_out_args):
+"""
+Main for iHMMuneAlign aligned sample sequences.
+Arguments:
+aligner_file : iHMMune-Align output file to process.
+seq_file : fasta file input to iHMMuneAlign (from which to get sequence).
+repo : folder with germline repertoire files.
+partial : If True put incomplete alignments in the pass file.
+asis_id : if ID is to be parsed for pRESTO output with default delimiters.
+extended : if True parse alignment scores, FWR and CDR region fields.
+format : output format. One of 'changeo' or 'airr'.
+out_file : output file name. Automatically generated from the input file if None.
+out_args : common output argument dictionary from parseCommonArgs.
+Returns:
+dict : names of the 'pass' and 'fail' output files.
+"""
+# Print parameter info
+log = OrderedDict()
+log['START'] = 'MakeDB'
+log['COMMAND'] = 'ihmm'
+log['ALIGNER_FILE'] = os.path.basename(aligner_file)
+log['SEQ_FILE'] = os.path.basename(seq_file)
+log['ASIS_ID'] = asis_id
+log['PARTIAL'] = partial
+log['EXTENDED'] = extended
+printLog(log)
+start_time = time()
+printMessage('Loading files', start_time=start_time, width=20)
+# Count records in sequence file
+total_count = countSeqFile(seq_file)
+# Get input sequence dictionary
+seq_dict = getSeqDict(seq_file)
+# Create germline repo dictionary
+references = readGermlines(repo)
+# Load supplementary annotation table
+if cellranger_file is not None:
+f = cellranger_extended if extended else cellranger_base
+annotations = readCellRanger(cellranger_file, fields=f)
+else:
+annotations = None
+printMessage('Done', start_time=start_time, end=True, width=20)
+# Check for IMGT-gaps in germlines
+if all('...' not in x for x in references.values()):
+printWarning('Germline reference sequences do not appear to contain IMGT-numbering spacers. Results may be incorrect.')
+# Define format operators
+try:
+__, writer, schema = getFormatOperators(format)
+except ValueError:
+printError('Invalid format %s.' % format)
+out_args['out_type'] = schema.out_type
+# Define output fields
+fields = list(schema.required)
+if extended:
+custom = IHMMuneReader.customFields(scores=True, regions=True, schema=schema)
+fields.extend(custom)
+# Parse and write output
+with open(aligner_file, 'r') as f:
+parse_iter = IHMMuneReader(f, seq_dict, references)
+germ_iter = (addGermline(x, references) for x in parse_iter)
+output = writeDb(germ_iter, fields=fields, aligner_file=aligner_file, total_count=total_count,
+annotations=annotations, asis_id=asis_id, partial=partial,
+writer=writer, out_file=out_file, out_args=out_args)
+return output
+def getArgParser():
+"""
+Defines the ArgumentParser.
+Returns:
+argparse.ArgumentParser
+"""
+fields = dedent(
+'''
+output files:
+db-pass
+database of alignment records with functionality information,
+V and J calls, and a junction region.
+db-fail
+database with records that fail due to no productivity information,
+no gene V assignment, no J assignment, or no junction region.
+universal output fields:
+sequence_id, sequence, sequence_alignment, germline_alignment,
+rev_comp, productive, stop_codon, vj_in_frame, locus,
+v_call, d_call, j_call, junction, junction_length, junction_aa,
+v_sequence_start, v_sequence_end, v_germline_start, v_germline_end,
+d_sequence_start, d_sequence_end, d_germline_start, d_germline_end,
+j_sequence_start, j_sequence_end, j_germline_start, j_germline_end,
+np1_length, np2_length, fwr1, fwr2, fwr3, fwr4, cdr1, cdr2, cdr3
+imgt specific output fields:
+n1_length, n2_length, p3v_length, p5d_length, p3d_length, p5j_length,
+d_frame, v_score, v_identity, d_score, d_identity, j_score, j_identity
+igblast specific output fields:
+v_score, v_identity, v_support, v_cigar,
+d_score, d_identity, d_support, d_cigar,
+j_score, j_identity, j_support, j_cigar
+ihmm specific output fields:
+vdj_score
+10X specific output fields:
+cell_id, c_call, consensus_count, umi_count,
+v_call_10x, d_call_10x, j_call_10x,
+junction_10x, junction_10x_aa
+''')
+# Define ArgumentParser
+parser = ArgumentParser(description=__doc__, epilog=fields,
+formatter_class=CommonHelpFormatter, add_help=False)
+group_help = parser.add_argument_group('help')
+group_help.add_argument('--version', action='version',
+version='%(prog)s:' + ' %s %s' %(__version__, __date__))
+group_help.add_argument('-h', '--help', action='help', help='show this help message and exit')
+subparsers = parser.add_subparsers(title='subcommands', dest='command',
+help='Aligner used', metavar='')
+# TODO:  This is a temporary fix for Python issue 9253
+subparsers.required = True
+# Parent parser
+parser_parent = getCommonArgParser(db_in=False)
+# igblastn output parser
+parser_igblast = subparsers.add_parser('igblast', parents=[parser_parent],
+formatter_class=CommonHelpFormatter, add_help=False,
+help='Process igblastn output.',
+description='Process igblastn output.')
+group_igblast = parser_igblast.add_argument_group('aligner parsing arguments')
+group_igblast.add_argument('-i', nargs='+', action='store', dest='aligner_files',
+required=True,
+help='''IgBLAST output files in format 7 with query sequence
+(igblastn argument \'-outfmt "7 std qseq sseq btop"\').''')
+group_igblast.add_argument('-r', nargs='+', action='store', dest='repo', required=True,
+help='''List of folders and/or fasta files containing
+the same germline set used in the IgBLAST alignment. These
+reference sequences must contain IMGT-numbering spacers (gaps)
+in the V segment.''')
+group_igblast.add_argument('-s', action='store', nargs='+', dest='seq_files',
+required=True,
+help='''List of input FASTA files (with .fasta, .fna or .fa
+extension), containing sequences.''')
+group_igblast.add_argument('--10x', action='store', nargs='+', dest='cellranger_file',
+help='''Table file containing 10X annotations (with .csv or .tsv
+extension).''')
+group_igblast.add_argument('--asis-id', action='store_true', dest='asis_id',
+help='''Specify to prevent input sequence headers from being parsed
+to add new columns to database. Parsing of sequence headers requires
+headers to be in the pRESTO annotation format, so this should be specified
+when sequence headers are incompatible with the pRESTO annotation scheme.
+Note, unrecognized header formats will default to this behavior.''')
+group_igblast.add_argument('--asis-calls', action='store_true', dest='asis_calls',
+help='''Specify to prevent gene calls from being parsed into standard allele names
+in both the IgBLAST output and reference database. Note, this requires
+the sequence identifiers in the reference sequence set and the IgBLAST
+database to be exact string matches.''')
+group_igblast.add_argument('--partial', action='store_true', dest='partial',
+help='''If specified, include incomplete V(D)J alignments in
+the pass file instead of the fail file. An incomplete alignment
+is defined as a record for which a valid IMGT-gapped sequence
+cannot be built or that is missing a V gene assignment,
+J gene assignment, junction region, or productivity call.''')
+group_igblast.add_argument('--extended', action='store_true', dest='extended',
+help='''Specify to include additional aligner specific fields in the output.
+Adds <vdj>_score, <vdj>_identity, <vdj>_support, <vdj>_cigar,
+fwr1, fwr2, fwr3, fwr4, cdr1, cdr2 and cdr3.''')
+group_igblast.add_argument('--regions', action='store', dest='regions',
+choices=('default', 'rhesus-igl'), default='default',
+help='''IMGT CDR and FWR boundary definition to use.''')
+parser_igblast.set_defaults(func=parseIgBLAST, amino_acid=False)
+# igblastp output parser
+parser_igblast_aa = subparsers.add_parser('igblast-aa', parents=[parser_parent],
+formatter_class=CommonHelpFormatter, add_help=False,
+help='Process igblastp output.',
+description='Process igblastp output.')
+group_igblast_aa = parser_igblast_aa.add_argument_group('aligner parsing arguments')
+group_igblast_aa.add_argument('-i', nargs='+', action='store', dest='aligner_files',
+required=True,
+help='''IgBLAST output files in format 7 with query sequence
+(igblastp argument \'-outfmt "7 std qseq sseq btop"\').''')
+group_igblast_aa.add_argument('-r', nargs='+', action='store', dest='repo', required=True,
+help='''List of folders and/or fasta files containing
+the same germline set used in the IgBLAST alignment. These
+reference sequences must contain IMGT-numbering spacers (gaps)
+in the V segment.''')
+group_igblast_aa.add_argument('-s', action='store', nargs='+', dest='seq_files', required=True,
+help='''List of input FASTA files (with .fasta, .fna or .fa
+extension), containing sequences.''')
+group_igblast_aa.add_argument('--10x', action='store', nargs='+', dest='cellranger_file',
+help='''Table file containing 10X annotations (with .csv or .tsv extension).''')
+group_igblast_aa.add_argument('--asis-id', action='store_true', dest='asis_id',
+help='''Specify to prevent input sequence headers from being parsed
+to add new columns to database. Parsing of sequence headers requires
+headers to be in the pRESTO annotation format, so this should be specified
+when sequence headers are incompatible with the pRESTO annotation scheme.
+Note, unrecognized header formats will default to this behavior.''')
+group_igblast_aa.add_argument('--asis-calls', action='store_true', dest='asis_calls',
+help='''Specify to prevent gene calls from being parsed into standard allele names
+in both the IgBLAST output and reference database. Note, this requires
+the sequence identifiers in the reference sequence set and the IgBLAST
+database to be exact string matches.''')
+group_igblast_aa.add_argument('--extended', action='store_true', dest='extended',
+help='''Specify to include additional aligner specific fields in the output.
+Adds v_score, v_identity, v_support, v_cigar, fwr1, fwr2, fwr3, cdr1 and cdr2.''')
+group_igblast_aa.add_argument('--regions', action='store', dest='regions',
+choices=('default', 'rhesus-igl'), default='default',
+help='''IMGT CDR and FWR boundary definition to use.''')
+parser_igblast_aa.set_defaults(func=parseIgBLAST, partial=True, amino_acid=True)
+# IMGT aligner
+parser_imgt = subparsers.add_parser('imgt', parents=[parser_parent],
+formatter_class=CommonHelpFormatter, add_help=False,
+help='''Process IMGT/HighV-Quest output
+(does not work with V-QUEST).''',
+description='''Process IMGT/HighV-Quest output
+(does not work with V-QUEST).''')
+group_imgt = parser_imgt.add_argument_group('aligner parsing arguments')
+group_imgt.add_argument('-i', nargs='+', action='store', dest='aligner_files',
+help='''Either zipped IMGT output files (.zip or .txz) or a
+folder containing unzipped IMGT output files (which must
+include 1_Summary, 2_IMGT-gapped, 3_Nt-sequences,
+and 6_Junction).''')
+group_imgt.add_argument('-s', nargs='*', action='store', dest='seq_files', required=False,
+help='''List of FASTA files (with .fasta, .fna or .fa
+extension) that were submitted to IMGT/HighV-QUEST.
+If unspecified, sequence identifiers truncated by IMGT/HighV-QUEST
+will not be corrected.''')
+group_imgt.add_argument('-r', nargs='+', action='store', dest='repo', required=False,
+help='''List of folders and/or fasta files containing
+the germline sequence set used by IMGT/HighV-QUEST.
+These reference sequences must contain IMGT-numbering spacers (gaps)
+in the V segment. If unspecified, the germline sequence reconstruction
+will not be included in the output.''')
+group_imgt.add_argument('--10x', action='store', nargs='+', dest='cellranger_file',
+help='''Table file containing 10X annotations (with .csv or .tsv
+extension).''')
+group_imgt.add_argument('--asis-id', action='store_true', dest='asis_id',
+help='''Specify to prevent input sequence headers from being parsed
+to add new columns to database. Parsing of sequence headers requires
+headers to be in the pRESTO annotation format, so this should be specified
+when sequence headers are incompatible with the pRESTO annotation scheme.
+Note, unrecognized header formats will default to this behavior.''')
+group_imgt.add_argument('--partial', action='store_true', dest='partial',
+help='''If specified, include incomplete V(D)J alignments in
+the pass file instead of the fail file. An incomplete alignment
+is defined as a record that is missing a V gene assignment,
+J gene assignment, junction region, or productivity call.''')
+group_imgt.add_argument('--extended', action='store_true', dest='extended',
+help='''Specify to include additional aligner specific fields in the output.
+Adds <vdj>_score, <vdj>_identity>, fwr1, fwr2, fwr3, fwr4,
+cdr1, cdr2, cdr3, n1_length, n2_length, p3v_length, p5d_length,
+p3d_length, p5j_length and d_frame.''')
+parser_imgt.set_defaults(func=parseIMGT)
+# iHMMuneAlign Aligner
+parser_ihmm = subparsers.add_parser('ihmm', parents=[parser_parent],
+formatter_class=CommonHelpFormatter, add_help=False,
+help='Process iHMMune-Align output.',
+description='Process iHMMune-Align output.')
+group_ihmm = parser_ihmm.add_argument_group('aligner parsing arguments')
+group_ihmm.add_argument('-i', nargs='+', action='store', dest='aligner_files',
+required=True,
+help='''iHMMune-Align output file.''')
+group_ihmm.add_argument('-r', nargs='+', action='store', dest='repo', required=True,
+help='''List of folders and/or FASTA files containing
+the set of germline sequences used by iHMMune-Align. These
+reference sequences must contain IMGT-numbering spacers (gaps)
+in the V segment.''')
+group_ihmm.add_argument('-s', action='store', nargs='+', dest='seq_files',
+required=True,
+help='''List of input FASTA files (with .fasta, .fna or .fa
+extension) containing sequences.''')
+group_ihmm.add_argument('--10x', action='store', nargs='+', dest='cellranger_file',
+help='''Table file containing 10X annotations (with .csv or .tsv
+extension).''')
+group_ihmm.add_argument('--asis-id', action='store_true', dest='asis_id',
+help='''Specify to prevent input sequence headers from being parsed
+to add new columns to database. Parsing of sequence headers requires
+headers to be in the pRESTO annotation format, so this should be specified
+when sequence headers are incompatible with the pRESTO annotation scheme.
+Note, unrecognized header formats will default to this behavior.''')
+group_ihmm.add_argument('--partial', action='store_true', dest='partial',
+help='''If specified, include incomplete V(D)J alignments in
+the pass file instead of the fail file. An incomplete alignment
+is defined as a record for which a valid IMGT-gapped sequence
+cannot be built or that is missing a V gene assignment,
+J gene assignment, junction region, or productivity call.''')
+group_ihmm.add_argument('--extended', action='store_true', dest='extended',
+help='''Specify to include additional aligner specific fields in the output.
+Adds the path score of the iHMMune-Align hidden Markov model as vdj_score;
+adds fwr1, fwr2, fwr3, fwr4, cdr1, cdr2 and cdr3.''')
+parser_ihmm.set_defaults(func=parseIHMM)
+return parser
+if __name__ == "__main__":
+"""
+Parses command line arguments and calls main
+"""
+parser = getArgParser()
+checkArgs(parser)
+args = parser.parse_args()
+args_dict = parseCommonArgs(args, in_arg='aligner_files')
+# Set no ID parsing if sequence files are not provided
+if 'seq_files' in args_dict and not args_dict['seq_files']:
+args_dict['asis_id'] = True
+# Delete
+if 'aligner_files' in args_dict: del args_dict['aligner_files']
+if 'seq_files' in args_dict: del args_dict['seq_files']
+if 'out_files' in args_dict: del args_dict['out_files']
+if 'command' in args_dict: del args_dict['command']
+if 'func' in args_dict: del args_dict['func']
+# Call main
+for i, f in enumerate(args.__dict__['aligner_files']):
+args_dict['aligner_file'] = f
+args_dict['seq_file'] = args.__dict__['seq_files'][i] \
+if args.__dict__['seq_files'] else None
+args_dict['out_file'] = args.__dict__['out_files'][i] \
+if args.__dict__['out_files'] else None
+args_dict['cellranger_file'] = args.__dict__['cellranger_file'][i] \
+if args.__dict__['cellranger_file'] else None
+args.func(**args_dict)

Mercurial > repos > davidvanzessen > shm_csr

comparison change_o/MakeDb.py @ 78:aff3ba86ef7a draft