annotate change_o/MakeDb.py @ 4:5ffd52fc35c4 draft

Uploaded
author davidvanzessen
date Mon, 12 Dec 2016 05:22:37 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
4
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
1 #!/usr/bin/env python3
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
2 """
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
3 Create tab-delimited database file to store sequence alignment information
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
4 """
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
5 # Info
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
6 __author__ = 'Namita Gupta, Jason Anthony Vander Heiden'
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
7 from changeo import __version__, __date__
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
8
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
9 # Imports
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
10 import csv
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
11 import os
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
12 import re
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
13 import sys
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
14 import pandas as pd
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
15 import tarfile
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
16 import zipfile
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
17 from argparse import ArgumentParser
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
18 from collections import OrderedDict
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
19 from itertools import groupby
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
20 from shutil import rmtree
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
21 from tempfile import mkdtemp
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
22 from textwrap import dedent
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
23 from time import time
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
24 from Bio import SeqIO
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
25 from Bio.Seq import Seq
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
26 from Bio.Alphabet import IUPAC
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
27
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
28 # Presto and changeo imports
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
29 from presto.Defaults import default_out_args
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
30 from presto.Annotation import parseAnnotation
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
31 from presto.IO import countSeqFile, printLog, printProgress
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
32 from changeo.Commandline import CommonHelpFormatter, getCommonArgParser, parseCommonArgs
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
33 from changeo.IO import getDbWriter, countDbFile, getRepo
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
34 from changeo.Receptor import IgRecord, parseAllele, v_allele_regex, d_allele_regex, \
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
35 j_allele_regex
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
36
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
37 # Default parameters
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
38 default_delimiter = ('\t', ',', '-')
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
39
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
40
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
41 def gapV(ig_dict, repo_dict):
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
42 """
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
43 Insert gaps into V region and update alignment information
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
44
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
45 Arguments:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
46 ig_dict : Dictionary of parsed IgBlast output
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
47 repo_dict : Dictionary of IMGT gapped germline sequences
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
48
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
49 Returns:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
50 dict : Updated with SEQUENCE_IMGT, V_GERM_START_IMGT, and V_GERM_LENGTH_IMGT fields
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
51 """
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
52
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
53 seq_imgt = '.' * (int(ig_dict['V_GERM_START_VDJ'])-1) + ig_dict['SEQUENCE_VDJ']
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
54
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
55 # Find gapped germline V segment
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
56 vgene = parseAllele(ig_dict['V_CALL'], v_allele_regex, 'first')
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
57 vkey = (vgene, )
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
58 #TODO: Figure out else case
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
59 if vkey in repo_dict:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
60 vgap = repo_dict[vkey]
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
61 # Iterate over gaps in the germline segment
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
62 gaps = re.finditer(r'\.', vgap)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
63 gapcount = int(ig_dict['V_GERM_START_VDJ'])-1
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
64 for gap in gaps:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
65 i = gap.start()
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
66 # Break if gap begins after V region
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
67 if i >= ig_dict['V_GERM_LENGTH_VDJ'] + gapcount:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
68 break
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
69 # Insert gap into IMGT sequence
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
70 seq_imgt = seq_imgt[:i] + '.' + seq_imgt[i:]
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
71 # Update gap counter
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
72 gapcount += 1
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
73 ig_dict['SEQUENCE_IMGT'] = seq_imgt
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
74 # Update IMGT positioning information for V
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
75 ig_dict['V_GERM_START_IMGT'] = 1
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
76 ig_dict['V_GERM_LENGTH_IMGT'] = ig_dict['V_GERM_LENGTH_VDJ'] + gapcount
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
77
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
78 return ig_dict
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
79
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
80
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
81 def getIMGTJunc(ig_dict, repo_dict):
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
82 """
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
83 Identify junction region by IMGT definition
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
84
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
85 Arguments:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
86 ig_dict : Dictionary of parsed IgBlast output
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
87 repo_dict : Dictionary of IMGT gapped germline sequences
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
88
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
89 Returns:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
90 dict : Updated with JUNCTION_LENGTH_IMGT and JUNCTION_IMGT fields
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
91 """
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
92 # Find germline J segment
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
93 jgene = parseAllele(ig_dict['J_CALL'], j_allele_regex, 'first')
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
94 jkey = (jgene, )
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
95 #TODO: Figure out else case
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
96 if jkey in repo_dict:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
97 # Get germline J sequence
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
98 jgerm = repo_dict[jkey]
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
99 jgerm = jgerm[:ig_dict['J_GERM_START']+ig_dict['J_GERM_LENGTH']-1]
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
100 # Look for (F|W)GXG aa motif in nt sequence
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
101 motif = re.search(r'T(TT|TC|GG)GG[ACGT]{4}GG[AGCT]', jgerm)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
102 aa_end = len(ig_dict['SEQUENCE_IMGT'])
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
103 #TODO: Figure out else case
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
104 if motif:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
105 # print('\n', motif.group())
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
106 aa_end = motif.start() - len(jgerm) + 3
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
107 # Add fields to dict
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
108 ig_dict['JUNCTION'] = ig_dict['SEQUENCE_IMGT'][309:aa_end]
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
109 ig_dict['JUNCTION_LENGTH'] = len(ig_dict['JUNCTION'])
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
110
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
111 return ig_dict
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
112
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
113
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
114 def getRegions(ig_dict):
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
115 """
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
116 Identify FWR and CDR regions by IMGT definition
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
117
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
118 Arguments:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
119 ig_dict : Dictionary of parsed alignment output
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
120
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
121 Returns:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
122 dict : Updated with FWR1_IMGT, FWR2_IMGT, FWR3_IMGT, FWR4_IMGT,
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
123 CDR1_IMGT, CDR2_IMGT, and CDR3_IMGT fields
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
124 """
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
125 try:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
126 seq_len = len(ig_dict['SEQUENCE_IMGT'])
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
127 ig_dict['FWR1_IMGT'] = ig_dict['SEQUENCE_IMGT'][0:min(78,seq_len)]
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
128 except (KeyError, IndexError):
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
129 return ig_dict
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
130
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
131 try: ig_dict['CDR1_IMGT'] = ig_dict['SEQUENCE_IMGT'][78:min(114, seq_len)]
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
132 except (IndexError): return ig_dict
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
133
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
134 try: ig_dict['FWR2_IMGT'] = ig_dict['SEQUENCE_IMGT'][114:min(165, seq_len)]
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
135 except (IndexError): return ig_dict
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
136
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
137 try: ig_dict['CDR2_IMGT'] = ig_dict['SEQUENCE_IMGT'][165:min(195, seq_len)]
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
138 except (IndexError): return ig_dict
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
139
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
140 try: ig_dict['FWR3_IMGT'] = ig_dict['SEQUENCE_IMGT'][195:min(312, seq_len)]
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
141 except (IndexError): return ig_dict
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
142
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
143 try:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
144 cdr3_end = 306 + ig_dict['JUNCTION_LENGTH']
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
145 ig_dict['CDR3_IMGT'] = ig_dict['SEQUENCE_IMGT'][312:cdr3_end]
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
146 ig_dict['FWR4_IMGT'] = ig_dict['SEQUENCE_IMGT'][cdr3_end:]
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
147 except (KeyError, IndexError):
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
148 return ig_dict
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
149
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
150 return ig_dict
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
151
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
152
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
153 def getSeqforIgBlast(seq_file):
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
154 """
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
155 Fetch input sequences for IgBlast queries
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
156
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
157 Arguments:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
158 seq_file = a fasta file of sequences input to IgBlast
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
159
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
160 Returns:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
161 a dictionary of {ID:Seq}
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
162 """
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
163
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
164 seq_dict = SeqIO.index(seq_file, "fasta", IUPAC.ambiguous_dna)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
165
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
166 # Create a seq_dict ID translation using IDs truncate up to space or 50 chars
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
167 seqs = {}
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
168 for seq in seq_dict.values():
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
169 seqs.update({seq.description:str(seq.seq)})
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
170
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
171 return seqs
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
172
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
173
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
174 def findLine(handle, query):
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
175 """
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
176 Finds line with query string in file
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
177
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
178 Arguments:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
179 handle = file handle in which to search for line
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
180 query = query string for which to search in file
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
181
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
182 Returns:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
183 line from handle in which query string was found
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
184 """
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
185 for line in handle:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
186 if(re.match(query, line)):
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
187 return line
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
188
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
189
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
190 def extractIMGT(imgt_output):
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
191 """
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
192 Extract necessary files from IMGT results, zipped or unzipped
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
193
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
194 Arguments:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
195 imgt_output = zipped file or unzipped folder output by IMGT
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
196
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
197 Returns:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
198 sorted list of filenames from which information will be read
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
199 """
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
200 #file_ext = os.path.splitext(imgt_output)[1].lower()
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
201 imgt_flags = ('1_Summary', '2_IMGT-gapped', '3_Nt-sequences', '6_Junction')
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
202 temp_dir = mkdtemp()
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
203 if zipfile.is_zipfile(imgt_output):
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
204 # Open zip file
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
205 imgt_zip = zipfile.ZipFile(imgt_output, 'r')
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
206 # Extract required files
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
207 imgt_files = sorted([n for n in imgt_zip.namelist() \
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
208 if os.path.basename(n).startswith(imgt_flags)])
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
209 imgt_zip.extractall(temp_dir, imgt_files)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
210 # Define file list
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
211 imgt_files = [os.path.join(temp_dir, f) for f in imgt_files]
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
212 elif os.path.isdir(imgt_output):
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
213 # Find required files in folder
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
214 folder_files = []
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
215 for root, dirs, files in os.walk(imgt_output):
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
216 folder_files.extend([os.path.join(os.path.abspath(root), f) for f in files])
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
217 # Define file list
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
218 imgt_files = sorted([n for n in folder_files \
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
219 if os.path.basename(n).startswith(imgt_flags)])
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
220 elif tarfile.is_tarfile(imgt_output):
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
221 # Open zip file
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
222 imgt_tar = tarfile.open(imgt_output, 'r')
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
223 # Extract required files
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
224 imgt_files = sorted([n for n in imgt_tar.getnames() \
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
225 if os.path.basename(n).startswith(imgt_flags)])
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
226 imgt_tar.extractall(temp_dir, [imgt_tar.getmember(n) for n in imgt_files])
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
227 # Define file list
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
228 imgt_files = [os.path.join(temp_dir, f) for f in imgt_files]
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
229 else:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
230 sys.exit('ERROR: Unsupported IGMT output file. Must be either a zipped file (.zip), LZMA compressed tarfile (.txz) or a folder.')
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
231
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
232 if len(imgt_files) > len(imgt_flags): # e.g. multiple 1_Summary files
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
233 sys.exit('ERROR: Wrong files in IMGT output %s.' % imgt_output)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
234 elif len(imgt_files) < len(imgt_flags):
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
235 sys.exit('ERROR: Missing necessary file IMGT output %s.' % imgt_output)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
236
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
237 return temp_dir, imgt_files
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
238
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
239
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
240 # TODO: return a dictionary with keys determined by the comment strings in the blocks, thus avoiding problems with missing blocks
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
241 def readOneIgBlastResult(block):
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
242 """
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
243 Parse a single IgBLAST query result
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
244
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
245 Arguments:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
246 block = itertools groupby object of single result
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
247
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
248 Returns:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
249 None if no results, otherwise list of DataFrames for each result block
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
250 """
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
251 results = list()
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
252 i = 0
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
253 for match, subblock in groupby(block, lambda l: l=='\n'):
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
254 if not match:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
255 # Strip whitespace and comments
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
256 sub = [s.strip() for s in subblock if not s.startswith('#')]
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
257
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
258 # Continue on empty block
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
259 if not sub: continue
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
260 else: i += 1
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
261
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
262 # Split by tabs
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
263 sub = [s.split('\t') for s in sub]
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
264
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
265 # Append list for "V-(D)-J rearrangement summary" (i == 1)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
266 # And "V-(D)-J junction details" (i == 2)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
267 # Otherwise append DataFrame of subblock
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
268 if i == 1 or i == 2:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
269 results.append(sub[0])
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
270 else:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
271 df = pd.DataFrame(sub)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
272 if not df.empty: results.append(df)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
273
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
274 return results if results else None
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
275
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
276
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
277 # TODO: needs more speeds. pandas is probably to blame.
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
278 def readIgBlast(igblast_output, seq_dict, repo_dict,
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
279 score_fields=False, region_fields=False):
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
280 """
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
281 Reads IgBlast output
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
282
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
283 Arguments:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
284 igblast_output = IgBlast output file (format 7)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
285 seq_dict = a dictionary of {ID:Seq} from input fasta file
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
286 repo_dict = dictionary of IMGT gapped germline sequences
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
287 score_fields = if True parse alignment scores
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
288 region_fields = if True add FWR and CDR region fields
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
289
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
290 Returns:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
291 a generator of dictionaries containing alignment data
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
292 """
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
293
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
294 # Open IgBlast output file
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
295 with open(igblast_output) as f:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
296 # Iterate over individual results (separated by # IGBLASTN)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
297 for k1, block in groupby(f, lambda x: re.match('# IGBLASTN', x)):
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
298 block = list(block)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
299 if not k1:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
300 # TODO: move query name extraction into block parser readOneIgBlastResult().
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
301 # Extract sequence ID
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
302 query_name = ' '.join(block[0].strip().split(' ')[2:])
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
303 # Initialize db_gen to have ID and input sequence
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
304 db_gen = {'SEQUENCE_ID': query_name,
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
305 'SEQUENCE_INPUT': seq_dict[query_name]}
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
306
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
307 # Parse further sub-blocks
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
308 block_list = readOneIgBlastResult(block)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
309
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
310 # TODO: this is indented pretty far. should be a separate function. or several functions.
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
311 # If results exist, parse further to obtain full db_gen
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
312 if block_list is not None:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
313 # Parse quality information
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
314 db_gen['STOP'] = 'T' if block_list[0][-4] == 'Yes' else 'F'
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
315 db_gen['IN_FRAME'] = 'T' if block_list[0][-3] == 'In-frame' else 'F'
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
316 db_gen['FUNCTIONAL'] = 'T' if block_list[0][-2] == 'Yes' else 'F'
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
317 if block_list[0][-1] == '-':
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
318 db_gen['SEQUENCE_INPUT'] = str(Seq(db_gen['SEQUENCE_INPUT'],
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
319 IUPAC.ambiguous_dna).reverse_complement())
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
320
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
321 # Parse V, D, and J calls
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
322 call_str = ' '.join(block_list[0])
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
323 v_call = parseAllele(call_str, v_allele_regex, action='list')
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
324 d_call = parseAllele(call_str, d_allele_regex, action='list')
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
325 j_call = parseAllele(call_str, j_allele_regex, action='list')
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
326 db_gen['V_CALL'] = ','.join(v_call) if v_call is not None else 'None'
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
327 db_gen['D_CALL'] = ','.join(d_call) if d_call is not None else 'None'
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
328 db_gen['J_CALL'] = ','.join(j_call) if j_call is not None else 'None'
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
329
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
330 # Parse junction sequence
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
331 # db_gen['JUNCTION_VDJ'] = re.sub('(N/A)|\[|\(|\)|\]', '', ''.join(block_list[1]))
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
332 # db_gen['JUNCTION_LENGTH_VDJ'] = len(db_gen['JUNCTION_VDJ'])
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
333
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
334 # TODO: IgBLAST does a stupid and doesn't output block #3 sometimes. why?
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
335 # TODO: maybe we should fail these. they look craptastic.
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
336 #pd.set_option('display.width', 500)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
337 #print query_name, len(block_list), hit_idx
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
338 #for i, x in enumerate(block_list):
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
339 # print '[%i]' % i
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
340 # print x
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
341
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
342 # Parse segment start and stop positions
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
343 hit_df = block_list[-1]
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
344
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
345 # Alignment info block
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
346 # 0: segment
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
347 # 1: query id
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
348 # 2: subject id
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
349 # 3: % identity
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
350 # 4: alignment length
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
351 # 5: mismatches
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
352 # 6: gap opens
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
353 # 7: gaps
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
354 # 8: q. start
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
355 # 9: q. end
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
356 # 10: s. start
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
357 # 11: s. end
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
358 # 12: evalue
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
359 # 13: bit score
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
360 # 14: query seq
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
361 # 15: subject seq
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
362 # 16: btop
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
363
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
364 # If V call exists, parse V alignment information
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
365 seq_vdj = ''
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
366 if v_call is not None:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
367 v_align = hit_df[hit_df[0] == 'V'].iloc[0]
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
368 # Germline positions
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
369 db_gen['V_GERM_START_VDJ'] = int(v_align[10])
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
370 db_gen['V_GERM_LENGTH_VDJ'] = int(v_align[11]) - db_gen['V_GERM_START_VDJ'] + 1
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
371 # Query sequence positions
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
372 db_gen['V_SEQ_START'] = int(v_align[8])
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
373 db_gen['V_SEQ_LENGTH'] = int(v_align[9]) - db_gen['V_SEQ_START'] + 1
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
374
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
375 if int(v_align[6]) == 0:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
376 db_gen['INDELS'] = 'F'
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
377 else:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
378 db_gen['INDELS'] = 'T'
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
379 # Set functional to none so record gets tossed (junction will be wrong)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
380 # db_gen['FUNCTIONAL'] = None
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
381
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
382 # V alignment scores
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
383 if score_fields:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
384 try: db_gen['V_SCORE'] = float(v_align[13])
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
385 except (TypeError, ValueError): db_gen['V_SCORE'] = 'None'
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
386
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
387 try: db_gen['V_IDENTITY'] = float(v_align[3]) / 100.0
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
388 except (TypeError, ValueError): db_gen['V_IDENTITY'] = 'None'
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
389
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
390 try: db_gen['V_EVALUE'] = float(v_align[12])
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
391 except (TypeError, ValueError): db_gen['V_EVALUE'] = 'None'
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
392
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
393 try: db_gen['V_BTOP'] = v_align[16]
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
394 except (TypeError, ValueError): db_gen['V_BTOP'] = 'None'
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
395
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
396 # Update VDJ sequence, removing insertions
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
397 start = 0
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
398 for m in re.finditer(r'-', v_align[15]):
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
399 ins = m.start()
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
400 seq_vdj += v_align[14][start:ins]
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
401 start = ins + 1
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
402 seq_vdj += v_align[14][start:]
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
403
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
404 # TODO: needs to check that the V results are present before trying to determine N1_LENGTH from them.
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
405 # If D call exists, parse D alignment information
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
406 if d_call is not None:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
407 d_align = hit_df[hit_df[0] == 'D'].iloc[0]
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
408
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
409 # TODO: this is kinda gross. not sure how else to fix the alignment overlap problem though.
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
410 # Determine N-region length and amount of J overlap with V or D alignment
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
411 overlap = 0
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
412 if v_call is not None:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
413 n1_len = int(d_align[8]) - (db_gen['V_SEQ_START'] + db_gen['V_SEQ_LENGTH'])
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
414 if n1_len < 0:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
415 db_gen['N1_LENGTH'] = 0
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
416 overlap = abs(n1_len)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
417 else:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
418 db_gen['N1_LENGTH'] = n1_len
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
419 n1_start = (db_gen['V_SEQ_START'] + db_gen['V_SEQ_LENGTH']-1)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
420 n1_end = int(d_align[8])-1
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
421 seq_vdj += db_gen['SEQUENCE_INPUT'][n1_start:n1_end]
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
422
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
423 # Query sequence positions
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
424 db_gen['D_SEQ_START'] = int(d_align[8]) + overlap
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
425 db_gen['D_SEQ_LENGTH'] = max(int(d_align[9]) - db_gen['D_SEQ_START'] + 1, 0)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
426
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
427 # Germline positions
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
428 db_gen['D_GERM_START'] = int(d_align[10]) + overlap
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
429 db_gen['D_GERM_LENGTH'] = max(int(d_align[11]) - db_gen['D_GERM_START'] + 1, 0)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
430
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
431 # Update VDJ sequence, removing insertions
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
432 start = overlap
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
433 for m in re.finditer(r'-', d_align[15]):
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
434 ins = m.start()
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
435 seq_vdj += d_align[14][start:ins]
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
436 start = ins + 1
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
437 seq_vdj += d_align[14][start:]
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
438
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
439 # TODO: needs to check that the V results are present before trying to determine N1_LENGTH from them.
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
440 # If J call exists, parse J alignment information
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
441 if j_call is not None:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
442 j_align = hit_df[hit_df[0] == 'J'].iloc[0]
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
443
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
444 # TODO: this is kinda gross. not sure how else to fix the alignment overlap problem though.
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
445 # Determine N-region length and amount of J overlap with V or D alignment
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
446 overlap = 0
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
447 if d_call is not None:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
448 n2_len = int(j_align[8]) - (db_gen['D_SEQ_START'] + db_gen['D_SEQ_LENGTH'])
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
449 if n2_len < 0:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
450 db_gen['N2_LENGTH'] = 0
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
451 overlap = abs(n2_len)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
452 else:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
453 db_gen['N2_LENGTH'] = n2_len
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
454 n2_start = (db_gen['D_SEQ_START']+db_gen['D_SEQ_LENGTH']-1)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
455 n2_end = int(j_align[8])-1
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
456 seq_vdj += db_gen['SEQUENCE_INPUT'][n2_start:n2_end]
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
457 elif v_call is not None:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
458 n1_len = int(j_align[8]) - (db_gen['V_SEQ_START'] + db_gen['V_SEQ_LENGTH'])
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
459 if n1_len < 0:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
460 db_gen['N1_LENGTH'] = 0
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
461 overlap = abs(n1_len)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
462 else:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
463 db_gen['N1_LENGTH'] = n1_len
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
464 n1_start = (db_gen['V_SEQ_START']+db_gen['V_SEQ_LENGTH']-1)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
465 n1_end = int(j_align[8])-1
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
466 seq_vdj += db_gen['SEQUENCE_INPUT'][n1_start:n1_end]
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
467 else:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
468 db_gen['N1_LENGTH'] = 0
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
469
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
470 # Query positions
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
471 db_gen['J_SEQ_START'] = int(j_align[8]) + overlap
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
472 db_gen['J_SEQ_LENGTH'] = max(int(j_align[9]) - db_gen['J_SEQ_START'] + 1, 0)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
473
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
474 # Germline positions
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
475 db_gen['J_GERM_START'] = int(j_align[10]) + overlap
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
476 db_gen['J_GERM_LENGTH'] = max(int(j_align[11]) - db_gen['J_GERM_START'] + 1, 0)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
477
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
478 # J alignment scores
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
479 if score_fields:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
480 try: db_gen['J_SCORE'] = float(j_align[13])
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
481 except (TypeError, ValueError): db_gen['J_SCORE'] = 'None'
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
482
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
483 try: db_gen['J_IDENTITY'] = float(j_align[3]) / 100.0
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
484 except (TypeError, ValueError): db_gen['J_IDENTITY'] = 'None'
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
485
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
486 try: db_gen['J_EVALUE'] = float(j_align[12])
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
487 except (TypeError, ValueError): db_gen['J_EVALUE'] = 'None'
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
488
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
489 try: db_gen['J_BTOP'] = j_align[16]
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
490 except (TypeError, ValueError): db_gen['J_BTOP'] = 'None'
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
491
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
492 # Update VDJ sequence, removing insertions
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
493 start = overlap
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
494 for m in re.finditer(r'-', j_align[15]):
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
495 ins = m.start()
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
496 seq_vdj += j_align[14][start:ins]
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
497 start = ins + 1
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
498 seq_vdj += j_align[14][start:]
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
499
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
500 db_gen['SEQUENCE_VDJ'] = seq_vdj
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
501
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
502 # Create IMGT-gapped sequence and infer IMGT junction
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
503 if v_call is not None:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
504 db_gen = gapV(db_gen, repo_dict)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
505 if j_call is not None:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
506 db_gen = getIMGTJunc(db_gen, repo_dict)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
507
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
508 # FWR and CDR regions
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
509 if region_fields: getRegions(db_gen)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
510
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
511 yield IgRecord(db_gen)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
512
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
513
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
514 # TODO: should be more readable
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
515 def readIMGT(imgt_files, score_fields=False, region_fields=False):
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
516 """
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
517 Reads IMGT/HighV-Quest output
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
518
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
519 Arguments:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
520 imgt_files = IMGT/HighV-Quest output files 1, 2, 3, and 6
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
521 score_fields = if True parse alignment scores
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
522 region_fields = if True add FWR and CDR region fields
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
523
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
524 Returns:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
525 a generator of dictionaries containing alignment data
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
526 """
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
527 imgt_iters = [csv.DictReader(open(f, 'rU'), delimiter='\t') for f in imgt_files]
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
528 # Create a dictionary for each sequence alignment and yield its generator
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
529 for sm, gp, nt, jn in zip(*imgt_iters):
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
530 if len(set([sm['Sequence ID'],
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
531 gp['Sequence ID'],
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
532 nt['Sequence ID'],
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
533 jn['Sequence ID']])) != 1:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
534 sys.exit('Error: IMGT files are corrupt starting with Summary file record %s' \
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
535 % sm['Sequence ID'])
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
536
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
537 db_gen = {'SEQUENCE_ID': sm['Sequence ID'],
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
538 'SEQUENCE_INPUT': sm['Sequence']}
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
539
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
540 if 'No results' not in sm['Functionality']:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
541 db_gen['FUNCTIONAL'] = ['?','T','F'][('productive' in sm['Functionality']) +
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
542 ('unprod' in sm['Functionality'])]
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
543 db_gen['IN_FRAME'] = ['?','T','F'][('in-frame' in sm['JUNCTION frame']) +
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
544 ('out-of-frame' in sm['JUNCTION frame'])],
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
545 db_gen['STOP'] = ['F','?','T'][('stop codon' in sm['Functionality comment']) +
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
546 ('unprod' in sm['Functionality'])]
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
547 db_gen['MUTATED_INVARIANT'] = ['F','?','T'][(any(('missing' in sm['Functionality comment'],
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
548 'missing' in sm['V-REGION potential ins/del']))) +
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
549 ('unprod' in sm['Functionality'])]
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
550 db_gen['INDELS'] = ['F','T'][any((sm['V-REGION potential ins/del'],
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
551 sm['V-REGION insertions'],
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
552 sm['V-REGION deletions']))]
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
553
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
554 db_gen['SEQUENCE_VDJ'] = nt['V-D-J-REGION'] if nt['V-D-J-REGION'] else nt['V-J-REGION']
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
555 db_gen['SEQUENCE_IMGT'] = gp['V-D-J-REGION'] if gp['V-D-J-REGION'] else gp['V-J-REGION']
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
556
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
557 db_gen['V_CALL'] = re.sub('\sor\s', ',', re.sub(',', '', gp['V-GENE and allele']))
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
558 db_gen['D_CALL'] = re.sub('\sor\s', ',', re.sub(',', '', gp['D-GENE and allele']))
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
559 db_gen['J_CALL'] = re.sub('\sor\s', ',', re.sub(',', '', gp['J-GENE and allele']))
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
560
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
561 v_seq_length = len(nt['V-REGION']) if nt['V-REGION'] else 0
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
562 db_gen['V_SEQ_START'] = nt['V-REGION start']
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
563 db_gen['V_SEQ_LENGTH'] = v_seq_length
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
564 db_gen['V_GERM_START_IMGT'] = 1
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
565 db_gen['V_GERM_LENGTH_IMGT'] = len(gp['V-REGION']) if gp['V-REGION'] else 0
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
566
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
567 db_gen['N1_LENGTH'] = sum(int(i) for i in [jn["P3'V-nt nb"],
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
568 jn['N-REGION-nt nb'],
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
569 jn['N1-REGION-nt nb'],
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
570 jn["P5'D-nt nb"]] if i)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
571 db_gen['D_SEQ_START'] = sum(int(i) for i in [1, v_seq_length,
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
572 jn["P3'V-nt nb"],
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
573 jn['N-REGION-nt nb'],
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
574 jn['N1-REGION-nt nb'],
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
575 jn["P5'D-nt nb"]] if i)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
576 db_gen['D_SEQ_LENGTH'] = int(jn["D-REGION-nt nb"] or 0)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
577 db_gen['D_GERM_START'] = int(jn["5'D-REGION trimmed-nt nb"] or 0) + 1
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
578 db_gen['D_GERM_LENGTH'] = int(jn["D-REGION-nt nb"] or 0)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
579 db_gen['N2_LENGTH'] = sum(int(i) for i in [jn["P3'D-nt nb"],
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
580 jn['N2-REGION-nt nb'],
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
581 jn["P5'J-nt nb"]] if i)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
582
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
583 db_gen['J_SEQ_START_IMGT'] = sum(int(i) for i in [1, v_seq_length,
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
584 jn["P3'V-nt nb"],
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
585 jn['N-REGION-nt nb'],
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
586 jn['N1-REGION-nt nb'],
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
587 jn["P5'D-nt nb"],
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
588 jn["D-REGION-nt nb"],
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
589 jn["P3'D-nt nb"],
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
590 jn['N2-REGION-nt nb'],
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
591 jn["P5'J-nt nb"]] if i)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
592 db_gen['J_SEQ_LENGTH'] = len(nt['J-REGION']) if nt['J-REGION'] else 0
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
593 db_gen['J_GERM_START'] = int(jn["5'J-REGION trimmed-nt nb"] or 0) + 1
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
594 db_gen['J_GERM_LENGTH'] = len(gp['J-REGION']) if gp['J-REGION'] else 0
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
595
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
596 db_gen['JUNCTION_LENGTH'] = len(jn['JUNCTION']) if jn['JUNCTION'] else 0
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
597 db_gen['JUNCTION'] = jn['JUNCTION']
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
598
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
599 # Alignment scores
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
600 if score_fields:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
601 try: db_gen['V_SCORE'] = float(sm['V-REGION score'])
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
602 except (TypeError, ValueError): db_gen['V_SCORE'] = 'None'
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
603
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
604 try: db_gen['V_IDENTITY'] = float(sm['V-REGION identity %']) / 100.0
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
605 except (TypeError, ValueError): db_gen['V_IDENTITY'] = 'None'
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
606
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
607 try: db_gen['J_SCORE'] = float(sm['J-REGION score'])
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
608 except (TypeError, ValueError): db_gen['J_SCORE'] = 'None'
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
609
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
610 try: db_gen['J_IDENTITY'] = float(sm['J-REGION identity %']) / 100.0
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
611 except (TypeError, ValueError): db_gen['J_IDENTITY'] = 'None'
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
612
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
613 # FWR and CDR regions
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
614 if region_fields: getRegions(db_gen)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
615 else:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
616 db_gen['V_CALL'] = 'None'
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
617 db_gen['D_CALL'] = 'None'
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
618 db_gen['J_CALL'] = 'None'
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
619
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
620 yield IgRecord(db_gen)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
621
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
622
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
623 def getIDforIMGT(seq_file):
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
624 """
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
625 Create a sequence ID translation using IMGT truncation
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
626
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
627 Arguments:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
628 seq_file = a fasta file of sequences input to IMGT
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
629
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
630 Returns:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
631 a dictionary of {truncated ID: full seq description}
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
632 """
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
633
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
634 # Create a seq_dict ID translation using IDs truncate up to space or 50 chars
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
635 ids = {}
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
636 for i, rec in enumerate(SeqIO.parse(seq_file, 'fasta', IUPAC.ambiguous_dna)):
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
637 if len(rec.description) <= 50:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
638 id_key = rec.description
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
639 else:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
640 id_key = re.sub('\||\s|!|&|\*|<|>|\?','_',rec.description[:50])
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
641 ids.update({id_key:rec.description})
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
642
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
643 return ids
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
644
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
645
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
646 def writeDb(db_gen, file_prefix, total_count, id_dict={}, no_parse=True,
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
647 score_fields=False, region_fields=False, out_args=default_out_args):
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
648 """
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
649 Writes tab-delimited database file in output directory
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
650
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
651 Arguments:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
652 db_gen = a generator of IgRecord objects containing alignment data
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
653 file_prefix = directory and prefix for CLIP tab-delim file
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
654 total_count = number of records (for progress bar)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
655 id_dict = a dictionary of {IMGT ID: full seq description}
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
656 no_parse = if ID is to be parsed for pRESTO output with default delimiters
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
657 score_fields = if True add alignment score fields to output file
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
658 region_fields = if True add FWR and CDR region fields to output file
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
659 out_args = common output argument dictionary from parseCommonArgs
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
660
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
661 Returns:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
662 None
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
663 """
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
664 pass_file = "%s_db-pass.tab" % file_prefix
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
665 fail_file = "%s_db-fail.tab" % file_prefix
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
666 ordered_fields = ['SEQUENCE_ID',
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
667 'SEQUENCE_INPUT',
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
668 'FUNCTIONAL',
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
669 'IN_FRAME',
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
670 'STOP',
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
671 'MUTATED_INVARIANT',
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
672 'INDELS',
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
673 'V_CALL',
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
674 'D_CALL',
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
675 'J_CALL',
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
676 'SEQUENCE_VDJ',
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
677 'SEQUENCE_IMGT',
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
678 'V_SEQ_START',
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
679 'V_SEQ_LENGTH',
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
680 'V_GERM_START_VDJ',
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
681 'V_GERM_LENGTH_VDJ',
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
682 'V_GERM_START_IMGT',
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
683 'V_GERM_LENGTH_IMGT',
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
684 'N1_LENGTH',
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
685 'D_SEQ_START',
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
686 'D_SEQ_LENGTH',
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
687 'D_GERM_START',
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
688 'D_GERM_LENGTH',
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
689 'N2_LENGTH',
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
690 'J_SEQ_START',
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
691 'J_SEQ_LENGTH',
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
692 'J_GERM_START',
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
693 'J_GERM_LENGTH',
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
694 'JUNCTION_LENGTH',
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
695 'JUNCTION']
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
696
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
697 if score_fields:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
698 ordered_fields.extend(['V_SCORE',
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
699 'V_IDENTITY',
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
700 'V_EVALUE',
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
701 'V_BTOP',
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
702 'J_SCORE',
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
703 'J_IDENTITY',
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
704 'J_EVALUE',
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
705 'J_BTOP'])
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
706
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
707 if region_fields:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
708 ordered_fields.extend(['FWR1_IMGT', 'FWR2_IMGT', 'FWR3_IMGT', 'FWR4_IMGT',
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
709 'CDR1_IMGT', 'CDR2_IMGT', 'CDR3_IMGT'])
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
710
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
711
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
712 # TODO: This is not the best approach. should pass in output fields.
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
713 # Initiate passed handle
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
714 pass_handle = None
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
715
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
716 # Open failed file
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
717 if out_args['failed']:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
718 fail_handle = open(fail_file, 'wt')
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
719 fail_writer = getDbWriter(fail_handle, add_fields=['SEQUENCE_ID', 'SEQUENCE_INPUT'])
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
720 else:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
721 fail_handle = None
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
722 fail_writer = None
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
723
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
724 # Initialize counters and file
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
725 pass_writer = None
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
726 start_time = time()
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
727 rec_count = pass_count = fail_count = 0
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
728 for record in db_gen:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
729 #printProgress(i + (total_count/2 if id_dict else 0), total_count, 0.05, start_time)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
730 printProgress(rec_count, total_count, 0.05, start_time)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
731 rec_count += 1
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
732
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
733 # Count pass or fail
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
734 if (record.v_call == 'None' and record.j_call == 'None') or \
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
735 record.functional is None or \
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
736 not record.seq_vdj or \
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
737 not record.junction:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
738 # print(record.v_call, record.j_call, record.functional, record.junction)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
739 fail_count += 1
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
740 if fail_writer is not None: fail_writer.writerow(record.toDict())
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
741 continue
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
742 else:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
743 pass_count += 1
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
744
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
745 # Build sample sequence description
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
746 if record.id in id_dict:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
747 record.id = id_dict[record.id]
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
748
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
749 # Parse sequence description into new columns
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
750 if not no_parse:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
751 record.annotations = parseAnnotation(record.id, delimiter=out_args['delimiter'])
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
752 record.id = record.annotations['ID']
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
753 del record.annotations['ID']
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
754
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
755 # TODO: This is not the best approach. should pass in output fields.
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
756 # If first sequence, use parsed description to create new columns and initialize writer
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
757 if pass_writer is None:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
758 if not no_parse: ordered_fields.extend(list(record.annotations.keys()))
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
759 pass_handle = open(pass_file, 'wt')
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
760 pass_writer = getDbWriter(pass_handle, add_fields=ordered_fields)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
761
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
762 # Write row to tab-delim CLIP file
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
763 pass_writer.writerow(record.toDict())
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
764
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
765 # Print log
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
766 #printProgress(i+1 + (total_count/2 if id_dict else 0), total_count, 0.05, start_time)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
767 printProgress(rec_count, total_count, 0.05, start_time)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
768
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
769 log = OrderedDict()
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
770 log['OUTPUT'] = pass_file
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
771 log['PASS'] = pass_count
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
772 log['FAIL'] = fail_count
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
773 log['END'] = 'MakeDb'
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
774 printLog(log)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
775
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
776 if pass_handle is not None: pass_handle.close()
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
777 if fail_handle is not None: fail_handle.close()
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
778
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
779
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
780 # TODO: may be able to merge with parseIMGT
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
781 def parseIgBlast(igblast_output, seq_file, repo, no_parse=True, score_fields=False,
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
782 region_fields=False, out_args=default_out_args):
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
783 """
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
784 Main for IgBlast aligned sample sequences
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
785
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
786 Arguments:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
787 igblast_output = IgBlast output file to process
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
788 seq_file = fasta file input to IgBlast (from which to get sequence)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
789 repo = folder with germline repertoire files
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
790 no_parse = if ID is to be parsed for pRESTO output with default delimiters
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
791 score_fields = if True add alignment score fields to output file
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
792 region_fields = if True add FWR and CDR region fields to output file
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
793 out_args = common output argument dictionary from parseCommonArgs
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
794
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
795 Returns:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
796 None
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
797 """
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
798 # Print parameter info
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
799 log = OrderedDict()
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
800 log['START'] = 'MakeDB'
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
801 log['ALIGNER'] = 'IgBlast'
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
802 log['ALIGN_RESULTS'] = os.path.basename(igblast_output)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
803 log['SEQ_FILE'] = os.path.basename(seq_file)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
804 log['NO_PARSE'] = no_parse
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
805 log['SCORE_FIELDS'] = score_fields
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
806 log['REGION_FIELDS'] = region_fields
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
807 printLog(log)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
808
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
809 # Get input sequence dictionary
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
810 seq_dict = getSeqforIgBlast(seq_file)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
811
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
812 # Formalize out_dir and file-prefix
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
813 if not out_args['out_dir']:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
814 out_dir = os.path.split(igblast_output)[0]
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
815 else:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
816 out_dir = os.path.abspath(out_args['out_dir'])
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
817 if not os.path.exists(out_dir): os.mkdir(out_dir)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
818 if out_args['out_name']:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
819 file_prefix = out_args['out_name']
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
820 else:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
821 file_prefix = os.path.basename(os.path.splitext(igblast_output)[0])
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
822 file_prefix = os.path.join(out_dir, file_prefix)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
823
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
824 total_count = countSeqFile(seq_file)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
825
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
826 # Create
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
827 repo_dict = getRepo(repo)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
828 igblast_dict = readIgBlast(igblast_output, seq_dict, repo_dict,
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
829 score_fields=score_fields, region_fields=region_fields)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
830 writeDb(igblast_dict, file_prefix, total_count, no_parse=no_parse,
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
831 score_fields=score_fields, region_fields=region_fields, out_args=out_args)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
832
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
833
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
834 # TODO: may be able to merge with parseIgBlast
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
835 def parseIMGT(imgt_output, seq_file=None, no_parse=True, score_fields=False,
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
836 region_fields=False, out_args=default_out_args):
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
837 """
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
838 Main for IMGT aligned sample sequences
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
839
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
840 Arguments:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
841 imgt_output = zipped file or unzipped folder output by IMGT
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
842 seq_file = FASTA file input to IMGT (from which to get seqID)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
843 no_parse = if ID is to be parsed for pRESTO output with default delimiters
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
844 score_fields = if True add alignment score fields to output file
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
845 region_fields = if True add FWR and CDR region fields to output file
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
846 out_args = common output argument dictionary from parseCommonArgs
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
847
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
848 Returns:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
849 None
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
850 """
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
851 # Print parameter info
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
852 log = OrderedDict()
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
853 log['START'] = 'MakeDb'
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
854 log['ALIGNER'] = 'IMGT'
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
855 log['ALIGN_RESULTS'] = imgt_output
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
856 log['SEQ_FILE'] = os.path.basename(seq_file) if seq_file else ''
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
857 log['NO_PARSE'] = no_parse
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
858 log['SCORE_FIELDS'] = score_fields
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
859 log['REGION_FIELDS'] = region_fields
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
860 printLog(log)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
861
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
862 # Get individual IMGT result files
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
863 temp_dir, imgt_files = extractIMGT(imgt_output)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
864
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
865 # Formalize out_dir and file-prefix
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
866 if not out_args['out_dir']:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
867 out_dir = os.path.dirname(os.path.abspath(imgt_output))
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
868 else:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
869 out_dir = os.path.abspath(out_args['out_dir'])
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
870 if not os.path.exists(out_dir): os.mkdir(out_dir)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
871 if out_args['out_name']:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
872 file_prefix = out_args['out_name']
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
873 else:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
874 file_prefix = os.path.splitext(os.path.split(os.path.abspath(imgt_output))[1])[0]
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
875 file_prefix = os.path.join(out_dir, file_prefix)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
876
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
877 total_count = countDbFile(imgt_files[0])
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
878
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
879 # Get (parsed) IDs from fasta file submitted to IMGT
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
880 id_dict = getIDforIMGT(seq_file) if seq_file else {}
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
881
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
882 # Create
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
883 imgt_dict = readIMGT(imgt_files, score_fields=score_fields,
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
884 region_fields=region_fields)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
885 writeDb(imgt_dict, file_prefix, total_count, id_dict=id_dict, no_parse=no_parse,
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
886 score_fields=score_fields, region_fields=region_fields, out_args=out_args)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
887
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
888 # Delete temp directory
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
889 rmtree(temp_dir)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
890
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
891
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
892 def getArgParser():
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
893 """
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
894 Defines the ArgumentParser
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
895
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
896 Arguments:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
897 None
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
898
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
899 Returns:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
900 an ArgumentParser object
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
901 """
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
902 fields = dedent(
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
903 '''
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
904 output files:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
905 db-pass
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
906 database of parsed alignment records.
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
907 db-fail
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
908 database with records failing alignment.
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
909
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
910 output fields:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
911 SEQUENCE_ID, SEQUENCE_INPUT, FUNCTIONAL, IN_FRAME, STOP, MUTATED_INVARIANT,
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
912 INDELS, V_CALL, D_CALL, J_CALL, SEQUENCE_VDJ and/or SEQUENCE_IMGT,
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
913 V_SEQ_START, V_SEQ_LENGTH, V_GERM_START_VDJ and/or V_GERM_START_IMGT,
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
914 V_GERM_LENGTH_VDJ and/or V_GERM_LENGTH_IMGT, N1_LENGTH,
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
915 D_SEQ_START, D_SEQ_LENGTH, D_GERM_START, D_GERM_LENGTH, N2_LENGTH,
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
916 J_SEQ_START, J_SEQ_LENGTH, J_GERM_START, J_GERM_LENGTH,
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
917 JUNCTION_LENGTH, JUNCTION, V_SCORE, V_IDENTITY, V_EVALUE, V_BTOP,
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
918 J_SCORE, J_IDENTITY, J_EVALUE, J_BTOP, FWR1_IMGT, FWR2_IMGT, FWR3_IMGT,
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
919 FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, CDR3_IMGT
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
920 ''')
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
921
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
922 # Define ArgumentParser
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
923 parser = ArgumentParser(description=__doc__, epilog=fields,
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
924 formatter_class=CommonHelpFormatter)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
925 parser.add_argument('--version', action='version',
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
926 version='%(prog)s:' + ' %s-%s' %(__version__, __date__))
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
927 subparsers = parser.add_subparsers(title='subcommands', dest='command',
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
928 help='Aligner used', metavar='')
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
929 # TODO: This is a temporary fix for Python issue 9253
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
930 subparsers.required = True
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
931
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
932 # Parent parser
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
933 parser_parent = getCommonArgParser(seq_in=False, seq_out=False, log=False)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
934
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
935 # IgBlast Aligner
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
936 parser_igblast = subparsers.add_parser('igblast', help='Process IgBlast output',
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
937 parents=[parser_parent],
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
938 formatter_class=CommonHelpFormatter)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
939 parser_igblast.set_defaults(func=parseIgBlast)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
940 parser_igblast.add_argument('-i', nargs='+', action='store', dest='aligner_files',
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
941 required=True,
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
942 help='''IgBLAST output files in format 7 with query sequence
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
943 (IgBLAST argument \'-outfmt "7 std qseq sseq btop"\').''')
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
944 parser_igblast.add_argument('-r', nargs='+', action='store', dest='repo', required=True,
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
945 help='''List of folders and/or fasta files containing
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
946 IMGT-gapped germline sequences corresponding to the
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
947 set of germlines used in the IgBLAST alignment.''')
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
948 parser_igblast.add_argument('-s', action='store', nargs='+', dest='seq_files',
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
949 required=True,
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
950 help='List of input FASTA files containing sequences')
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
951 parser_igblast.add_argument('--noparse', action='store_true', dest='no_parse',
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
952 help='''Specify if input IDs should not be parsed to add
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
953 new columns to database.''')
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
954 parser_igblast.add_argument('--scores', action='store_true', dest='score_fields',
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
955 help='''Specify if alignment score metrics should be
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
956 included in the output. Adds the V_SCORE, V_IDENTITY,
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
957 V_EVALUE, V_BTOP, J_SCORE, J_IDENTITY,
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
958 J_BTOP, and J_EVALUE columns.''')
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
959 parser_igblast.add_argument('--regions', action='store_true', dest='region_fields',
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
960 help='''Specify if IMGT framework and CDR regions should be
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
961 included in the output. Adds the FWR1_IMGT, FWR2_IMGT,
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
962 FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
963 CDR3_IMGT columns.''')
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
964
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
965 # IMGT aligner
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
966 parser_imgt = subparsers.add_parser('imgt', help='Process IMGT/HighV-Quest output',
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
967 parents=[parser_parent],
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
968 formatter_class=CommonHelpFormatter)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
969 imgt_arg_group = parser_imgt.add_mutually_exclusive_group(required=True)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
970 imgt_arg_group.add_argument('-i', nargs='+', action='store', dest='aligner_files',
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
971 help='''Either zipped IMGT output files (.zip) or a folder
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
972 containing unzipped IMGT output files (which must
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
973 include 1_Summary, 2_IMGT-gapped, 3_Nt-sequences,
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
974 and 6_Junction).''')
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
975 parser_imgt.add_argument('-s', nargs='*', action='store', dest='seq_files',
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
976 required=False,
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
977 help='List of input FASTA files containing sequences')
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
978 parser_imgt.add_argument('--noparse', action='store_true', dest='no_parse',
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
979 help='''Specify if input IDs should not be parsed to add new
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
980 columns to database.''')
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
981 parser_imgt.add_argument('--scores', action='store_true', dest='score_fields',
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
982 help='''Specify if alignment score metrics should be
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
983 included in the output. Adds the V_SCORE, V_IDENTITY,
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
984 J_SCORE and J_IDENTITY. Note, this will also add
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
985 the columns V_EVALUE, V_BTOP, J_EVALUE and J_BTOP,
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
986 but they will be empty for IMGT output.''')
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
987 parser_imgt.add_argument('--regions', action='store_true', dest='region_fields',
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
988 help='''Specify if IMGT framework and CDR regions should be
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
989 included in the output. Adds the FWR1_IMGT, FWR2_IMGT,
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
990 FWR3_IMGT, FWR4_IMGT, CDR1_IMGT, CDR2_IMGT, and
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
991 CDR3_IMGT columns.''')
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
992 parser_imgt.set_defaults(func=parseIMGT)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
993
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
994 return parser
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
995
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
996
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
997 if __name__ == "__main__":
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
998 """
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
999 Parses command line arguments and calls main
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
1000 """
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
1001 parser = getArgParser()
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
1002 args = parser.parse_args()
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
1003 args_dict = parseCommonArgs(args, in_arg='aligner_files')
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
1004
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
1005 # Set no ID parsing if sequence files are not provided
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
1006 if 'seq_files' in args_dict and not args_dict['seq_files']:
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
1007 args_dict['no_parse'] = True
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
1008
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
1009 # Delete
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
1010 if 'seq_files' in args_dict: del args_dict['seq_files']
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
1011 if 'aligner_files' in args_dict: del args_dict['aligner_files']
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
1012 if 'command' in args_dict: del args_dict['command']
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
1013 if 'func' in args_dict: del args_dict['func']
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
1014
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
1015 if args.command == 'imgt':
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
1016 for i in range(len(args.__dict__['aligner_files'])):
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
1017 args_dict['imgt_output'] = args.__dict__['aligner_files'][i]
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
1018 args_dict['seq_file'] = args.__dict__['seq_files'][i] \
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
1019 if args.__dict__['seq_files'] else None
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
1020 args.func(**args_dict)
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
1021 elif args.command == 'igblast':
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
1022 for i in range(len(args.__dict__['aligner_files'])):
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
1023 args_dict['igblast_output'] = args.__dict__['aligner_files'][i]
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
1024 args_dict['seq_file'] = args.__dict__['seq_files'][i]
5ffd52fc35c4 Uploaded
davidvanzessen
parents:
diff changeset
1025 args.func(**args_dict)