Mercurial > repos > public-health-bioinformatics > aggregate_linelisting
comparison aggregate_linelisting.py @ 0:515c0c885f5d draft default tip
planemo upload for repository https://github.com/Public-Health-Bioinformatics/flu_classification_suite commit b96b6e06f6eaa6ae8ef4c24630dbb72a4aed7dbe
author | public-health-bioinformatics |
---|---|
date | Thu, 04 Jul 2019 19:40:13 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:515c0c885f5d |
---|---|
1 #!/usr/bin/env python | |
2 '''Reads in a fasta file of antigenic maps and one with the reference antigenic map as | |
3 protein SeqRecords. Compares amino acids of sample antigenic maps to corresponding sites | |
4 in the reference and masks identical amino acids with dots. Writes headers (including | |
5 amino acid position numbers read from the respective index array), the reference amino | |
6 acid sequence and column headings required for both non-aggregated and aggregated line lists. | |
7 Outputs all headers and modified (i.e. dotted) sample sequences to a csv file.''' | |
8 | |
9 '''Author: Diane Eisler, Molecular Microbiology & Genomics, BCCDC Public Health Laboratory, Jan 2018''' | |
10 | |
11 import sys,string,os, time, Bio, re, argparse | |
12 from Bio import Seq, SeqIO, SeqUtils, Alphabet, SeqRecord | |
13 from Bio.SeqRecord import SeqRecord | |
14 from Bio.Alphabet import IUPAC | |
15 from Bio.Seq import Seq | |
16 | |
17 inputAntigenicMaps = sys.argv[1] #batch fasta file with antigenic map sequences | |
18 refAntigenicMap = sys.argv[2] #fasta file of reference antigenic map sequence | |
19 antigenicSiteIndexArray = sys.argv[3] #antigenic site index array csv file | |
20 cladeDefinitionFile = sys.argv[4] #clade definition csv file | |
21 outFileHandle = sys.argv[5] #user-specifed output filename | |
22 | |
23 agg_lineListFile = open(outFileHandle,'w') #open a writable output file | |
24 | |
25 indicesLine = "" #comma-separated antigenic site positions | |
26 cladeList = [] #list of clade names read from clade definition file | |
27 ref_seq = "" #reference antigenic map (protein sequence) | |
28 seqList = [] #list of aa sequences to compare to reference | |
29 | |
30 BC_list = [] #empty list for BC samples | |
31 AB_list = [] #empty list for AB samples | |
32 ON_list = [] #empty list for ON samples | |
33 QC_list = [] #empty list for QC samples | |
34 nonprov_list = [] #empty list for samples not in above 4 provinces | |
35 #dictionary for location-separated sequence lists | |
36 prov_lists = {'1_BC':BC_list,'2_AB':AB_list,'3_ON':ON_list,'4_QC': QC_list, '5_nonprov': nonprov_list} | |
37 | |
38 def replace_matching_aa_with_dot(record): | |
39 """Compare amino acids in record to reference, mask identical symbols with dots, and return modified record.""" | |
40 orig_seq = str(record.seq) #sequence string from SeqRecord | |
41 mod_seq = "" | |
42 #replace only those aa's matching the reference with dots | |
43 for i in range(0, len(orig_seq)): | |
44 if (orig_seq[i] == ref_seq[i]): | |
45 mod_seq = mod_seq + '.' | |
46 else: | |
47 mod_seq = mod_seq + orig_seq[i] | |
48 #assign modified sequence to new SeqRecord and return it | |
49 rec = SeqRecord(Seq(mod_seq,IUPAC.protein), id = record.id, name = "", description = "") | |
50 return rec | |
51 | |
52 def extract_clade(record): | |
53 """Extract clade name (or 'No_Match') from sequence name and return as clade name. """ | |
54 if record.id.endswith('No_Match'): | |
55 clade_name = 'No_Match' | |
56 else: # | |
57 for clade in cladeList: | |
58 if record.id.endswith(clade): | |
59 clade_name = clade | |
60 return clade_name | |
61 | |
62 def extract_sample_name(record, clade): | |
63 """Extract sample name from sequence name and return sample name. """ | |
64 end_index = record.id.index(clade) | |
65 sample_name = record.id[:end_index -1] | |
66 #return sample name as sequence name minus underscore and clade name | |
67 return sample_name | |
68 | |
69 def sort_by_location(record): | |
70 """Search sequence name for province name or 2-letter province code and add SeqRecord to | |
71 province-specific dictionary.""" | |
72 seq_name = record.id | |
73 if ('-BC-' in seq_name) or ('/British_Columbia/' in seq_name): | |
74 BC_list.append(record) #add Sequence record to BC_list | |
75 elif ('-AB-' in seq_name) or ('/Alberta/' in seq_name): | |
76 AB_list.append(record) #add Sequence record to AB_list | |
77 elif ('-ON-' in seq_name) or ('/Ontario/' in seq_name): | |
78 ON_list.append(record) #add Sequence record to ON_list | |
79 elif ('-QC-' in seq_name) or ('/Quebec/' in seq_name): | |
80 QC_list.append(record) #add Sequence record to QC_list | |
81 else: | |
82 nonprov_list.append(record) #add Sequence record to nonprov_list | |
83 return | |
84 | |
85 def extract_province(record): | |
86 """Search sequence name for province name or 2-letter province code and return province.""" | |
87 seq_name = record.id | |
88 if ('-BC-' in seq_name) or ('/British_Columbia/' in seq_name): | |
89 province = 'British Columbia' | |
90 elif ('-AB-' in seq_name) or ('Alberta' in seq_name): | |
91 province = '/Alberta/' | |
92 elif ('-ON-' in seq_name) or ('/Ontario/' in seq_name): | |
93 province = 'Ontario' | |
94 elif ('-QC-' in seq_name) or ('/Quebec/' in seq_name): | |
95 province = 'Quebec' | |
96 else: | |
97 province = "other" | |
98 return province | |
99 | |
100 def get_sequence_length(record): | |
101 """Return length of sequence in a SeqRecord.""" | |
102 sequenceLength = len(str((record.seq))) | |
103 return sequenceLength | |
104 | |
105 def get_antigenic_site_substitutions(record): | |
106 """Count number of non-dotted amino acids in SeqRecord sequence and return as substitutions.""" | |
107 sequenceLength = get_sequence_length(record) | |
108 seqString = str(record.seq) | |
109 matches = seqString.count('.') | |
110 substitutions = sequenceLength - matches | |
111 return substitutions | |
112 | |
113 def calculate_percent_id(record, substitutions): | |
114 """Calculate percent sequence identity to reference sequence, based on substitutions | |
115 and sequence length and return percent id as a ratio (i.e. 0.90 no 90%).""" | |
116 sequenceLength = get_sequence_length(record) | |
117 percentID = (1.00 - (float(substitutions)/float(sequenceLength))) | |
118 return percentID | |
119 | |
120 def output_aggregated_linelist(a_list): | |
121 """Output aggregated line list of SeqRecords in csv format.""" | |
122 sequevars = {} #dict of sequevar: SeqRecord list | |
123 firstRecordID = None | |
124 #examine dotted/masked sequences in list and assign unique ones as dict keys | |
125 for rec in a_list: | |
126 rec = replace_matching_aa_with_dot(rec) | |
127 sequence =str(rec.seq) | |
128 #if the sequence is a key in the dict, add SeqRecord to list | |
129 if sequence in sequevars: | |
130 #if sequence already in dict as a key, increment the value | |
131 sequevars[sequence].append(rec) | |
132 else: | |
133 #if sequence not in dict, add is as new key with list of 1 SeqRecord | |
134 sequevars[sequence] = [rec] | |
135 #get list of sorted unique sequence keys | |
136 sorted_unique_seq_keys = sorted(sequevars.keys()) | |
137 #process each list of SeqRecords sharing a unique sequence | |
138 for u in sorted_unique_seq_keys: | |
139 #access list of sequences by unique sequence | |
140 listOfSeqs = sequevars[u] | |
141 #sort this list of SeqRecords by record.id (i.e. name) | |
142 listOfSeqs = [f for f in sorted(listOfSeqs, key = lambda x : x.id)] | |
143 N = len(listOfSeqs) | |
144 #output details of first SeqRecord to csv | |
145 firstRecord = listOfSeqs[0] | |
146 province = extract_province(firstRecord) | |
147 clade = extract_clade(firstRecord) | |
148 substitutions = get_antigenic_site_substitutions(firstRecord) | |
149 percentID = calculate_percent_id(firstRecord,substitutions) | |
150 name = extract_sample_name(firstRecord, clade) | |
151 name_part = name.rstrip() + ',' | |
152 N_part = str(N) + ',' | |
153 clade_part = clade + ',' | |
154 substitutions_part = str(substitutions) + ',' | |
155 percID_part = str(percentID) + ',' | |
156 col = " ," #empty column | |
157 sequence = str(firstRecord.seq).strip() | |
158 csv_seq = ",".join(sequence) +"," | |
159 comma_sep_output = name_part + N_part + clade_part + col + csv_seq + substitutions_part + percID_part + "\n" | |
160 #write first member of unique sequence list to csv | |
161 agg_lineListFile.write(comma_sep_output) | |
162 #print sequence records in sequevar to console | |
163 print("\n\t\t%i SeqRecords matching Sequevar: %s" % (len(listOfSeqs), u)) | |
164 | |
165 #to uncollapse sequevar group, print each member of the sequevar list to csv output | |
166 '''for i in range(1,len(listOfSeqs)): | |
167 currentRec = listOfSeqs[i] | |
168 province = extract_province(currentRec) | |
169 clade = extract_clade(currentRec) | |
170 substitutions = get_antigenic_site_substitutions(currentRec) | |
171 percentID = calculate_percent_id(currentRec,substitutions) | |
172 name_part = (currentRec.id).rstrip() + ',' | |
173 N_part = "n/a" + ',' | |
174 clade_part = clade + ',' | |
175 substitutions_part = str(substitutions) + ',' | |
176 percID_part = str(percentID) + ',' | |
177 col = " ," #empty column | |
178 sequence = str(currentRec.seq).strip() | |
179 csv_seq = ",".join(sequence) +"," | |
180 comma_sep_output = name_part + N_part + clade_part + col + csv_seq + substitutions_part + percID_part + "\n" | |
181 agg_lineListFile.write(comma_sep_output) ''' | |
182 return | |
183 | |
184 with open (antigenicSiteIndexArray,'r') as siteIndices: | |
185 """Read amino acid positions from antigenic site index array and print as header after one empty row.""" | |
186 col = "," #empty column | |
187 #read amino acid positions and remove trailing whitespace | |
188 for line in siteIndices: | |
189 #remove whitespace from the end of each line | |
190 indicesLine = line.rstrip() | |
191 row1 = "\n" | |
192 #add comma-separated AA positions to header line | |
193 row2 = col + col + col + col + indicesLine + "\n" | |
194 #write first (empty) and 2nd (amino acid position) lines to output file | |
195 agg_lineListFile.write(row1) | |
196 agg_lineListFile.write(row2) | |
197 | |
198 with open (refAntigenicMap,'r') as refMapFile: | |
199 """Read reference antigenic map from fasta and output amino acids, followed by column headers.""" | |
200 #read sequences from fasta to SeqRecord, uppercase, and store sequence string to ref_seq | |
201 record = SeqIO.read(refMapFile,"fasta",alphabet=IUPAC.protein) | |
202 record = record.upper() | |
203 ref_seq = str(record.seq).strip() #store sequence in variable for comparison to sample seqs | |
204 col = "," #empty column | |
205 name_part = (record.id).rstrip() + ',' | |
206 sequence = str(record.seq).strip() | |
207 csv_seq = ",".join(sequence) | |
208 #output row with reference sequence displayed above sample sequences | |
209 row3 = name_part + col + col + col + csv_seq + "\n" | |
210 agg_lineListFile.write(row3) | |
211 positions = indicesLine.split(',') | |
212 numPos = len(positions) | |
213 empty_indicesLine = ',' * numPos | |
214 #print column headers for sample sequences | |
215 row4 = "Sequence Name,N,Clade,Extra Substitutions," + empty_indicesLine + "Number of Amino Acid Substitutions in Antigenic Sites,% Identity of Antigenic Site Residues\n" | |
216 agg_lineListFile.write(row4) | |
217 print("\nREFERENCE ANTIGENIC MAP: '%s' (%i amino acids)" % (record.id, len(record))) | |
218 | |
219 with open(cladeDefinitionFile,'r') as cladeFile: | |
220 """Read clade definition file and store clade names in list.""" | |
221 #remove whitespace from the end of each line and split elements at commas | |
222 for line in cladeFile: | |
223 elementList = line.rstrip().split(',') | |
224 name = elementList[0] #move 1st element to name field | |
225 cladeList.append(name) | |
226 | |
227 with open(inputAntigenicMaps,'r') as extrAntigMapFile: | |
228 """Read antigenic maps as protein SeqRecords and add to list.""" | |
229 #read Sequences from fasta file, uppercase and add to seqList | |
230 for record in SeqIO.parse(extrAntigMapFile, "fasta", alphabet=IUPAC.protein): | |
231 record = record.upper() | |
232 seqList.append(record) #add Seq to list of Sequences | |
233 | |
234 #print number of sequences to be processed as user check | |
235 print("\nCOMPARING %i flu antigenic map sequences to the reference..." % len(seqList)) | |
236 for record in seqList: | |
237 #assign SeqRecords to province-specific dictionaries | |
238 sort_by_location(record) | |
239 | |
240 #access prov segregated lists in order | |
241 sorted_prov_keys = sorted(prov_lists.keys()) | |
242 print("\nSequence Lists Sorted by Province: ") | |
243 for prov in sorted_prov_keys: | |
244 current_list = prov_lists[prov] | |
245 #mask AA's identical to reference sequence with dot | |
246 masked_list = [] # empty temporary list to park masked sequences | |
247 for record in current_list: | |
248 masked_rec = replace_matching_aa_with_dot(record) | |
249 masked_list.append(masked_rec) | |
250 prov_lists[prov] = masked_list #replace original SeqRecord list with masked list | |
251 | |
252 #group sequences in province-sorted list into clades | |
253 for prov in sorted_prov_keys: | |
254 prov_list = prov_lists[prov] | |
255 by_clades_dict = {} #empty dict for clade:seqRecord list groups | |
256 print("\n'%s' List (Amino Acids identical to Reference are Masked): " % (prov)) | |
257 for rec in prov_list: | |
258 clade = extract_clade(rec) | |
259 if clade in by_clades_dict: | |
260 #if clade already in dict as key, append record to list (value) | |
261 by_clades_dict[clade].append(rec) | |
262 else: #add clade as key to dict, value is list of 1 SeqRecord | |
263 by_clades_dict[clade] = [rec] | |
264 #get list of alphabetically sorted clade keys | |
265 sorted_clade_keys = sorted(by_clades_dict.keys()) | |
266 print("\tNumber of clades: ", len(by_clades_dict)) | |
267 #group each list of sequences in clade by sequevars | |
268 for key in sorted_clade_keys: | |
269 print("\n\tCLADE: %s Number of Members: %i" % (key, len(by_clades_dict[key]))) | |
270 a_list = by_clades_dict[key] | |
271 for seqrec in a_list: | |
272 print("\t %s: %s" %(seqrec.id,str(seqrec.seq))) | |
273 #output the list to csv as aggregated linelist | |
274 output_aggregated_linelist(a_list) | |
275 | |
276 print("Aggregated Linelist written to file: '%s\n'" % (outFileHandle)) | |
277 extrAntigMapFile.close() | |
278 refMapFile.close() | |
279 agg_lineListFile.close() |