comparison aggregate_linelisting.py @ 0:515c0c885f5d draft default tip

planemo upload for repository https://github.com/Public-Health-Bioinformatics/flu_classification_suite commit b96b6e06f6eaa6ae8ef4c24630dbb72a4aed7dbe
author public-health-bioinformatics
date Thu, 04 Jul 2019 19:40:13 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:515c0c885f5d
1 #!/usr/bin/env python
2 '''Reads in a fasta file of antigenic maps and one with the reference antigenic map as
3 protein SeqRecords. Compares amino acids of sample antigenic maps to corresponding sites
4 in the reference and masks identical amino acids with dots. Writes headers (including
5 amino acid position numbers read from the respective index array), the reference amino
6 acid sequence and column headings required for both non-aggregated and aggregated line lists.
7 Outputs all headers and modified (i.e. dotted) sample sequences to a csv file.'''
8
9 '''Author: Diane Eisler, Molecular Microbiology & Genomics, BCCDC Public Health Laboratory, Jan 2018'''
10
11 import sys,string,os, time, Bio, re, argparse
12 from Bio import Seq, SeqIO, SeqUtils, Alphabet, SeqRecord
13 from Bio.SeqRecord import SeqRecord
14 from Bio.Alphabet import IUPAC
15 from Bio.Seq import Seq
16
17 inputAntigenicMaps = sys.argv[1] #batch fasta file with antigenic map sequences
18 refAntigenicMap = sys.argv[2] #fasta file of reference antigenic map sequence
19 antigenicSiteIndexArray = sys.argv[3] #antigenic site index array csv file
20 cladeDefinitionFile = sys.argv[4] #clade definition csv file
21 outFileHandle = sys.argv[5] #user-specifed output filename
22
23 agg_lineListFile = open(outFileHandle,'w') #open a writable output file
24
25 indicesLine = "" #comma-separated antigenic site positions
26 cladeList = [] #list of clade names read from clade definition file
27 ref_seq = "" #reference antigenic map (protein sequence)
28 seqList = [] #list of aa sequences to compare to reference
29
30 BC_list = [] #empty list for BC samples
31 AB_list = [] #empty list for AB samples
32 ON_list = [] #empty list for ON samples
33 QC_list = [] #empty list for QC samples
34 nonprov_list = [] #empty list for samples not in above 4 provinces
35 #dictionary for location-separated sequence lists
36 prov_lists = {'1_BC':BC_list,'2_AB':AB_list,'3_ON':ON_list,'4_QC': QC_list, '5_nonprov': nonprov_list}
37
38 def replace_matching_aa_with_dot(record):
39 """Compare amino acids in record to reference, mask identical symbols with dots, and return modified record."""
40 orig_seq = str(record.seq) #sequence string from SeqRecord
41 mod_seq = ""
42 #replace only those aa's matching the reference with dots
43 for i in range(0, len(orig_seq)):
44 if (orig_seq[i] == ref_seq[i]):
45 mod_seq = mod_seq + '.'
46 else:
47 mod_seq = mod_seq + orig_seq[i]
48 #assign modified sequence to new SeqRecord and return it
49 rec = SeqRecord(Seq(mod_seq,IUPAC.protein), id = record.id, name = "", description = "")
50 return rec
51
52 def extract_clade(record):
53 """Extract clade name (or 'No_Match') from sequence name and return as clade name. """
54 if record.id.endswith('No_Match'):
55 clade_name = 'No_Match'
56 else: #
57 for clade in cladeList:
58 if record.id.endswith(clade):
59 clade_name = clade
60 return clade_name
61
62 def extract_sample_name(record, clade):
63 """Extract sample name from sequence name and return sample name. """
64 end_index = record.id.index(clade)
65 sample_name = record.id[:end_index -1]
66 #return sample name as sequence name minus underscore and clade name
67 return sample_name
68
69 def sort_by_location(record):
70 """Search sequence name for province name or 2-letter province code and add SeqRecord to
71 province-specific dictionary."""
72 seq_name = record.id
73 if ('-BC-' in seq_name) or ('/British_Columbia/' in seq_name):
74 BC_list.append(record) #add Sequence record to BC_list
75 elif ('-AB-' in seq_name) or ('/Alberta/' in seq_name):
76 AB_list.append(record) #add Sequence record to AB_list
77 elif ('-ON-' in seq_name) or ('/Ontario/' in seq_name):
78 ON_list.append(record) #add Sequence record to ON_list
79 elif ('-QC-' in seq_name) or ('/Quebec/' in seq_name):
80 QC_list.append(record) #add Sequence record to QC_list
81 else:
82 nonprov_list.append(record) #add Sequence record to nonprov_list
83 return
84
85 def extract_province(record):
86 """Search sequence name for province name or 2-letter province code and return province."""
87 seq_name = record.id
88 if ('-BC-' in seq_name) or ('/British_Columbia/' in seq_name):
89 province = 'British Columbia'
90 elif ('-AB-' in seq_name) or ('Alberta' in seq_name):
91 province = '/Alberta/'
92 elif ('-ON-' in seq_name) or ('/Ontario/' in seq_name):
93 province = 'Ontario'
94 elif ('-QC-' in seq_name) or ('/Quebec/' in seq_name):
95 province = 'Quebec'
96 else:
97 province = "other"
98 return province
99
100 def get_sequence_length(record):
101 """Return length of sequence in a SeqRecord."""
102 sequenceLength = len(str((record.seq)))
103 return sequenceLength
104
105 def get_antigenic_site_substitutions(record):
106 """Count number of non-dotted amino acids in SeqRecord sequence and return as substitutions."""
107 sequenceLength = get_sequence_length(record)
108 seqString = str(record.seq)
109 matches = seqString.count('.')
110 substitutions = sequenceLength - matches
111 return substitutions
112
113 def calculate_percent_id(record, substitutions):
114 """Calculate percent sequence identity to reference sequence, based on substitutions
115 and sequence length and return percent id as a ratio (i.e. 0.90 no 90%)."""
116 sequenceLength = get_sequence_length(record)
117 percentID = (1.00 - (float(substitutions)/float(sequenceLength)))
118 return percentID
119
120 def output_aggregated_linelist(a_list):
121 """Output aggregated line list of SeqRecords in csv format."""
122 sequevars = {} #dict of sequevar: SeqRecord list
123 firstRecordID = None
124 #examine dotted/masked sequences in list and assign unique ones as dict keys
125 for rec in a_list:
126 rec = replace_matching_aa_with_dot(rec)
127 sequence =str(rec.seq)
128 #if the sequence is a key in the dict, add SeqRecord to list
129 if sequence in sequevars:
130 #if sequence already in dict as a key, increment the value
131 sequevars[sequence].append(rec)
132 else:
133 #if sequence not in dict, add is as new key with list of 1 SeqRecord
134 sequevars[sequence] = [rec]
135 #get list of sorted unique sequence keys
136 sorted_unique_seq_keys = sorted(sequevars.keys())
137 #process each list of SeqRecords sharing a unique sequence
138 for u in sorted_unique_seq_keys:
139 #access list of sequences by unique sequence
140 listOfSeqs = sequevars[u]
141 #sort this list of SeqRecords by record.id (i.e. name)
142 listOfSeqs = [f for f in sorted(listOfSeqs, key = lambda x : x.id)]
143 N = len(listOfSeqs)
144 #output details of first SeqRecord to csv
145 firstRecord = listOfSeqs[0]
146 province = extract_province(firstRecord)
147 clade = extract_clade(firstRecord)
148 substitutions = get_antigenic_site_substitutions(firstRecord)
149 percentID = calculate_percent_id(firstRecord,substitutions)
150 name = extract_sample_name(firstRecord, clade)
151 name_part = name.rstrip() + ','
152 N_part = str(N) + ','
153 clade_part = clade + ','
154 substitutions_part = str(substitutions) + ','
155 percID_part = str(percentID) + ','
156 col = " ," #empty column
157 sequence = str(firstRecord.seq).strip()
158 csv_seq = ",".join(sequence) +","
159 comma_sep_output = name_part + N_part + clade_part + col + csv_seq + substitutions_part + percID_part + "\n"
160 #write first member of unique sequence list to csv
161 agg_lineListFile.write(comma_sep_output)
162 #print sequence records in sequevar to console
163 print("\n\t\t%i SeqRecords matching Sequevar: %s" % (len(listOfSeqs), u))
164
165 #to uncollapse sequevar group, print each member of the sequevar list to csv output
166 '''for i in range(1,len(listOfSeqs)):
167 currentRec = listOfSeqs[i]
168 province = extract_province(currentRec)
169 clade = extract_clade(currentRec)
170 substitutions = get_antigenic_site_substitutions(currentRec)
171 percentID = calculate_percent_id(currentRec,substitutions)
172 name_part = (currentRec.id).rstrip() + ','
173 N_part = "n/a" + ','
174 clade_part = clade + ','
175 substitutions_part = str(substitutions) + ','
176 percID_part = str(percentID) + ','
177 col = " ," #empty column
178 sequence = str(currentRec.seq).strip()
179 csv_seq = ",".join(sequence) +","
180 comma_sep_output = name_part + N_part + clade_part + col + csv_seq + substitutions_part + percID_part + "\n"
181 agg_lineListFile.write(comma_sep_output) '''
182 return
183
184 with open (antigenicSiteIndexArray,'r') as siteIndices:
185 """Read amino acid positions from antigenic site index array and print as header after one empty row."""
186 col = "," #empty column
187 #read amino acid positions and remove trailing whitespace
188 for line in siteIndices:
189 #remove whitespace from the end of each line
190 indicesLine = line.rstrip()
191 row1 = "\n"
192 #add comma-separated AA positions to header line
193 row2 = col + col + col + col + indicesLine + "\n"
194 #write first (empty) and 2nd (amino acid position) lines to output file
195 agg_lineListFile.write(row1)
196 agg_lineListFile.write(row2)
197
198 with open (refAntigenicMap,'r') as refMapFile:
199 """Read reference antigenic map from fasta and output amino acids, followed by column headers."""
200 #read sequences from fasta to SeqRecord, uppercase, and store sequence string to ref_seq
201 record = SeqIO.read(refMapFile,"fasta",alphabet=IUPAC.protein)
202 record = record.upper()
203 ref_seq = str(record.seq).strip() #store sequence in variable for comparison to sample seqs
204 col = "," #empty column
205 name_part = (record.id).rstrip() + ','
206 sequence = str(record.seq).strip()
207 csv_seq = ",".join(sequence)
208 #output row with reference sequence displayed above sample sequences
209 row3 = name_part + col + col + col + csv_seq + "\n"
210 agg_lineListFile.write(row3)
211 positions = indicesLine.split(',')
212 numPos = len(positions)
213 empty_indicesLine = ',' * numPos
214 #print column headers for sample sequences
215 row4 = "Sequence Name,N,Clade,Extra Substitutions," + empty_indicesLine + "Number of Amino Acid Substitutions in Antigenic Sites,% Identity of Antigenic Site Residues\n"
216 agg_lineListFile.write(row4)
217 print("\nREFERENCE ANTIGENIC MAP: '%s' (%i amino acids)" % (record.id, len(record)))
218
219 with open(cladeDefinitionFile,'r') as cladeFile:
220 """Read clade definition file and store clade names in list."""
221 #remove whitespace from the end of each line and split elements at commas
222 for line in cladeFile:
223 elementList = line.rstrip().split(',')
224 name = elementList[0] #move 1st element to name field
225 cladeList.append(name)
226
227 with open(inputAntigenicMaps,'r') as extrAntigMapFile:
228 """Read antigenic maps as protein SeqRecords and add to list."""
229 #read Sequences from fasta file, uppercase and add to seqList
230 for record in SeqIO.parse(extrAntigMapFile, "fasta", alphabet=IUPAC.protein):
231 record = record.upper()
232 seqList.append(record) #add Seq to list of Sequences
233
234 #print number of sequences to be processed as user check
235 print("\nCOMPARING %i flu antigenic map sequences to the reference..." % len(seqList))
236 for record in seqList:
237 #assign SeqRecords to province-specific dictionaries
238 sort_by_location(record)
239
240 #access prov segregated lists in order
241 sorted_prov_keys = sorted(prov_lists.keys())
242 print("\nSequence Lists Sorted by Province: ")
243 for prov in sorted_prov_keys:
244 current_list = prov_lists[prov]
245 #mask AA's identical to reference sequence with dot
246 masked_list = [] # empty temporary list to park masked sequences
247 for record in current_list:
248 masked_rec = replace_matching_aa_with_dot(record)
249 masked_list.append(masked_rec)
250 prov_lists[prov] = masked_list #replace original SeqRecord list with masked list
251
252 #group sequences in province-sorted list into clades
253 for prov in sorted_prov_keys:
254 prov_list = prov_lists[prov]
255 by_clades_dict = {} #empty dict for clade:seqRecord list groups
256 print("\n'%s' List (Amino Acids identical to Reference are Masked): " % (prov))
257 for rec in prov_list:
258 clade = extract_clade(rec)
259 if clade in by_clades_dict:
260 #if clade already in dict as key, append record to list (value)
261 by_clades_dict[clade].append(rec)
262 else: #add clade as key to dict, value is list of 1 SeqRecord
263 by_clades_dict[clade] = [rec]
264 #get list of alphabetically sorted clade keys
265 sorted_clade_keys = sorted(by_clades_dict.keys())
266 print("\tNumber of clades: ", len(by_clades_dict))
267 #group each list of sequences in clade by sequevars
268 for key in sorted_clade_keys:
269 print("\n\tCLADE: %s Number of Members: %i" % (key, len(by_clades_dict[key])))
270 a_list = by_clades_dict[key]
271 for seqrec in a_list:
272 print("\t %s: %s" %(seqrec.id,str(seqrec.seq)))
273 #output the list to csv as aggregated linelist
274 output_aggregated_linelist(a_list)
275
276 print("Aggregated Linelist written to file: '%s\n'" % (outFileHandle))
277 extrAntigMapFile.close()
278 refMapFile.close()
279 agg_lineListFile.close()