# HG changeset patch # User public-health-bioinformatics # Date 1562283613 14400 # Node ID 515c0c885f5d0181df2c71b4f8471940211db4db planemo upload for repository https://github.com/Public-Health-Bioinformatics/flu_classification_suite commit b96b6e06f6eaa6ae8ef4c24630dbb72a4aed7dbe diff -r 000000000000 -r 515c0c885f5d aggregate_linelisting.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/aggregate_linelisting.py Thu Jul 04 19:40:13 2019 -0400 @@ -0,0 +1,279 @@ +#!/usr/bin/env python +'''Reads in a fasta file of antigenic maps and one with the reference antigenic map as +protein SeqRecords. Compares amino acids of sample antigenic maps to corresponding sites +in the reference and masks identical amino acids with dots. Writes headers (including +amino acid position numbers read from the respective index array), the reference amino +acid sequence and column headings required for both non-aggregated and aggregated line lists. +Outputs all headers and modified (i.e. dotted) sample sequences to a csv file.''' + +'''Author: Diane Eisler, Molecular Microbiology & Genomics, BCCDC Public Health Laboratory, Jan 2018''' + +import sys,string,os, time, Bio, re, argparse +from Bio import Seq, SeqIO, SeqUtils, Alphabet, SeqRecord +from Bio.SeqRecord import SeqRecord +from Bio.Alphabet import IUPAC +from Bio.Seq import Seq + +inputAntigenicMaps = sys.argv[1] #batch fasta file with antigenic map sequences +refAntigenicMap = sys.argv[2] #fasta file of reference antigenic map sequence +antigenicSiteIndexArray = sys.argv[3] #antigenic site index array csv file +cladeDefinitionFile = sys.argv[4] #clade definition csv file +outFileHandle = sys.argv[5] #user-specifed output filename + +agg_lineListFile = open(outFileHandle,'w') #open a writable output file + +indicesLine = "" #comma-separated antigenic site positions +cladeList = [] #list of clade names read from clade definition file +ref_seq = "" #reference antigenic map (protein sequence) +seqList = [] #list of aa sequences to compare to reference + +BC_list = [] #empty list for BC samples +AB_list = [] #empty list for AB samples +ON_list = [] #empty list for ON samples +QC_list = [] #empty list for QC samples +nonprov_list = [] #empty list for samples not in above 4 provinces +#dictionary for location-separated sequence lists +prov_lists = {'1_BC':BC_list,'2_AB':AB_list,'3_ON':ON_list,'4_QC': QC_list, '5_nonprov': nonprov_list} + +def replace_matching_aa_with_dot(record): + """Compare amino acids in record to reference, mask identical symbols with dots, and return modified record.""" + orig_seq = str(record.seq) #sequence string from SeqRecord + mod_seq = "" + #replace only those aa's matching the reference with dots + for i in range(0, len(orig_seq)): + if (orig_seq[i] == ref_seq[i]): + mod_seq = mod_seq + '.' + else: + mod_seq = mod_seq + orig_seq[i] + #assign modified sequence to new SeqRecord and return it + rec = SeqRecord(Seq(mod_seq,IUPAC.protein), id = record.id, name = "", description = "") + return rec + +def extract_clade(record): + """Extract clade name (or 'No_Match') from sequence name and return as clade name. """ + if record.id.endswith('No_Match'): + clade_name = 'No_Match' + else: # + for clade in cladeList: + if record.id.endswith(clade): + clade_name = clade + return clade_name + +def extract_sample_name(record, clade): + """Extract sample name from sequence name and return sample name. """ + end_index = record.id.index(clade) + sample_name = record.id[:end_index -1] + #return sample name as sequence name minus underscore and clade name + return sample_name + +def sort_by_location(record): + """Search sequence name for province name or 2-letter province code and add SeqRecord to + province-specific dictionary.""" + seq_name = record.id + if ('-BC-' in seq_name) or ('/British_Columbia/' in seq_name): + BC_list.append(record) #add Sequence record to BC_list + elif ('-AB-' in seq_name) or ('/Alberta/' in seq_name): + AB_list.append(record) #add Sequence record to AB_list + elif ('-ON-' in seq_name) or ('/Ontario/' in seq_name): + ON_list.append(record) #add Sequence record to ON_list + elif ('-QC-' in seq_name) or ('/Quebec/' in seq_name): + QC_list.append(record) #add Sequence record to QC_list + else: + nonprov_list.append(record) #add Sequence record to nonprov_list + return + +def extract_province(record): + """Search sequence name for province name or 2-letter province code and return province.""" + seq_name = record.id + if ('-BC-' in seq_name) or ('/British_Columbia/' in seq_name): + province = 'British Columbia' + elif ('-AB-' in seq_name) or ('Alberta' in seq_name): + province = '/Alberta/' + elif ('-ON-' in seq_name) or ('/Ontario/' in seq_name): + province = 'Ontario' + elif ('-QC-' in seq_name) or ('/Quebec/' in seq_name): + province = 'Quebec' + else: + province = "other" + return province + +def get_sequence_length(record): + """Return length of sequence in a SeqRecord.""" + sequenceLength = len(str((record.seq))) + return sequenceLength + +def get_antigenic_site_substitutions(record): + """Count number of non-dotted amino acids in SeqRecord sequence and return as substitutions.""" + sequenceLength = get_sequence_length(record) + seqString = str(record.seq) + matches = seqString.count('.') + substitutions = sequenceLength - matches + return substitutions + +def calculate_percent_id(record, substitutions): + """Calculate percent sequence identity to reference sequence, based on substitutions +and sequence length and return percent id as a ratio (i.e. 0.90 no 90%).""" + sequenceLength = get_sequence_length(record) + percentID = (1.00 - (float(substitutions)/float(sequenceLength))) + return percentID + +def output_aggregated_linelist(a_list): + """Output aggregated line list of SeqRecords in csv format.""" + sequevars = {} #dict of sequevar: SeqRecord list + firstRecordID = None + #examine dotted/masked sequences in list and assign unique ones as dict keys + for rec in a_list: + rec = replace_matching_aa_with_dot(rec) + sequence =str(rec.seq) + #if the sequence is a key in the dict, add SeqRecord to list + if sequence in sequevars: + #if sequence already in dict as a key, increment the value + sequevars[sequence].append(rec) + else: + #if sequence not in dict, add is as new key with list of 1 SeqRecord + sequevars[sequence] = [rec] + #get list of sorted unique sequence keys + sorted_unique_seq_keys = sorted(sequevars.keys()) + #process each list of SeqRecords sharing a unique sequence + for u in sorted_unique_seq_keys: + #access list of sequences by unique sequence + listOfSeqs = sequevars[u] + #sort this list of SeqRecords by record.id (i.e. name) + listOfSeqs = [f for f in sorted(listOfSeqs, key = lambda x : x.id)] + N = len(listOfSeqs) + #output details of first SeqRecord to csv + firstRecord = listOfSeqs[0] + province = extract_province(firstRecord) + clade = extract_clade(firstRecord) + substitutions = get_antigenic_site_substitutions(firstRecord) + percentID = calculate_percent_id(firstRecord,substitutions) + name = extract_sample_name(firstRecord, clade) + name_part = name.rstrip() + ',' + N_part = str(N) + ',' + clade_part = clade + ',' + substitutions_part = str(substitutions) + ',' + percID_part = str(percentID) + ',' + col = " ," #empty column + sequence = str(firstRecord.seq).strip() + csv_seq = ",".join(sequence) +"," + comma_sep_output = name_part + N_part + clade_part + col + csv_seq + substitutions_part + percID_part + "\n" + #write first member of unique sequence list to csv + agg_lineListFile.write(comma_sep_output) + #print sequence records in sequevar to console + print("\n\t\t%i SeqRecords matching Sequevar: %s" % (len(listOfSeqs), u)) + + #to uncollapse sequevar group, print each member of the sequevar list to csv output + '''for i in range(1,len(listOfSeqs)): + currentRec = listOfSeqs[i] + province = extract_province(currentRec) + clade = extract_clade(currentRec) + substitutions = get_antigenic_site_substitutions(currentRec) + percentID = calculate_percent_id(currentRec,substitutions) + name_part = (currentRec.id).rstrip() + ',' + N_part = "n/a" + ',' + clade_part = clade + ',' + substitutions_part = str(substitutions) + ',' + percID_part = str(percentID) + ',' + col = " ," #empty column + sequence = str(currentRec.seq).strip() + csv_seq = ",".join(sequence) +"," + comma_sep_output = name_part + N_part + clade_part + col + csv_seq + substitutions_part + percID_part + "\n" + agg_lineListFile.write(comma_sep_output) ''' + return + +with open (antigenicSiteIndexArray,'r') as siteIndices: + """Read amino acid positions from antigenic site index array and print as header after one empty row.""" + col = "," #empty column + #read amino acid positions and remove trailing whitespace + for line in siteIndices: + #remove whitespace from the end of each line + indicesLine = line.rstrip() + row1 = "\n" + #add comma-separated AA positions to header line + row2 = col + col + col + col + indicesLine + "\n" + #write first (empty) and 2nd (amino acid position) lines to output file + agg_lineListFile.write(row1) + agg_lineListFile.write(row2) + +with open (refAntigenicMap,'r') as refMapFile: + """Read reference antigenic map from fasta and output amino acids, followed by column headers.""" + #read sequences from fasta to SeqRecord, uppercase, and store sequence string to ref_seq + record = SeqIO.read(refMapFile,"fasta",alphabet=IUPAC.protein) + record = record.upper() + ref_seq = str(record.seq).strip() #store sequence in variable for comparison to sample seqs + col = "," #empty column + name_part = (record.id).rstrip() + ',' + sequence = str(record.seq).strip() + csv_seq = ",".join(sequence) + #output row with reference sequence displayed above sample sequences + row3 = name_part + col + col + col + csv_seq + "\n" + agg_lineListFile.write(row3) + positions = indicesLine.split(',') + numPos = len(positions) + empty_indicesLine = ',' * numPos + #print column headers for sample sequences + row4 = "Sequence Name,N,Clade,Extra Substitutions," + empty_indicesLine + "Number of Amino Acid Substitutions in Antigenic Sites,% Identity of Antigenic Site Residues\n" + agg_lineListFile.write(row4) + print("\nREFERENCE ANTIGENIC MAP: '%s' (%i amino acids)" % (record.id, len(record))) + +with open(cladeDefinitionFile,'r') as cladeFile: + """Read clade definition file and store clade names in list.""" + #remove whitespace from the end of each line and split elements at commas + for line in cladeFile: + elementList = line.rstrip().split(',') + name = elementList[0] #move 1st element to name field + cladeList.append(name) + +with open(inputAntigenicMaps,'r') as extrAntigMapFile: + """Read antigenic maps as protein SeqRecords and add to list.""" + #read Sequences from fasta file, uppercase and add to seqList + for record in SeqIO.parse(extrAntigMapFile, "fasta", alphabet=IUPAC.protein): + record = record.upper() + seqList.append(record) #add Seq to list of Sequences + +#print number of sequences to be processed as user check +print("\nCOMPARING %i flu antigenic map sequences to the reference..." % len(seqList)) +for record in seqList: + #assign SeqRecords to province-specific dictionaries + sort_by_location(record) + +#access prov segregated lists in order +sorted_prov_keys = sorted(prov_lists.keys()) +print("\nSequence Lists Sorted by Province: ") +for prov in sorted_prov_keys: + current_list = prov_lists[prov] + #mask AA's identical to reference sequence with dot + masked_list = [] # empty temporary list to park masked sequences + for record in current_list: + masked_rec = replace_matching_aa_with_dot(record) + masked_list.append(masked_rec) + prov_lists[prov] = masked_list #replace original SeqRecord list with masked list + +#group sequences in province-sorted list into clades +for prov in sorted_prov_keys: + prov_list = prov_lists[prov] + by_clades_dict = {} #empty dict for clade:seqRecord list groups + print("\n'%s' List (Amino Acids identical to Reference are Masked): " % (prov)) + for rec in prov_list: + clade = extract_clade(rec) + if clade in by_clades_dict: + #if clade already in dict as key, append record to list (value) + by_clades_dict[clade].append(rec) + else: #add clade as key to dict, value is list of 1 SeqRecord + by_clades_dict[clade] = [rec] + #get list of alphabetically sorted clade keys + sorted_clade_keys = sorted(by_clades_dict.keys()) + print("\tNumber of clades: ", len(by_clades_dict)) + #group each list of sequences in clade by sequevars + for key in sorted_clade_keys: + print("\n\tCLADE: %s Number of Members: %i" % (key, len(by_clades_dict[key]))) + a_list = by_clades_dict[key] + for seqrec in a_list: + print("\t %s: %s" %(seqrec.id,str(seqrec.seq))) + #output the list to csv as aggregated linelist + output_aggregated_linelist(a_list) + +print("Aggregated Linelist written to file: '%s\n'" % (outFileHandle)) +extrAntigMapFile.close() +refMapFile.close() +agg_lineListFile.close() diff -r 000000000000 -r 515c0c885f5d aggregate_linelisting.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/aggregate_linelisting.xml Thu Jul 04 19:40:13 2019 -0400 @@ -0,0 +1,35 @@ + + + biopython + + + + + + + + + + + + + + + + + + + + + + + + diff -r 000000000000 -r 515c0c885f5d test-data/FluA_H3_antigenic_aa_indices.csv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/FluA_H3_antigenic_aa_indices.csv Thu Jul 04 19:40:13 2019 -0400 @@ -0,0 +1,1 @@ +44,45,46,47,48,50,51,53,54,57,59,62,63,67,75,78,80,81,82,83,86,87,88,91,92,94,96,102,103,109,117,121,122,124,126,128,129,130,131,132,133,135,137,138,140,142,143,144,145,146,150,152,155,156,157,158,159,160,163,164,165,167,168,170,171,172,173,174,175,176,177,179,182,186,187,188,189,190,192,193,194,196,197,198,201,203,207,208,209,212,213,214,215,216,217,218,219,226,227,228,229,230,238,240,242,244,246,247,248,260,261,262,265,273,275,276,278,279,280,294,297,299,300,304,305,307,308,309,310,311,312 diff -r 000000000000 -r 515c0c885f5d test-data/Flu_Clade_Definitions_H3_20171121.csv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/Flu_Clade_Definitions_H3_20171121.csv Thu Jul 04 19:40:13 2019 -0400 @@ -0,0 +1,12 @@ +3C.2a,1,3,I,144,S,145,S,159,Y,160,T,225,D,311,H,489,N,,,,,,,,,,,,,,,,,,,, +3C.2a_+_T131K_+_R142K_+_R261Q,2,3,I,131,K,142,K,144,S,145,S,159,Y,160,T,225,D,261,Q,311,H,489,N,,,,,,,,,,,,,, +3C.2a_+_N121K_+_S144K,2,3,I,121,K,144,K,145,S,159,Y,160,T,225,D,311,H,489,N,,,,,,,,,,,,,,,,,, +3C.2a_+_N31S_+_D53N_+_R142G_+_S144R_+_N171K_+_I192T_+_Q197H,2,3,I,31,S,53,N,142,G,144,R,145,S,159,Y,160,T,171,K,192,T,197,H,225,D,311,H,489,N,,,,,,,, +3C.2a1,2,3,I,144,S,145,S,159,Y,160,T,171,K,225,D,311,H,406,V,484,E,489,N,,,,,,,,,,,,,, +3C.2a1_+_N121K,3,3,I,121,K,144,S,145,S,159,Y,160,T,171,K,225,D,311,H,406,V,484,E,489,N,,,,,,,,,,,, +3C.2a1_+_N121K_+_K92R_+_H311Q,4,3,I,92,R,121,K,144,S,145,S,159,Y,160,T,171,K,225,D,311,Q,406,V,484,E,489,N,,,,,,,,,, +3C.2a1_+_N121K_+_T135K,4,3,I,121,K,135,K,144,S,145,S,159,Y,160,T,171,K,225,D,311,H,406,V,479,E,484,E,489,N,,,,,,,, +3C.2a1_+_N121K_+_I140M,4,3,I,121,K,140,M,144,S,145,S,159,Y,160,T,171,K,225,D,311,H,406,V,479,E,484,E,489,N,,,,,,,, +3C.2a1_+_N121K_+_R142G,4,3,I,121,K,142,G,144,S,145,S,159,Y,160,T,171,K,225,D,311,H,406,V,484,E,489,N,,,,,,,,,, +3C.2a1_+_N121K_+_R142G_+_I242V,5,3,I,121,K,142,G,144,S,145,S,159,Y,160,T,171,K,225,D,242,V,311,H,406,V,479,E,484,E,489,N,,,,,, +3C.3a,1,128,A,138,S,142,G,145,S,159,S,225,D,,,,,,,,,,,,,,,,,,,,,,,, diff -r 000000000000 -r 515c0c885f5d test-data/MAP_3C.2a_A_Hong_Kong_4801_2014_X-263B_EGG.fasta --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/MAP_3C.2a_A_Hong_Kong_4801_2014_X-263B_EGG.fasta Thu Jul 04 19:40:13 2019 -0400 @@ -0,0 +1,4 @@ +>Clade_3C.2a_A/Hong_Kong/4801/2014_X-263B_EGG +QNSSIEIDSQLENIQGQNKKLFVSKYSVPRTNNSNTGVTQNTSAIRSSSSRNTHLNYKAL +NTMNNEQFDKLIVGTDKDIFPAQSRXKRSAVIPNIGSIPSRIKGILNSTIRSSPGKKSEF +VRIACRYVKHS diff -r 000000000000 -r 515c0c885f5d test-data/fluA_H3_clade_assigned_antigenic_sites_extracted.fasta --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/fluA_H3_clade_assigned_antigenic_sites_extracted.fasta Thu Jul 04 19:40:13 2019 -0400 @@ -0,0 +1,94 @@ +>A-ON-314-2017_3C.3a +QNSSIEIDSQLENIQGQNKKLFVNKYNVPRTNNSNAGVTQNTSSIGSKSSRNTHLNSKAL +NTMNNEQFDKLIVGTDKDISLAQSRTKRSAVIPNIGSIPSRIKGILNSTIRSSPGKKSEF +VRIACRYVKQS + +>A-AB-399-2017_3C.2a_+_N31S_+_D53N_+_R142G_+_S144R_+_N171K_+_I192T_+_Q197H +QNSSIEINSQLENIQGQNKKLFVSKYNVPRTNNSNTGVTQNTSAIGSRSSRNTHLNYTAL +NTMNKEQFDKLIVGTDKDTFLAHSRTKRSAVIPNIGSIPSRIKGLLNSTIRSSPGKKSEF +VRIACRYVKHS + +>A-QC-303-2017_3C.2a_+_N31S_+_D53N_+_R142G_+_S144R_+_N171K_+_I192T_+_Q197H +QNSSIEINSQLENIQGQNKKLFVSKYNVPRTNNSNTGVTQNTSAIGSRSSRNTHLNYTAL +NTMNKEQFDKLIVGTDKDTFLAHSRTKRSAVIPNIGSIPSRIKGLLNSTIRSSPGKKSEF +VRIACRYVKHS + +>A-AB-319-2017_3C.2a_+_N31S_+_D53N_+_R142G_+_S144R_+_N171K_+_I192T_+_Q197H +QNSSIEINSQLENIQGQNKKLFVSKYNVPRTNNSNTGVTQNTSAIGSRSSRNTHLNYTAL +NTMNKEQFDKLIVGTDKDTFLAHSRTKRSAVIPNIGSIPSRIKGLLNSTIRSSPGKKSEF +VRIACRYVKHS + +>A-AB-308-2017_3C.2a1_+_N121K_+_T135K +QNSSIEIDSQLENIQDQNKKLFVSKHNVPRTKDSNTGVTQNKSAIRSSSSRNTHLNYTAL +NTMNKEQFDKLIIGTDKDIFLAQSRTKRSAVIPNIGSIPSRIKGILNSTIRSSPGKKSEF +VRIACRYVKHS + +>A-AB-341-2017_3C.2a1_+_N121K_+_T135K +QNSSIEIDSQLENIQDQNKKLFVSKHNVPRTKDSNTGVTQNKSAIRSSSSRNTHLNYTAL +NTMNKEQFDKLIIGTDKDIFLAQSRTKRSAVIPNIGSIPSRIKGILNSTIRSSPGKKSEF +VRIACRYVKHS + +>A-BC-024-2018_3C.2a1_+_N121K_+_T135K +QNSSIEIDSQLENIQDQNKKLFVSKHNVPRTKNSNTGVTQNKSAIRSSSSRNTHLNYTAL +NTMNKEQFDKLIIGTDKDIFLAQSRTKRSAVIPNIGSIPSRIKGILNSTIRSSPGKKSEF +VRIACRYVKHS + +>A-QC-309-2017_3C.2a1_+_N121K_+_K92R_+_H311Q +QNSSIEIDSQLGNIQDQNKKLFVSRYNVPRTKDSNTGVTQNKSAIGSSSSRNTHLNYTAL +NTMNKEQFDKLIVGTDKDIFLAQSRTKRSAVIPNIGSIPSRIKGILNSTIRSSPGKKSEF +VRIACRYVKQS + +>A-BC-324-2017_3C.2a1_+_N121K_+_K92R_+_H311Q +QNSSMEIDSQLGNIQGQNKKLFVSRYNVPRTKNSNTGVTQNKSAIGSSSSRNTHLNYTAL +NTMNKEQFDKLIVGTDKDIFLAQSRTKRSAVIPNIGSIPSRIKGILNSTIRSSPGKKSEF +VRIACRYVKQS + +>A-QC-315-2017_3C.2a1_+_N121K_+_K92R_+_H311Q +QNSSIEIDSQLGNIQGQNKKLFVSRYNVPRTKNSNAGVTQNKSAIGSSSSRNTHLNYTAL +NTMNKEQFDKLIVGTDKDIFLAQSRTKRSAVIPNIGSIPSRIKGILNSTIRSSPGKKSEF +VRIACRYVKQS + +>A-ON-016-2018_3C.2a_+_N121K_+_S144K +QNSSIEIDSQLENIQGQNKKLFVSKYNVPRTKNSNTGVTQNKSAIRSKSSKNTHLNYTAL +NTMNNEQFDKLIVGTDKDIFLAQSRTKRSAVIPNIGSIPSRIKGILNSTIQSSPGKKSEF +VRIACRYVKHS + +>A-BC-325-2017_3C.2a_+_N121K_+_S144K +QNSSIEIDSQLENIQGQNKKLFVSKYNVPRTKNSNTGVTQNKSAIRSKSSKNTHLNYTAL +NTMNNEQFDKLIVGTDKDIFLAQSRTKRSAVIPNIGSIPSRIKGILNSTIQSSPGKKSEF +VRIACRYVKHS + +>A-ON-003-2018_3C.2a_+_N121K_+_S144K +QNSSIEIDSQLENIQGQNKKLFVSKYNVPRTKNSNTGVTQNKSAIRSKSSKNTHLNYTAL +NTMNNEQFDKLIVGTDKDIFLAQSRTKRSAVIPNIGSIPSRIKGILNSTIQSSPGKKSEF +VRIACRYVKHS + +>A-ON-309-2017_No_Match +QNSSIEIDSQLENIQGQNKKLFVSKYNVPRTNNSNTGVKQNTSAIKSSSSRNTHLNYKAL +NTMNNEQFDKLIVGTDKDIFLAQSKTKISAVIPNIGSIPSRIKGILNSTIQSSPGKKSEF +VRIACRYVKHS + +>A-BC-330-2017_No_Match +QNSSIEIDSQLENIQGQNKKLFVSKYNVPRTNNSNTGVKQNTSAIKSRSSRNTHLNYTAL +NTMNNEQFDKLIVGTDKDIFLAQSRTKRSAVIPNIGSIPSRIKGILNSTIQSSPGKKSEF +VRIACRYVKHS + +>A-AB-415-2017_3C.2a_+_T131K_+_R142K_+_R261Q +QNSSIEIDSQLENIQGQNKKLFVSRYNVPRTNNSNTGVKQNTSAIKSSSSRNTHLNYTAL +NTMNNEQFDKLIVGTDKDIFLAQSRTKRSAVIPNIGFIPSRIKGILNSTIQSSPGKKSEF +VRIACRYVKHS + +>A-AB-400-2017_3C.2a_+_T131K_+_R142K_+_R261Q +QNSSIEIDSQLENIQGQNKKLFVSRYNVPRTNNSNTGVKQNTSAIKSSSSRNTHLNYTAL +NTMNNEQFDKLIVGTDKDIFLAQSRTKRSAVIPNIGSIPSRIKGILNSTIQSSPGKKSEF +VRIACRYVKHS + +>A-AB-416-2017_3C.2a_+_T131K_+_R142K_+_R261Q +QNSSIEIDSQLENIQGQNKKLFVSRYNVPRTNNSNTGVKQNTSAIKSSSSRNTHLNYTAL +NTMNNEQFDKLIVGTDKDIFLAQSRTKRSAVIPNIGSIPSRIKGILNSTIQSSPGKKSEF +VRIACRYVKHS + +>A-QC-316-2017_3C.2a_+_T131K_+_R142K_+_R261Q +QNSSIEIDSQLENIQGQNKKLFVSRYNVPRTNNSNTGVKQNTSAIKSSSSRNTHLNYTAL +NTMNNEQFDKLIVGTDKDIFLAQSRTKRSAVIPNIGSIPSRIKGILNSTIQSSPGKKSEF +VRIACRYVKHS diff -r 000000000000 -r 515c0c885f5d test-data/test_output.csv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_output.csv Thu Jul 04 19:40:13 2019 -0400 @@ -0,0 +1,19 @@ + +,,,,44,45,46,47,48,50,51,53,54,57,59,62,63,67,75,78,80,81,82,83,86,87,88,91,92,94,96,102,103,109,117,121,122,124,126,128,129,130,131,132,133,135,137,138,140,142,143,144,145,146,150,152,155,156,157,158,159,160,163,164,165,167,168,170,171,172,173,174,175,176,177,179,182,186,187,188,189,190,192,193,194,196,197,198,201,203,207,208,209,212,213,214,215,216,217,218,219,226,227,228,229,230,238,240,242,244,246,247,248,260,261,262,265,273,275,276,278,279,280,294,297,299,300,304,305,307,308,309,310,311,312 +Clade_3C.2a_A/Hong_Kong/4801/2014_X-263B_EGG,,,,Q,N,S,S,I,E,I,D,S,Q,L,E,N,I,Q,G,Q,N,K,K,L,F,V,S,K,Y,S,V,P,R,T,N,N,S,N,T,G,V,T,Q,N,T,S,A,I,R,S,S,S,S,R,N,T,H,L,N,Y,K,A,L,N,T,M,N,N,E,Q,F,D,K,L,I,V,G,T,D,K,D,I,F,P,A,Q,S,R,X,K,R,S,A,V,I,P,N,I,G,S,I,P,S,R,I,K,G,I,L,N,S,T,I,R,S,S,P,G,K,K,S,E,F,V,R,I,A,C,R,Y,V,K,H,S +Sequence Name,N,Clade,Extra Substitutions,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Number of Amino Acid Substitutions in Antigenic Sites,% Identity of Antigenic Site Residues +A-BC-324-2017,1,3C.2a1_+_N121K_+_K92R_+_H311Q, ,.,.,.,.,M,.,.,.,.,.,.,G,.,.,.,.,.,.,.,.,.,.,.,.,R,.,N,.,.,.,.,K,.,.,.,.,.,.,.,.,.,K,.,.,.,G,.,.,.,.,.,.,.,.,.,.,.,T,.,.,.,.,.,.,K,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,L,.,.,.,.,T,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,Q,.,12,0.9083969465648855, +A-BC-024-2018,1,3C.2a1_+_N121K_+_T135K, ,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,D,.,.,.,.,.,.,.,.,.,H,N,.,.,.,.,K,.,.,.,.,.,.,.,.,.,K,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,T,.,.,.,.,.,.,K,.,.,.,.,.,.,.,I,.,.,.,.,.,.,.,L,.,.,.,.,T,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,10,0.9236641221374046, +A-BC-325-2017,1,3C.2a_+_N121K_+_S144K, ,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,N,.,.,.,.,K,.,.,.,.,.,.,.,.,.,K,.,.,.,.,.,K,.,.,K,.,.,.,.,.,.,T,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,L,.,.,.,.,T,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,Q,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,9,0.9312977099236641, +A-BC-330-2017,1,No_Match, ,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,N,.,.,.,.,.,.,.,.,.,.,.,K,.,.,.,.,.,.,K,.,R,.,.,.,.,.,.,.,.,.,T,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,L,.,.,.,.,T,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,Q,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,8,0.9389312977099237, +A-AB-308-2017,2,3C.2a1_+_N121K_+_T135K, ,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,D,.,.,.,.,.,.,.,.,.,H,N,.,.,.,.,K,D,.,.,.,.,.,.,.,.,K,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,T,.,.,.,.,.,.,K,.,.,.,.,.,.,.,I,.,.,.,.,.,.,.,L,.,.,.,.,T,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,11,0.916030534351145, +A-AB-319-2017,2,3C.2a_+_N31S_+_D53N_+_R142G_+_S144R_+_N171K_+_I192T_+_Q197H, ,.,.,.,.,.,.,.,N,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,N,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,G,.,R,.,.,.,.,.,.,.,.,.,T,.,.,.,.,.,.,K,.,.,.,.,.,.,.,.,.,.,.,.,.,T,.,L,.,H,.,.,T,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,L,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,11,0.916030534351145, +A-AB-400-2017,2,3C.2a_+_T131K_+_R142K_+_R261Q, ,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,R,.,N,.,.,.,.,.,.,.,.,.,.,.,K,.,.,.,.,.,.,K,.,.,.,.,.,.,.,.,.,.,.,T,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,L,.,.,.,.,T,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,Q,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,8,0.9389312977099237, +A-AB-415-2017,1,3C.2a_+_T131K_+_R142K_+_R261Q, ,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,R,.,N,.,.,.,.,.,.,.,.,.,.,.,K,.,.,.,.,.,.,K,.,.,.,.,.,.,.,.,.,.,.,T,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,L,.,.,.,.,T,.,.,.,.,.,.,.,.,.,.,F,.,.,.,.,.,.,.,.,.,.,.,.,.,Q,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,9,0.9312977099236641, +A-ON-003-2018,2,3C.2a_+_N121K_+_S144K, ,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,N,.,.,.,.,K,.,.,.,.,.,.,.,.,.,K,.,.,.,.,.,K,.,.,K,.,.,.,.,.,.,T,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,L,.,.,.,.,T,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,Q,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,9,0.9312977099236641, +A-ON-314-2017,1,3C.3a, ,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,N,.,.,N,.,.,.,.,.,.,.,.,A,.,.,.,.,.,.,.,S,.,G,.,K,.,.,.,.,.,.,.,.,S,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,S,L,.,.,.,.,T,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,Q,.,11,0.916030534351145, +A-ON-309-2017,1,No_Match, ,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,N,.,.,.,.,.,.,.,.,.,.,.,K,.,.,.,.,.,.,K,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,L,.,.,.,K,T,.,I,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,Q,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,8,0.9389312977099237, +A-QC-315-2017,1,3C.2a1_+_N121K_+_K92R_+_H311Q, ,.,.,.,.,.,.,.,.,.,.,.,G,.,.,.,.,.,.,.,.,.,.,.,.,R,.,N,.,.,.,.,K,.,.,.,A,.,.,.,.,.,K,.,.,.,G,.,.,.,.,.,.,.,.,.,.,.,T,.,.,.,.,.,.,K,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,L,.,.,.,.,T,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,Q,.,12,0.9083969465648855, +A-QC-309-2017,1,3C.2a1_+_N121K_+_K92R_+_H311Q, ,.,.,.,.,.,.,.,.,.,.,.,G,.,.,.,D,.,.,.,.,.,.,.,.,R,.,N,.,.,.,.,K,D,.,.,.,.,.,.,.,.,K,.,.,.,G,.,.,.,.,.,.,.,.,.,.,.,T,.,.,.,.,.,.,K,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,L,.,.,.,.,T,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,Q,.,13,0.9007633587786259, +A-QC-303-2017,1,3C.2a_+_N31S_+_D53N_+_R142G_+_S144R_+_N171K_+_I192T_+_Q197H, ,.,.,.,.,.,.,.,N,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,N,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,G,.,R,.,.,.,.,.,.,.,.,.,T,.,.,.,.,.,.,K,.,.,.,.,.,.,.,.,.,.,.,.,.,T,.,L,.,H,.,.,T,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,L,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,11,0.916030534351145, +A-QC-316-2017,1,3C.2a_+_T131K_+_R142K_+_R261Q, ,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,R,.,N,.,.,.,.,.,.,.,.,.,.,.,K,.,.,.,.,.,.,K,.,.,.,.,.,.,.,.,.,.,.,T,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,L,.,.,.,.,T,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,Q,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,8,0.9389312977099237,