Mercurial > repos > public-health-bioinformatics > assign_clades

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/assign_clades.py	Thu Jul 04 19:34:32 2019 -0400
@@ -0,0 +1,133 @@
+#!/usr/bin/env python
+
+'''Accepts fasta files containing amino acid sequence, reading them in as
+amino acid sequence objects.  Reads influenza clade defintions (i.e. amino
+acids at certain positions) from .csv file into dictionary structure. Searches
+each of the amino acid sequence objects for a list of matching clades, assigns
+the most 'evolved' (i.e. child as opposed to parent clade) to the sequence. Appends
+"_cladename" to the Sequence name and generates a fasta file of original sequences with
+modified names.'''
+
+'''Author: Diane Eisler, Molecular Microbiology & Genomics, BCCDC Public Health Laboratory, Oct 2017'''
+
+import sys,string,os, time, Bio
+from Bio import Seq, SeqIO, SeqUtils, Alphabet, SeqRecord
+from Bio.SeqRecord import SeqRecord
+from Bio.Alphabet import IUPAC
+from Bio.Seq import Seq
+
+localtime = time.asctime(time.localtime(time.time())) #date and time of analysis
+inFileHandle1 = sys.argv[1] #batch fasta file with sequences to be parsed
+inFileHandle2 = sys.argv[2] # .csv file containing clade definitions and "depth"
+outFileHandle = sys.argv[3] #user-specified name for output file of aa seq's with clade suffixes
+outFile= open(outFileHandle,'w') #open a writable, appendable output file
+seqList = [] #list of aa sequence objects to parse for clade definitions
+cladeList = [] #empty list to hold clade tuples i.e. ("3C.3a", 1 ,{"3":"I", "9":"V"..})
+
+'''Searches record for required amino acids at defined positions. If found, assigns
+clade name to sequence name by appending underscore and clade name to record id.'''
+def call_clade(record):
+    print("---------------------------------------------------------------------")
+    print("Parsing %s for matching flu clade definitions..." % (record.id))
+    matchList = [] #empty list to hold clades that match 100%
+    #iterate over each tuple in the clade list
+    for clade in cladeList:
+        cladeName = clade[0] #temp variable for name
+        depth = clade[1] #temp variable for depth
+        sites = clade[2] #temp variable for aa def dictionary
+        shouldFind = len(sites) #number of sites that should match
+        found = 0 #a counter to hold matches to antigenic sites
+        #iterate over each position in sites dictionary
+        for pos, aa in sites.items():
+            #translate pos to corresponding index in target sequence
+            index = int(pos) - 1
+            #if record at index has same amino acid as 'aa', increment 'found'
+            if record[index] == aa:
+                found += 1
+        if (found == shouldFind):
+            #add the matching clade tuple to the list of matches
+            matchList.append(clade)
+    return matchList
+
+'''Compares depth level of clades in a list and returns the most granular one'''
+def decide_clade_by_depth(matchList):
+    #empty variable for maximum depth encountered
+    max_depth = 0
+    best_match_name = '' #variable to hold most granular clade
+    #for each matching clade, check depth of the corresponding tuple
+    for clade in matchList:
+        #if the current clade is 'deeper' than the one before it
+        if clade[1] > max_depth:
+            #store this depth
+            max_depth = clade[1]
+            #store name of the clade
+            best_match_name = clade[0]
+    return best_match_name
+
+'''opens the .csv file of clade definitions and clade "depth" '''
+with open (inFileHandle2, 'r') as clade_file:
+    #remove whitespace from the end of each line and split elements at commas
+    for line in clade_file:
+        #print "Current Line in File:" + line
+        sites={} #initialize a dictionary for clade
+        elementList = line.rstrip().split(',')
+        new_list = [] #start a new list to put non-empty strings into
+        #remove empty stings in list
+        for item in elementList:
+            if item != '':
+                new_list.append(item)
+        name = new_list.pop(0) #move 1st element to name field
+        depth = int(new_list.pop(0)) #move 2nd element to depth field
+        #read remaining pairs of non-null elements into clade def dictionary
+        for i in range(0, len(new_list), 2):
+            #move next 2 items from the list into the dict
+            pos = new_list[i]
+            aa = new_list[i + 1]
+            sites[pos] = aa
+        #add the clade info as a tuple to the cladeList[]
+        oneClade =(name, depth, sites)
+        cladeList.append(oneClade)
+    print("The List of Clades:")
+    for clade in cladeList:
+        print("Clade Name: %s Depth: %i Antigenic Sites: %i" % (clade[0], clade[1], len(clade[2])))
+        for pos, aa in clade[2].items():
+            print("Pos: %s\tAA: %s" % (pos,aa))
+
+'''opens readable input file of sequences to parse using filename from cmd line,
+    instantiates as AA Sequence objects, with ppercase sequences'''
+with open(inFileHandle1,'r') as inFile:
+    #read in Sequences from fasta file, uppercase and add to seqList
+    for record in SeqIO.parse(inFile, "fasta", alphabet=IUPAC.protein):
+        record = record.upper()
+        seqList.append(record) #add Seq to list of Sequences
+    print("\n%i flu HA sequences will be compared to current clade definitions..." % len(seqList))
+    #parse each target sequence object
+    for record in seqList:
+        clade_call = '' #empty variale for final clade call on sequence
+        matchingCladeList = call_clade(record) #holds matching clade tuples
+        #if there is more than one clade match
+        if len(matchingCladeList) > 1:
+            #choose the most granular clade based on depth
+            clade_call = decide_clade_by_depth(matchingCladeList)
+        #if there is only one clade call
+        elif len(matchingCladeList) > 0:
+            clade = matchingCladeList[0] #take the first tuple in the list
+            clade_call = clade[0] #clade name is the first item in the tuple
+        #empty list return, no matches
+        else:
+            clade_call = "No_Match"
+        print(clade_call)
+        seq_name = record.id
+        mod_name = seq_name + "_" + clade_call
+        print("New Sequence Name: " + mod_name)
+        record.id = mod_name
+
+
+#output fasta file with clade calls appended to sequence names
+SeqIO.write(seqList,outFile,"fasta")
+
+#print("\n%i Sequences Extracted to Output file: %s"  % ((len(extractedSeqList),outFileHandle)))
+inFile.close()
+clade_file.close()
+outFile.close()
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/assign_clades.xml	Thu Jul 04 19:34:32 2019 -0400
@@ -0,0 +1,29 @@
+<tool id="assign_clades" name="Assign Clades" version="0.0.1">
+  <requirements>
+    <requirement type="package" version="1.70">biopython</requirement>
+  </requirements>
+  <command detect_errors="exit_code"><![CDATA[
+    python $__tool_directory__/assign_clades.py
+    '$input_fasta'
+    '$clade_definitions'
+    '$output_file'
+  ]]></command>
+  <inputs>
+    <param name="input_fasta" format="fasta" type="data" />
+    <param name="clade_definitions" format="csv" type="data" />
+  </inputs>
+  <outputs>
+    <data name="output_file" format="fasta"/>
+  </outputs>
+  <tests>
+    <test>
+      <param name="input_fasta" value="input_fasta.fasta" />
+      <param name="clade_definitions" value="clades.csv" />
+      <output name="output_file" value="output.fasta" />
+    </test>
+  </tests>
+  <help><![CDATA[
+  ]]></help>
+  <citations>
+  </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/clades.csv	Thu Jul 04 19:34:32 2019 -0400
@@ -0,0 +1,15 @@
+3C.2a,1,3,I,144,S,145,S,159,Y,160,T,225,D,311,H,489,N,,,,,,,,,,,,,,
+3C.2a_+_T131K_+_R142K_+_R261Q,2,3,I,131,K,142,K,144,S,145,S,159,Y,160,T,225,D,261,Q,311,H,489,N,,,,,,,,
+3C.2a_+_N121K_+_S144K,2,3,I,121,K,144,K,145,S,159,Y,160,T,225,D,311,H,489,N,,,,,,,,,,,,
+3C.2a_+_Q197K_+_R261Q,2,3,I,144,S,145,S,159,Y,160,T,197,K,225,D,261,Q,311,H,489,N,,,,,,,,,,
+3C.2a_+_N31S_+_D53N_+_R142G_+_S144R_+_N171K_+_I192T_+_Q197H_,2,3,I,31,S,53,N,142,G,144,R,145,S,159,Y,160,T,171,K,192,T,197,H,225,D,311,H,489,N,,
+3C.2a1_(w/o_N121K),3,3,I,144,S,145,S,159,Y,160,T,171,K,225,D,311,H,406,V,484,E,489,N,,,,,,,,
+3C.2a1_+_N121K,4,3,I,121,K,144,S,145,S,159,Y,160,T,171,K,225,D,311,H,406,V,484,E,489,N,,,,,,
+3C.2a1_+_N121K_+_R142G_+_I242V,4,3,I,121,K,142,G,144,S,145,S,159,Y,160,T,171,K,225,D,242,V,311,H,406,V,479,E,484,E,489,N
+3C.2a1_+_N121K_+_R142G,4,3,I,121,K,142,G,144,S,145,S,159,Y,160,T,171,K,225,D,311,H,406,V,484,E,489,N,,,,
+3C.2a1_+_N121K_+_T135K,4,3,I,121,K,135,K,144,S,145,S,159,Y,160,T,171,K,225,D,311,H,406,V,479,E,484,E,489,N,,
+3C.2a1_+_N121K_+_K92R_+_H311Q,4,3,I,92,R,121,K,144,S,145,S,159,Y,160,T,171,K,225,D,311,Q,406,V,484,E,489,N,,,,
+3C.2a1_+_N121K_+_I140M,4,3,I,121,K,140,M,144,S,145,S,159,Y,160,T,171,K,225,D,311,H,406,V,479,E,484,E,489,N,,
+3C.2a1_+_R142G,4,3,I,142,G,144,S,145,S,159,Y,160,T,171,K,225,D,311,H,406,V,484,E,489,N,,,,,,
+3C.2a1_+_S47T_+_G78S,4,3,I,47,T,78,S,144,S,145,S,159,Y,160,T,171,K,225,D,311,H,406,V,484,E,489,N,,,,
+3C.3a,1,128,A,138,S,142,G,145,S,159,S,225,D,,,,,,,,,,,,,,,,,,
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/input_fasta.fasta	Thu Jul 04 19:34:32 2019 -0400
@@ -0,0 +1,2 @@
+>test
+QKIPGNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNSSIGEICDSPHQILDGENCTLIDALLGDPQCDGFQNKKWDLFVERNKAYSSCYPYDVPDYASLRSLVASSGTLEFNNESFNWAGVTQNGTSSSCIRGSKSSFFSRLNWLTHLNSKYPALNVTMPNNEQFDKLYIWGVHHPGTDKDQISLYAQSSGRITVSTKRSQQAVIPNIGSRPRIRDIPSRISIYWTIVKPGDILLINSTGNLIAPRGYFKIRSGKSSIMRSDAPIGKCKSECITPNGSIPNDKPFQNVNRITYGACPRYVKQSTLKLATGMRNVPERQTRGIFGAIAGFIENGWEGMVDGWYGFRHQNSEGRGQAADLKSTQAAIDQINGKLNRLIGKTNEKFHQIEKEFSEVEGRIQDLEKYVEDTKIDLWSYNAELLVALENQHTIDLTDSEMNKLFEKTKKQLRENAEDMGNGCFKIYHKCDNACIGSIRNGTYDHNVYRDEALNNRFQIKGVELKSGYKDW
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/output.fasta	Thu Jul 04 19:34:32 2019 -0400
@@ -0,0 +1,10 @@
+>test_3C.3a test
+QKIPGNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNSSIGEICDSPHQILD
+GENCTLIDALLGDPQCDGFQNKKWDLFVERNKAYSSCYPYDVPDYASLRSLVASSGTLEF
+NNESFNWAGVTQNGTSSSCIRGSKSSFFSRLNWLTHLNSKYPALNVTMPNNEQFDKLYIW
+GVHHPGTDKDQISLYAQSSGRITVSTKRSQQAVIPNIGSRPRIRDIPSRISIYWTIVKPG
+DILLINSTGNLIAPRGYFKIRSGKSSIMRSDAPIGKCKSECITPNGSIPNDKPFQNVNRI
+TYGACPRYVKQSTLKLATGMRNVPERQTRGIFGAIAGFIENGWEGMVDGWYGFRHQNSEG
+RGQAADLKSTQAAIDQINGKLNRLIGKTNEKFHQIEKEFSEVEGRIQDLEKYVEDTKIDL
+WSYNAELLVALENQHTIDLTDSEMNKLFEKTKKQLRENAEDMGNGCFKIYHKCDNACIGS
+IRNGTYDHNVYRDEALNNRFQIKGVELKSGYKDW