comparison antigenic_site_extraction.py @ 0:a1b46e339580 draft default tip

planemo upload for repository https://github.com/Public-Health-Bioinformatics/flu_classification_suite commit b96b6e06f6eaa6ae8ef4c24630dbb72a4aed7dbe
author public-health-bioinformatics
date Thu, 04 Jul 2019 19:36:38 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:a1b46e339580
1 #!/usr/bin/env python
2
3 '''Accepts fasta files of amino acid sequence, extracts specific amino acids (defined in a csv index array),
4 and outputs extracted sequences - representing flu antigenic sites - to fasta (default) or csv.'''
5
6 '''Author: Diane Eisler, Molecular Microbiology & Genomics, BCCDC Public Health Laboratory,Sept 2017'''
7
8 import sys,string,os, time, Bio, argparse
9 from Bio import Seq, SeqIO, SeqUtils, Alphabet, SeqRecord
10 from Bio.SeqRecord import SeqRecord
11 from Bio.Alphabet import IUPAC
12 from Bio.Seq import Seq
13
14 #parse command line arguments
15 parser = argparse.ArgumentParser()
16 parser.add_argument("-c","--csv",help="export extracted antigenic sites to csv file",action="store_true")
17 parser.add_argument("inFileHandle1") #batch fasta file with sequences to be parsed
18 parser.add_argument("inFileHandle2") # .csv file containing positions of aa's to extract
19 parser.add_argument("outFileHandle") #user-specified name for output file of extracted aa seq's
20 args = parser.parse_args()
21
22 #inFileHandle1 = sys.argv[1] #batch fasta file with sequences to be parsed
23 #inFileHandle2 = sys.argv[2] # .csv file containing positions of aa's to extract
24 #outFileHandle = sys.argv[3] #user-specified name for output file of extracted aa seq's
25
26 outFile= open(args.outFileHandle,'w') #open a writable, appendable output file
27 localtime = time.asctime(time.localtime(time.time())) #date and time of analysis
28 seqList = [] #list of aa sequence objects to parse for oligo sequences
29 indexArray = [] # .csv list of aa's corresponding to antigenic site positions
30 extractedSeqList = [] #list of extracted antigenic sites extracted from seqList
31
32 def extract_aa_from_sequence(record):
33 """Extract specific amino acids from SeqRecord, create new SeqRecord and append to list."""
34 original_sequence = str(record.seq) #pull out the SeqRecord's Seq object and ToString it
35 new_sequence = "" #set variable to empty
36 new_id = record.id #store the same sequence id as the original sequence
37 #iterate over each position in index array, extract corresponding aa and add to string
38 for pos in indexArray:
39 char = original_sequence[pos-1] #aa positions must be zero indexed
40 new_sequence = new_sequence + char
41 rec = SeqRecord(Seq(new_sequence,IUPAC.protein), id = record.id, name = "", description = "")
42 extractedSeqList.append(rec) #add new SeqRecord object to the list
43
44 with open (args.inFileHandle2,'r') as inFile2:
45 '''Open csv file containing amino acid positions to extract and add to list.'''
46 #read items separated by comma's to position list
47 positionList = ""
48 for line in inFile2:
49 #remove whitespace from the end of each line
50 strippedLine = line.rstrip()
51 #split the line at commas and assigned the returned list as indexArray
52 positionList = strippedLine.split(',')
53 #Convert string items in positionList from strings to int and add to indexArray
54 for item in positionList:
55 indexArray.append(int(item))
56 #print number of amino acids to extract and array to console as user check
57 print("Amino Acid positions to extract: %i " %(len(indexArray)))
58 print(indexArray)
59
60 with open(args.inFileHandle1,'r') as inFile:
61 '''Open fasta of amino acid sequences to parse, uppercase and add to protein Sequence list.'''
62 #read in Sequences from fasta file, uppercase and add to seqList
63 for record in SeqIO.parse(inFile, "fasta", alphabet=IUPAC.protein):
64 record = record.upper()
65 seqList.append(record) #add Seq to list of Sequences
66 #print number of sequences to be process as user check
67 print("\n%i flu sequences will be extracted for antigenic sites..." % len(seqList))
68 #parse each target sequence object
69 for record in seqList:
70 extract_aa_from_sequence(record)
71
72 #print original and extracted sequence
73 for x in range(0, len(seqList)):
74 print("Original %s: %i amino acids,\tExtracted: %i" % (seqList[x].id,len(seqList[x]),len(extractedSeqList[x])))
75
76 #determine if output format is fasta (default) or csv
77 if args.csv:
78 #write csv file of extracted antigenic sits
79 for record in extractedSeqList:
80 #outFile.write(record.id),","
81 name_part = (record.id).rstrip() + ','
82 sequence = str(record.seq).strip()
83 csv_seq = ",".join(sequence)
84 comma_separated_sequence = name_part + csv_seq + "\n"
85 print(comma_separated_sequence)
86 outFile.write(comma_separated_sequence)
87 else:
88 #write fasta file of extracted antigenic sites
89 SeqIO.write(extractedSeqList,outFile,"fasta")
90
91 print("\n%i Sequences Extracted to Output file: %s" % ((len(extractedSeqList),args.outFileHandle)))
92 inFile.close()
93 inFile2.close()
94 outFile.close()
95