Mercurial > repos > public-health-bioinformatics > antigenic_site_extraction
comparison antigenic_site_extraction.py @ 0:a1b46e339580 draft default tip
planemo upload for repository https://github.com/Public-Health-Bioinformatics/flu_classification_suite commit b96b6e06f6eaa6ae8ef4c24630dbb72a4aed7dbe
| author | public-health-bioinformatics |
|---|---|
| date | Thu, 04 Jul 2019 19:36:38 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:a1b46e339580 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 '''Accepts fasta files of amino acid sequence, extracts specific amino acids (defined in a csv index array), | |
| 4 and outputs extracted sequences - representing flu antigenic sites - to fasta (default) or csv.''' | |
| 5 | |
| 6 '''Author: Diane Eisler, Molecular Microbiology & Genomics, BCCDC Public Health Laboratory,Sept 2017''' | |
| 7 | |
| 8 import sys,string,os, time, Bio, argparse | |
| 9 from Bio import Seq, SeqIO, SeqUtils, Alphabet, SeqRecord | |
| 10 from Bio.SeqRecord import SeqRecord | |
| 11 from Bio.Alphabet import IUPAC | |
| 12 from Bio.Seq import Seq | |
| 13 | |
| 14 #parse command line arguments | |
| 15 parser = argparse.ArgumentParser() | |
| 16 parser.add_argument("-c","--csv",help="export extracted antigenic sites to csv file",action="store_true") | |
| 17 parser.add_argument("inFileHandle1") #batch fasta file with sequences to be parsed | |
| 18 parser.add_argument("inFileHandle2") # .csv file containing positions of aa's to extract | |
| 19 parser.add_argument("outFileHandle") #user-specified name for output file of extracted aa seq's | |
| 20 args = parser.parse_args() | |
| 21 | |
| 22 #inFileHandle1 = sys.argv[1] #batch fasta file with sequences to be parsed | |
| 23 #inFileHandle2 = sys.argv[2] # .csv file containing positions of aa's to extract | |
| 24 #outFileHandle = sys.argv[3] #user-specified name for output file of extracted aa seq's | |
| 25 | |
| 26 outFile= open(args.outFileHandle,'w') #open a writable, appendable output file | |
| 27 localtime = time.asctime(time.localtime(time.time())) #date and time of analysis | |
| 28 seqList = [] #list of aa sequence objects to parse for oligo sequences | |
| 29 indexArray = [] # .csv list of aa's corresponding to antigenic site positions | |
| 30 extractedSeqList = [] #list of extracted antigenic sites extracted from seqList | |
| 31 | |
| 32 def extract_aa_from_sequence(record): | |
| 33 """Extract specific amino acids from SeqRecord, create new SeqRecord and append to list.""" | |
| 34 original_sequence = str(record.seq) #pull out the SeqRecord's Seq object and ToString it | |
| 35 new_sequence = "" #set variable to empty | |
| 36 new_id = record.id #store the same sequence id as the original sequence | |
| 37 #iterate over each position in index array, extract corresponding aa and add to string | |
| 38 for pos in indexArray: | |
| 39 char = original_sequence[pos-1] #aa positions must be zero indexed | |
| 40 new_sequence = new_sequence + char | |
| 41 rec = SeqRecord(Seq(new_sequence,IUPAC.protein), id = record.id, name = "", description = "") | |
| 42 extractedSeqList.append(rec) #add new SeqRecord object to the list | |
| 43 | |
| 44 with open (args.inFileHandle2,'r') as inFile2: | |
| 45 '''Open csv file containing amino acid positions to extract and add to list.''' | |
| 46 #read items separated by comma's to position list | |
| 47 positionList = "" | |
| 48 for line in inFile2: | |
| 49 #remove whitespace from the end of each line | |
| 50 strippedLine = line.rstrip() | |
| 51 #split the line at commas and assigned the returned list as indexArray | |
| 52 positionList = strippedLine.split(',') | |
| 53 #Convert string items in positionList from strings to int and add to indexArray | |
| 54 for item in positionList: | |
| 55 indexArray.append(int(item)) | |
| 56 #print number of amino acids to extract and array to console as user check | |
| 57 print("Amino Acid positions to extract: %i " %(len(indexArray))) | |
| 58 print(indexArray) | |
| 59 | |
| 60 with open(args.inFileHandle1,'r') as inFile: | |
| 61 '''Open fasta of amino acid sequences to parse, uppercase and add to protein Sequence list.''' | |
| 62 #read in Sequences from fasta file, uppercase and add to seqList | |
| 63 for record in SeqIO.parse(inFile, "fasta", alphabet=IUPAC.protein): | |
| 64 record = record.upper() | |
| 65 seqList.append(record) #add Seq to list of Sequences | |
| 66 #print number of sequences to be process as user check | |
| 67 print("\n%i flu sequences will be extracted for antigenic sites..." % len(seqList)) | |
| 68 #parse each target sequence object | |
| 69 for record in seqList: | |
| 70 extract_aa_from_sequence(record) | |
| 71 | |
| 72 #print original and extracted sequence | |
| 73 for x in range(0, len(seqList)): | |
| 74 print("Original %s: %i amino acids,\tExtracted: %i" % (seqList[x].id,len(seqList[x]),len(extractedSeqList[x]))) | |
| 75 | |
| 76 #determine if output format is fasta (default) or csv | |
| 77 if args.csv: | |
| 78 #write csv file of extracted antigenic sits | |
| 79 for record in extractedSeqList: | |
| 80 #outFile.write(record.id),"," | |
| 81 name_part = (record.id).rstrip() + ',' | |
| 82 sequence = str(record.seq).strip() | |
| 83 csv_seq = ",".join(sequence) | |
| 84 comma_separated_sequence = name_part + csv_seq + "\n" | |
| 85 print(comma_separated_sequence) | |
| 86 outFile.write(comma_separated_sequence) | |
| 87 else: | |
| 88 #write fasta file of extracted antigenic sites | |
| 89 SeqIO.write(extractedSeqList,outFile,"fasta") | |
| 90 | |
| 91 print("\n%i Sequences Extracted to Output file: %s" % ((len(extractedSeqList),args.outFileHandle))) | |
| 92 inFile.close() | |
| 93 inFile2.close() | |
| 94 outFile.close() | |
| 95 |
