5
|
1
|
1
|
2 """
|
|
3 Python-code: Merge Scaffold Samples Report files
|
|
4 @author = Brent Kuenzi
|
|
5 @email = Brent.Kuenzi@moffitt.org
|
|
6 """
|
|
7 #######################################################################################
|
|
8 ## Description: ##
|
|
9 # This program will filter a fasta file using a data file
|
|
10 # containing an "Accession" or "Accession Number" column
|
|
11 ## Required input: ##
|
|
12 # 1) fasta file to be filtered
|
|
13 # 2) data file containing a "Accession" or "Accession Number" column
|
|
14
|
|
15
|
|
16 import sys
|
|
17 import os
|
|
18 def readtab(infile): # read in tab-delim text
|
|
19 with open(infile,'r') as x:
|
|
20 output = []
|
|
21 for line in x:
|
|
22 line = line.strip()
|
|
23 temp = line.split('\t')
|
|
24 output.append(temp)
|
|
25 return output
|
|
26 def getAccessions(infile): # get list of protein accessions from your data
|
|
27 data = readtab(infile)
|
|
28 cnt = 0
|
|
29 header_start = 0
|
|
30 prot_start = 0
|
|
31 for i in data:
|
10
|
32 if "Accession Number" in i: # finds the start of header
|
1
|
33 header_start = cnt
|
|
34 break
|
|
35 cnt += 1
|
|
36 header = data[header_start]
|
|
37 if "Accession Number" in header:
|
|
38 prot_start = header.index("Accession Number")
|
|
39 elif "Accession" in header:
|
|
40 prot_start = header.index("Accession")
|
|
41 proteins = []
|
10
|
42 for protein in data[header_start:]:
|
|
43 if len(protein) > prot_start:
|
|
44 proteins.append(protein[prot_start])
|
1
|
45 return proteins
|
|
46 def FilterFastaSeq(infile,accession): # fasta file and UniprotID/SwissprotID
|
|
47 input_data = readtab(infile)
|
|
48 seq=[]
|
|
49 header=[]
|
|
50 temp=[]
|
|
51 flag = 0
|
|
52 cnt = 0
|
|
53 for i in input_data:
|
|
54 cnt+=1
|
|
55 if flag == 1: # once we have a hit, start adding the sequences
|
|
56 if ">" not in i[0]: # don't add the headers to the sequence
|
|
57 temp.append(i[0])
|
|
58 if i[0].startswith(">"): # is it a fasta header?
|
|
59 if temp != []: # if it is a continued fasta header, add old sequences to the sequence list
|
|
60 # will this cutoff the last on of the file?
|
|
61 merged = "\n".join(temp)
|
|
62 if merged!="":
|
|
63 seq.append(merged)
|
|
64 temp=[]
|
|
65 lst = i[0].split('|')
|
|
66 ID1 = lst[1]
|
|
67 lst2 = lst[2].split(' ')
|
|
68 ID2 = lst2[0]
|
|
69 flag = 0
|
|
70 if ID1 in accession: # are the IDs part of your data?
|
|
71 flag+=1
|
|
72 header.append(i[0]) # if it is then append it
|
|
73 if ID2 in accession: # are the IDs part of your data?
|
|
74 flag+=1
|
|
75 header.append(i[0]) # if it is then append it
|
|
76 if cnt == len(input_data): # account for last fasta sequence in file
|
|
77 if temp != []:
|
|
78 merged = "\n".join(temp)
|
|
79 if merged!="":
|
|
80 seq.append(merged)
|
|
81 cnt=0
|
9
|
82 x = open("output.txt","w")
|
7
|
83 for i in header:
|
|
84 x.write(i+'\n'+seq[cnt]+'\n')
|
|
85 cnt+=1
|
|
86 x.close()
|
10
|
87 #fasta = sys.argv[1] # fasta file to filter
|
|
88 #data = sys.argv[2] # scaffold report #2 -- filename
|
|
89 fasta = r"C:\Users\Owner\Desktop\APOSTL\SAINT_preprocessing\SwissProt_HUMAN_2014_08.fasta"
|
|
90 data = r"C:\Users\Owner\Desktop\APOSTL\Scaffold\scaffold_EGFR.txt"
|
1
|
91
|
|
92 FilterFastaSeq(fasta,getAccessions(data))
|
10
|
93 os.rename("output.txt", "output1.txt") |