annotate filter_fasta.py @ 15:3833aebf8d3a draft default tip

Uploaded
author bornea
date Wed, 24 Aug 2016 13:00:17 -0400
parents d66c62c6c56f
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
5
af454e5a9ef5 Uploaded
bornea
parents: 1
diff changeset
1
1
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
2 """
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
3 Python-code: Merge Scaffold Samples Report files
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
4 @author = Brent Kuenzi
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
5 @email = Brent.Kuenzi@moffitt.org
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
6 """
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
7 #######################################################################################
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
8 ## Description: ##
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
9 # This program will filter a fasta file using a data file
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
10 # containing an "Accession" or "Accession Number" column
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
11 ## Required input: ##
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
12 # 1) fasta file to be filtered
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
13 # 2) data file containing a "Accession" or "Accession Number" column
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
14
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
15
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
16 import sys
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
17 import os
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
18 def readtab(infile): # read in tab-delim text
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
19 with open(infile,'r') as x:
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
20 output = []
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
21 for line in x:
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
22 line = line.strip()
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
23 temp = line.split('\t')
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
24 output.append(temp)
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
25 return output
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
26 def getAccessions(infile): # get list of protein accessions from your data
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
27 data = readtab(infile)
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
28 cnt = 0
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
29 header_start = 0
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
30 prot_start = 0
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
31 for i in data:
10
c9e6e2e7697c Uploaded
bornea
parents: 9
diff changeset
32 if "Accession Number" in i: # finds the start of header
1
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
33 header_start = cnt
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
34 break
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
35 cnt += 1
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
36 header = data[header_start]
14
d66c62c6c56f Uploaded
bornea
parents: 12
diff changeset
37 for i in header:
d66c62c6c56f Uploaded
bornea
parents: 12
diff changeset
38 if i == "Accession":
d66c62c6c56f Uploaded
bornea
parents: 12
diff changeset
39 prot_start = header.index("Accession")
d66c62c6c56f Uploaded
bornea
parents: 12
diff changeset
40 if i == "Accession Number":
d66c62c6c56f Uploaded
bornea
parents: 12
diff changeset
41 prot_start = header.index("Accession Number")
d66c62c6c56f Uploaded
bornea
parents: 12
diff changeset
42 if i == "Main Accession":
d66c62c6c56f Uploaded
bornea
parents: 12
diff changeset
43 prot_start = header.index("Main Accession")
1
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
44 proteins = []
10
c9e6e2e7697c Uploaded
bornea
parents: 9
diff changeset
45 for protein in data[header_start:]:
c9e6e2e7697c Uploaded
bornea
parents: 9
diff changeset
46 if len(protein) > prot_start:
c9e6e2e7697c Uploaded
bornea
parents: 9
diff changeset
47 proteins.append(protein[prot_start])
1
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
48 return proteins
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
49 def FilterFastaSeq(infile,accession): # fasta file and UniprotID/SwissprotID
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
50 input_data = readtab(infile)
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
51 seq=[]
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
52 header=[]
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
53 temp=[]
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
54 flag = 0
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
55 cnt = 0
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
56 for i in input_data:
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
57 cnt+=1
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
58 if flag == 1: # once we have a hit, start adding the sequences
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
59 if ">" not in i[0]: # don't add the headers to the sequence
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
60 temp.append(i[0])
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
61 if i[0].startswith(">"): # is it a fasta header?
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
62 if temp != []: # if it is a continued fasta header, add old sequences to the sequence list
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
63 # will this cutoff the last on of the file?
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
64 merged = "\n".join(temp)
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
65 if merged!="":
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
66 seq.append(merged)
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
67 temp=[]
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
68 lst = i[0].split('|')
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
69 ID1 = lst[1]
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
70 lst2 = lst[2].split(' ')
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
71 ID2 = lst2[0]
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
72 flag = 0
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
73 if ID1 in accession: # are the IDs part of your data?
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
74 flag+=1
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
75 header.append(i[0]) # if it is then append it
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
76 if ID2 in accession: # are the IDs part of your data?
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
77 flag+=1
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
78 header.append(i[0]) # if it is then append it
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
79 if cnt == len(input_data): # account for last fasta sequence in file
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
80 if temp != []:
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
81 merged = "\n".join(temp)
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
82 if merged!="":
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
83 seq.append(merged)
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
84 cnt=0
9
6794a638a504 Uploaded
bornea
parents: 8
diff changeset
85 x = open("output.txt","w")
7
e01b49c112b0 Uploaded
bornea
parents: 6
diff changeset
86 for i in header:
e01b49c112b0 Uploaded
bornea
parents: 6
diff changeset
87 x.write(i+'\n'+seq[cnt]+'\n')
e01b49c112b0 Uploaded
bornea
parents: 6
diff changeset
88 cnt+=1
e01b49c112b0 Uploaded
bornea
parents: 6
diff changeset
89 x.close()
11
573c36ff075f Uploaded
bornea
parents: 10
diff changeset
90 fasta = sys.argv[1] # fasta file to filter
573c36ff075f Uploaded
bornea
parents: 10
diff changeset
91 data = sys.argv[2] # scaffold report #2 -- filename
1
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
92
b25b0f210e10 Uploaded
bornea
parents:
diff changeset
93 FilterFastaSeq(fasta,getAccessions(data))
12
05d48b118f81 Uploaded
bornea
parents: 11
diff changeset
94 os.rename("output.txt", sys.argv[3])