Mercurial > repos > bornea > saint_preprocessing
comparison mzID_process2.py @ 65:a551998e1068 draft
Uploaded
author | bornea |
---|---|
date | Sat, 27 Aug 2016 22:13:31 -0400 |
parents | |
children | b8169cc0b860 |
comparison
equal
deleted
inserted
replaced
64:e382e24af909 | 65:a551998e1068 |
---|---|
1 # -*- coding: utf-8 -*- | |
2 """ | |
3 Python-code: Preprocess mzIdentML | |
4 @author = Brent Kuenzi | |
5 @email = Brent.Kuenzi@moffitt.org | |
6 """ | |
7 ####################################################################################### | |
8 ## Description: ## | |
9 # This program will create inter, prey, and bait files from mzIdentML files | |
10 ## Required input: ## | |
11 # 1) mzIdentML file to be reformatted | |
12 # 2) minimum PSM for quantification | |
13 | |
14 | |
15 import sys | |
16 import os | |
17 | |
18 ins_path = sys.argv[5] | |
19 | |
20 class ReturnValue1(object): | |
21 def __init__(self, sequence, gene): | |
22 self.seqlength = sequence | |
23 self.genename = gene | |
24 class ReturnValue2(object): | |
25 def __init__(self, inter, accessions): | |
26 self.inter = inter | |
27 self.accessions = accessions | |
28 def read_tab(infile): | |
29 with open(infile,'r') as x: | |
30 output = [] | |
31 for line in x: | |
32 line = line.strip() | |
33 temp = line.split('\t') | |
34 output.append(temp) | |
35 return output | |
36 def printProgress (iteration, total, prefix = '', suffix = '', decimals = 1, barLength = 100): | |
37 """ | |
38 Call in a loop to create terminal progress bar | |
39 @params: | |
40 iteration - Required : current iteration (Int) | |
41 total - Required : total iterations (Int) | |
42 prefix - Optional : prefix string (Str) | |
43 suffix - Optional : suffix string (Str) | |
44 decimals - Optional : positive number of decimals in percent complete (Int) | |
45 barLength - Optional : character length of bar (Int) | |
46 """ | |
47 formatStr = "{0:." + str(decimals) + "f}" | |
48 percents = formatStr.format(100 * (iteration / float(total))) | |
49 filledLength = int(round(barLength * iteration / float(total))) | |
50 bar = '=' * filledLength + '-' * (barLength - filledLength) | |
51 sys.stdout.write('\r%s |%s| %s%s %s' % (prefix, bar, percents, '%', suffix)), | |
52 sys.stdout.flush() | |
53 if iteration == total: | |
54 sys.stdout.write('\n') | |
55 sys.stdout.flush() | |
56 def get_info(uniprot_accession_in,fasta_db): | |
57 # Get aminoacid lengths and gene name. | |
58 error = open('error proteins.txt', 'a+') | |
59 data = open(fasta_db, 'r') | |
60 data_lines = data.readlines() | |
61 db_len = len(data_lines) | |
62 seqlength = 0 | |
63 count = 0 | |
64 last_line = data_lines[-1] | |
65 for data_line in data_lines: | |
66 if ">sp" in data_line: | |
67 namer = data_line.split("|")[2] | |
68 if uniprot_accession_in == data_line.split("|")[1]: | |
69 match = count + 1 | |
70 if 'GN=' in data_line: | |
71 lst = data_line.split('GN=') | |
72 lst2 = lst[1].split(' ') | |
73 genename = lst2[0] | |
74 if 'GN=' not in data_line: | |
75 genename = 'NA' | |
76 while ">sp" not in data_lines[match]: | |
77 if match <= db_len: | |
78 seqlength = seqlength + len(data_lines[match].strip()) | |
79 if data_lines[match] == last_line: | |
80 break | |
81 match = match + 1 | |
82 else: | |
83 break | |
84 return ReturnValue1(seqlength, genename) | |
85 if uniprot_accession_in == namer.split(" ")[0]: | |
86 match = count + 1 | |
87 # Ensures consistent spacing throughout. | |
88 if 'GN=' in data_line: | |
89 lst = data_line.split('GN=') | |
90 lst2 = lst[1].split(' ') | |
91 genename = lst2[0] | |
92 if 'GN=' not in data_line: | |
93 genename = 'NA' | |
94 while ">sp" not in data_lines[match]: | |
95 if match <= db_len: | |
96 seqlength = seqlength + len(data_lines[match].strip()) | |
97 if data_lines[match] == last_line: | |
98 break | |
99 match = match + 1 | |
100 else: | |
101 break | |
102 return ReturnValue1(seqlength, genename) | |
103 count = count + 1 | |
104 if seqlength == 0: | |
105 error.write(uniprot_accession_in + '\t' + "Uniprot not in Fasta" + '\n') | |
106 error.close | |
107 seqlength = 'NA' | |
108 genename = 'NA' | |
109 return ReturnValue1(seqlength, genename) | |
110 def make_inter(mzIdentML,replicate,grouping): | |
111 accession_index = mzIdentML[0].index("accession") | |
112 PSMs = {} | |
113 accessions = [] | |
114 cnt = 0 | |
115 unique_lines = [mzIdentML[1:]] | |
116 for i in mzIdentML[1:]: | |
117 PSMs[i[accession_index]] = 0 | |
118 if i[accession_index] not in accessions: | |
119 accessions.append(i[accession_index]) | |
120 if i not in unique_lines: | |
121 unique_lines.append(i) | |
122 for i in accessions: | |
123 for j in unique_lines[1:]: | |
124 if j[accession_index] == i: | |
125 PSMs[j[accession_index]] +=1 | |
126 inter = "" | |
127 for i in accessions: | |
128 inter = inter + replicate + "\t" + grouping + "\t" + i + "\t" + str(PSMs[i]) + "\n" | |
129 return ReturnValue2(inter,accessions) | |
130 | |
131 | |
132 files = sys.argv[1] | |
133 file_list = files.split(",") | |
134 bait = read_tab(sys.argv[2]) | |
135 make_prey = sys.argv[3] | |
136 db = sys.argv[4] | |
137 if db == "None": | |
138 db = str(ins_path) + "/SwissProt_HUMAN_2015_12.fasta" | |
139 make_bait = sys.argv[6] | |
140 bait_bool = sys.argv[7] | |
141 prey_file = sys.argv[8] | |
142 bait_out = sys.argv[9] | |
143 inter_out = sys.argv[10] | |
144 | |
145 def bait_create(baits, infile): | |
146 # Verifies the Baits are valid in the Scaffold file and writes the Bait.txt. | |
147 baits = make_bait.split() | |
148 i = 0 | |
149 bait_file_tmp = open("bait.txt", "w") | |
150 order = [] | |
151 bait_cache = [] | |
152 while i < len(baits): | |
153 if baits[i+2] == "true": | |
154 T_C = "C" | |
155 else: | |
156 T_C = "T" | |
157 bait_line = baits[i] + "\t" + baits[i+1] + "\t" + T_C + "\n" | |
158 bait_cache.append(str(bait_line)) | |
159 i = i + 3 | |
160 | |
161 for cache_line in bait_cache: | |
162 bait_file_tmp.write(cache_line) | |
163 | |
164 bait_file_tmp.close() | |
165 | |
166 if bait_bool == 'false': | |
167 bait_create(make_bait, infile) | |
168 bait = "bait.txt" | |
169 else: | |
170 bait_temp_file = open(sys.argv[9], 'r') | |
171 bait_cache = bait_temp_file.readlines() | |
172 bait_file_tmp = open("bait.txt", "wr") | |
173 for cache_line in bait_cache: | |
174 bait_file_tmp.write(cache_line) | |
175 bait_file_tmp.close() | |
176 bait = "bait.txt" | |
177 | |
178 inter = "" | |
179 cnt = 0 | |
180 accessions = [] | |
181 for i in file_list: | |
182 cmd = (r"Rscript "+ str(ins_path) +"flatten_mzIdentML.R " + i) | |
183 os.system(cmd) | |
184 mzIdentML = read_tab("flat_mzIdentML.txt") | |
185 inter = inter + make_inter(mzIdentML,bait[cnt][0],bait[cnt][1]).inter | |
186 accessions.append(make_inter(mzIdentML,bait[cnt][0],bait[cnt][1]).accessions) | |
187 cnt+=1 | |
188 | |
189 with open("inter.txt","w") as x: | |
190 x.write(inter) | |
191 if make_prey == "Y": | |
192 unique_accessions = [] | |
193 prey = "" | |
194 for i in accessions: | |
195 for j in i: | |
196 if j not in unique_accessions: | |
197 unique_accessions.append(j) | |
198 start = 0 | |
199 end = len(unique_accessions) | |
200 printProgress(start,end,prefix = "Making Prey File:",suffix = "Complete",barLength=50) | |
201 | |
202 for i in unique_accessions: | |
203 prey = prey + i + "\t" + str(get_info(i,db).seqlength) + "\t" + get_info(i,db).genename + "\n" | |
204 start+=1 | |
205 printProgress(start, end) | |
206 with open("prey.txt","w") as x: | |
207 x.write(prey) | |
208 | |
209 os.rename("bait.txt", sys.argv[2]) | |
210 os.rename("inter.txt", sys.argv[10]) | |
211 if str(prey) != "None": | |
212 os.rename("prey.txt", sys.argv[11]) |