Mercurial > repos > glogobyte > isoread
comparison mirgene_functions.py @ 5:810e789ffeab draft
Uploaded
author | glogobyte |
---|---|
date | Wed, 13 Oct 2021 16:04:54 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
4:f44185f616bc | 5:810e789ffeab |
---|---|
1 import itertools | |
2 import urllib.request | |
3 from collections import OrderedDict | |
4 import copy | |
5 | |
6 ######################################################################################################################################################## | |
7 | |
8 """ Read a file and return it as a list """ | |
9 | |
10 def read(path, flag): | |
11 if flag == 0: | |
12 with open(path) as fp: | |
13 file=fp.readlines() | |
14 fp.close() | |
15 return file | |
16 | |
17 if flag == 1: | |
18 with open(path) as fp: | |
19 file = fp.read().splitlines() | |
20 fp.close() | |
21 return file | |
22 | |
23 # Write a list to a txt file | |
24 def write(path, list): | |
25 with open(path,'w') as fp: | |
26 for x in list: | |
27 fp.write(str("\t".join(x[1:-1]))) | |
28 fp.close() | |
29 | |
30 ######################################################################################################################################################## | |
31 | |
32 """ Detect the longest common substring sequence between two mirnas """ | |
33 | |
34 def longestSubstring(str1, str2): | |
35 | |
36 from difflib import SequenceMatcher | |
37 # initialize SequenceMatcher object with | |
38 # input string | |
39 seqMatch = SequenceMatcher(None, str1, str2) | |
40 | |
41 # find match of longest sub-string | |
42 # output will be like Match(a=0, b=0, size=5) | |
43 match = seqMatch.find_longest_match(0, len(str1), 0, len(str2)) | |
44 | |
45 # print longest substring | |
46 if (match.size != 0): | |
47 return str1[match.a: match.a + match.size] | |
48 else: | |
49 print('No longest common sub-string found') | |
50 | |
51 | |
52 ################################################################################################################################################################# | |
53 | |
54 """ | |
55 | |
56 Read the sam files from alignment tool and do the followings: | |
57 | |
58 1) Keep mapped reads | |
59 2) Keep all sequences with length between 18 and 26 nucleotides | |
60 3) Detects the ref and templated miRNAs | |
61 4) Gives names to templated miRNAs based on ref miRNAs | |
62 | |
63 """ | |
64 | |
65 def sam_edit(mature_mirnas,path,file,case,l,samples,data,file_order,unmap_seq,names_n_seqs,deseq,mirna_names,ini_sample,unmap_counts): | |
66 | |
67 # read the sam file | |
68 ini_sam=read(path,0) | |
69 main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] | |
70 unique_seq = [x for x in main_sam if x[1] == '0' and len(x[9])>=18 and len(x[9])<=26] | |
71 filter_sam = [[x[0],x[1],x[2],len(x[9])] for x in main_sam] | |
72 sorted_uni_arms = [] | |
73 | |
74 # Detection of differences between the canonical miRNA and the detected miRNA | |
75 for i in range(len(mature_mirnas)): | |
76 tmp_count_reads = 0 # calculate the total number of reads | |
77 tmp_count_seq = 0 # calculate the total number of sequences | |
78 for j in range(len(unique_seq)): | |
79 | |
80 if mature_mirnas[i] == unique_seq[j][2]: | |
81 | |
82 temp_mature = mature_mirnas[i+1] | |
83 off_part = longestSubstring(temp_mature, unique_seq[j][9]) | |
84 | |
85 mat_diff = temp_mature.split(off_part) | |
86 mat_diff = [len(mat_diff[0]), len(mat_diff[1])] | |
87 | |
88 unique_diff = unique_seq[j][9].split(off_part) | |
89 unique_diff = [len(unique_diff[0]), len(unique_diff[1])] | |
90 | |
91 # Handling of some special mirnas like (hsa-miR-8485) | |
92 if mat_diff[1]!=0 and unique_diff[1]!=0: | |
93 unique_seq[j]=1 | |
94 pre_pos = 0 | |
95 post_pos = 0 | |
96 | |
97 elif mat_diff[0]!=0 and unique_diff[0]!=0: | |
98 unique_seq[j]=1 | |
99 pre_pos = 0 | |
100 post_pos = 0 | |
101 | |
102 else: | |
103 # Keep the findings | |
104 pre_pos = mat_diff[0]-unique_diff[0] | |
105 post_pos = unique_diff[1]-mat_diff[1] | |
106 tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1]) | |
107 tmp_count_seq = tmp_count_seq+1 | |
108 | |
109 # Store the detected miRNAs with new names according to the findings | |
110 if pre_pos != 0 or post_pos != 0: | |
111 if pre_pos == 0: | |
112 unique_seq[j][2] = unique_seq[j][2] + "_t_" +str(pre_pos) + "_" + '{:+d}'.format(post_pos) | |
113 elif post_pos == 0: | |
114 unique_seq[j][2] = unique_seq[j][2] + "_t_" + '{:+d}'.format(pre_pos) + "_" + str(post_pos) | |
115 else: | |
116 unique_seq[j][2] = unique_seq[j][2]+"_t_"+'{:+d}'.format(pre_pos)+"_"+'{:+d}'.format(post_pos) | |
117 | |
118 # Remove the values "1" from the handling of special mirnas (hsa-miR-8485) | |
119 for x in range(unique_seq.count(1)): | |
120 unique_seq.remove(1) | |
121 | |
122 # metrics for the production of database | |
123 if tmp_count_reads != 0 and tmp_count_seq != 0: | |
124 sorted_uni_arms.append([mature_mirnas[i], tmp_count_seq, tmp_count_reads]) | |
125 | |
126 # Sorting of the metrics for database | |
127 sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True) | |
128 | |
129 # Correction of metrics due to the collapsing and removing of duplicates for the production of Database | |
130 for y in sorted_uni_arms: | |
131 counts=0 | |
132 seqs=0 | |
133 for x in unique_seq: | |
134 if y[0]==x[2].split("_")[0]+"_"+x[2].split("_")[1]: | |
135 counts+=int(x[0].split("-")[1]) | |
136 seqs+=1 | |
137 | |
138 y[1]=seqs | |
139 y[2]=counts | |
140 | |
141 # Output variables | |
142 temp_mirna_names=[] | |
143 | |
144 l.acquire() | |
145 if case == "c" or case == "t": | |
146 temp_mirna_names.extend(z[2] for z in unique_seq) | |
147 names_n_seqs.extend([[y[2],y[9]] for y in unique_seq]) | |
148 deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq]) | |
149 mirna_names.extend(temp_mirna_names) | |
150 unmap_seq.value += sum([1 for x in main_sam if x[1] == '4']) | |
151 unmap_counts.value += sum([int(x[0].split("-")[1]) for x in main_sam if x[1] == '4']) | |
152 file_order.append(file) | |
153 samples.append(unique_seq) | |
154 data.append([case,file,unique_seq,sorted_uni_arms]) | |
155 ini_sample.append(filter_sam) | |
156 l.release() | |
157 | |
158 | |
159 ###################################################################################################################################### | |
160 """ | |
161 | |
162 Read a sam file from Bowtie and do the followings: | |
163 | |
164 1) Keep unmapped reads | |
165 2) Keep all sequences with length between 18 and 26 nucleotides | |
166 3) Detects the non-template isomirs | |
167 4) Gives names to isomir's based on ref miRNAs | |
168 | |
169 """ | |
170 | |
171 def non_sam_edit(mature_mirnas,path,file,case,l,data,file_order,n_deseq,names_n_seqs): | |
172 | |
173 # read the sam file | |
174 ini_sam=read(path,0) | |
175 main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] | |
176 unique_seq=[] | |
177 unique_seq = [x for x in main_sam if x[1] == '4' and len(x[9])>=18 and len(x[9])<=26] | |
178 uni_seq=[] | |
179 | |
180 # Calculate the shifted positions for every isomir and add them to the name of it | |
181 sorted_uni_arms = [] | |
182 for i in range(1,len(mature_mirnas),2): | |
183 tmp_count_reads = 0 # calculate the total number of reads | |
184 tmp_count_seq = 0 # calculate the total number of sequences | |
185 | |
186 for j in range(len(unique_seq)): | |
187 | |
188 temp_mature = mature_mirnas[i].strip().replace("U", "T") | |
189 | |
190 # Detection of differences between the canonical miRNA and the detected non template miRNA | |
191 if temp_mature in unique_seq[j][9]: | |
192 | |
193 off_part = longestSubstring(temp_mature, unique_seq[j][9]) | |
194 | |
195 mat_diff = temp_mature.split(off_part) | |
196 mat_diff = [len(mat_diff[0]), len(mat_diff[1])] | |
197 | |
198 unique_diff = unique_seq[j][9].split(off_part) | |
199 if len(unique_diff)<=2: | |
200 unique_diff = [len(unique_diff[0]), len(unique_diff[1])] | |
201 | |
202 pre_pos = mat_diff[0]-unique_diff[0] | |
203 post_pos = unique_diff[1]-mat_diff[1] | |
204 | |
205 lengthofmir = len(off_part) + post_pos | |
206 if pre_pos == 0 and post_pos<4: | |
207 tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1]) | |
208 tmp_count_seq = tmp_count_seq + 1 | |
209 | |
210 t_name=copy.deepcopy(unique_seq[j]) | |
211 t_name[2]=mature_mirnas[i - 1] + "_nont_" + str(pre_pos) + "_" + '{:+d}'.format(post_pos) + "_" + str(unique_seq[j][9][len(off_part):]) | |
212 uni_seq.append(t_name) | |
213 | |
214 # metrics for the production of database | |
215 if tmp_count_reads != 0 and tmp_count_seq != 0: | |
216 sorted_uni_arms.append([mature_mirnas[i-1], tmp_count_seq, tmp_count_reads]) | |
217 | |
218 | |
219 sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True) | |
220 unique_seq = list(map(list, OrderedDict.fromkeys(map(tuple,uni_seq)))) | |
221 | |
222 # Output variables | |
223 | |
224 l.acquire() | |
225 if case == "c" or case == "t": | |
226 names_n_seqs.extend([[y[2],y[9]] for y in unique_seq if y[2]!="*"]) | |
227 n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"]) | |
228 file_order.append(file) | |
229 data.append([case,file,unique_seq,sorted_uni_arms]) | |
230 l.release() | |
231 | |
232 ################################################################################################################################################################################################################# | |
233 | |
234 """ | |
235 | |
236 This function detects the differences between the two groups (control, treated). | |
237 Then copy the undetected miRNAs from one group to other and add zeros as counts. | |
238 With this way the two groups will have the same number of miRNAs. | |
239 | |
240 """ | |
241 | |
242 def black_white(mirna_names_1,mirna_names_2,group,manager): | |
243 | |
244 add_names = [x for x in mirna_names_1 if x not in mirna_names_2] | |
245 add_names.sort() | |
246 add_names = list(add_names for add_names,_ in itertools.groupby(add_names)) | |
247 | |
248 group.sort() | |
249 group = list(group for group,_ in itertools.groupby(group)) | |
250 | |
251 zeros=["0"]*(len(group[0])-2) | |
252 [add_names[i].extend(zeros) for i,_ in enumerate(add_names)] | |
253 group=group+add_names | |
254 | |
255 manager.extend(group) | |
256 | |
257 ########################################################################################################> | |
258 | |
259 """ | |
260 | |
261 This function collapses the miRNAs with same sequences and different names into one entry | |
262 by merging all the different names into one and are separated with the character "/" | |
263 | |
264 """ | |
265 | |
266 def merging_dupes(group,f_dupes): | |
267 | |
268 dupes=[] | |
269 temp_mat =[] | |
270 | |
271 for num,_ in enumerate(group): | |
272 | |
273 if group[num][1] not in temp_mat and group[num][0] not in temp_mat: | |
274 temp_mat.append(group[num][1]) | |
275 temp_mat.append(group[num][0]) | |
276 else: | |
277 dupes.append(group[num][1]) | |
278 | |
279 | |
280 dupes=list(set(dupes)) | |
281 | |
282 dupes=[[x] for x in dupes] | |
283 | |
284 for x in group: | |
285 for y in dupes: | |
286 if x[1]==y[0]: | |
287 fl=0 | |
288 if len(y)==1: | |
289 y.append(x[0]) | |
290 else: | |
291 for i in range(1,len(y)): | |
292 if y[i].split("_")[0]+"_"+y[i].split("_")[1]==x[0].split("_")[0]+"_"+x[0].split("_")[1]: | |
293 fl=1 | |
294 if len(x[0])<len(y[i]): | |
295 del y[i] | |
296 y.append(x[0]) | |
297 break | |
298 | |
299 if fl==0: | |
300 y.append((x[0])) | |
301 | |
302 for y in dupes: | |
303 if len(y)>2: | |
304 for i in range(len(y)-1,1,-1): | |
305 y[1]=y[1]+"/"+y[i] | |
306 del y[i] | |
307 | |
308 f_dupes.extend(dupes) | |
309 | |
310 | |
311 ########################################################################################################> | |
312 | |
313 """ | |
314 | |
315 This function removes the duplications of sequences based on output from the fuction merging_dupes | |
316 | |
317 """ | |
318 | |
319 def apply_merging_dupes(group,dupes,managger): | |
320 | |
321 for x in group: | |
322 for y in dupes: | |
323 if x[1]==y[0]: | |
324 x[0]=y[1] | |
325 | |
326 group.sort() | |
327 group=list(group for group,_ in itertools.groupby(group)) | |
328 managger.extend(group) | |
329 | |
330 ########################################################################################################> | |
331 | |
332 """ | |
333 | |
334 This function is optional and performs a filter for low counts miRNAs based on | |
335 number of counts and the percentage of the samples, according to user preferences | |
336 | |
337 """ | |
338 | |
339 def filter_low_counts(c_group,t_group,fil_c_group,fil_t_group,per,counts): | |
340 | |
341 t_group_new=[] | |
342 c_group_new=[] | |
343 | |
344 percent=int(per)/100 | |
345 c_col_filter=round(percent*(len(c_group[1])-2)) | |
346 t_col_filter=round(percent*(len(t_group[1])-2)) | |
347 | |
348 for i, _ in enumerate(c_group): | |
349 c_cols=0 | |
350 t_cols=0 | |
351 | |
352 c_cols=sum([1 for j in range(len(c_group[i])-2) if int(c_group[i][j+2])>=int(counts)]) | |
353 t_cols=sum([1 for j in range(len(t_group[i])-2) if int(t_group[i][j+2])>=int(counts)]) | |
354 | |
355 if c_cols>=c_col_filter or t_cols>=t_col_filter: | |
356 t_group_new.append(t_group[i]) | |
357 c_group_new.append(c_group[i]) | |
358 | |
359 fil_c_group.extend(c_group_new) | |
360 fil_t_group.extend(t_group_new) | |
361 | |
362 ################################################################################################################################################################################################################## | |
363 | |
364 """ | |
365 | |
366 This function exports the count matrices for every group (controls, treated) | |
367 and condition (ref and templated miRNAs, non-templated miRNAs) | |
368 | |
369 """ | |
370 | |
371 def write_main(raw_con, raw_tre, fil_con, fil_tre, con_file_order, tre_file_order, flag, n1, n2, per): | |
372 | |
373 if flag == 1 and int(per)!=-1: | |
374 fp = open('Counts/Filtered '+n2 +' Templated Counts', 'w') | |
375 fp.write("Name\t") | |
376 fp.write("Sequence") | |
377 for y in tre_file_order: | |
378 fp.write("\t"+y) | |
379 | |
380 for x in fil_tre: | |
381 fp.write("\n%s" % "\t".join(x)) | |
382 fp.close() | |
383 | |
384 fp = open('Counts/Filtered '+n1+' Templated Counts', 'w') | |
385 fp.write("Name\t") | |
386 fp.write("Sequence") | |
387 for y in con_file_order: | |
388 fp.write("\t"+y) | |
389 | |
390 for x in fil_con: | |
391 fp.write("\n%s" % "\t".join(x)) | |
392 fp.close() | |
393 | |
394 | |
395 if flag == 2 and int(per)!=-1: | |
396 fp = open('Counts/Filtered '+n2+' Non-Templated Counts', 'w') | |
397 fp.write("Name\t") | |
398 fp.write("Sequence") | |
399 for y in tre_file_order: | |
400 fp.write("\t"+y) | |
401 | |
402 | |
403 for x in fil_tre: | |
404 fp.write("\n%s" % "\t".join(x)) | |
405 fp.close() | |
406 | |
407 fp = open('Counts/Filtered '+n1+' Non-Templated Counts', 'w') | |
408 fp.write("Name\t") | |
409 fp.write("Sequence") | |
410 for y in con_file_order: | |
411 fp.write("\t"+y) | |
412 | |
413 for x in fil_con: | |
414 fp.write("\n%s" % "\t".join(x)) | |
415 fp.close() | |
416 | |
417 | |
418 if flag == 1: | |
419 fp = open('Counts/Raw '+n2+' Templated Counts', 'w') | |
420 fp.write("Name\t") | |
421 fp.write("Sequence") | |
422 for y in tre_file_order: | |
423 fp.write("\t"+y) | |
424 | |
425 for x in raw_tre: | |
426 fp.write("\n%s" % "\t".join(x)) | |
427 fp.close() | |
428 | |
429 fp = open('Counts/Raw '+n1+' Templated Counts', 'w') | |
430 fp.write("Name\t") | |
431 fp.write("Sequence") | |
432 for y in con_file_order: | |
433 fp.write("\t"+y) | |
434 | |
435 for x in raw_con: | |
436 fp.write("\n%s" % "\t".join(x)) | |
437 fp.close() | |
438 | |
439 if flag == 2: | |
440 fp = open('Counts/Raw '+n2+' Non-Templated Counts', 'w') | |
441 fp.write("Name\t") | |
442 fp.write("Sequence") | |
443 for y in tre_file_order: | |
444 fp.write("\t"+y) | |
445 | |
446 | |
447 for x in raw_tre: | |
448 fp.write("\n%s" % "\t".join(x)) | |
449 fp.close() | |
450 | |
451 fp = open('Counts/Raw '+n1+' Non-Templated Counts', 'w') | |
452 fp.write("Name\t") | |
453 fp.write("Sequence") | |
454 for y in con_file_order: | |
455 fp.write("\t"+y) | |
456 | |
457 for x in raw_con: | |
458 fp.write("\n%s" % "\t".join(x)) | |
459 fp.close() | |
460 | |
461 #################################################################################################################################################################################################################### | |
462 | |
463 """ | |
464 | |
465 This function exports the files of the database with all the info | |
466 about every type of the detected miRNAs for every sample | |
467 | |
468 """ | |
469 | |
470 def DB_write(con,name,unique_seq,sorted_uni_arms,f): | |
471 | |
472 if f==1: | |
473 if con=="c": | |
474 fp = open('split1/'+name, 'w') | |
475 | |
476 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) | |
477 if con=="t": | |
478 fp = open('split2/'+name, 'w') | |
479 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) | |
480 | |
481 for i in range(len(sorted_uni_arms)): | |
482 temp = [] | |
483 for j in range(len(unique_seq)): | |
484 | |
485 if sorted_uni_arms[i][0] in (unique_seq[j][2].split("_")[0]+"_"+unique_seq[j][2].split("_")[1]): | |
486 | |
487 temp.append(unique_seq[j]) | |
488 | |
489 temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True) | |
490 fp.write("*********************************************************************************************************\n") | |
491 fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|")) | |
492 fp.write("*********************************************************************************************************\n\n") | |
493 [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp] | |
494 fp.write("\n" + "\n") | |
495 fp.close() | |
496 | |
497 if f==2: | |
498 | |
499 if con=="c": | |
500 fp = open('split3/'+name, 'w') | |
501 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) | |
502 if con=="t": | |
503 fp = open('split4/'+name, 'w') | |
504 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) | |
505 | |
506 for i in range(len(sorted_uni_arms)): | |
507 temp = [] | |
508 for j in range(len(unique_seq)): | |
509 if sorted_uni_arms[i][0]==unique_seq[j][2].split("_nont_")[0]: | |
510 temp.append(unique_seq[j]) | |
511 if temp!=[]: | |
512 temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True) | |
513 fp.write("*********************************************************************************************************\n") | |
514 fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|")) | |
515 fp.write("*********************************************************************************************************\n\n") | |
516 [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp] | |
517 fp.write("\n" + "\n") | |
518 fp.close() | |
519 | |
520 | |
521 ######################################################################################################################### | |
522 | |
523 """ | |
524 | |
525 This function merges the different names for the same mirna sequence per group (controls, treated) to avoid duplicates | |
526 | |
527 """ | |
528 | |
529 def merging_names(ini_mat,new): | |
530 | |
531 dupes=[] | |
532 temp_mat =[] | |
533 | |
534 for num in range(len(ini_mat)): | |
535 | |
536 if ini_mat[num][1] not in temp_mat and ini_mat[num][0] not in temp_mat: | |
537 temp_mat.append(ini_mat[num][1]) | |
538 temp_mat.append(ini_mat[num][0]) | |
539 else: | |
540 dupes.append(ini_mat[num][1]) | |
541 | |
542 dupes=list(set(dupes)) | |
543 | |
544 for i in range(len(dupes)): | |
545 dupes[i]=[dupes[i]] | |
546 | |
547 for x in ini_mat: | |
548 for y in dupes: | |
549 if x[1]==y[0]: | |
550 fl=0 | |
551 if len(y)==1: | |
552 y.append(x[0]) | |
553 else: | |
554 for i in range(1,len(y)): | |
555 if y[i].split("_")[0]+"_"+y[i].split("_")[1]==x[0].split("_")[0]+"_"+x[0].split("_")[1]: | |
556 fl=1 | |
557 if len(x[0])<len(y[i]): | |
558 del y[i] | |
559 y.append(x[0]) | |
560 break | |
561 | |
562 if fl==0: | |
563 y.append((x[0])) | |
564 | |
565 for y in dupes: | |
566 if len(y)>2: | |
567 for i in range(len(y)-1,1,-1): | |
568 y[1]=y[1]+"/"+y[i] | |
569 del y[i] | |
570 | |
571 | |
572 for x in ini_mat: | |
573 for y in dupes: | |
574 if x[1]==y[0]: | |
575 x[0]=y[1] | |
576 | |
577 ini_mat.sort() | |
578 ini_mat=list(ini_mat for ini_mat,_ in itertools.groupby(ini_mat)) | |
579 new.extend(ini_mat) | |
580 | |
581 #################################################################################################################################################################################################################### | |
582 | |
583 """ | |
584 | |
585 This function exports the count matrices for differential expresion | |
586 if user chose analysis with non-templated miRNAs detection | |
587 | |
588 """ | |
589 | |
590 def nontemp_counts_to_diff(tem_names,tem_samp,non_names,non_samp,folder,pro): | |
591 | |
592 for i in range(2,len(tem_samp[0])): | |
593 | |
594 fp = open(folder+tem_names[i-2]+'.txt','w') | |
595 fp.write("miRNA id"+"\t"+tem_names[i-2]+"\n") | |
596 | |
597 for x in tem_samp: | |
598 fp.write("%s" % "\t".join([x[0],x[i]])+"\n") | |
599 | |
600 for j in range(len(non_names)): | |
601 if non_names[j]==tem_names[i-2]: | |
602 for x in non_samp: | |
603 fp.write("%s" % "\t".join([x[0],x[j+2]])+"\n") | |
604 fp.close() | |
605 | |
606 ################################################################################################################################################################################################################# | |
607 | |
608 """ | |
609 | |
610 This function exports the count matrices for differential expresion | |
611 if user chose analysis only with templated miRNAs detection | |
612 | |
613 """ | |
614 | |
615 def temp_counts_to_diff(names,samp,folder,pro): | |
616 | |
617 for i in range(2,len(samp[0])): | |
618 | |
619 fp = open(folder+names[i-2]+'.txt','w') | |
620 fp.write("miRNA id"+"\t"+names[i-2]+"\n") | |
621 | |
622 for x in samp: | |
623 fp.write("%s" % "\t".join([x[0],x[i]])+"\n") | |
624 fp.close() | |
625 | |
626 ################################################################################################################################################################################################################# | |
627 | |
628 """ | |
629 | |
630 This function downloads the fasta files from MirGene site | |
631 with ref miRNAs and star miRNAs sequences and merges them | |
632 into one list | |
633 | |
634 """ | |
635 | |
636 def download_matures(matures,org_name): | |
637 | |
638 mature_mir=[] | |
639 | |
640 mat_url = 'http://mirgenedb.org/fasta/'+org_name+'?mat=1' | |
641 star_url = 'http://mirgenedb.org/fasta/'+org_name+'?star=1' | |
642 | |
643 data = urllib.request.urlopen(mat_url).read() | |
644 file_mirna = data.decode('utf-8') | |
645 mature_mir = file_mirna.split("\n") | |
646 mature_mir = [x.replace(">","") for x in mature_mir] | |
647 del mature_mir[-1] | |
648 | |
649 data = urllib.request.urlopen(star_url).read() | |
650 file_mirna = data.decode('utf-8') | |
651 star_mir = file_mirna.split("\n") | |
652 star_mir = [x.replace(">","") for x in star_mir] | |
653 del star_mir[-1] | |
654 | |
655 mature_mir.extend(star_mir) | |
656 | |
657 for i in range(1,len(mature_mir),2): | |
658 mature_mir[i]=mature_mir[i].replace("U","T") | |
659 | |
660 matures.extend(mature_mir) | |
661 | |
662 ################################################################################################################### | |
663 | |
664 """ | |
665 | |
666 This function detects the templated isoforms from the 1st part of analysis | |
667 These isoforms and ref miRNAs will be used for the detection of non-templated miRNAs | |
668 | |
669 """ | |
670 | |
671 def non_template_ref(sc,st,all_isoforms): | |
672 | |
673 pre_uni_seq_con = list(sc) | |
674 pre_uni_seq_tre = list(st) | |
675 | |
676 for x in pre_uni_seq_con: | |
677 for y in x: | |
678 if y[2] not in all_isoforms and "_t_" in y[2]: | |
679 all_isoforms.append(y[2]) | |
680 all_isoforms.append(y[9]) | |
681 | |
682 for x in pre_uni_seq_tre: | |
683 for y in x: | |
684 if y[2] not in all_isoforms and "_t_" in y[2]: | |
685 all_isoforms.append(y[2]) | |
686 all_isoforms.append(y[9]) | |
687 | |
688 ################################################################################################################################################################################################ | |
689 | |
690 """ | |
691 | |
692 This function adds uncommon detected mirnas among the samples with zeros as counts | |
693 | |
694 """ | |
695 | |
696 def uncommon_mirnas(sample,mir_names,l,new_d,sample_name,sample_order): | |
697 | |
698 for y in mir_names: | |
699 flag=0 | |
700 for x in sample: | |
701 if y[0]==x[0]: # check if miRNA exists in the sample | |
702 flag=1 | |
703 break | |
704 if flag==0: | |
705 sample.append([y[0],"0",y[1]]) # add the name of mirna to the sample with zero counts and its sequence | |
706 | |
707 # sorting and remove duplicates | |
708 sample.sort(key=lambda x: x[0]) | |
709 sample=list(sample for sample,_ in itertools.groupby(sample)) | |
710 | |
711 # Return the updated sample | |
712 l.acquire() | |
713 new_d.append(sample) | |
714 sample_order.append(sample_name) | |
715 l.release() | |
716 | |
717 ############################################################################################################################################################################################### |