comparison mirbase_functions.py @ 2:47232a73a46b draft

Uploaded
author glogobyte
date Wed, 13 Oct 2021 16:04:11 +0000
parents
children 77ba8dde6fb7
comparison
equal deleted inserted replaced
1:f10c8f43f010 2:47232a73a46b
1 import itertools
2 import re
3 import urllib.request
4 import gzip
5 import copy
6 from collections import OrderedDict
7
8
9
10 # Read a file and return it as a list
11 def read(path, flag):
12 if flag == 0:
13 with open(path) as fp:
14 file=fp.readlines()
15 fp.close()
16 return file
17
18 if flag == 1:
19 with open(path) as fp:
20 file = fp.read().splitlines()
21 fp.close()
22 return file
23
24 # Write a list to a txt file
25 def write(path, list):
26 with open(path,'w') as fp:
27 for x in list:
28 fp.write(str("\t".join(x[1:-1])))
29 fp.close()
30
31
32 #################################################################################################################>
33
34 # Detect the longest common substring sequence between two mirnas
35 def longestSubstring(str1, str2):
36
37 from difflib import SequenceMatcher
38 # initialize SequenceMatcher object with
39 # input string
40 seqMatch = SequenceMatcher(None, str1, str2)
41
42 # find match of longest sub-string
43 # output will be like Match(a=0, b=0, size=5)
44 match = seqMatch.find_longest_match(0, len(str1), 0, len(str2))
45
46 # print longest substring
47 if (match.size != 0):
48 return str1[match.a: match.a + match.size]
49 else:
50 print('No longest common sub-string found')
51
52 #################################################################################################################################################################################################################
53
54 """
55
56 This function concatenates miRNAs which are generated from different chromosomes
57 and eliminates the duplications of miRNAs on every sample
58
59 input: detected miRNAs
60 output: collpased miRNAs without duplicates
61
62 """
63
64
65 def remove_duplicates(mirnas):
66
67 # Detection of canonical mirRNAs whicha are generated from different chromosomes
68 dupes=[[x[9],x[0],x[2]] for x in mirnas]
69
70 for x in mirnas:
71 for y in dupes:
72 if x[9] == y[0] and x[0] == y[1] and x[2].split("_")[0] == y[2].split("_")[0] and x[2] != y[2]:
73 y.append(x[2])
74
75 # Detection of different chromosomes for every miRNA
76 chr_order = []
77 for x in dupes:
78 temp = []
79 for i in range(2,len(x)):
80 if x[i].split("chr")[1].split("(")[0].isdigit():
81 temp.append(int(x[i].split("chr")[1].split("(")[1][0]+x[i].split("chr")[1].split("(")[0]))
82 else:
83 temp.append(x[i].split("chr")[1][0:4])
84
85 for z in temp:
86 if 'X(-)'==z or 'Y(-)'==z or 'X(+)'==z or 'Y(+)'==z:
87 temp = [str(j) for j in temp]
88 temp = list(set(temp))
89 temp.sort()
90 chr_order.append(temp)
91
92 # Collapsing the miRNAs with the same sequence from different chromosomes
93 collapsed_dupes=[]
94 for i in range(len(dupes)):
95 collapsed_dupes.append([dupes[i][0],dupes[i][2].split("_")[0],dupes[i][1]])
96 for x in chr_order[i]:
97 chr_check = re.match("[-+]?\d+$", str(x)) # check if chromosome is 'X' or 'Y'
98 if chr_check is not None:
99 if int(x)<0: # Check the strand (+) or (-)
100 collapsed_dupes[i][1]= collapsed_dupes[i][1]+"_chr"+str(abs(int(x)))+"(-)"
101 else:
102 collapsed_dupes[i][1] = collapsed_dupes[i][1] + "_chr" + str(abs(int(x)))+"(+)"
103 else:
104 collapsed_dupes[i][1] = collapsed_dupes[i][1] + "_chr" + str(x)
105
106 # Remove duplicates from collapsed_dupes
107 collapsed_dupes.sort()
108 collapsed_dupes = list(collapsed_dupes for collapsed_dupes,_ in itertools.groupby(collapsed_dupes))
109
110 for i in range(len(mirnas)):
111 for x in collapsed_dupes:
112
113 # Naming of template isomirs (adding positions in the names)
114 if mirnas[i][9] == x[0] and mirnas[i][0] == x[2] and len(mirnas[i][2].split("_")) >3 and mirnas[i][2].split("_")[0]==x[1].split("_")[0]:
115 gg=str("_t_"+mirnas[i][2].split("_")[-2]+"_"+mirnas[i][2].split("_")[-1])
116 mirnas[i][2] = x[1]+gg
117 break
118
119 # Naming of canonical miRNAs (collpsed names)
120 if mirnas[i][9]==x[0] and mirnas[i][0]== x[2] and len(mirnas[i][2].split("_"))==3 and mirnas[i][2].split("_")[0]==x[1].split("_")[0]:
121 mirnas[i][2] = x[1]
122 break
123
124 # Remove duplicates
125 mirnas.sort()
126 mirnas=list(mirnas for mirnas,_ in itertools.groupby(mirnas))
127
128 return mirnas
129
130 #############################################################################################################################################################################################################
131
132 """
133
134 This function indentifies and classifies the miRNAs which are detected from the alignment tool.
135
136 """
137
138 def sam_edit(mature_mirnas,path,file,case,l,samples,data,file_order,unmap_seq,names_n_seqs,deseq,mirna_names,ini_sample,unmap_counts):
139
140 # read the sam file
141 ini_sam=read(path,0)
142 main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] # remove introduction
143 unique_seq = [x for x in main_sam if x[1] == '0' and len(x[9])>=18 and len(x[9])<=26] # keeps only the functional miRNAs
144 filter_sam = [[x[0],x[1],x[2],len(x[9])] for x in main_sam] # keeps only the necessary info of miRNAs from sam files (name, sequence, counts, etc)
145
146 sorted_uni_arms = []
147
148 for i in range(0,len(mature_mirnas,),2):
149 tmp_count_reads = 0 # calculate the total number of reads
150 tmp_count_seq = 0 # calculate the total number of sequences
151 for j in range(len(unique_seq)):
152
153 if "{" in unique_seq[j][2].split("_")[0]: # checks if a miRNA is generated from two different locis on the same chromosome
154 mirna=unique_seq[j][2].split("_")[0][:-4]
155 else:
156 mirna=unique_seq[j][2].split("_")[0]
157
158 # Detection of differences between the canonical miRNA and the detected miRNA
159 if mature_mirnas[i].split(" ")[0][1:] == mirna:
160
161 temp_mature = mature_mirnas[i+1].strip().replace("U", "T")
162 off_part = longestSubstring(temp_mature, unique_seq[j][9])
163
164 mat_diff = temp_mature.split(off_part)
165 mat_diff = [len(mat_diff[0]), len(mat_diff[1])]
166
167 unique_diff = unique_seq[j][9].split(off_part)
168 unique_diff = [len(unique_diff[0]), len(unique_diff[1])]
169
170 # Handling of some special mirnas like (hsa-miR-8485)
171 if mat_diff[1]!=0 and unique_diff[1]!=0:
172 unique_seq[j]=1
173 pre_pos = 0
174 post_pos = 0
175
176 elif mat_diff[0]!=0 and unique_diff[0]!=0:
177 unique_seq[j]=1
178 pre_pos = 0
179 post_pos = 0
180
181 else:
182 # Keep the findings
183 pre_pos = mat_diff[0]-unique_diff[0]
184 post_pos = unique_diff[1]-mat_diff[1]
185 tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1])
186 tmp_count_seq = tmp_count_seq+1
187
188 # Store the detected miRNAs with new names according to the findings
189 if pre_pos != 0 or post_pos != 0:
190 if pre_pos == 0:
191 unique_seq[j][2] = unique_seq[j][2].split("_")[0]+"_"+unique_seq[j][2].split("_")[2]+ "_t_" +str(pre_pos) + "_" + '{:+d}'.format(post_pos)
192 elif post_pos == 0:
193 unique_seq[j][2] = unique_seq[j][2].split("_")[0]+"_"+unique_seq[j][2].split("_")[2] + "_t_" + '{:+d}'.format(pre_pos) + "_" + str(post_pos)
194 else:
195 unique_seq[j][2] = unique_seq[j][2].split("_")[0]+"_"+unique_seq[j][2].split("_")[2]+"_t_"+'{:+d}'.format(pre_pos)+"_"+'{:+d}'.format(post_pos)
196
197 # Remove the values "1" from the handling of special mirnas (hsa-miR-8485)
198 for x in range(unique_seq.count(1)):
199 unique_seq.remove(1)
200
201 # metrics for the production of database
202 if tmp_count_reads != 0 and tmp_count_seq != 0:
203 sorted_uni_arms.append([mature_mirnas[i].split(" ")[0][1:], tmp_count_seq, tmp_count_reads])
204
205 # Sorting of the metrics for database
206 sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True)
207
208 # Collapsing of miRNAs and removing of duplicates
209 collapsed_mirnas = remove_duplicates(unique_seq)
210
211 # Correction of metrics due to the collapsing and removing of duplicates for the production of Database
212 for y in sorted_uni_arms:
213 counts=0
214 seqs=0
215 for x in collapsed_mirnas:
216 if y[0]==x[2].split("_")[0]:
217 counts+=int(x[0].split("-")[1])
218 seqs+=1
219
220 y[1]=seqs
221 y[2]=counts
222
223
224 # Output variables
225 temp_mirna_names=[]
226
227 l.acquire()
228
229 if case == "c" or case == "t":
230 temp_mirna_names.extend(z[2] for z in collapsed_mirnas)
231 names_n_seqs.extend([[y[2],y[9]] for y in collapsed_mirnas])
232 deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in collapsed_mirnas])
233 mirna_names.extend(temp_mirna_names)
234 unmap_seq.value += sum([1 for x in main_sam if x[1] == '4']) # Keeps the unmap unique sequences for the production of a graph
235 unmap_counts.value += sum([int(x[0].split("-")[1]) for x in main_sam if x[1] == '4']) # Keeps the unmap counts of sequences for the production of a graph
236 file_order.append(file) #Keeps the names of SAM files with the order of reading by the fuction (avoid problems due to multiprocesssing)
237 samples.append(collapsed_mirnas) # return the processed detected miRNAs
238 data.append([case,file,collapsed_mirnas,sorted_uni_arms])
239 ini_sample.append(filter_sam) # returns the filtered sam file
240
241 l.release()
242
243
244 ######################################################################################################################################
245
246
247 """
248
249 Read a sam file from Bowtie and do the followings:
250
251 1) Remove reverse stranded mapped reads
252 2) Remove unmapped reads
253 3) Remove all sequences with reads less than 11 reads
254 4) Sort the arms with the most sequences in decreading rate
255 5) Sort the sequences of every arm with the most reads in decreasing rate
256 6) Calculate total number of sequences of every arm
257 7) Calculate total number of reads of sequences of every arm.
258 8) Store all the informations in a txt file
259
260 """
261
262 def non_sam_edit(mature_mirnas,path,file,case,l,data,file_order,n_deseq,names_n_seqs):
263
264 # read the sam file
265 ini_sam=read(path,0)
266 main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]]
267 unique_seq=[]
268 unique_seq = [x for x in main_sam if x[1] == '4' and len(x[9])>=18 and len(x[9])<=26]
269
270 uni_seq=[]
271
272 # Calculate the shifted positions for every non template mirna and add them to the name of it
273 sorted_uni_arms = []
274 for i in range(1,len(mature_mirnas),2):
275 tmp_count_reads = 0 # calculate the total number of reads
276 tmp_count_seq = 0 # calculate the total number of sequences
277
278 for j in range(len(unique_seq)):
279
280 temp_mature = mature_mirnas[i].strip().replace("U", "T")
281
282 # Detection of differences between the canonical miRNA and the detected non template miRNA
283 if temp_mature in unique_seq[j][9]:
284
285 off_part = longestSubstring(temp_mature, unique_seq[j][9])
286
287 mat_diff = temp_mature.split(off_part)
288 mat_diff = [len(mat_diff[0]), len(mat_diff[1])]
289
290 unique_diff = unique_seq[j][9].split(off_part)
291 if len(unique_diff)<=2:
292 unique_diff = [len(unique_diff[0]), len(unique_diff[1])]
293
294 pre_pos = mat_diff[0]-unique_diff[0]
295 post_pos = unique_diff[1]-mat_diff[1]
296
297 lengthofmir = len(off_part) + post_pos
298 if pre_pos == 0 and post_pos<4:
299 tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1])
300 tmp_count_seq = tmp_count_seq + 1
301
302 t_name=unique_seq[j].copy()
303 t_name[2]=mature_mirnas[i - 1].split(" ")[0][1:] + "_nont_" + str(pre_pos) + "_" + '{:+d}'.format(post_pos) + "_" + str(unique_seq[j][9][len(off_part):])
304 uni_seq.append(t_name)
305 # metrics for the production of database
306 if tmp_count_reads != 0 and tmp_count_seq != 0:
307 sorted_uni_arms.append([mature_mirnas[i-1].split(" ")[0][1:], tmp_count_seq, tmp_count_reads])
308
309 sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True)
310 unique_seq = list(map(list, OrderedDict.fromkeys(map(tuple,uni_seq))))
311
312 # Output variables
313 l.acquire()
314 if case=="c" or case=="t":
315 names_n_seqs.extend([[y[2],y[9]] for y in unique_seq if y[2]!="*"])
316 n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"])
317 file_order.append(file)
318 data.append([case,file,unique_seq,sorted_uni_arms])
319 l.release()
320
321 #################################################################################################################################################################################################################
322
323 def black_white(mirna_names_1,mirna_names_2,group,manager):
324
325 add_names = [x for x in mirna_names_1 if x not in mirna_names_2]
326 add_names.sort()
327 add_names = list(add_names for add_names,_ in itertools.groupby(add_names))
328
329 group.sort()
330 group = list(group for group,_ in itertools.groupby(group))
331
332 zeros=["0"]*(len(group[0])-2)
333 [add_names[i].extend(zeros) for i,_ in enumerate(add_names)]
334 group=group+add_names
335
336 manager.extend(group)
337
338 ################################################################################################################################################################################################################################
339
340 def merging_dupes(group,f_dupes):
341
342 dupes=[]
343 final_mat =[]
344
345 for num,_ in enumerate(group):
346
347 if group[num][1] not in final_mat and group[num][0] not in final_mat:
348 final_mat.append(group[num][1])
349 final_mat.append(group[num][0])
350 else:
351 dupes.append(group[num][1])
352
353
354 dupes=list(set(dupes))
355
356 dupes=[[x] for x in dupes]
357
358 for x in group:
359 for y in dupes:
360 if x[1]==y[0]:
361 fl=0
362 if len(y)==1:
363 y.append(x[0])
364 else:
365 for i in range(1,len(y)):
366 if y[i].split("_")[0]==x[0].split("_")[0]:
367 fl=1
368 if len(x[0])<len(y[i]):
369 del y[i]
370 y.append(x[0])
371 break
372
373 if fl==0:
374 y.append((x[0]))
375
376 for y in dupes:
377 if len(y)>2:
378 for i in range(len(y)-1,1,-1):
379 y[1]=y[1]+"/"+y[i]
380 del y[i]
381
382 f_dupes.extend(dupes)
383
384 ##########################################################################################################################################################################################################################################
385
386 def apply_merging_dupes(group,dupes,managger):
387
388 for x in group:
389 for y in dupes:
390 if x[1]==y[0]:
391 x[0]=y[1]
392
393 group.sort()
394 group=list(group for group,_ in itertools.groupby(group))
395 managger.extend(group)
396
397 ###############################################################################################################################################################################################################################
398
399
400 def filter_low_counts(c_group,t_group,fil_c_group,fil_t_group,per,counts):
401
402 t_group_new=[]
403 c_group_new=[]
404
405 percent=int(per)/100
406 c_col_filter=round(percent*(len(c_group[1])-2))
407 t_col_filter=round(percent*(len(t_group[1])-2))
408
409 for i, _ in enumerate(c_group):
410 c_cols=0
411 t_cols=0
412
413 c_cols=sum([1 for j in range(len(c_group[i])-2) if int(c_group[i][j+2])>=int(counts)])
414 t_cols=sum([1 for j in range(len(t_group[i])-2) if int(t_group[i][j+2])>=int(counts)])
415
416 if c_cols>=c_col_filter or t_cols>=t_col_filter:
417 t_group_new.append(t_group[i])
418 c_group_new.append(c_group[i])
419
420 fil_c_group.extend(c_group_new)
421 fil_t_group.extend(t_group_new)
422
423 ##################################################################################################################################################################################################################
424
425
426 def write_main(raw_con, raw_tre, fil_con, fil_tre, con_file_order, tre_file_order, flag, group_name1, group_name2, per):
427
428 if flag == 1 and int(per)!=-1:
429 fp = open('Counts/Filtered '+group_name2 +' Templated Counts', 'w')
430 fp.write("Name\t")
431 fp.write("Sequence")
432 for y in tre_file_order:
433 fp.write("\t"+y)
434
435 for x in fil_tre:
436 fp.write("\n%s" % "\t".join(x))
437 fp.close()
438
439 fp = open('Counts/Filtered '+group_name1+' Templated Counts', 'w')
440 fp.write("Name\t")
441 fp.write("Sequence")
442 for y in con_file_order:
443 fp.write("\t"+y)
444
445 for x in fil_con:
446 fp.write("\n%s" % "\t".join(x))
447 fp.close()
448
449
450 if flag == 2 and int(per)!=-1:
451 fp = open('Counts/Filtered '+group_name2+' Non-Templated Counts', 'w')
452 fp.write("Name\t")
453 fp.write("Sequence")
454 for y in tre_file_order:
455 fp.write("\t"+y)
456
457
458 for x in fil_tre:
459 fp.write("\n%s" % "\t".join(x))
460 fp.close()
461
462 fp = open('Counts/Filtered '+group_name1+' Non-Templated Counts', 'w')
463 fp.write("Name\t")
464 fp.write("Sequence")
465 for y in con_file_order:
466 fp.write("\t"+y)
467
468 for x in fil_con:
469 fp.write("\n%s" % "\t".join(x))
470 fp.close()
471
472
473 if flag == 1:
474 fp = open('Counts/Raw '+group_name2+' Templated Counts', 'w')
475 fp.write("Name\t")
476 fp.write("Sequence")
477 for y in tre_file_order:
478 fp.write("\t"+y)
479
480 for x in raw_tre:
481 fp.write("\n%s" % "\t".join(x))
482 fp.close()
483
484 fp = open('Counts/Raw '+group_name1+' Templated Counts', 'w')
485 fp.write("Name\t")
486 fp.write("Sequence")
487 for y in con_file_order:
488 fp.write("\t"+y)
489
490 for x in raw_con:
491 fp.write("\n%s" % "\t".join(x))
492 fp.close()
493
494
495 if flag == 2:
496 fp = open('Counts/Raw '+group_name2+' Non-Templated Counts', 'w')
497 fp.write("Name\t")
498 fp.write("Sequence")
499 for y in tre_file_order:
500 fp.write("\t"+y)
501
502
503 for x in raw_tre:
504 fp.write("\n%s" % "\t".join(x))
505 fp.close()
506
507 fp = open('Counts/Raw '+group_name1+' Non-Templated Counts', 'w')
508 fp.write("Name\t")
509 fp.write("Sequence")
510 for y in con_file_order:
511 fp.write("\t"+y)
512
513 for x in raw_con:
514 fp.write("\n%s" % "\t".join(x))
515 fp.close()
516
517
518 #########################################################################################################################################
519
520 def temp_counts_to_diff(names,samp,folder):
521
522 for i in range(2,len(samp[0])):
523
524 fp = open(folder+names[i-2]+'.txt','w')
525 fp.write("miRNA id"+"\t"+names[i-2]+"\n")
526
527 for x in samp:
528 fp.write("%s" % "\t".join([x[0],x[i]])+"\n")
529 fp.close()
530
531 ##################################################################################################################
532
533 def DB_write(con,name,unique_seq,sorted_uni_arms,f):
534
535 if f==1:
536 # Write a txt file with all the information
537 if con=="c":
538 fp = open('split1/'+name, 'w')
539
540 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
541 if con=="t":
542 fp = open('split2/'+name, 'w')
543 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
544
545
546 for i in range(len(sorted_uni_arms)):
547 temp = []
548 for j in range(len(unique_seq)):
549
550 if sorted_uni_arms[i][0] in unique_seq[j][2].split("_")[0]:
551
552 temp.append(unique_seq[j])
553
554 temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True)
555 fp.write("*********************************************************************************************************\n")
556 fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|"))
557 fp.write("*********************************************************************************************************\n\n")
558 [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp]
559 fp.write("\n" + "\n")
560 fp.close()
561
562 if f==2:
563
564 if con=="c":
565 fp = open('split3/'+name, 'w')
566 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
567 if con=="t":
568 fp = open('split4/'+name, 'w')
569 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
570
571
572 for i in range(len(sorted_uni_arms)):
573 temp = []
574 for j in range(len(unique_seq)):
575 if sorted_uni_arms[i][0]==unique_seq[j][2].split("_nont_")[0]:
576 temp.append(unique_seq[j])
577 if temp!=[]:
578 temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True)
579 fp.write("*********************************************************************************************************\n")
580 fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|"))
581 fp.write("*********************************************************************************************************\n\n")
582 [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp]
583 fp.write("\n" + "\n")
584 fp.close()
585
586
587 ##########################################################################################################################
588
589 def new_mat_seq(pre_unique_seq,mat_mirnas,l):
590
591 unique_iso = []
592 for x in pre_unique_seq:
593 if len(x[2].split("_"))==3:
594 for y in pre_unique_seq:
595 if x[2] in y[2] and int(x[0].split("-")[1])<int(y[0].split("-")[1]):
596 if any(y[2] in lst2 for lst2 in unique_iso)==False:
597 y[2]=">"+y[2]
598 unique_iso.append(y)
599 l.acquire()
600 for x in unique_iso:
601 mat_mirnas.append(x[2])
602 mat_mirnas.append(x[9])
603 l.release()
604
605 #########################################################################################################################
606
607 def merging_names(ini_mat,new):
608
609 dupes=[]
610 final_mat =[]
611
612 for num in range(len(ini_mat)):
613
614 if ini_mat[num][1] not in final_mat and ini_mat[num][0] not in final_mat:
615 final_mat.append(ini_mat[num][1])
616 final_mat.append(ini_mat[num][0])
617 else:
618 dupes.append(ini_mat[num][1])
619
620 dupes=list(set(dupes))
621
622 for i in range(len(dupes)):
623 dupes[i]=[dupes[i]]
624
625 for x in ini_mat:
626 for y in dupes:
627 if x[1]==y[0]:
628 fl=0
629 if len(y)==1:
630 y.append(x[0])
631 else:
632 for i in range(1,len(y)):
633 if y[i].split("_")[0]==x[0].split("_")[0]:
634 fl=1
635 if len(x[0])<len(y[i]):
636 del y[i]
637 y.append(x[0])
638 break
639
640 if fl==0:
641 y.append((x[0]))
642
643 for y in dupes:
644 if len(y)>2:
645 for i in range(len(y)-1,1,-1):
646 y[1]=y[1]+"/"+y[i]
647 del y[i]
648
649
650 for x in ini_mat:
651 for y in dupes:
652 if x[1]==y[0]:
653 x[0]=y[1]
654
655 ini_mat.sort()
656 ini_mat=list(ini_mat for ini_mat,_ in itertools.groupby(ini_mat))
657
658 new.extend(ini_mat)
659
660
661 ######################################################################################################################################################
662
663 def nontemp_counts_to_diff(tem_names,tem_samp,non_names,non_samp,folder):
664
665 for i in range(2,len(tem_samp[0])):
666
667 fp = open(folder+tem_names[i-2]+'.txt','w')
668 fp.write("miRNA id"+"\t"+tem_names[i-2]+"\n")
669
670 for x in tem_samp:
671 fp.write("%s" % "\t".join([x[0],x[i]])+"\n")
672
673 for j in range(len(non_names)):
674 if non_names[j]==tem_names[i-2]:
675 for x in non_samp:
676 fp.write("%s" % "\t".join([x[0],x[j+2]])+"\n")
677 fp.close()
678
679 ###################################################################################################################################################################################################################
680
681 """
682
683 This function downloads all the miRNAs of all the species from MirBase
684 and filters them by the requested organism
685
686 input : Organism
687 output: A list with the miRNA sequences in fasta format
688
689 """
690
691 def download_matures(matures,org_name):
692
693 url = 'ftp://mirbase.org/pub/mirbase/CURRENT/mature.fa.gz'
694 data = urllib.request.urlopen(url).read()
695 file_mirna = gzip.decompress(data).decode('utf-8')
696 file_mirna = file_mirna.split("\n")
697
698 for i in range(0,len(file_mirna)-1,2):
699
700 if org_name in file_mirna[i]:
701 matures.append(file_mirna[i])
702 matures.append(file_mirna[i+1])
703
704 ###################################################################################################################################################################################################################
705
706
707 """
708
709 This function keeps all mirna isoforms which are detected on SAM files from the first part of the analysis
710 These isoforms will be used as refence sequences with canonical (ref) mirnas for the detection of non-template
711 mirnas
712
713 """
714
715
716 def non_template_ref(c_samples,t_samples,all_isoforms):
717
718 pre_uni_seq_con = list(c_samples)
719 pre_uni_seq_tre = list(t_samples)
720
721 for x in pre_uni_seq_con:
722 for y in x:
723 #if ">"+y[2] not in all_isoforms and ")_" in y[2] :
724 if ">"+y[2] not in all_isoforms and "_t_" in y[2] :
725 all_isoforms.append(">"+y[2])
726 all_isoforms.append(y[9])
727
728 for x in pre_uni_seq_tre:
729 for y in x:
730 #if ">"+y[2] not in all_isoforms and ")_" in y[2]:
731 if ">"+y[2] not in all_isoforms and "_t_" in y[2] :
732 all_isoforms.append(">"+y[2])
733 all_isoforms.append(y[9])
734
735 ################################################################################################################################################################################################
736
737 """
738
739 This function adds the uncommon detected miRNAs among samples.
740 As a result all samples will have the same length.
741
742 """
743
744 def uncommon_mirnas(sample,mir_names,l,new_d,sample_name,sample_order):
745
746 for y in mir_names:
747 flag=0
748 for x in sample:
749 if y[0]==x[0]: # check if miRNA exists in the sample
750 flag=1
751 break
752 if flag==0:
753 sample.append([y[0],"0",y[1]]) # add the name of mirna to the sample with zero counts and its sequence
754
755 # sorting and remove duplicates
756 sample.sort(key=lambda x: x[0])
757 sample=list(sample for sample,_ in itertools.groupby(sample))
758
759 # Return the updated sample
760 l.acquire()
761 new_d.append(sample)
762 sample_order.append(sample_name)
763 l.release()
764
765 ###############################################################################################################################################################################################
766