5
|
1 import itertools
|
|
2 import urllib.request
|
|
3 from collections import OrderedDict
|
|
4 import copy
|
|
5
|
|
6 ########################################################################################################################################################
|
|
7
|
|
8 """ Read a file and return it as a list """
|
|
9
|
|
10 def read(path, flag):
|
|
11 if flag == 0:
|
|
12 with open(path) as fp:
|
|
13 file=fp.readlines()
|
|
14 fp.close()
|
|
15 return file
|
|
16
|
|
17 if flag == 1:
|
|
18 with open(path) as fp:
|
|
19 file = fp.read().splitlines()
|
|
20 fp.close()
|
|
21 return file
|
|
22
|
|
23 # Write a list to a txt file
|
|
24 def write(path, list):
|
|
25 with open(path,'w') as fp:
|
|
26 for x in list:
|
|
27 fp.write(str("\t".join(x[1:-1])))
|
|
28 fp.close()
|
|
29
|
|
30 ########################################################################################################################################################
|
|
31
|
|
32 """ Detect the longest common substring sequence between two mirnas """
|
|
33
|
|
34 def longestSubstring(str1, str2):
|
|
35
|
|
36 from difflib import SequenceMatcher
|
|
37 # initialize SequenceMatcher object with
|
|
38 # input string
|
|
39 seqMatch = SequenceMatcher(None, str1, str2)
|
|
40
|
|
41 # find match of longest sub-string
|
|
42 # output will be like Match(a=0, b=0, size=5)
|
|
43 match = seqMatch.find_longest_match(0, len(str1), 0, len(str2))
|
|
44
|
|
45 # print longest substring
|
|
46 if (match.size != 0):
|
|
47 return str1[match.a: match.a + match.size]
|
|
48 else:
|
|
49 print('No longest common sub-string found')
|
|
50
|
|
51
|
|
52 #################################################################################################################################################################
|
|
53
|
|
54 """
|
|
55
|
|
56 Read the sam files from alignment tool and do the followings:
|
|
57
|
|
58 1) Keep mapped reads
|
|
59 2) Keep all sequences with length between 18 and 26 nucleotides
|
|
60 3) Detects the ref and templated miRNAs
|
|
61 4) Gives names to templated miRNAs based on ref miRNAs
|
|
62
|
|
63 """
|
|
64
|
|
65 def sam_edit(mature_mirnas,path,file,case,l,samples,data,file_order,unmap_seq,names_n_seqs,deseq,mirna_names,ini_sample,unmap_counts):
|
|
66
|
|
67 # read the sam file
|
|
68 ini_sam=read(path,0)
|
|
69 main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]]
|
|
70 unique_seq = [x for x in main_sam if x[1] == '0' and len(x[9])>=18 and len(x[9])<=26]
|
|
71 filter_sam = [[x[0],x[1],x[2],len(x[9])] for x in main_sam]
|
|
72 sorted_uni_arms = []
|
|
73
|
|
74 # Detection of differences between the canonical miRNA and the detected miRNA
|
|
75 for i in range(len(mature_mirnas)):
|
|
76 tmp_count_reads = 0 # calculate the total number of reads
|
|
77 tmp_count_seq = 0 # calculate the total number of sequences
|
|
78 for j in range(len(unique_seq)):
|
|
79
|
|
80 if mature_mirnas[i] == unique_seq[j][2]:
|
|
81
|
|
82 temp_mature = mature_mirnas[i+1]
|
|
83 off_part = longestSubstring(temp_mature, unique_seq[j][9])
|
|
84
|
|
85 mat_diff = temp_mature.split(off_part)
|
|
86 mat_diff = [len(mat_diff[0]), len(mat_diff[1])]
|
|
87
|
|
88 unique_diff = unique_seq[j][9].split(off_part)
|
|
89 unique_diff = [len(unique_diff[0]), len(unique_diff[1])]
|
|
90
|
|
91 # Handling of some special mirnas like (hsa-miR-8485)
|
|
92 if mat_diff[1]!=0 and unique_diff[1]!=0:
|
|
93 unique_seq[j]=1
|
|
94 pre_pos = 0
|
|
95 post_pos = 0
|
|
96
|
|
97 elif mat_diff[0]!=0 and unique_diff[0]!=0:
|
|
98 unique_seq[j]=1
|
|
99 pre_pos = 0
|
|
100 post_pos = 0
|
|
101
|
|
102 else:
|
|
103 # Keep the findings
|
|
104 pre_pos = mat_diff[0]-unique_diff[0]
|
|
105 post_pos = unique_diff[1]-mat_diff[1]
|
|
106 tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1])
|
|
107 tmp_count_seq = tmp_count_seq+1
|
|
108
|
|
109 # Store the detected miRNAs with new names according to the findings
|
|
110 if pre_pos != 0 or post_pos != 0:
|
|
111 if pre_pos == 0:
|
|
112 unique_seq[j][2] = unique_seq[j][2] + "_t_" +str(pre_pos) + "_" + '{:+d}'.format(post_pos)
|
|
113 elif post_pos == 0:
|
|
114 unique_seq[j][2] = unique_seq[j][2] + "_t_" + '{:+d}'.format(pre_pos) + "_" + str(post_pos)
|
|
115 else:
|
|
116 unique_seq[j][2] = unique_seq[j][2]+"_t_"+'{:+d}'.format(pre_pos)+"_"+'{:+d}'.format(post_pos)
|
|
117
|
|
118 # Remove the values "1" from the handling of special mirnas (hsa-miR-8485)
|
|
119 for x in range(unique_seq.count(1)):
|
|
120 unique_seq.remove(1)
|
|
121
|
|
122 # metrics for the production of database
|
|
123 if tmp_count_reads != 0 and tmp_count_seq != 0:
|
|
124 sorted_uni_arms.append([mature_mirnas[i], tmp_count_seq, tmp_count_reads])
|
|
125
|
|
126 # Sorting of the metrics for database
|
|
127 sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True)
|
|
128
|
|
129 # Correction of metrics due to the collapsing and removing of duplicates for the production of Database
|
|
130 for y in sorted_uni_arms:
|
|
131 counts=0
|
|
132 seqs=0
|
|
133 for x in unique_seq:
|
|
134 if y[0]==x[2].split("_")[0]+"_"+x[2].split("_")[1]:
|
|
135 counts+=int(x[0].split("-")[1])
|
|
136 seqs+=1
|
|
137
|
|
138 y[1]=seqs
|
|
139 y[2]=counts
|
|
140
|
|
141 # Output variables
|
|
142 temp_mirna_names=[]
|
|
143
|
|
144 l.acquire()
|
|
145 if case == "c" or case == "t":
|
|
146 temp_mirna_names.extend(z[2] for z in unique_seq)
|
|
147 names_n_seqs.extend([[y[2],y[9]] for y in unique_seq])
|
|
148 deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq])
|
|
149 mirna_names.extend(temp_mirna_names)
|
|
150 unmap_seq.value += sum([1 for x in main_sam if x[1] == '4'])
|
|
151 unmap_counts.value += sum([int(x[0].split("-")[1]) for x in main_sam if x[1] == '4'])
|
|
152 file_order.append(file)
|
|
153 samples.append(unique_seq)
|
|
154 data.append([case,file,unique_seq,sorted_uni_arms])
|
|
155 ini_sample.append(filter_sam)
|
|
156 l.release()
|
|
157
|
|
158
|
|
159 ######################################################################################################################################
|
|
160 """
|
|
161
|
|
162 Read a sam file from Bowtie and do the followings:
|
|
163
|
|
164 1) Keep unmapped reads
|
|
165 2) Keep all sequences with length between 18 and 26 nucleotides
|
|
166 3) Detects the non-template isomirs
|
|
167 4) Gives names to isomir's based on ref miRNAs
|
|
168
|
|
169 """
|
|
170
|
|
171 def non_sam_edit(mature_mirnas,path,file,case,l,data,file_order,n_deseq,names_n_seqs):
|
|
172
|
|
173 # read the sam file
|
|
174 ini_sam=read(path,0)
|
|
175 main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]]
|
|
176 unique_seq=[]
|
|
177 unique_seq = [x for x in main_sam if x[1] == '4' and len(x[9])>=18 and len(x[9])<=26]
|
|
178 uni_seq=[]
|
|
179
|
|
180 # Calculate the shifted positions for every isomir and add them to the name of it
|
|
181 sorted_uni_arms = []
|
|
182 for i in range(1,len(mature_mirnas),2):
|
|
183 tmp_count_reads = 0 # calculate the total number of reads
|
|
184 tmp_count_seq = 0 # calculate the total number of sequences
|
|
185
|
|
186 for j in range(len(unique_seq)):
|
|
187
|
|
188 temp_mature = mature_mirnas[i].strip().replace("U", "T")
|
|
189
|
|
190 # Detection of differences between the canonical miRNA and the detected non template miRNA
|
|
191 if temp_mature in unique_seq[j][9]:
|
|
192
|
|
193 off_part = longestSubstring(temp_mature, unique_seq[j][9])
|
|
194
|
|
195 mat_diff = temp_mature.split(off_part)
|
|
196 mat_diff = [len(mat_diff[0]), len(mat_diff[1])]
|
|
197
|
|
198 unique_diff = unique_seq[j][9].split(off_part)
|
|
199 if len(unique_diff)<=2:
|
|
200 unique_diff = [len(unique_diff[0]), len(unique_diff[1])]
|
|
201
|
|
202 pre_pos = mat_diff[0]-unique_diff[0]
|
|
203 post_pos = unique_diff[1]-mat_diff[1]
|
|
204
|
|
205 lengthofmir = len(off_part) + post_pos
|
|
206 if pre_pos == 0 and post_pos<4:
|
|
207 tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1])
|
|
208 tmp_count_seq = tmp_count_seq + 1
|
|
209
|
|
210 t_name=copy.deepcopy(unique_seq[j])
|
|
211 t_name[2]=mature_mirnas[i - 1] + "_nont_" + str(pre_pos) + "_" + '{:+d}'.format(post_pos) + "_" + str(unique_seq[j][9][len(off_part):])
|
|
212 uni_seq.append(t_name)
|
|
213
|
|
214 # metrics for the production of database
|
|
215 if tmp_count_reads != 0 and tmp_count_seq != 0:
|
|
216 sorted_uni_arms.append([mature_mirnas[i-1], tmp_count_seq, tmp_count_reads])
|
|
217
|
|
218
|
|
219 sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True)
|
|
220 unique_seq = list(map(list, OrderedDict.fromkeys(map(tuple,uni_seq))))
|
|
221
|
|
222 # Output variables
|
|
223
|
|
224 l.acquire()
|
|
225 if case == "c" or case == "t":
|
|
226 names_n_seqs.extend([[y[2],y[9]] for y in unique_seq if y[2]!="*"])
|
|
227 n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"])
|
|
228 file_order.append(file)
|
|
229 data.append([case,file,unique_seq,sorted_uni_arms])
|
|
230 l.release()
|
|
231
|
|
232 #################################################################################################################################################################################################################
|
|
233
|
|
234 """
|
|
235
|
|
236 This function detects the differences between the two groups (control, treated).
|
|
237 Then copy the undetected miRNAs from one group to other and add zeros as counts.
|
|
238 With this way the two groups will have the same number of miRNAs.
|
|
239
|
|
240 """
|
|
241
|
|
242 def black_white(mirna_names_1,mirna_names_2,group,manager):
|
|
243
|
|
244 add_names = [x for x in mirna_names_1 if x not in mirna_names_2]
|
|
245 add_names.sort()
|
|
246 add_names = list(add_names for add_names,_ in itertools.groupby(add_names))
|
|
247
|
|
248 group.sort()
|
|
249 group = list(group for group,_ in itertools.groupby(group))
|
|
250
|
|
251 zeros=["0"]*(len(group[0])-2)
|
|
252 [add_names[i].extend(zeros) for i,_ in enumerate(add_names)]
|
|
253 group=group+add_names
|
|
254
|
|
255 manager.extend(group)
|
|
256
|
|
257 ########################################################################################################>
|
|
258
|
|
259 """
|
|
260
|
|
261 This function collapses the miRNAs with same sequences and different names into one entry
|
|
262 by merging all the different names into one and are separated with the character "/"
|
|
263
|
|
264 """
|
|
265
|
|
266 def merging_dupes(group,f_dupes):
|
|
267
|
|
268 dupes=[]
|
|
269 temp_mat =[]
|
|
270
|
|
271 for num,_ in enumerate(group):
|
|
272
|
|
273 if group[num][1] not in temp_mat and group[num][0] not in temp_mat:
|
|
274 temp_mat.append(group[num][1])
|
|
275 temp_mat.append(group[num][0])
|
|
276 else:
|
|
277 dupes.append(group[num][1])
|
|
278
|
|
279
|
|
280 dupes=list(set(dupes))
|
|
281
|
|
282 dupes=[[x] for x in dupes]
|
|
283
|
|
284 for x in group:
|
|
285 for y in dupes:
|
|
286 if x[1]==y[0]:
|
|
287 fl=0
|
|
288 if len(y)==1:
|
|
289 y.append(x[0])
|
|
290 else:
|
|
291 for i in range(1,len(y)):
|
|
292 if y[i].split("_")[0]+"_"+y[i].split("_")[1]==x[0].split("_")[0]+"_"+x[0].split("_")[1]:
|
|
293 fl=1
|
|
294 if len(x[0])<len(y[i]):
|
|
295 del y[i]
|
|
296 y.append(x[0])
|
|
297 break
|
|
298
|
|
299 if fl==0:
|
|
300 y.append((x[0]))
|
|
301
|
|
302 for y in dupes:
|
|
303 if len(y)>2:
|
|
304 for i in range(len(y)-1,1,-1):
|
|
305 y[1]=y[1]+"/"+y[i]
|
|
306 del y[i]
|
|
307
|
|
308 f_dupes.extend(dupes)
|
|
309
|
|
310
|
|
311 ########################################################################################################>
|
|
312
|
|
313 """
|
|
314
|
|
315 This function removes the duplications of sequences based on output from the fuction merging_dupes
|
|
316
|
|
317 """
|
|
318
|
|
319 def apply_merging_dupes(group,dupes,managger):
|
|
320
|
|
321 for x in group:
|
|
322 for y in dupes:
|
|
323 if x[1]==y[0]:
|
|
324 x[0]=y[1]
|
|
325
|
|
326 group.sort()
|
|
327 group=list(group for group,_ in itertools.groupby(group))
|
|
328 managger.extend(group)
|
|
329
|
|
330 ########################################################################################################>
|
|
331
|
|
332 """
|
|
333
|
|
334 This function is optional and performs a filter for low counts miRNAs based on
|
|
335 number of counts and the percentage of the samples, according to user preferences
|
|
336
|
|
337 """
|
|
338
|
|
339 def filter_low_counts(c_group,t_group,fil_c_group,fil_t_group,per,counts):
|
|
340
|
|
341 t_group_new=[]
|
|
342 c_group_new=[]
|
|
343
|
|
344 percent=int(per)/100
|
|
345 c_col_filter=round(percent*(len(c_group[1])-2))
|
|
346 t_col_filter=round(percent*(len(t_group[1])-2))
|
|
347
|
|
348 for i, _ in enumerate(c_group):
|
|
349 c_cols=0
|
|
350 t_cols=0
|
|
351
|
|
352 c_cols=sum([1 for j in range(len(c_group[i])-2) if int(c_group[i][j+2])>=int(counts)])
|
|
353 t_cols=sum([1 for j in range(len(t_group[i])-2) if int(t_group[i][j+2])>=int(counts)])
|
|
354
|
|
355 if c_cols>=c_col_filter or t_cols>=t_col_filter:
|
|
356 t_group_new.append(t_group[i])
|
|
357 c_group_new.append(c_group[i])
|
|
358
|
|
359 fil_c_group.extend(c_group_new)
|
|
360 fil_t_group.extend(t_group_new)
|
|
361
|
|
362 ##################################################################################################################################################################################################################
|
|
363
|
|
364 """
|
|
365
|
|
366 This function exports the count matrices for every group (controls, treated)
|
|
367 and condition (ref and templated miRNAs, non-templated miRNAs)
|
|
368
|
|
369 """
|
|
370
|
|
371 def write_main(raw_con, raw_tre, fil_con, fil_tre, con_file_order, tre_file_order, flag, n1, n2, per):
|
|
372
|
|
373 if flag == 1 and int(per)!=-1:
|
|
374 fp = open('Counts/Filtered '+n2 +' Templated Counts', 'w')
|
|
375 fp.write("Name\t")
|
|
376 fp.write("Sequence")
|
|
377 for y in tre_file_order:
|
|
378 fp.write("\t"+y)
|
|
379
|
|
380 for x in fil_tre:
|
|
381 fp.write("\n%s" % "\t".join(x))
|
|
382 fp.close()
|
|
383
|
|
384 fp = open('Counts/Filtered '+n1+' Templated Counts', 'w')
|
|
385 fp.write("Name\t")
|
|
386 fp.write("Sequence")
|
|
387 for y in con_file_order:
|
|
388 fp.write("\t"+y)
|
|
389
|
|
390 for x in fil_con:
|
|
391 fp.write("\n%s" % "\t".join(x))
|
|
392 fp.close()
|
|
393
|
|
394
|
|
395 if flag == 2 and int(per)!=-1:
|
|
396 fp = open('Counts/Filtered '+n2+' Non-Templated Counts', 'w')
|
|
397 fp.write("Name\t")
|
|
398 fp.write("Sequence")
|
|
399 for y in tre_file_order:
|
|
400 fp.write("\t"+y)
|
|
401
|
|
402
|
|
403 for x in fil_tre:
|
|
404 fp.write("\n%s" % "\t".join(x))
|
|
405 fp.close()
|
|
406
|
|
407 fp = open('Counts/Filtered '+n1+' Non-Templated Counts', 'w')
|
|
408 fp.write("Name\t")
|
|
409 fp.write("Sequence")
|
|
410 for y in con_file_order:
|
|
411 fp.write("\t"+y)
|
|
412
|
|
413 for x in fil_con:
|
|
414 fp.write("\n%s" % "\t".join(x))
|
|
415 fp.close()
|
|
416
|
|
417
|
|
418 if flag == 1:
|
|
419 fp = open('Counts/Raw '+n2+' Templated Counts', 'w')
|
|
420 fp.write("Name\t")
|
|
421 fp.write("Sequence")
|
|
422 for y in tre_file_order:
|
|
423 fp.write("\t"+y)
|
|
424
|
|
425 for x in raw_tre:
|
|
426 fp.write("\n%s" % "\t".join(x))
|
|
427 fp.close()
|
|
428
|
|
429 fp = open('Counts/Raw '+n1+' Templated Counts', 'w')
|
|
430 fp.write("Name\t")
|
|
431 fp.write("Sequence")
|
|
432 for y in con_file_order:
|
|
433 fp.write("\t"+y)
|
|
434
|
|
435 for x in raw_con:
|
|
436 fp.write("\n%s" % "\t".join(x))
|
|
437 fp.close()
|
|
438
|
|
439 if flag == 2:
|
|
440 fp = open('Counts/Raw '+n2+' Non-Templated Counts', 'w')
|
|
441 fp.write("Name\t")
|
|
442 fp.write("Sequence")
|
|
443 for y in tre_file_order:
|
|
444 fp.write("\t"+y)
|
|
445
|
|
446
|
|
447 for x in raw_tre:
|
|
448 fp.write("\n%s" % "\t".join(x))
|
|
449 fp.close()
|
|
450
|
|
451 fp = open('Counts/Raw '+n1+' Non-Templated Counts', 'w')
|
|
452 fp.write("Name\t")
|
|
453 fp.write("Sequence")
|
|
454 for y in con_file_order:
|
|
455 fp.write("\t"+y)
|
|
456
|
|
457 for x in raw_con:
|
|
458 fp.write("\n%s" % "\t".join(x))
|
|
459 fp.close()
|
|
460
|
|
461 ####################################################################################################################################################################################################################
|
|
462
|
|
463 """
|
|
464
|
|
465 This function exports the files of the database with all the info
|
|
466 about every type of the detected miRNAs for every sample
|
|
467
|
|
468 """
|
|
469
|
|
470 def DB_write(con,name,unique_seq,sorted_uni_arms,f):
|
|
471
|
|
472 if f==1:
|
|
473 if con=="c":
|
|
474 fp = open('split1/'+name, 'w')
|
|
475
|
|
476 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
|
|
477 if con=="t":
|
|
478 fp = open('split2/'+name, 'w')
|
|
479 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
|
|
480
|
|
481 for i in range(len(sorted_uni_arms)):
|
|
482 temp = []
|
|
483 for j in range(len(unique_seq)):
|
|
484
|
|
485 if sorted_uni_arms[i][0] in (unique_seq[j][2].split("_")[0]+"_"+unique_seq[j][2].split("_")[1]):
|
|
486
|
|
487 temp.append(unique_seq[j])
|
|
488
|
|
489 temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True)
|
|
490 fp.write("*********************************************************************************************************\n")
|
|
491 fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|"))
|
|
492 fp.write("*********************************************************************************************************\n\n")
|
|
493 [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp]
|
|
494 fp.write("\n" + "\n")
|
|
495 fp.close()
|
|
496
|
|
497 if f==2:
|
|
498
|
|
499 if con=="c":
|
|
500 fp = open('split3/'+name, 'w')
|
|
501 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
|
|
502 if con=="t":
|
|
503 fp = open('split4/'+name, 'w')
|
|
504 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
|
|
505
|
|
506 for i in range(len(sorted_uni_arms)):
|
|
507 temp = []
|
|
508 for j in range(len(unique_seq)):
|
|
509 if sorted_uni_arms[i][0]==unique_seq[j][2].split("_nont_")[0]:
|
|
510 temp.append(unique_seq[j])
|
|
511 if temp!=[]:
|
|
512 temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True)
|
|
513 fp.write("*********************************************************************************************************\n")
|
|
514 fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|"))
|
|
515 fp.write("*********************************************************************************************************\n\n")
|
|
516 [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp]
|
|
517 fp.write("\n" + "\n")
|
|
518 fp.close()
|
|
519
|
|
520
|
|
521 #########################################################################################################################
|
|
522
|
|
523 """
|
|
524
|
|
525 This function merges the different names for the same mirna sequence per group (controls, treated) to avoid duplicates
|
|
526
|
|
527 """
|
|
528
|
|
529 def merging_names(ini_mat,new):
|
|
530
|
|
531 dupes=[]
|
|
532 temp_mat =[]
|
|
533
|
|
534 for num in range(len(ini_mat)):
|
|
535
|
|
536 if ini_mat[num][1] not in temp_mat and ini_mat[num][0] not in temp_mat:
|
|
537 temp_mat.append(ini_mat[num][1])
|
|
538 temp_mat.append(ini_mat[num][0])
|
|
539 else:
|
|
540 dupes.append(ini_mat[num][1])
|
|
541
|
|
542 dupes=list(set(dupes))
|
|
543
|
|
544 for i in range(len(dupes)):
|
|
545 dupes[i]=[dupes[i]]
|
|
546
|
|
547 for x in ini_mat:
|
|
548 for y in dupes:
|
|
549 if x[1]==y[0]:
|
|
550 fl=0
|
|
551 if len(y)==1:
|
|
552 y.append(x[0])
|
|
553 else:
|
|
554 for i in range(1,len(y)):
|
|
555 if y[i].split("_")[0]+"_"+y[i].split("_")[1]==x[0].split("_")[0]+"_"+x[0].split("_")[1]:
|
|
556 fl=1
|
|
557 if len(x[0])<len(y[i]):
|
|
558 del y[i]
|
|
559 y.append(x[0])
|
|
560 break
|
|
561
|
|
562 if fl==0:
|
|
563 y.append((x[0]))
|
|
564
|
|
565 for y in dupes:
|
|
566 if len(y)>2:
|
|
567 for i in range(len(y)-1,1,-1):
|
|
568 y[1]=y[1]+"/"+y[i]
|
|
569 del y[i]
|
|
570
|
|
571
|
|
572 for x in ini_mat:
|
|
573 for y in dupes:
|
|
574 if x[1]==y[0]:
|
|
575 x[0]=y[1]
|
|
576
|
|
577 ini_mat.sort()
|
|
578 ini_mat=list(ini_mat for ini_mat,_ in itertools.groupby(ini_mat))
|
|
579 new.extend(ini_mat)
|
|
580
|
|
581 ####################################################################################################################################################################################################################
|
|
582
|
|
583 """
|
|
584
|
|
585 This function exports the count matrices for differential expresion
|
|
586 if user chose analysis with non-templated miRNAs detection
|
|
587
|
|
588 """
|
|
589
|
|
590 def nontemp_counts_to_diff(tem_names,tem_samp,non_names,non_samp,folder,pro):
|
|
591
|
|
592 for i in range(2,len(tem_samp[0])):
|
|
593
|
|
594 fp = open(folder+tem_names[i-2]+'.txt','w')
|
|
595 fp.write("miRNA id"+"\t"+tem_names[i-2]+"\n")
|
|
596
|
|
597 for x in tem_samp:
|
|
598 fp.write("%s" % "\t".join([x[0],x[i]])+"\n")
|
|
599
|
|
600 for j in range(len(non_names)):
|
|
601 if non_names[j]==tem_names[i-2]:
|
|
602 for x in non_samp:
|
|
603 fp.write("%s" % "\t".join([x[0],x[j+2]])+"\n")
|
|
604 fp.close()
|
|
605
|
|
606 #################################################################################################################################################################################################################
|
|
607
|
|
608 """
|
|
609
|
|
610 This function exports the count matrices for differential expresion
|
|
611 if user chose analysis only with templated miRNAs detection
|
|
612
|
|
613 """
|
|
614
|
|
615 def temp_counts_to_diff(names,samp,folder,pro):
|
|
616
|
|
617 for i in range(2,len(samp[0])):
|
|
618
|
|
619 fp = open(folder+names[i-2]+'.txt','w')
|
|
620 fp.write("miRNA id"+"\t"+names[i-2]+"\n")
|
|
621
|
|
622 for x in samp:
|
|
623 fp.write("%s" % "\t".join([x[0],x[i]])+"\n")
|
|
624 fp.close()
|
|
625
|
|
626 #################################################################################################################################################################################################################
|
|
627
|
|
628 """
|
|
629
|
|
630 This function downloads the fasta files from MirGene site
|
|
631 with ref miRNAs and star miRNAs sequences and merges them
|
|
632 into one list
|
|
633
|
|
634 """
|
|
635
|
|
636 def download_matures(matures,org_name):
|
|
637
|
|
638 mature_mir=[]
|
|
639
|
|
640 mat_url = 'http://mirgenedb.org/fasta/'+org_name+'?mat=1'
|
|
641 star_url = 'http://mirgenedb.org/fasta/'+org_name+'?star=1'
|
|
642
|
|
643 data = urllib.request.urlopen(mat_url).read()
|
|
644 file_mirna = data.decode('utf-8')
|
|
645 mature_mir = file_mirna.split("\n")
|
|
646 mature_mir = [x.replace(">","") for x in mature_mir]
|
|
647 del mature_mir[-1]
|
|
648
|
|
649 data = urllib.request.urlopen(star_url).read()
|
|
650 file_mirna = data.decode('utf-8')
|
|
651 star_mir = file_mirna.split("\n")
|
|
652 star_mir = [x.replace(">","") for x in star_mir]
|
|
653 del star_mir[-1]
|
|
654
|
|
655 mature_mir.extend(star_mir)
|
|
656
|
|
657 for i in range(1,len(mature_mir),2):
|
|
658 mature_mir[i]=mature_mir[i].replace("U","T")
|
|
659
|
|
660 matures.extend(mature_mir)
|
|
661
|
|
662 ###################################################################################################################
|
|
663
|
|
664 """
|
|
665
|
|
666 This function detects the templated isoforms from the 1st part of analysis
|
|
667 These isoforms and ref miRNAs will be used for the detection of non-templated miRNAs
|
|
668
|
|
669 """
|
|
670
|
|
671 def non_template_ref(sc,st,all_isoforms):
|
|
672
|
|
673 pre_uni_seq_con = list(sc)
|
|
674 pre_uni_seq_tre = list(st)
|
|
675
|
|
676 for x in pre_uni_seq_con:
|
|
677 for y in x:
|
|
678 if y[2] not in all_isoforms and "_t_" in y[2]:
|
|
679 all_isoforms.append(y[2])
|
|
680 all_isoforms.append(y[9])
|
|
681
|
|
682 for x in pre_uni_seq_tre:
|
|
683 for y in x:
|
|
684 if y[2] not in all_isoforms and "_t_" in y[2]:
|
|
685 all_isoforms.append(y[2])
|
|
686 all_isoforms.append(y[9])
|
|
687
|
|
688 ################################################################################################################################################################################################
|
|
689
|
|
690 """
|
|
691
|
|
692 This function adds uncommon detected mirnas among the samples with zeros as counts
|
|
693
|
|
694 """
|
|
695
|
|
696 def uncommon_mirnas(sample,mir_names,l,new_d,sample_name,sample_order):
|
|
697
|
|
698 for y in mir_names:
|
|
699 flag=0
|
|
700 for x in sample:
|
|
701 if y[0]==x[0]: # check if miRNA exists in the sample
|
|
702 flag=1
|
|
703 break
|
|
704 if flag==0:
|
|
705 sample.append([y[0],"0",y[1]]) # add the name of mirna to the sample with zero counts and its sequence
|
|
706
|
|
707 # sorting and remove duplicates
|
|
708 sample.sort(key=lambda x: x[0])
|
|
709 sample=list(sample for sample,_ in itertools.groupby(sample))
|
|
710
|
|
711 # Return the updated sample
|
|
712 l.acquire()
|
|
713 new_d.append(sample)
|
|
714 sample_order.append(sample_name)
|
|
715 l.release()
|
|
716
|
|
717 ###############################################################################################################################################################################################
|