2
|
1 import itertools
|
|
2 import re
|
|
3 import urllib.request
|
|
4 import gzip
|
|
5 import copy
|
|
6 from collections import OrderedDict
|
|
7
|
|
8
|
|
9
|
|
10 # Read a file and return it as a list
|
|
11 def read(path, flag):
|
|
12 if flag == 0:
|
|
13 with open(path) as fp:
|
|
14 file=fp.readlines()
|
|
15 fp.close()
|
|
16 return file
|
|
17
|
|
18 if flag == 1:
|
|
19 with open(path) as fp:
|
|
20 file = fp.read().splitlines()
|
|
21 fp.close()
|
|
22 return file
|
|
23
|
|
24 # Write a list to a txt file
|
|
25 def write(path, list):
|
|
26 with open(path,'w') as fp:
|
|
27 for x in list:
|
|
28 fp.write(str("\t".join(x[1:-1])))
|
|
29 fp.close()
|
|
30
|
|
31
|
|
32 #################################################################################################################>
|
|
33
|
|
34 # Detect the longest common substring sequence between two mirnas
|
|
35 def longestSubstring(str1, str2):
|
|
36
|
|
37 from difflib import SequenceMatcher
|
|
38 # initialize SequenceMatcher object with
|
|
39 # input string
|
|
40 seqMatch = SequenceMatcher(None, str1, str2)
|
|
41
|
|
42 # find match of longest sub-string
|
|
43 # output will be like Match(a=0, b=0, size=5)
|
|
44 match = seqMatch.find_longest_match(0, len(str1), 0, len(str2))
|
|
45
|
|
46 # print longest substring
|
|
47 if (match.size != 0):
|
|
48 return str1[match.a: match.a + match.size]
|
|
49 else:
|
|
50 print('No longest common sub-string found')
|
|
51
|
|
52 #################################################################################################################################################################################################################
|
|
53
|
|
54 """
|
|
55
|
|
56 This function concatenates miRNAs which are generated from different chromosomes
|
|
57 and eliminates the duplications of miRNAs on every sample
|
|
58
|
|
59 input: detected miRNAs
|
|
60 output: collpased miRNAs without duplicates
|
|
61
|
|
62 """
|
|
63
|
|
64
|
|
65 def remove_duplicates(mirnas):
|
|
66
|
|
67 # Detection of canonical mirRNAs whicha are generated from different chromosomes
|
|
68 dupes=[[x[9],x[0],x[2]] for x in mirnas]
|
|
69
|
|
70 for x in mirnas:
|
|
71 for y in dupes:
|
|
72 if x[9] == y[0] and x[0] == y[1] and x[2].split("_")[0] == y[2].split("_")[0] and x[2] != y[2]:
|
|
73 y.append(x[2])
|
|
74
|
|
75 # Detection of different chromosomes for every miRNA
|
|
76 chr_order = []
|
|
77 for x in dupes:
|
|
78 temp = []
|
|
79 for i in range(2,len(x)):
|
|
80 if x[i].split("chr")[1].split("(")[0].isdigit():
|
|
81 temp.append(int(x[i].split("chr")[1].split("(")[1][0]+x[i].split("chr")[1].split("(")[0]))
|
|
82 else:
|
|
83 temp.append(x[i].split("chr")[1][0:4])
|
|
84
|
|
85 for z in temp:
|
|
86 if 'X(-)'==z or 'Y(-)'==z or 'X(+)'==z or 'Y(+)'==z:
|
|
87 temp = [str(j) for j in temp]
|
|
88 temp = list(set(temp))
|
|
89 temp.sort()
|
|
90 chr_order.append(temp)
|
|
91
|
|
92 # Collapsing the miRNAs with the same sequence from different chromosomes
|
|
93 collapsed_dupes=[]
|
|
94 for i in range(len(dupes)):
|
|
95 collapsed_dupes.append([dupes[i][0],dupes[i][2].split("_")[0],dupes[i][1]])
|
|
96 for x in chr_order[i]:
|
|
97 chr_check = re.match("[-+]?\d+$", str(x)) # check if chromosome is 'X' or 'Y'
|
|
98 if chr_check is not None:
|
|
99 if int(x)<0: # Check the strand (+) or (-)
|
|
100 collapsed_dupes[i][1]= collapsed_dupes[i][1]+"_chr"+str(abs(int(x)))+"(-)"
|
|
101 else:
|
|
102 collapsed_dupes[i][1] = collapsed_dupes[i][1] + "_chr" + str(abs(int(x)))+"(+)"
|
|
103 else:
|
|
104 collapsed_dupes[i][1] = collapsed_dupes[i][1] + "_chr" + str(x)
|
|
105
|
|
106 # Remove duplicates from collapsed_dupes
|
|
107 collapsed_dupes.sort()
|
|
108 collapsed_dupes = list(collapsed_dupes for collapsed_dupes,_ in itertools.groupby(collapsed_dupes))
|
|
109
|
|
110 for i in range(len(mirnas)):
|
|
111 for x in collapsed_dupes:
|
|
112
|
|
113 # Naming of template isomirs (adding positions in the names)
|
|
114 if mirnas[i][9] == x[0] and mirnas[i][0] == x[2] and len(mirnas[i][2].split("_")) >3 and mirnas[i][2].split("_")[0]==x[1].split("_")[0]:
|
|
115 gg=str("_t_"+mirnas[i][2].split("_")[-2]+"_"+mirnas[i][2].split("_")[-1])
|
|
116 mirnas[i][2] = x[1]+gg
|
|
117 break
|
|
118
|
|
119 # Naming of canonical miRNAs (collpsed names)
|
|
120 if mirnas[i][9]==x[0] and mirnas[i][0]== x[2] and len(mirnas[i][2].split("_"))==3 and mirnas[i][2].split("_")[0]==x[1].split("_")[0]:
|
|
121 mirnas[i][2] = x[1]
|
|
122 break
|
|
123
|
|
124 # Remove duplicates
|
|
125 mirnas.sort()
|
|
126 mirnas=list(mirnas for mirnas,_ in itertools.groupby(mirnas))
|
|
127
|
|
128 return mirnas
|
|
129
|
|
130 #############################################################################################################################################################################################################
|
|
131
|
|
132 """
|
|
133
|
|
134 This function indentifies and classifies the miRNAs which are detected from the alignment tool.
|
|
135
|
|
136 """
|
|
137
|
|
138 def sam_edit(mature_mirnas,path,file,case,l,samples,data,file_order,unmap_seq,names_n_seqs,deseq,mirna_names,ini_sample,unmap_counts):
|
|
139
|
|
140 # read the sam file
|
|
141 ini_sam=read(path,0)
|
|
142 main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] # remove introduction
|
|
143 unique_seq = [x for x in main_sam if x[1] == '0' and len(x[9])>=18 and len(x[9])<=26] # keeps only the functional miRNAs
|
|
144 filter_sam = [[x[0],x[1],x[2],len(x[9])] for x in main_sam] # keeps only the necessary info of miRNAs from sam files (name, sequence, counts, etc)
|
|
145
|
|
146 sorted_uni_arms = []
|
|
147
|
|
148 for i in range(0,len(mature_mirnas,),2):
|
|
149 tmp_count_reads = 0 # calculate the total number of reads
|
|
150 tmp_count_seq = 0 # calculate the total number of sequences
|
|
151 for j in range(len(unique_seq)):
|
|
152
|
|
153 if "{" in unique_seq[j][2].split("_")[0]: # checks if a miRNA is generated from two different locis on the same chromosome
|
|
154 mirna=unique_seq[j][2].split("_")[0][:-4]
|
|
155 else:
|
|
156 mirna=unique_seq[j][2].split("_")[0]
|
|
157
|
|
158 # Detection of differences between the canonical miRNA and the detected miRNA
|
|
159 if mature_mirnas[i].split(" ")[0][1:] == mirna:
|
|
160
|
|
161 temp_mature = mature_mirnas[i+1].strip().replace("U", "T")
|
|
162 off_part = longestSubstring(temp_mature, unique_seq[j][9])
|
|
163
|
|
164 mat_diff = temp_mature.split(off_part)
|
|
165 mat_diff = [len(mat_diff[0]), len(mat_diff[1])]
|
|
166
|
|
167 unique_diff = unique_seq[j][9].split(off_part)
|
|
168 unique_diff = [len(unique_diff[0]), len(unique_diff[1])]
|
|
169
|
|
170 # Handling of some special mirnas like (hsa-miR-8485)
|
|
171 if mat_diff[1]!=0 and unique_diff[1]!=0:
|
|
172 unique_seq[j]=1
|
|
173 pre_pos = 0
|
|
174 post_pos = 0
|
|
175
|
|
176 elif mat_diff[0]!=0 and unique_diff[0]!=0:
|
|
177 unique_seq[j]=1
|
|
178 pre_pos = 0
|
|
179 post_pos = 0
|
|
180
|
|
181 else:
|
|
182 # Keep the findings
|
|
183 pre_pos = mat_diff[0]-unique_diff[0]
|
|
184 post_pos = unique_diff[1]-mat_diff[1]
|
|
185 tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1])
|
|
186 tmp_count_seq = tmp_count_seq+1
|
|
187
|
|
188 # Store the detected miRNAs with new names according to the findings
|
|
189 if pre_pos != 0 or post_pos != 0:
|
|
190 if pre_pos == 0:
|
|
191 unique_seq[j][2] = unique_seq[j][2].split("_")[0]+"_"+unique_seq[j][2].split("_")[2]+ "_t_" +str(pre_pos) + "_" + '{:+d}'.format(post_pos)
|
|
192 elif post_pos == 0:
|
|
193 unique_seq[j][2] = unique_seq[j][2].split("_")[0]+"_"+unique_seq[j][2].split("_")[2] + "_t_" + '{:+d}'.format(pre_pos) + "_" + str(post_pos)
|
|
194 else:
|
|
195 unique_seq[j][2] = unique_seq[j][2].split("_")[0]+"_"+unique_seq[j][2].split("_")[2]+"_t_"+'{:+d}'.format(pre_pos)+"_"+'{:+d}'.format(post_pos)
|
|
196
|
|
197 # Remove the values "1" from the handling of special mirnas (hsa-miR-8485)
|
|
198 for x in range(unique_seq.count(1)):
|
|
199 unique_seq.remove(1)
|
|
200
|
|
201 # metrics for the production of database
|
|
202 if tmp_count_reads != 0 and tmp_count_seq != 0:
|
|
203 sorted_uni_arms.append([mature_mirnas[i].split(" ")[0][1:], tmp_count_seq, tmp_count_reads])
|
|
204
|
|
205 # Sorting of the metrics for database
|
|
206 sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True)
|
|
207
|
|
208 # Collapsing of miRNAs and removing of duplicates
|
|
209 collapsed_mirnas = remove_duplicates(unique_seq)
|
|
210
|
|
211 # Correction of metrics due to the collapsing and removing of duplicates for the production of Database
|
|
212 for y in sorted_uni_arms:
|
|
213 counts=0
|
|
214 seqs=0
|
|
215 for x in collapsed_mirnas:
|
|
216 if y[0]==x[2].split("_")[0]:
|
|
217 counts+=int(x[0].split("-")[1])
|
|
218 seqs+=1
|
|
219
|
|
220 y[1]=seqs
|
|
221 y[2]=counts
|
|
222
|
|
223
|
|
224 # Output variables
|
|
225 temp_mirna_names=[]
|
|
226
|
|
227 l.acquire()
|
|
228
|
|
229 if case == "c" or case == "t":
|
|
230 temp_mirna_names.extend(z[2] for z in collapsed_mirnas)
|
|
231 names_n_seqs.extend([[y[2],y[9]] for y in collapsed_mirnas])
|
|
232 deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in collapsed_mirnas])
|
|
233 mirna_names.extend(temp_mirna_names)
|
|
234 unmap_seq.value += sum([1 for x in main_sam if x[1] == '4']) # Keeps the unmap unique sequences for the production of a graph
|
|
235 unmap_counts.value += sum([int(x[0].split("-")[1]) for x in main_sam if x[1] == '4']) # Keeps the unmap counts of sequences for the production of a graph
|
|
236 file_order.append(file) #Keeps the names of SAM files with the order of reading by the fuction (avoid problems due to multiprocesssing)
|
|
237 samples.append(collapsed_mirnas) # return the processed detected miRNAs
|
|
238 data.append([case,file,collapsed_mirnas,sorted_uni_arms])
|
|
239 ini_sample.append(filter_sam) # returns the filtered sam file
|
|
240
|
|
241 l.release()
|
|
242
|
|
243
|
|
244 ######################################################################################################################################
|
|
245
|
|
246
|
|
247 """
|
|
248
|
|
249 Read a sam file from Bowtie and do the followings:
|
|
250
|
|
251 1) Remove reverse stranded mapped reads
|
|
252 2) Remove unmapped reads
|
|
253 3) Remove all sequences with reads less than 11 reads
|
|
254 4) Sort the arms with the most sequences in decreading rate
|
|
255 5) Sort the sequences of every arm with the most reads in decreasing rate
|
|
256 6) Calculate total number of sequences of every arm
|
|
257 7) Calculate total number of reads of sequences of every arm.
|
|
258 8) Store all the informations in a txt file
|
|
259
|
|
260 """
|
|
261
|
|
262 def non_sam_edit(mature_mirnas,path,file,case,l,data,file_order,n_deseq,names_n_seqs):
|
|
263
|
|
264 # read the sam file
|
|
265 ini_sam=read(path,0)
|
|
266 main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]]
|
|
267 unique_seq=[]
|
|
268 unique_seq = [x for x in main_sam if x[1] == '4' and len(x[9])>=18 and len(x[9])<=26]
|
|
269
|
|
270 uni_seq=[]
|
|
271
|
|
272 # Calculate the shifted positions for every non template mirna and add them to the name of it
|
|
273 sorted_uni_arms = []
|
|
274 for i in range(1,len(mature_mirnas),2):
|
|
275 tmp_count_reads = 0 # calculate the total number of reads
|
|
276 tmp_count_seq = 0 # calculate the total number of sequences
|
|
277
|
|
278 for j in range(len(unique_seq)):
|
|
279
|
|
280 temp_mature = mature_mirnas[i].strip().replace("U", "T")
|
|
281
|
|
282 # Detection of differences between the canonical miRNA and the detected non template miRNA
|
|
283 if temp_mature in unique_seq[j][9]:
|
|
284
|
|
285 off_part = longestSubstring(temp_mature, unique_seq[j][9])
|
|
286
|
|
287 mat_diff = temp_mature.split(off_part)
|
|
288 mat_diff = [len(mat_diff[0]), len(mat_diff[1])]
|
|
289
|
|
290 unique_diff = unique_seq[j][9].split(off_part)
|
|
291 if len(unique_diff)<=2:
|
|
292 unique_diff = [len(unique_diff[0]), len(unique_diff[1])]
|
|
293
|
|
294 pre_pos = mat_diff[0]-unique_diff[0]
|
|
295 post_pos = unique_diff[1]-mat_diff[1]
|
|
296
|
|
297 lengthofmir = len(off_part) + post_pos
|
|
298 if pre_pos == 0 and post_pos<4:
|
|
299 tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1])
|
|
300 tmp_count_seq = tmp_count_seq + 1
|
|
301
|
|
302 t_name=unique_seq[j].copy()
|
|
303 t_name[2]=mature_mirnas[i - 1].split(" ")[0][1:] + "_nont_" + str(pre_pos) + "_" + '{:+d}'.format(post_pos) + "_" + str(unique_seq[j][9][len(off_part):])
|
|
304 uni_seq.append(t_name)
|
|
305 # metrics for the production of database
|
|
306 if tmp_count_reads != 0 and tmp_count_seq != 0:
|
|
307 sorted_uni_arms.append([mature_mirnas[i-1].split(" ")[0][1:], tmp_count_seq, tmp_count_reads])
|
|
308
|
|
309 sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True)
|
|
310 unique_seq = list(map(list, OrderedDict.fromkeys(map(tuple,uni_seq))))
|
|
311
|
|
312 # Output variables
|
|
313 l.acquire()
|
|
314 if case=="c" or case=="t":
|
|
315 names_n_seqs.extend([[y[2],y[9]] for y in unique_seq if y[2]!="*"])
|
|
316 n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"])
|
|
317 file_order.append(file)
|
|
318 data.append([case,file,unique_seq,sorted_uni_arms])
|
|
319 l.release()
|
|
320
|
|
321 #################################################################################################################################################################################################################
|
|
322
|
|
323 def black_white(mirna_names_1,mirna_names_2,group,manager):
|
|
324
|
|
325 add_names = [x for x in mirna_names_1 if x not in mirna_names_2]
|
|
326 add_names.sort()
|
|
327 add_names = list(add_names for add_names,_ in itertools.groupby(add_names))
|
|
328
|
|
329 group.sort()
|
|
330 group = list(group for group,_ in itertools.groupby(group))
|
|
331
|
|
332 zeros=["0"]*(len(group[0])-2)
|
|
333 [add_names[i].extend(zeros) for i,_ in enumerate(add_names)]
|
|
334 group=group+add_names
|
|
335
|
|
336 manager.extend(group)
|
|
337
|
|
338 ################################################################################################################################################################################################################################
|
|
339
|
|
340 def merging_dupes(group,f_dupes):
|
|
341
|
|
342 dupes=[]
|
|
343 final_mat =[]
|
|
344
|
|
345 for num,_ in enumerate(group):
|
|
346
|
|
347 if group[num][1] not in final_mat and group[num][0] not in final_mat:
|
|
348 final_mat.append(group[num][1])
|
|
349 final_mat.append(group[num][0])
|
|
350 else:
|
|
351 dupes.append(group[num][1])
|
|
352
|
|
353
|
|
354 dupes=list(set(dupes))
|
|
355
|
|
356 dupes=[[x] for x in dupes]
|
|
357
|
|
358 for x in group:
|
|
359 for y in dupes:
|
|
360 if x[1]==y[0]:
|
|
361 fl=0
|
|
362 if len(y)==1:
|
|
363 y.append(x[0])
|
|
364 else:
|
|
365 for i in range(1,len(y)):
|
|
366 if y[i].split("_")[0]==x[0].split("_")[0]:
|
|
367 fl=1
|
|
368 if len(x[0])<len(y[i]):
|
|
369 del y[i]
|
|
370 y.append(x[0])
|
|
371 break
|
|
372
|
|
373 if fl==0:
|
|
374 y.append((x[0]))
|
|
375
|
|
376 for y in dupes:
|
|
377 if len(y)>2:
|
|
378 for i in range(len(y)-1,1,-1):
|
|
379 y[1]=y[1]+"/"+y[i]
|
|
380 del y[i]
|
|
381
|
|
382 f_dupes.extend(dupes)
|
|
383
|
|
384 ##########################################################################################################################################################################################################################################
|
|
385
|
|
386 def apply_merging_dupes(group,dupes,managger):
|
|
387
|
|
388 for x in group:
|
|
389 for y in dupes:
|
|
390 if x[1]==y[0]:
|
|
391 x[0]=y[1]
|
|
392
|
|
393 group.sort()
|
|
394 group=list(group for group,_ in itertools.groupby(group))
|
|
395 managger.extend(group)
|
|
396
|
|
397 ###############################################################################################################################################################################################################################
|
|
398
|
|
399
|
|
400 def filter_low_counts(c_group,t_group,fil_c_group,fil_t_group,per,counts):
|
|
401
|
|
402 t_group_new=[]
|
|
403 c_group_new=[]
|
|
404
|
|
405 percent=int(per)/100
|
|
406 c_col_filter=round(percent*(len(c_group[1])-2))
|
|
407 t_col_filter=round(percent*(len(t_group[1])-2))
|
|
408
|
|
409 for i, _ in enumerate(c_group):
|
|
410 c_cols=0
|
|
411 t_cols=0
|
|
412
|
|
413 c_cols=sum([1 for j in range(len(c_group[i])-2) if int(c_group[i][j+2])>=int(counts)])
|
|
414 t_cols=sum([1 for j in range(len(t_group[i])-2) if int(t_group[i][j+2])>=int(counts)])
|
|
415
|
|
416 if c_cols>=c_col_filter or t_cols>=t_col_filter:
|
|
417 t_group_new.append(t_group[i])
|
|
418 c_group_new.append(c_group[i])
|
|
419
|
|
420 fil_c_group.extend(c_group_new)
|
|
421 fil_t_group.extend(t_group_new)
|
|
422
|
|
423 ##################################################################################################################################################################################################################
|
|
424
|
|
425
|
|
426 def write_main(raw_con, raw_tre, fil_con, fil_tre, con_file_order, tre_file_order, flag, group_name1, group_name2, per):
|
|
427
|
|
428 if flag == 1 and int(per)!=-1:
|
|
429 fp = open('Counts/Filtered '+group_name2 +' Templated Counts', 'w')
|
|
430 fp.write("Name\t")
|
|
431 fp.write("Sequence")
|
|
432 for y in tre_file_order:
|
|
433 fp.write("\t"+y)
|
|
434
|
|
435 for x in fil_tre:
|
|
436 fp.write("\n%s" % "\t".join(x))
|
|
437 fp.close()
|
|
438
|
|
439 fp = open('Counts/Filtered '+group_name1+' Templated Counts', 'w')
|
|
440 fp.write("Name\t")
|
|
441 fp.write("Sequence")
|
|
442 for y in con_file_order:
|
|
443 fp.write("\t"+y)
|
|
444
|
|
445 for x in fil_con:
|
|
446 fp.write("\n%s" % "\t".join(x))
|
|
447 fp.close()
|
|
448
|
|
449
|
|
450 if flag == 2 and int(per)!=-1:
|
|
451 fp = open('Counts/Filtered '+group_name2+' Non-Templated Counts', 'w')
|
|
452 fp.write("Name\t")
|
|
453 fp.write("Sequence")
|
|
454 for y in tre_file_order:
|
|
455 fp.write("\t"+y)
|
|
456
|
|
457
|
|
458 for x in fil_tre:
|
|
459 fp.write("\n%s" % "\t".join(x))
|
|
460 fp.close()
|
|
461
|
|
462 fp = open('Counts/Filtered '+group_name1+' Non-Templated Counts', 'w')
|
|
463 fp.write("Name\t")
|
|
464 fp.write("Sequence")
|
|
465 for y in con_file_order:
|
|
466 fp.write("\t"+y)
|
|
467
|
|
468 for x in fil_con:
|
|
469 fp.write("\n%s" % "\t".join(x))
|
|
470 fp.close()
|
|
471
|
|
472
|
|
473 if flag == 1:
|
|
474 fp = open('Counts/Raw '+group_name2+' Templated Counts', 'w')
|
|
475 fp.write("Name\t")
|
|
476 fp.write("Sequence")
|
|
477 for y in tre_file_order:
|
|
478 fp.write("\t"+y)
|
|
479
|
|
480 for x in raw_tre:
|
|
481 fp.write("\n%s" % "\t".join(x))
|
|
482 fp.close()
|
|
483
|
|
484 fp = open('Counts/Raw '+group_name1+' Templated Counts', 'w')
|
|
485 fp.write("Name\t")
|
|
486 fp.write("Sequence")
|
|
487 for y in con_file_order:
|
|
488 fp.write("\t"+y)
|
|
489
|
|
490 for x in raw_con:
|
|
491 fp.write("\n%s" % "\t".join(x))
|
|
492 fp.close()
|
|
493
|
|
494
|
|
495 if flag == 2:
|
|
496 fp = open('Counts/Raw '+group_name2+' Non-Templated Counts', 'w')
|
|
497 fp.write("Name\t")
|
|
498 fp.write("Sequence")
|
|
499 for y in tre_file_order:
|
|
500 fp.write("\t"+y)
|
|
501
|
|
502
|
|
503 for x in raw_tre:
|
|
504 fp.write("\n%s" % "\t".join(x))
|
|
505 fp.close()
|
|
506
|
|
507 fp = open('Counts/Raw '+group_name1+' Non-Templated Counts', 'w')
|
|
508 fp.write("Name\t")
|
|
509 fp.write("Sequence")
|
|
510 for y in con_file_order:
|
|
511 fp.write("\t"+y)
|
|
512
|
|
513 for x in raw_con:
|
|
514 fp.write("\n%s" % "\t".join(x))
|
|
515 fp.close()
|
|
516
|
|
517
|
|
518 #########################################################################################################################################
|
|
519
|
|
520 def temp_counts_to_diff(names,samp,folder):
|
|
521
|
|
522 for i in range(2,len(samp[0])):
|
|
523
|
|
524 fp = open(folder+names[i-2]+'.txt','w')
|
|
525 fp.write("miRNA id"+"\t"+names[i-2]+"\n")
|
|
526
|
|
527 for x in samp:
|
|
528 fp.write("%s" % "\t".join([x[0],x[i]])+"\n")
|
|
529 fp.close()
|
|
530
|
|
531 ##################################################################################################################
|
|
532
|
|
533 def DB_write(con,name,unique_seq,sorted_uni_arms,f):
|
|
534
|
|
535 if f==1:
|
|
536 # Write a txt file with all the information
|
|
537 if con=="c":
|
|
538 fp = open('split1/'+name, 'w')
|
|
539
|
|
540 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
|
|
541 if con=="t":
|
|
542 fp = open('split2/'+name, 'w')
|
|
543 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
|
|
544
|
|
545
|
|
546 for i in range(len(sorted_uni_arms)):
|
|
547 temp = []
|
|
548 for j in range(len(unique_seq)):
|
|
549
|
|
550 if sorted_uni_arms[i][0] in unique_seq[j][2].split("_")[0]:
|
|
551
|
|
552 temp.append(unique_seq[j])
|
|
553
|
|
554 temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True)
|
|
555 fp.write("*********************************************************************************************************\n")
|
|
556 fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|"))
|
|
557 fp.write("*********************************************************************************************************\n\n")
|
|
558 [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp]
|
|
559 fp.write("\n" + "\n")
|
|
560 fp.close()
|
|
561
|
|
562 if f==2:
|
|
563
|
|
564 if con=="c":
|
|
565 fp = open('split3/'+name, 'w')
|
|
566 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
|
|
567 if con=="t":
|
|
568 fp = open('split4/'+name, 'w')
|
|
569 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
|
|
570
|
|
571
|
|
572 for i in range(len(sorted_uni_arms)):
|
|
573 temp = []
|
|
574 for j in range(len(unique_seq)):
|
|
575 if sorted_uni_arms[i][0]==unique_seq[j][2].split("_nont_")[0]:
|
|
576 temp.append(unique_seq[j])
|
|
577 if temp!=[]:
|
|
578 temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True)
|
|
579 fp.write("*********************************************************************************************************\n")
|
|
580 fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|"))
|
|
581 fp.write("*********************************************************************************************************\n\n")
|
|
582 [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp]
|
|
583 fp.write("\n" + "\n")
|
|
584 fp.close()
|
|
585
|
|
586
|
|
587 ##########################################################################################################################
|
|
588
|
|
589 def new_mat_seq(pre_unique_seq,mat_mirnas,l):
|
|
590
|
|
591 unique_iso = []
|
|
592 for x in pre_unique_seq:
|
|
593 if len(x[2].split("_"))==3:
|
|
594 for y in pre_unique_seq:
|
|
595 if x[2] in y[2] and int(x[0].split("-")[1])<int(y[0].split("-")[1]):
|
|
596 if any(y[2] in lst2 for lst2 in unique_iso)==False:
|
|
597 y[2]=">"+y[2]
|
|
598 unique_iso.append(y)
|
|
599 l.acquire()
|
|
600 for x in unique_iso:
|
|
601 mat_mirnas.append(x[2])
|
|
602 mat_mirnas.append(x[9])
|
|
603 l.release()
|
|
604
|
|
605 #########################################################################################################################
|
|
606
|
|
607 def merging_names(ini_mat,new):
|
|
608
|
|
609 dupes=[]
|
|
610 final_mat =[]
|
|
611
|
|
612 for num in range(len(ini_mat)):
|
|
613
|
|
614 if ini_mat[num][1] not in final_mat and ini_mat[num][0] not in final_mat:
|
|
615 final_mat.append(ini_mat[num][1])
|
|
616 final_mat.append(ini_mat[num][0])
|
|
617 else:
|
|
618 dupes.append(ini_mat[num][1])
|
|
619
|
|
620 dupes=list(set(dupes))
|
|
621
|
|
622 for i in range(len(dupes)):
|
|
623 dupes[i]=[dupes[i]]
|
|
624
|
|
625 for x in ini_mat:
|
|
626 for y in dupes:
|
|
627 if x[1]==y[0]:
|
|
628 fl=0
|
|
629 if len(y)==1:
|
|
630 y.append(x[0])
|
|
631 else:
|
|
632 for i in range(1,len(y)):
|
|
633 if y[i].split("_")[0]==x[0].split("_")[0]:
|
|
634 fl=1
|
|
635 if len(x[0])<len(y[i]):
|
|
636 del y[i]
|
|
637 y.append(x[0])
|
|
638 break
|
|
639
|
|
640 if fl==0:
|
|
641 y.append((x[0]))
|
|
642
|
|
643 for y in dupes:
|
|
644 if len(y)>2:
|
|
645 for i in range(len(y)-1,1,-1):
|
|
646 y[1]=y[1]+"/"+y[i]
|
|
647 del y[i]
|
|
648
|
|
649
|
|
650 for x in ini_mat:
|
|
651 for y in dupes:
|
|
652 if x[1]==y[0]:
|
|
653 x[0]=y[1]
|
|
654
|
|
655 ini_mat.sort()
|
|
656 ini_mat=list(ini_mat for ini_mat,_ in itertools.groupby(ini_mat))
|
|
657
|
|
658 new.extend(ini_mat)
|
|
659
|
|
660
|
|
661 ######################################################################################################################################################
|
|
662
|
|
663 def nontemp_counts_to_diff(tem_names,tem_samp,non_names,non_samp,folder):
|
|
664
|
|
665 for i in range(2,len(tem_samp[0])):
|
|
666
|
|
667 fp = open(folder+tem_names[i-2]+'.txt','w')
|
|
668 fp.write("miRNA id"+"\t"+tem_names[i-2]+"\n")
|
|
669
|
|
670 for x in tem_samp:
|
|
671 fp.write("%s" % "\t".join([x[0],x[i]])+"\n")
|
|
672
|
|
673 for j in range(len(non_names)):
|
|
674 if non_names[j]==tem_names[i-2]:
|
|
675 for x in non_samp:
|
|
676 fp.write("%s" % "\t".join([x[0],x[j+2]])+"\n")
|
|
677 fp.close()
|
|
678
|
|
679 ###################################################################################################################################################################################################################
|
|
680
|
|
681 """
|
|
682
|
|
683 This function downloads all the miRNAs of all the species from MirBase
|
|
684 and filters them by the requested organism
|
|
685
|
|
686 input : Organism
|
|
687 output: A list with the miRNA sequences in fasta format
|
|
688
|
|
689 """
|
|
690
|
|
691 def download_matures(matures,org_name):
|
|
692
|
|
693 url = 'ftp://mirbase.org/pub/mirbase/CURRENT/mature.fa.gz'
|
|
694 data = urllib.request.urlopen(url).read()
|
|
695 file_mirna = gzip.decompress(data).decode('utf-8')
|
|
696 file_mirna = file_mirna.split("\n")
|
|
697
|
|
698 for i in range(0,len(file_mirna)-1,2):
|
|
699
|
|
700 if org_name in file_mirna[i]:
|
|
701 matures.append(file_mirna[i])
|
|
702 matures.append(file_mirna[i+1])
|
|
703
|
|
704 ###################################################################################################################################################################################################################
|
|
705
|
|
706
|
|
707 """
|
|
708
|
|
709 This function keeps all mirna isoforms which are detected on SAM files from the first part of the analysis
|
|
710 These isoforms will be used as refence sequences with canonical (ref) mirnas for the detection of non-template
|
|
711 mirnas
|
|
712
|
|
713 """
|
|
714
|
|
715
|
|
716 def non_template_ref(c_samples,t_samples,all_isoforms):
|
|
717
|
|
718 pre_uni_seq_con = list(c_samples)
|
|
719 pre_uni_seq_tre = list(t_samples)
|
|
720
|
|
721 for x in pre_uni_seq_con:
|
|
722 for y in x:
|
|
723 #if ">"+y[2] not in all_isoforms and ")_" in y[2] :
|
|
724 if ">"+y[2] not in all_isoforms and "_t_" in y[2] :
|
|
725 all_isoforms.append(">"+y[2])
|
|
726 all_isoforms.append(y[9])
|
|
727
|
|
728 for x in pre_uni_seq_tre:
|
|
729 for y in x:
|
|
730 #if ">"+y[2] not in all_isoforms and ")_" in y[2]:
|
|
731 if ">"+y[2] not in all_isoforms and "_t_" in y[2] :
|
|
732 all_isoforms.append(">"+y[2])
|
|
733 all_isoforms.append(y[9])
|
|
734
|
|
735 ################################################################################################################################################################################################
|
|
736
|
|
737 """
|
|
738
|
|
739 This function adds the uncommon detected miRNAs among samples.
|
|
740 As a result all samples will have the same length.
|
|
741
|
|
742 """
|
|
743
|
|
744 def uncommon_mirnas(sample,mir_names,l,new_d,sample_name,sample_order):
|
|
745
|
|
746 for y in mir_names:
|
|
747 flag=0
|
|
748 for x in sample:
|
|
749 if y[0]==x[0]: # check if miRNA exists in the sample
|
|
750 flag=1
|
|
751 break
|
|
752 if flag==0:
|
|
753 sample.append([y[0],"0",y[1]]) # add the name of mirna to the sample with zero counts and its sequence
|
|
754
|
|
755 # sorting and remove duplicates
|
|
756 sample.sort(key=lambda x: x[0])
|
|
757 sample=list(sample for sample,_ in itertools.groupby(sample))
|
|
758
|
|
759 # Return the updated sample
|
|
760 l.acquire()
|
|
761 new_d.append(sample)
|
|
762 sample_order.append(sample_name)
|
|
763 l.release()
|
|
764
|
|
765 ###############################################################################################################################################################################################
|
|
766
|