Mercurial > repos > glogobyte > isoread
comparison mirbase_functions.py @ 2:47232a73a46b draft
Uploaded
author | glogobyte |
---|---|
date | Wed, 13 Oct 2021 16:04:11 +0000 |
parents | |
children | 77ba8dde6fb7 |
comparison
equal
deleted
inserted
replaced
1:f10c8f43f010 | 2:47232a73a46b |
---|---|
1 import itertools | |
2 import re | |
3 import urllib.request | |
4 import gzip | |
5 import copy | |
6 from collections import OrderedDict | |
7 | |
8 | |
9 | |
10 # Read a file and return it as a list | |
11 def read(path, flag): | |
12 if flag == 0: | |
13 with open(path) as fp: | |
14 file=fp.readlines() | |
15 fp.close() | |
16 return file | |
17 | |
18 if flag == 1: | |
19 with open(path) as fp: | |
20 file = fp.read().splitlines() | |
21 fp.close() | |
22 return file | |
23 | |
24 # Write a list to a txt file | |
25 def write(path, list): | |
26 with open(path,'w') as fp: | |
27 for x in list: | |
28 fp.write(str("\t".join(x[1:-1]))) | |
29 fp.close() | |
30 | |
31 | |
32 #################################################################################################################> | |
33 | |
34 # Detect the longest common substring sequence between two mirnas | |
35 def longestSubstring(str1, str2): | |
36 | |
37 from difflib import SequenceMatcher | |
38 # initialize SequenceMatcher object with | |
39 # input string | |
40 seqMatch = SequenceMatcher(None, str1, str2) | |
41 | |
42 # find match of longest sub-string | |
43 # output will be like Match(a=0, b=0, size=5) | |
44 match = seqMatch.find_longest_match(0, len(str1), 0, len(str2)) | |
45 | |
46 # print longest substring | |
47 if (match.size != 0): | |
48 return str1[match.a: match.a + match.size] | |
49 else: | |
50 print('No longest common sub-string found') | |
51 | |
52 ################################################################################################################################################################################################################# | |
53 | |
54 """ | |
55 | |
56 This function concatenates miRNAs which are generated from different chromosomes | |
57 and eliminates the duplications of miRNAs on every sample | |
58 | |
59 input: detected miRNAs | |
60 output: collpased miRNAs without duplicates | |
61 | |
62 """ | |
63 | |
64 | |
65 def remove_duplicates(mirnas): | |
66 | |
67 # Detection of canonical mirRNAs whicha are generated from different chromosomes | |
68 dupes=[[x[9],x[0],x[2]] for x in mirnas] | |
69 | |
70 for x in mirnas: | |
71 for y in dupes: | |
72 if x[9] == y[0] and x[0] == y[1] and x[2].split("_")[0] == y[2].split("_")[0] and x[2] != y[2]: | |
73 y.append(x[2]) | |
74 | |
75 # Detection of different chromosomes for every miRNA | |
76 chr_order = [] | |
77 for x in dupes: | |
78 temp = [] | |
79 for i in range(2,len(x)): | |
80 if x[i].split("chr")[1].split("(")[0].isdigit(): | |
81 temp.append(int(x[i].split("chr")[1].split("(")[1][0]+x[i].split("chr")[1].split("(")[0])) | |
82 else: | |
83 temp.append(x[i].split("chr")[1][0:4]) | |
84 | |
85 for z in temp: | |
86 if 'X(-)'==z or 'Y(-)'==z or 'X(+)'==z or 'Y(+)'==z: | |
87 temp = [str(j) for j in temp] | |
88 temp = list(set(temp)) | |
89 temp.sort() | |
90 chr_order.append(temp) | |
91 | |
92 # Collapsing the miRNAs with the same sequence from different chromosomes | |
93 collapsed_dupes=[] | |
94 for i in range(len(dupes)): | |
95 collapsed_dupes.append([dupes[i][0],dupes[i][2].split("_")[0],dupes[i][1]]) | |
96 for x in chr_order[i]: | |
97 chr_check = re.match("[-+]?\d+$", str(x)) # check if chromosome is 'X' or 'Y' | |
98 if chr_check is not None: | |
99 if int(x)<0: # Check the strand (+) or (-) | |
100 collapsed_dupes[i][1]= collapsed_dupes[i][1]+"_chr"+str(abs(int(x)))+"(-)" | |
101 else: | |
102 collapsed_dupes[i][1] = collapsed_dupes[i][1] + "_chr" + str(abs(int(x)))+"(+)" | |
103 else: | |
104 collapsed_dupes[i][1] = collapsed_dupes[i][1] + "_chr" + str(x) | |
105 | |
106 # Remove duplicates from collapsed_dupes | |
107 collapsed_dupes.sort() | |
108 collapsed_dupes = list(collapsed_dupes for collapsed_dupes,_ in itertools.groupby(collapsed_dupes)) | |
109 | |
110 for i in range(len(mirnas)): | |
111 for x in collapsed_dupes: | |
112 | |
113 # Naming of template isomirs (adding positions in the names) | |
114 if mirnas[i][9] == x[0] and mirnas[i][0] == x[2] and len(mirnas[i][2].split("_")) >3 and mirnas[i][2].split("_")[0]==x[1].split("_")[0]: | |
115 gg=str("_t_"+mirnas[i][2].split("_")[-2]+"_"+mirnas[i][2].split("_")[-1]) | |
116 mirnas[i][2] = x[1]+gg | |
117 break | |
118 | |
119 # Naming of canonical miRNAs (collpsed names) | |
120 if mirnas[i][9]==x[0] and mirnas[i][0]== x[2] and len(mirnas[i][2].split("_"))==3 and mirnas[i][2].split("_")[0]==x[1].split("_")[0]: | |
121 mirnas[i][2] = x[1] | |
122 break | |
123 | |
124 # Remove duplicates | |
125 mirnas.sort() | |
126 mirnas=list(mirnas for mirnas,_ in itertools.groupby(mirnas)) | |
127 | |
128 return mirnas | |
129 | |
130 ############################################################################################################################################################################################################# | |
131 | |
132 """ | |
133 | |
134 This function indentifies and classifies the miRNAs which are detected from the alignment tool. | |
135 | |
136 """ | |
137 | |
138 def sam_edit(mature_mirnas,path,file,case,l,samples,data,file_order,unmap_seq,names_n_seqs,deseq,mirna_names,ini_sample,unmap_counts): | |
139 | |
140 # read the sam file | |
141 ini_sam=read(path,0) | |
142 main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] # remove introduction | |
143 unique_seq = [x for x in main_sam if x[1] == '0' and len(x[9])>=18 and len(x[9])<=26] # keeps only the functional miRNAs | |
144 filter_sam = [[x[0],x[1],x[2],len(x[9])] for x in main_sam] # keeps only the necessary info of miRNAs from sam files (name, sequence, counts, etc) | |
145 | |
146 sorted_uni_arms = [] | |
147 | |
148 for i in range(0,len(mature_mirnas,),2): | |
149 tmp_count_reads = 0 # calculate the total number of reads | |
150 tmp_count_seq = 0 # calculate the total number of sequences | |
151 for j in range(len(unique_seq)): | |
152 | |
153 if "{" in unique_seq[j][2].split("_")[0]: # checks if a miRNA is generated from two different locis on the same chromosome | |
154 mirna=unique_seq[j][2].split("_")[0][:-4] | |
155 else: | |
156 mirna=unique_seq[j][2].split("_")[0] | |
157 | |
158 # Detection of differences between the canonical miRNA and the detected miRNA | |
159 if mature_mirnas[i].split(" ")[0][1:] == mirna: | |
160 | |
161 temp_mature = mature_mirnas[i+1].strip().replace("U", "T") | |
162 off_part = longestSubstring(temp_mature, unique_seq[j][9]) | |
163 | |
164 mat_diff = temp_mature.split(off_part) | |
165 mat_diff = [len(mat_diff[0]), len(mat_diff[1])] | |
166 | |
167 unique_diff = unique_seq[j][9].split(off_part) | |
168 unique_diff = [len(unique_diff[0]), len(unique_diff[1])] | |
169 | |
170 # Handling of some special mirnas like (hsa-miR-8485) | |
171 if mat_diff[1]!=0 and unique_diff[1]!=0: | |
172 unique_seq[j]=1 | |
173 pre_pos = 0 | |
174 post_pos = 0 | |
175 | |
176 elif mat_diff[0]!=0 and unique_diff[0]!=0: | |
177 unique_seq[j]=1 | |
178 pre_pos = 0 | |
179 post_pos = 0 | |
180 | |
181 else: | |
182 # Keep the findings | |
183 pre_pos = mat_diff[0]-unique_diff[0] | |
184 post_pos = unique_diff[1]-mat_diff[1] | |
185 tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1]) | |
186 tmp_count_seq = tmp_count_seq+1 | |
187 | |
188 # Store the detected miRNAs with new names according to the findings | |
189 if pre_pos != 0 or post_pos != 0: | |
190 if pre_pos == 0: | |
191 unique_seq[j][2] = unique_seq[j][2].split("_")[0]+"_"+unique_seq[j][2].split("_")[2]+ "_t_" +str(pre_pos) + "_" + '{:+d}'.format(post_pos) | |
192 elif post_pos == 0: | |
193 unique_seq[j][2] = unique_seq[j][2].split("_")[0]+"_"+unique_seq[j][2].split("_")[2] + "_t_" + '{:+d}'.format(pre_pos) + "_" + str(post_pos) | |
194 else: | |
195 unique_seq[j][2] = unique_seq[j][2].split("_")[0]+"_"+unique_seq[j][2].split("_")[2]+"_t_"+'{:+d}'.format(pre_pos)+"_"+'{:+d}'.format(post_pos) | |
196 | |
197 # Remove the values "1" from the handling of special mirnas (hsa-miR-8485) | |
198 for x in range(unique_seq.count(1)): | |
199 unique_seq.remove(1) | |
200 | |
201 # metrics for the production of database | |
202 if tmp_count_reads != 0 and tmp_count_seq != 0: | |
203 sorted_uni_arms.append([mature_mirnas[i].split(" ")[0][1:], tmp_count_seq, tmp_count_reads]) | |
204 | |
205 # Sorting of the metrics for database | |
206 sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True) | |
207 | |
208 # Collapsing of miRNAs and removing of duplicates | |
209 collapsed_mirnas = remove_duplicates(unique_seq) | |
210 | |
211 # Correction of metrics due to the collapsing and removing of duplicates for the production of Database | |
212 for y in sorted_uni_arms: | |
213 counts=0 | |
214 seqs=0 | |
215 for x in collapsed_mirnas: | |
216 if y[0]==x[2].split("_")[0]: | |
217 counts+=int(x[0].split("-")[1]) | |
218 seqs+=1 | |
219 | |
220 y[1]=seqs | |
221 y[2]=counts | |
222 | |
223 | |
224 # Output variables | |
225 temp_mirna_names=[] | |
226 | |
227 l.acquire() | |
228 | |
229 if case == "c" or case == "t": | |
230 temp_mirna_names.extend(z[2] for z in collapsed_mirnas) | |
231 names_n_seqs.extend([[y[2],y[9]] for y in collapsed_mirnas]) | |
232 deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in collapsed_mirnas]) | |
233 mirna_names.extend(temp_mirna_names) | |
234 unmap_seq.value += sum([1 for x in main_sam if x[1] == '4']) # Keeps the unmap unique sequences for the production of a graph | |
235 unmap_counts.value += sum([int(x[0].split("-")[1]) for x in main_sam if x[1] == '4']) # Keeps the unmap counts of sequences for the production of a graph | |
236 file_order.append(file) #Keeps the names of SAM files with the order of reading by the fuction (avoid problems due to multiprocesssing) | |
237 samples.append(collapsed_mirnas) # return the processed detected miRNAs | |
238 data.append([case,file,collapsed_mirnas,sorted_uni_arms]) | |
239 ini_sample.append(filter_sam) # returns the filtered sam file | |
240 | |
241 l.release() | |
242 | |
243 | |
244 ###################################################################################################################################### | |
245 | |
246 | |
247 """ | |
248 | |
249 Read a sam file from Bowtie and do the followings: | |
250 | |
251 1) Remove reverse stranded mapped reads | |
252 2) Remove unmapped reads | |
253 3) Remove all sequences with reads less than 11 reads | |
254 4) Sort the arms with the most sequences in decreading rate | |
255 5) Sort the sequences of every arm with the most reads in decreasing rate | |
256 6) Calculate total number of sequences of every arm | |
257 7) Calculate total number of reads of sequences of every arm. | |
258 8) Store all the informations in a txt file | |
259 | |
260 """ | |
261 | |
262 def non_sam_edit(mature_mirnas,path,file,case,l,data,file_order,n_deseq,names_n_seqs): | |
263 | |
264 # read the sam file | |
265 ini_sam=read(path,0) | |
266 main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] | |
267 unique_seq=[] | |
268 unique_seq = [x for x in main_sam if x[1] == '4' and len(x[9])>=18 and len(x[9])<=26] | |
269 | |
270 uni_seq=[] | |
271 | |
272 # Calculate the shifted positions for every non template mirna and add them to the name of it | |
273 sorted_uni_arms = [] | |
274 for i in range(1,len(mature_mirnas),2): | |
275 tmp_count_reads = 0 # calculate the total number of reads | |
276 tmp_count_seq = 0 # calculate the total number of sequences | |
277 | |
278 for j in range(len(unique_seq)): | |
279 | |
280 temp_mature = mature_mirnas[i].strip().replace("U", "T") | |
281 | |
282 # Detection of differences between the canonical miRNA and the detected non template miRNA | |
283 if temp_mature in unique_seq[j][9]: | |
284 | |
285 off_part = longestSubstring(temp_mature, unique_seq[j][9]) | |
286 | |
287 mat_diff = temp_mature.split(off_part) | |
288 mat_diff = [len(mat_diff[0]), len(mat_diff[1])] | |
289 | |
290 unique_diff = unique_seq[j][9].split(off_part) | |
291 if len(unique_diff)<=2: | |
292 unique_diff = [len(unique_diff[0]), len(unique_diff[1])] | |
293 | |
294 pre_pos = mat_diff[0]-unique_diff[0] | |
295 post_pos = unique_diff[1]-mat_diff[1] | |
296 | |
297 lengthofmir = len(off_part) + post_pos | |
298 if pre_pos == 0 and post_pos<4: | |
299 tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1]) | |
300 tmp_count_seq = tmp_count_seq + 1 | |
301 | |
302 t_name=unique_seq[j].copy() | |
303 t_name[2]=mature_mirnas[i - 1].split(" ")[0][1:] + "_nont_" + str(pre_pos) + "_" + '{:+d}'.format(post_pos) + "_" + str(unique_seq[j][9][len(off_part):]) | |
304 uni_seq.append(t_name) | |
305 # metrics for the production of database | |
306 if tmp_count_reads != 0 and tmp_count_seq != 0: | |
307 sorted_uni_arms.append([mature_mirnas[i-1].split(" ")[0][1:], tmp_count_seq, tmp_count_reads]) | |
308 | |
309 sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True) | |
310 unique_seq = list(map(list, OrderedDict.fromkeys(map(tuple,uni_seq)))) | |
311 | |
312 # Output variables | |
313 l.acquire() | |
314 if case=="c" or case=="t": | |
315 names_n_seqs.extend([[y[2],y[9]] for y in unique_seq if y[2]!="*"]) | |
316 n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"]) | |
317 file_order.append(file) | |
318 data.append([case,file,unique_seq,sorted_uni_arms]) | |
319 l.release() | |
320 | |
321 ################################################################################################################################################################################################################# | |
322 | |
323 def black_white(mirna_names_1,mirna_names_2,group,manager): | |
324 | |
325 add_names = [x for x in mirna_names_1 if x not in mirna_names_2] | |
326 add_names.sort() | |
327 add_names = list(add_names for add_names,_ in itertools.groupby(add_names)) | |
328 | |
329 group.sort() | |
330 group = list(group for group,_ in itertools.groupby(group)) | |
331 | |
332 zeros=["0"]*(len(group[0])-2) | |
333 [add_names[i].extend(zeros) for i,_ in enumerate(add_names)] | |
334 group=group+add_names | |
335 | |
336 manager.extend(group) | |
337 | |
338 ################################################################################################################################################################################################################################ | |
339 | |
340 def merging_dupes(group,f_dupes): | |
341 | |
342 dupes=[] | |
343 final_mat =[] | |
344 | |
345 for num,_ in enumerate(group): | |
346 | |
347 if group[num][1] not in final_mat and group[num][0] not in final_mat: | |
348 final_mat.append(group[num][1]) | |
349 final_mat.append(group[num][0]) | |
350 else: | |
351 dupes.append(group[num][1]) | |
352 | |
353 | |
354 dupes=list(set(dupes)) | |
355 | |
356 dupes=[[x] for x in dupes] | |
357 | |
358 for x in group: | |
359 for y in dupes: | |
360 if x[1]==y[0]: | |
361 fl=0 | |
362 if len(y)==1: | |
363 y.append(x[0]) | |
364 else: | |
365 for i in range(1,len(y)): | |
366 if y[i].split("_")[0]==x[0].split("_")[0]: | |
367 fl=1 | |
368 if len(x[0])<len(y[i]): | |
369 del y[i] | |
370 y.append(x[0]) | |
371 break | |
372 | |
373 if fl==0: | |
374 y.append((x[0])) | |
375 | |
376 for y in dupes: | |
377 if len(y)>2: | |
378 for i in range(len(y)-1,1,-1): | |
379 y[1]=y[1]+"/"+y[i] | |
380 del y[i] | |
381 | |
382 f_dupes.extend(dupes) | |
383 | |
384 ########################################################################################################################################################################################################################################## | |
385 | |
386 def apply_merging_dupes(group,dupes,managger): | |
387 | |
388 for x in group: | |
389 for y in dupes: | |
390 if x[1]==y[0]: | |
391 x[0]=y[1] | |
392 | |
393 group.sort() | |
394 group=list(group for group,_ in itertools.groupby(group)) | |
395 managger.extend(group) | |
396 | |
397 ############################################################################################################################################################################################################################### | |
398 | |
399 | |
400 def filter_low_counts(c_group,t_group,fil_c_group,fil_t_group,per,counts): | |
401 | |
402 t_group_new=[] | |
403 c_group_new=[] | |
404 | |
405 percent=int(per)/100 | |
406 c_col_filter=round(percent*(len(c_group[1])-2)) | |
407 t_col_filter=round(percent*(len(t_group[1])-2)) | |
408 | |
409 for i, _ in enumerate(c_group): | |
410 c_cols=0 | |
411 t_cols=0 | |
412 | |
413 c_cols=sum([1 for j in range(len(c_group[i])-2) if int(c_group[i][j+2])>=int(counts)]) | |
414 t_cols=sum([1 for j in range(len(t_group[i])-2) if int(t_group[i][j+2])>=int(counts)]) | |
415 | |
416 if c_cols>=c_col_filter or t_cols>=t_col_filter: | |
417 t_group_new.append(t_group[i]) | |
418 c_group_new.append(c_group[i]) | |
419 | |
420 fil_c_group.extend(c_group_new) | |
421 fil_t_group.extend(t_group_new) | |
422 | |
423 ################################################################################################################################################################################################################## | |
424 | |
425 | |
426 def write_main(raw_con, raw_tre, fil_con, fil_tre, con_file_order, tre_file_order, flag, group_name1, group_name2, per): | |
427 | |
428 if flag == 1 and int(per)!=-1: | |
429 fp = open('Counts/Filtered '+group_name2 +' Templated Counts', 'w') | |
430 fp.write("Name\t") | |
431 fp.write("Sequence") | |
432 for y in tre_file_order: | |
433 fp.write("\t"+y) | |
434 | |
435 for x in fil_tre: | |
436 fp.write("\n%s" % "\t".join(x)) | |
437 fp.close() | |
438 | |
439 fp = open('Counts/Filtered '+group_name1+' Templated Counts', 'w') | |
440 fp.write("Name\t") | |
441 fp.write("Sequence") | |
442 for y in con_file_order: | |
443 fp.write("\t"+y) | |
444 | |
445 for x in fil_con: | |
446 fp.write("\n%s" % "\t".join(x)) | |
447 fp.close() | |
448 | |
449 | |
450 if flag == 2 and int(per)!=-1: | |
451 fp = open('Counts/Filtered '+group_name2+' Non-Templated Counts', 'w') | |
452 fp.write("Name\t") | |
453 fp.write("Sequence") | |
454 for y in tre_file_order: | |
455 fp.write("\t"+y) | |
456 | |
457 | |
458 for x in fil_tre: | |
459 fp.write("\n%s" % "\t".join(x)) | |
460 fp.close() | |
461 | |
462 fp = open('Counts/Filtered '+group_name1+' Non-Templated Counts', 'w') | |
463 fp.write("Name\t") | |
464 fp.write("Sequence") | |
465 for y in con_file_order: | |
466 fp.write("\t"+y) | |
467 | |
468 for x in fil_con: | |
469 fp.write("\n%s" % "\t".join(x)) | |
470 fp.close() | |
471 | |
472 | |
473 if flag == 1: | |
474 fp = open('Counts/Raw '+group_name2+' Templated Counts', 'w') | |
475 fp.write("Name\t") | |
476 fp.write("Sequence") | |
477 for y in tre_file_order: | |
478 fp.write("\t"+y) | |
479 | |
480 for x in raw_tre: | |
481 fp.write("\n%s" % "\t".join(x)) | |
482 fp.close() | |
483 | |
484 fp = open('Counts/Raw '+group_name1+' Templated Counts', 'w') | |
485 fp.write("Name\t") | |
486 fp.write("Sequence") | |
487 for y in con_file_order: | |
488 fp.write("\t"+y) | |
489 | |
490 for x in raw_con: | |
491 fp.write("\n%s" % "\t".join(x)) | |
492 fp.close() | |
493 | |
494 | |
495 if flag == 2: | |
496 fp = open('Counts/Raw '+group_name2+' Non-Templated Counts', 'w') | |
497 fp.write("Name\t") | |
498 fp.write("Sequence") | |
499 for y in tre_file_order: | |
500 fp.write("\t"+y) | |
501 | |
502 | |
503 for x in raw_tre: | |
504 fp.write("\n%s" % "\t".join(x)) | |
505 fp.close() | |
506 | |
507 fp = open('Counts/Raw '+group_name1+' Non-Templated Counts', 'w') | |
508 fp.write("Name\t") | |
509 fp.write("Sequence") | |
510 for y in con_file_order: | |
511 fp.write("\t"+y) | |
512 | |
513 for x in raw_con: | |
514 fp.write("\n%s" % "\t".join(x)) | |
515 fp.close() | |
516 | |
517 | |
518 ######################################################################################################################################### | |
519 | |
520 def temp_counts_to_diff(names,samp,folder): | |
521 | |
522 for i in range(2,len(samp[0])): | |
523 | |
524 fp = open(folder+names[i-2]+'.txt','w') | |
525 fp.write("miRNA id"+"\t"+names[i-2]+"\n") | |
526 | |
527 for x in samp: | |
528 fp.write("%s" % "\t".join([x[0],x[i]])+"\n") | |
529 fp.close() | |
530 | |
531 ################################################################################################################## | |
532 | |
533 def DB_write(con,name,unique_seq,sorted_uni_arms,f): | |
534 | |
535 if f==1: | |
536 # Write a txt file with all the information | |
537 if con=="c": | |
538 fp = open('split1/'+name, 'w') | |
539 | |
540 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) | |
541 if con=="t": | |
542 fp = open('split2/'+name, 'w') | |
543 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) | |
544 | |
545 | |
546 for i in range(len(sorted_uni_arms)): | |
547 temp = [] | |
548 for j in range(len(unique_seq)): | |
549 | |
550 if sorted_uni_arms[i][0] in unique_seq[j][2].split("_")[0]: | |
551 | |
552 temp.append(unique_seq[j]) | |
553 | |
554 temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True) | |
555 fp.write("*********************************************************************************************************\n") | |
556 fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|")) | |
557 fp.write("*********************************************************************************************************\n\n") | |
558 [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp] | |
559 fp.write("\n" + "\n") | |
560 fp.close() | |
561 | |
562 if f==2: | |
563 | |
564 if con=="c": | |
565 fp = open('split3/'+name, 'w') | |
566 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) | |
567 if con=="t": | |
568 fp = open('split4/'+name, 'w') | |
569 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) | |
570 | |
571 | |
572 for i in range(len(sorted_uni_arms)): | |
573 temp = [] | |
574 for j in range(len(unique_seq)): | |
575 if sorted_uni_arms[i][0]==unique_seq[j][2].split("_nont_")[0]: | |
576 temp.append(unique_seq[j]) | |
577 if temp!=[]: | |
578 temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True) | |
579 fp.write("*********************************************************************************************************\n") | |
580 fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|")) | |
581 fp.write("*********************************************************************************************************\n\n") | |
582 [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp] | |
583 fp.write("\n" + "\n") | |
584 fp.close() | |
585 | |
586 | |
587 ########################################################################################################################## | |
588 | |
589 def new_mat_seq(pre_unique_seq,mat_mirnas,l): | |
590 | |
591 unique_iso = [] | |
592 for x in pre_unique_seq: | |
593 if len(x[2].split("_"))==3: | |
594 for y in pre_unique_seq: | |
595 if x[2] in y[2] and int(x[0].split("-")[1])<int(y[0].split("-")[1]): | |
596 if any(y[2] in lst2 for lst2 in unique_iso)==False: | |
597 y[2]=">"+y[2] | |
598 unique_iso.append(y) | |
599 l.acquire() | |
600 for x in unique_iso: | |
601 mat_mirnas.append(x[2]) | |
602 mat_mirnas.append(x[9]) | |
603 l.release() | |
604 | |
605 ######################################################################################################################### | |
606 | |
607 def merging_names(ini_mat,new): | |
608 | |
609 dupes=[] | |
610 final_mat =[] | |
611 | |
612 for num in range(len(ini_mat)): | |
613 | |
614 if ini_mat[num][1] not in final_mat and ini_mat[num][0] not in final_mat: | |
615 final_mat.append(ini_mat[num][1]) | |
616 final_mat.append(ini_mat[num][0]) | |
617 else: | |
618 dupes.append(ini_mat[num][1]) | |
619 | |
620 dupes=list(set(dupes)) | |
621 | |
622 for i in range(len(dupes)): | |
623 dupes[i]=[dupes[i]] | |
624 | |
625 for x in ini_mat: | |
626 for y in dupes: | |
627 if x[1]==y[0]: | |
628 fl=0 | |
629 if len(y)==1: | |
630 y.append(x[0]) | |
631 else: | |
632 for i in range(1,len(y)): | |
633 if y[i].split("_")[0]==x[0].split("_")[0]: | |
634 fl=1 | |
635 if len(x[0])<len(y[i]): | |
636 del y[i] | |
637 y.append(x[0]) | |
638 break | |
639 | |
640 if fl==0: | |
641 y.append((x[0])) | |
642 | |
643 for y in dupes: | |
644 if len(y)>2: | |
645 for i in range(len(y)-1,1,-1): | |
646 y[1]=y[1]+"/"+y[i] | |
647 del y[i] | |
648 | |
649 | |
650 for x in ini_mat: | |
651 for y in dupes: | |
652 if x[1]==y[0]: | |
653 x[0]=y[1] | |
654 | |
655 ini_mat.sort() | |
656 ini_mat=list(ini_mat for ini_mat,_ in itertools.groupby(ini_mat)) | |
657 | |
658 new.extend(ini_mat) | |
659 | |
660 | |
661 ###################################################################################################################################################### | |
662 | |
663 def nontemp_counts_to_diff(tem_names,tem_samp,non_names,non_samp,folder): | |
664 | |
665 for i in range(2,len(tem_samp[0])): | |
666 | |
667 fp = open(folder+tem_names[i-2]+'.txt','w') | |
668 fp.write("miRNA id"+"\t"+tem_names[i-2]+"\n") | |
669 | |
670 for x in tem_samp: | |
671 fp.write("%s" % "\t".join([x[0],x[i]])+"\n") | |
672 | |
673 for j in range(len(non_names)): | |
674 if non_names[j]==tem_names[i-2]: | |
675 for x in non_samp: | |
676 fp.write("%s" % "\t".join([x[0],x[j+2]])+"\n") | |
677 fp.close() | |
678 | |
679 ################################################################################################################################################################################################################### | |
680 | |
681 """ | |
682 | |
683 This function downloads all the miRNAs of all the species from MirBase | |
684 and filters them by the requested organism | |
685 | |
686 input : Organism | |
687 output: A list with the miRNA sequences in fasta format | |
688 | |
689 """ | |
690 | |
691 def download_matures(matures,org_name): | |
692 | |
693 url = 'ftp://mirbase.org/pub/mirbase/CURRENT/mature.fa.gz' | |
694 data = urllib.request.urlopen(url).read() | |
695 file_mirna = gzip.decompress(data).decode('utf-8') | |
696 file_mirna = file_mirna.split("\n") | |
697 | |
698 for i in range(0,len(file_mirna)-1,2): | |
699 | |
700 if org_name in file_mirna[i]: | |
701 matures.append(file_mirna[i]) | |
702 matures.append(file_mirna[i+1]) | |
703 | |
704 ################################################################################################################################################################################################################### | |
705 | |
706 | |
707 """ | |
708 | |
709 This function keeps all mirna isoforms which are detected on SAM files from the first part of the analysis | |
710 These isoforms will be used as refence sequences with canonical (ref) mirnas for the detection of non-template | |
711 mirnas | |
712 | |
713 """ | |
714 | |
715 | |
716 def non_template_ref(c_samples,t_samples,all_isoforms): | |
717 | |
718 pre_uni_seq_con = list(c_samples) | |
719 pre_uni_seq_tre = list(t_samples) | |
720 | |
721 for x in pre_uni_seq_con: | |
722 for y in x: | |
723 #if ">"+y[2] not in all_isoforms and ")_" in y[2] : | |
724 if ">"+y[2] not in all_isoforms and "_t_" in y[2] : | |
725 all_isoforms.append(">"+y[2]) | |
726 all_isoforms.append(y[9]) | |
727 | |
728 for x in pre_uni_seq_tre: | |
729 for y in x: | |
730 #if ">"+y[2] not in all_isoforms and ")_" in y[2]: | |
731 if ">"+y[2] not in all_isoforms and "_t_" in y[2] : | |
732 all_isoforms.append(">"+y[2]) | |
733 all_isoforms.append(y[9]) | |
734 | |
735 ################################################################################################################################################################################################ | |
736 | |
737 """ | |
738 | |
739 This function adds the uncommon detected miRNAs among samples. | |
740 As a result all samples will have the same length. | |
741 | |
742 """ | |
743 | |
744 def uncommon_mirnas(sample,mir_names,l,new_d,sample_name,sample_order): | |
745 | |
746 for y in mir_names: | |
747 flag=0 | |
748 for x in sample: | |
749 if y[0]==x[0]: # check if miRNA exists in the sample | |
750 flag=1 | |
751 break | |
752 if flag==0: | |
753 sample.append([y[0],"0",y[1]]) # add the name of mirna to the sample with zero counts and its sequence | |
754 | |
755 # sorting and remove duplicates | |
756 sample.sort(key=lambda x: x[0]) | |
757 sample=list(sample for sample,_ in itertools.groupby(sample)) | |
758 | |
759 # Return the updated sample | |
760 l.acquire() | |
761 new_d.append(sample) | |
762 sample_order.append(sample_name) | |
763 l.release() | |
764 | |
765 ############################################################################################################################################################################################### | |
766 |